Skip to content

Commit 57628b2

Browse files
authored
Merge pull request #2030 from shapeblue/snapshot-housekeeping
CLOUDSTACK-9864 cleanup stale worker VMs after job expiry time
2 parents 8854d4c + 084febb commit 57628b2

File tree

5 files changed

+43
-15
lines changed

5 files changed

+43
-15
lines changed

engine/storage/volume/src/org/apache/cloudstack/storage/volume/VolumeServiceImpl.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1993,8 +1993,13 @@ public SnapshotInfo takeSnapshot(VolumeInfo volume) {
19931993
SnapshotInfo snapshot = null;
19941994
try {
19951995
snapshot = snapshotMgr.takeSnapshot(volume);
1996+
} catch (CloudRuntimeException cre) {
1997+
s_logger.error("Take snapshot: " + volume.getId() + " failed", cre);
1998+
throw cre;
19961999
} catch (Exception e) {
1997-
s_logger.debug("Take snapshot: " + volume.getId() + " failed", e);
2000+
if(s_logger.isDebugEnabled()) {
2001+
s_logger.debug("unknown exception while taking snapshot for volume " + volume.getId() + " was caught", e);
2002+
}
19982003
throw new CloudRuntimeException("Failed to take snapshot", e);
19992004
}
20002005

framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,9 @@
8585

8686
public class AsyncJobManagerImpl extends ManagerBase implements AsyncJobManager, ClusterManagerListener, Configurable {
8787
// Advanced
88-
private static final ConfigKey<Long> JobExpireMinutes = new ConfigKey<Long>("Advanced", Long.class, "job.expire.minutes", "1440",
88+
public static final ConfigKey<Long> JobExpireMinutes = new ConfigKey<Long>("Advanced", Long.class, "job.expire.minutes", "1440",
8989
"Time (in minutes) for async-jobs to be kept in system", true, ConfigKey.Scope.Global);
90-
private static final ConfigKey<Long> JobCancelThresholdMinutes = new ConfigKey<Long>("Advanced", Long.class, "job.cancel.threshold.minutes", "60",
90+
public static final ConfigKey<Long> JobCancelThresholdMinutes = new ConfigKey<Long>("Advanced", Long.class, "job.cancel.threshold.minutes", "60",
9191
"Time (in minutes) for async-jobs to be forcely cancelled if it has been in process for long", true, ConfigKey.Scope.Global);
9292
private static final ConfigKey<Integer> VmJobLockTimeout = new ConfigKey<Integer>("Advanced",
9393
Integer.class, "vm.job.lock.timeout", "1800",

plugins/hypervisors/vmware/src/com/cloud/hypervisor/vmware/manager/VmwareManager.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ public interface VmwareManager {
3535
public static final ConfigKey<Long> s_vmwareNicHotplugWaitTimeout = new ConfigKey<Long>("Advanced", Long.class, "vmware.nic.hotplug.wait.timeout", "15000",
3636
"Wait timeout (milli seconds) for hot plugged NIC of VM to be detected by guest OS.", false, ConfigKey.Scope.Global);
3737

38+
public static final ConfigKey<Boolean> s_vmwareCleanOldWorderVMs = new ConfigKey<Boolean>("Advanced", Boolean.class, "vmware.clean.old.worker.vms", "false",
39+
"If a worker vm is older then twice the 'job.expire.minutes' + 'job.cancel.threshold.minutes' , remove it.", true, ConfigKey.Scope.Global);
40+
3841
String composeWorkerName();
3942

4043
String getSystemVMIsoFileNameOnDatastore();

plugins/hypervisors/vmware/src/com/cloud/hypervisor/vmware/manager/VmwareManagerImpl.java

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
import java.net.URISyntaxException;
2323
import java.net.URL;
2424
import java.rmi.RemoteException;
25+
import java.time.Duration;
26+
import java.time.Instant;
2527
import java.util.ArrayList;
2628
import java.util.HashMap;
2729
import java.util.List;
@@ -35,6 +37,7 @@
3537
import javax.inject.Inject;
3638
import javax.naming.ConfigurationException;
3739

40+
import org.apache.cloudstack.framework.jobs.impl.AsyncJobManagerImpl;
3841
import org.apache.log4j.Logger;
3942

4043
import com.vmware.vim25.AboutInfo;
@@ -128,6 +131,7 @@
128131
public class VmwareManagerImpl extends ManagerBase implements VmwareManager, VmwareStorageMount, Listener, VmwareDatacenterService, Configurable {
129132
private static final Logger s_logger = Logger.getLogger(VmwareManagerImpl.class);
130133

134+
private static final long SECONDS_PER_MINUTE = 60;
131135
private static final int STARTUP_DELAY = 60000; // 60 seconds
132136
private static final long DEFAULT_HOST_SCAN_INTERVAL = 600000; // every 10 minutes
133137
private long _hostScanInterval = DEFAULT_HOST_SCAN_INTERVAL;
@@ -212,7 +216,7 @@ public String getConfigComponentName() {
212216

213217
@Override
214218
public ConfigKey<?>[] getConfigKeys() {
215-
return new ConfigKey<?>[] {s_vmwareNicHotplugWaitTimeout};
219+
return new ConfigKey<?>[] {s_vmwareNicHotplugWaitTimeout, s_vmwareCleanOldWorderVMs};
216220
}
217221

218222
@Override
@@ -534,7 +538,7 @@ public boolean needRecycle(String workerTag) {
534538
return false;
535539
}
536540

537-
Long.parseLong(tokens[0]);
541+
long startTick = Long.parseLong(tokens[0]);
538542
long msid = Long.parseLong(tokens[1]);
539543
long runid = Long.parseLong(tokens[2]);
540544

@@ -550,15 +554,22 @@ public boolean needRecycle(String workerTag) {
550554
return true;
551555
}
552556

553-
// disable time-out check until we have found out a VMware API that can check if
554-
// there are pending tasks on the subject VM
555-
/*
556-
if(System.currentTimeMillis() - startTick > _hungWorkerTimeout) {
557-
if(s_logger.isInfoEnabled())
558-
s_logger.info("Worker VM expired, seconds elapsed: " + (System.currentTimeMillis() - startTick) / 1000);
559-
return true;
560-
}
561-
*/
557+
// this time-out check was disabled
558+
// "until we have found out a VMware API that can check if there are pending tasks on the subject VM"
559+
// but as we expire jobs and those stale worker VMs stay around untill an MS reboot we opt in to have them removed anyway
560+
Instant start = Instant.ofEpochMilli(startTick);
561+
Instant end = start.plusSeconds(2 * (AsyncJobManagerImpl.JobExpireMinutes.value() + AsyncJobManagerImpl.JobCancelThresholdMinutes.value()) * SECONDS_PER_MINUTE);
562+
Instant now = Instant.now();
563+
if(s_vmwareCleanOldWorderVMs.value() && now.isAfter(end)) {
564+
if(s_logger.isInfoEnabled()) {
565+
s_logger.info("Worker VM expired, seconds elapsed: " + Duration.between(start,now).getSeconds());
566+
}
567+
return true;
568+
}
569+
if (s_logger.isTraceEnabled()) {
570+
s_logger.trace("Worker VM with tag '" + workerTag + "' does not need recycling, yet." +
571+
"But in " + Duration.between(now,end).getSeconds() + " seconds, though");
572+
}
562573
return false;
563574
}
564575

server/src/com/cloud/storage/snapshot/SnapshotManagerImpl.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1114,8 +1114,17 @@ public SnapshotInfo takeSnapshot(VolumeInfo volume) throws ResourceAllocationExc
11141114
} catch (Exception e) {
11151115
s_logger.debug("post process snapshot failed", e);
11161116
}
1117+
} catch (CloudRuntimeException cre) {
1118+
if(s_logger.isDebugEnabled()) {
1119+
s_logger.debug("Failed to create snapshot" + cre.getLocalizedMessage());
1120+
}
1121+
_resourceLimitMgr.decrementResourceCount(snapshotOwner.getId(), ResourceType.snapshot);
1122+
_resourceLimitMgr.decrementResourceCount(snapshotOwner.getId(), ResourceType.secondary_storage, new Long(volume.getSize()));
1123+
throw cre;
11171124
} catch (Exception e) {
1118-
s_logger.debug("Failed to create snapshot", e);
1125+
if(s_logger.isDebugEnabled()) {
1126+
s_logger.debug("Failed to create snapshot", e);
1127+
}
11191128
_resourceLimitMgr.decrementResourceCount(snapshotOwner.getId(), ResourceType.snapshot);
11201129
_resourceLimitMgr.decrementResourceCount(snapshotOwner.getId(), ResourceType.secondary_storage, new Long(volume.getSize()));
11211130
throw new CloudRuntimeException("Failed to create snapshot", e);

0 commit comments

Comments
 (0)