Set a node offline even if there is an exception
In particular, an InterruptedException is likely in the portions of safeExecuteFunction that wait for the Jenkins job to complete. In those cases, we still want to return an exception, but we also want to make sure that once we have scheduled a build on a node, when that build is finished (even if it is due to some catastrophe such as a failure to communicate with the node), we still take the node offline. We have seen the occasional scheduled job stuck in Jenkins because of a situation where a node fails, and the gearman plugin schedules another build on the node because the offline method has not run. Meanwhile, nodepool deletes the node (because Jenkins said the job finished) and the scheduled build gets stuck. This should eliminate that. Change-Id: I69b1e4b21430b7427ed47c3cb43bd94e04213321
This commit is contained in:
parent
af21876dfe
commit
ad75b7e0b0
|
@ -178,65 +178,68 @@ public class StartJobWorker extends AbstractGearmanFunction {
|
|||
// check build and pass results back to client
|
||||
String jobData;
|
||||
|
||||
// This is a hack that relies on implementation knowledge. In
|
||||
// order to actually send a WORK_STATUS packet before the
|
||||
// completion of work, we need to directly drive the session
|
||||
// IO, which requires a session object. We happen to know
|
||||
// that's what our event listener is.
|
||||
GearmanJobServerSession sess = null;
|
||||
try {
|
||||
// This is a hack that relies on implementation knowledge. In
|
||||
// order to actually send a WORK_STATUS packet before the
|
||||
// completion of work, we need to directly drive the session
|
||||
// IO, which requires a session object. We happen to know
|
||||
// that's what our event listener is.
|
||||
GearmanJobServerSession sess = null;
|
||||
|
||||
for (GearmanIOEventListener listener : listeners) {
|
||||
if (listener instanceof GearmanJobServerSession) {
|
||||
sess = (GearmanJobServerSession)listener;
|
||||
}
|
||||
}
|
||||
|
||||
// wait for start of build
|
||||
Queue.Executable exec = future.getStartCondition().get();
|
||||
AbstractBuild<?, ?> currBuild = (AbstractBuild<?, ?>) exec;
|
||||
|
||||
if (!offlineWhenComplete) {
|
||||
// Unlock the monitor for this worker
|
||||
availability.unlock(worker);
|
||||
}
|
||||
|
||||
long now = new Date().getTime();
|
||||
int duration = (int) (now - currBuild.getStartTimeInMillis());
|
||||
int estimatedDuration = (int) currBuild.getEstimatedDuration();
|
||||
jobData = buildStatusData(currBuild);
|
||||
|
||||
sendData(jobData.getBytes());
|
||||
sess.driveSessionIO();
|
||||
sendStatus(estimatedDuration, duration);
|
||||
sess.driveSessionIO();
|
||||
|
||||
while (!future.isDone()) {
|
||||
// wait for jenkins build to complete
|
||||
try {
|
||||
future.get(10, TimeUnit.SECONDS);
|
||||
} catch (TimeoutException e) {
|
||||
now = new Date().getTime();
|
||||
duration = (int) (now - currBuild.getStartTimeInMillis());
|
||||
estimatedDuration = (int) currBuild.getEstimatedDuration();
|
||||
if (sess != null) {
|
||||
sendStatus(estimatedDuration, duration);
|
||||
sess.driveSessionIO();
|
||||
for (GearmanIOEventListener listener : listeners) {
|
||||
if (listener instanceof GearmanJobServerSession) {
|
||||
sess = (GearmanJobServerSession)listener;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
exec = future.get();
|
||||
jobData = buildStatusData(currBuild);
|
||||
// wait for start of build
|
||||
Queue.Executable exec = future.getStartCondition().get();
|
||||
AbstractBuild<?, ?> currBuild = (AbstractBuild<?, ?>) exec;
|
||||
|
||||
if (offlineWhenComplete) {
|
||||
if (computer == null) {
|
||||
logger.error("---- Worker " + this.worker + " has no " +
|
||||
"computer while trying to take node offline.");
|
||||
} else {
|
||||
logger.info("---- Worker " + this.worker + " setting " +
|
||||
"node offline.");
|
||||
computer.setTemporarilyOffline(true,
|
||||
new OfflineCause.ByCLI("Offline due to Gearman request"));
|
||||
if (!offlineWhenComplete) {
|
||||
// Unlock the monitor for this worker
|
||||
availability.unlock(worker);
|
||||
}
|
||||
|
||||
long now = new Date().getTime();
|
||||
int duration = (int) (now - currBuild.getStartTimeInMillis());
|
||||
int estimatedDuration = (int) currBuild.getEstimatedDuration();
|
||||
jobData = buildStatusData(currBuild);
|
||||
|
||||
sendData(jobData.getBytes());
|
||||
sess.driveSessionIO();
|
||||
sendStatus(estimatedDuration, duration);
|
||||
sess.driveSessionIO();
|
||||
|
||||
while (!future.isDone()) {
|
||||
// wait for jenkins build to complete
|
||||
try {
|
||||
future.get(10, TimeUnit.SECONDS);
|
||||
} catch (TimeoutException e) {
|
||||
now = new Date().getTime();
|
||||
duration = (int) (now - currBuild.getStartTimeInMillis());
|
||||
estimatedDuration = (int) currBuild.getEstimatedDuration();
|
||||
if (sess != null) {
|
||||
sendStatus(estimatedDuration, duration);
|
||||
sess.driveSessionIO();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
exec = future.get();
|
||||
jobData = buildStatusData(currBuild);
|
||||
|
||||
} finally {
|
||||
if (offlineWhenComplete) {
|
||||
if (computer == null) {
|
||||
logger.error("---- Worker " + this.worker + " has no " +
|
||||
"computer while trying to take node offline.");
|
||||
} else {
|
||||
logger.info("---- Worker " + this.worker + " setting " +
|
||||
"node offline.");
|
||||
computer.setTemporarilyOffline(true,
|
||||
new OfflineCause.ByCLI("Offline due to Gearman request"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue