Set a node offline even if there is an exception

In particular, an InterruptedException is likely in the portions
of safeExecuteFunction that wait for the Jenkins job to complete.
In those cases, we still want to return an exception, but we also
want to make sure that once we have scheduled a build on a node,
when that build is finished (even if it is due to some catastrophe
such as a failure to communicate with the node), we still take
the node offline.

We have seen the occasional scheduled job stuck in Jenkins because
of a situation where a node fails, and the gearman plugin schedules
another build on the node because the offline method has not run.
Meanwhile, nodepool deletes the node (because Jenkins said the
job finished) and the scheduled build gets stuck.  This should
eliminate that.

Change-Id: I69b1e4b21430b7427ed47c3cb43bd94e04213321
This commit is contained in:
James E. Blair 2013-09-11 14:15:59 -07:00
parent af21876dfe
commit ad75b7e0b0
1 changed files with 57 additions and 54 deletions

View File

@ -178,65 +178,68 @@ public class StartJobWorker extends AbstractGearmanFunction {
// check build and pass results back to client
String jobData;
// This is a hack that relies on implementation knowledge. In
// order to actually send a WORK_STATUS packet before the
// completion of work, we need to directly drive the session
// IO, which requires a session object. We happen to know
// that's what our event listener is.
GearmanJobServerSession sess = null;
try {
// This is a hack that relies on implementation knowledge. In
// order to actually send a WORK_STATUS packet before the
// completion of work, we need to directly drive the session
// IO, which requires a session object. We happen to know
// that's what our event listener is.
GearmanJobServerSession sess = null;
for (GearmanIOEventListener listener : listeners) {
if (listener instanceof GearmanJobServerSession) {
sess = (GearmanJobServerSession)listener;
}
}
// wait for start of build
Queue.Executable exec = future.getStartCondition().get();
AbstractBuild<?, ?> currBuild = (AbstractBuild<?, ?>) exec;
if (!offlineWhenComplete) {
// Unlock the monitor for this worker
availability.unlock(worker);
}
long now = new Date().getTime();
int duration = (int) (now - currBuild.getStartTimeInMillis());
int estimatedDuration = (int) currBuild.getEstimatedDuration();
jobData = buildStatusData(currBuild);
sendData(jobData.getBytes());
sess.driveSessionIO();
sendStatus(estimatedDuration, duration);
sess.driveSessionIO();
while (!future.isDone()) {
// wait for jenkins build to complete
try {
future.get(10, TimeUnit.SECONDS);
} catch (TimeoutException e) {
now = new Date().getTime();
duration = (int) (now - currBuild.getStartTimeInMillis());
estimatedDuration = (int) currBuild.getEstimatedDuration();
if (sess != null) {
sendStatus(estimatedDuration, duration);
sess.driveSessionIO();
for (GearmanIOEventListener listener : listeners) {
if (listener instanceof GearmanJobServerSession) {
sess = (GearmanJobServerSession)listener;
}
}
}
exec = future.get();
jobData = buildStatusData(currBuild);
// wait for start of build
Queue.Executable exec = future.getStartCondition().get();
AbstractBuild<?, ?> currBuild = (AbstractBuild<?, ?>) exec;
if (offlineWhenComplete) {
if (computer == null) {
logger.error("---- Worker " + this.worker + " has no " +
"computer while trying to take node offline.");
} else {
logger.info("---- Worker " + this.worker + " setting " +
"node offline.");
computer.setTemporarilyOffline(true,
new OfflineCause.ByCLI("Offline due to Gearman request"));
if (!offlineWhenComplete) {
// Unlock the monitor for this worker
availability.unlock(worker);
}
long now = new Date().getTime();
int duration = (int) (now - currBuild.getStartTimeInMillis());
int estimatedDuration = (int) currBuild.getEstimatedDuration();
jobData = buildStatusData(currBuild);
sendData(jobData.getBytes());
sess.driveSessionIO();
sendStatus(estimatedDuration, duration);
sess.driveSessionIO();
while (!future.isDone()) {
// wait for jenkins build to complete
try {
future.get(10, TimeUnit.SECONDS);
} catch (TimeoutException e) {
now = new Date().getTime();
duration = (int) (now - currBuild.getStartTimeInMillis());
estimatedDuration = (int) currBuild.getEstimatedDuration();
if (sess != null) {
sendStatus(estimatedDuration, duration);
sess.driveSessionIO();
}
}
}
exec = future.get();
jobData = buildStatusData(currBuild);
} finally {
if (offlineWhenComplete) {
if (computer == null) {
logger.error("---- Worker " + this.worker + " has no " +
"computer while trying to take node offline.");
} else {
logger.info("---- Worker " + this.worker + " setting " +
"node offline.");
computer.setTemporarilyOffline(true,
new OfflineCause.ByCLI("Offline due to Gearman request"));
}
}
}