001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.oozie.command.wf; 020 021import java.util.ArrayList; 022import java.util.Date; 023import java.util.List; 024 025import javax.servlet.jsp.el.ELException; 026 027import org.apache.hadoop.conf.Configuration; 028import org.apache.oozie.ErrorCode; 029import org.apache.oozie.FaultInjection; 030import org.apache.oozie.SLAEventBean; 031import org.apache.oozie.WorkflowActionBean; 032import org.apache.oozie.WorkflowJobBean; 033import org.apache.oozie.XException; 034import org.apache.oozie.action.ActionExecutor; 035import org.apache.oozie.action.ActionExecutorException; 036import org.apache.oozie.action.control.ControlNodeActionExecutor; 037import org.apache.oozie.client.OozieClient; 038import org.apache.oozie.client.WorkflowAction; 039import org.apache.oozie.client.WorkflowJob; 040import org.apache.oozie.client.SLAEvent.SlaAppType; 041import org.apache.oozie.client.SLAEvent.Status; 042import org.apache.oozie.client.rest.JsonBean; 043import org.apache.oozie.client.rest.JsonTags; 044import org.apache.oozie.command.CommandException; 045import org.apache.oozie.command.PreconditionException; 046import org.apache.oozie.command.XCommand; 047import org.apache.oozie.executor.jpa.BatchQueryExecutor.UpdateEntry; 048import org.apache.oozie.executor.jpa.BatchQueryExecutor; 049import org.apache.oozie.executor.jpa.JPAExecutorException; 050import org.apache.oozie.executor.jpa.WorkflowActionQueryExecutor; 051import org.apache.oozie.executor.jpa.WorkflowActionQueryExecutor.WorkflowActionQuery; 052import org.apache.oozie.executor.jpa.WorkflowJobQueryExecutor; 053import org.apache.oozie.executor.jpa.WorkflowJobQueryExecutor.WorkflowJobQuery; 054import org.apache.oozie.service.ActionService; 055import org.apache.oozie.service.EventHandlerService; 056import org.apache.oozie.service.JPAService; 057import org.apache.oozie.service.Services; 058import org.apache.oozie.service.UUIDService; 059import org.apache.oozie.util.DateUtils; 060import org.apache.oozie.util.ELEvaluationException; 061import org.apache.oozie.util.Instrumentation; 062import org.apache.oozie.util.JobUtils; 063import org.apache.oozie.util.LogUtils; 064import org.apache.oozie.util.XLog; 065import org.apache.oozie.util.XmlUtils; 066import org.apache.oozie.util.db.SLADbXOperations; 067 068@SuppressWarnings("deprecation") 069public class ActionStartXCommand extends ActionXCommand<org.apache.oozie.command.wf.ActionXCommand.ActionExecutorContext> { 070 public static final String EL_ERROR = "EL_ERROR"; 071 public static final String EL_EVAL_ERROR = "EL_EVAL_ERROR"; 072 public static final String COULD_NOT_START = "COULD_NOT_START"; 073 public static final String START_DATA_MISSING = "START_DATA_MISSING"; 074 public static final String EXEC_DATA_MISSING = "EXEC_DATA_MISSING"; 075 076 private String jobId = null; 077 protected String actionId = null; 078 protected WorkflowJobBean wfJob = null; 079 protected WorkflowActionBean wfAction = null; 080 private JPAService jpaService = null; 081 private ActionExecutor executor = null; 082 private List<UpdateEntry> updateList = new ArrayList<UpdateEntry>(); 083 private List<JsonBean> insertList = new ArrayList<JsonBean>(); 084 protected ActionExecutorContext context = null; 085 086 public ActionStartXCommand(String actionId, String type) { 087 super("action.start", type, 0); 088 this.actionId = actionId; 089 this.jobId = Services.get().get(UUIDService.class).getId(actionId); 090 } 091 092 public ActionStartXCommand(WorkflowJobBean job, String actionId, String type) { 093 super("action.start", type, 0); 094 this.actionId = actionId; 095 this.wfJob = job; 096 this.jobId = wfJob.getId(); 097 } 098 099 @Override 100 protected void setLogInfo() { 101 LogUtils.setLogInfo(actionId); 102 } 103 104 @Override 105 protected boolean isLockRequired() { 106 return true; 107 } 108 109 @Override 110 public String getEntityKey() { 111 return this.jobId; 112 } 113 114 @Override 115 protected void loadState() throws CommandException { 116 try { 117 jpaService = Services.get().get(JPAService.class); 118 if (jpaService != null) { 119 if (wfJob == null) { 120 this.wfJob = WorkflowJobQueryExecutor.getInstance().get(WorkflowJobQuery.GET_WORKFLOW, jobId); 121 } 122 this.wfAction = WorkflowActionQueryExecutor.getInstance().get(WorkflowActionQuery.GET_ACTION, actionId); 123 LogUtils.setLogInfo( wfJob); 124 LogUtils.setLogInfo(wfAction); 125 } 126 else { 127 throw new CommandException(ErrorCode.E0610); 128 } 129 } 130 catch (XException ex) { 131 throw new CommandException(ex); 132 } 133 } 134 135 @Override 136 protected void verifyPrecondition() throws CommandException, PreconditionException { 137 if (wfJob == null) { 138 throw new PreconditionException(ErrorCode.E0604, jobId); 139 } 140 if (wfAction == null) { 141 throw new PreconditionException(ErrorCode.E0605, actionId); 142 } 143 if (wfAction.isPending() 144 && (wfAction.getStatus() == WorkflowActionBean.Status.PREP 145 || wfAction.getStatus() == WorkflowActionBean.Status.START_RETRY 146 || wfAction.getStatus() == WorkflowActionBean.Status.START_MANUAL 147 || wfAction.getStatus() == WorkflowActionBean.Status.USER_RETRY 148 )) { 149 if (wfJob.getStatus() != WorkflowJob.Status.RUNNING) { 150 throw new PreconditionException(ErrorCode.E0810, WorkflowJob.Status.RUNNING.toString()); 151 } 152 } 153 else { 154 throw new PreconditionException(ErrorCode.E0816, wfAction.isPending(), wfAction.getStatusStr()); 155 } 156 157 executor = Services.get().get(ActionService.class).getExecutor(wfAction.getType()); 158 if (executor == null) { 159 throw new CommandException(ErrorCode.E0802, wfAction.getType()); 160 } 161 } 162 163 @Override 164 protected ActionExecutorContext execute() throws CommandException { 165 LOG.debug("STARTED ActionStartXCommand for wf actionId=" + actionId); 166 Configuration conf = wfJob.getWorkflowInstance().getConf(); 167 168 int maxRetries = 0; 169 long retryInterval = 0; 170 boolean execSynchronous = false; 171 172 if (!(executor instanceof ControlNodeActionExecutor)) { 173 maxRetries = conf.getInt(OozieClient.ACTION_MAX_RETRIES, executor.getMaxRetries()); 174 retryInterval = conf.getLong(OozieClient.ACTION_RETRY_INTERVAL, executor.getRetryInterval()); 175 } 176 177 executor.setMaxRetries(maxRetries); 178 executor.setRetryInterval(retryInterval); 179 180 try { 181 boolean isRetry = false; 182 if (wfAction.getStatus() == WorkflowActionBean.Status.START_RETRY 183 || wfAction.getStatus() == WorkflowActionBean.Status.START_MANUAL) { 184 isRetry = true; 185 prepareForRetry(wfAction); 186 } 187 boolean isUserRetry = false; 188 if (wfAction.getStatus() == WorkflowActionBean.Status.USER_RETRY) { 189 isUserRetry = true; 190 prepareForRetry(wfAction); 191 } 192 context = getContext(isRetry, isUserRetry); 193 boolean caught = false; 194 try { 195 if (!(executor instanceof ControlNodeActionExecutor)) { 196 String tmpActionConf = XmlUtils.removeComments(wfAction.getConf()); 197 String actionConf = context.getELEvaluator().evaluate(tmpActionConf, String.class); 198 wfAction.setConf(actionConf); 199 LOG.debug("Start, name [{0}] type [{1}] configuration{E}{E}{2}{E}", wfAction.getName(), wfAction 200 .getType(), actionConf); 201 } 202 } 203 catch (ELEvaluationException ex) { 204 caught = true; 205 throw new ActionExecutorException(ActionExecutorException.ErrorType.TRANSIENT, EL_EVAL_ERROR, ex 206 .getMessage(), ex); 207 } 208 catch (ELException ex) { 209 caught = true; 210 context.setErrorInfo(EL_ERROR, ex.getMessage()); 211 LOG.warn("ELException in ActionStartXCommand ", ex.getMessage(), ex); 212 handleError(context, wfJob, wfAction); 213 } 214 catch (org.jdom.JDOMException je) { 215 caught = true; 216 context.setErrorInfo("ParsingError", je.getMessage()); 217 LOG.warn("JDOMException in ActionStartXCommand ", je.getMessage(), je); 218 handleError(context, wfJob, wfAction); 219 } 220 catch (Exception ex) { 221 caught = true; 222 context.setErrorInfo(EL_ERROR, ex.getMessage()); 223 LOG.warn("Exception in ActionStartXCommand ", ex.getMessage(), ex); 224 handleError(context, wfJob, wfAction); 225 } 226 if(!caught) { 227 wfAction.setErrorInfo(null, null); 228 incrActionCounter(wfAction.getType(), 1); 229 230 LOG.info("Start action [{0}] with user-retry state : userRetryCount [{1}], userRetryMax [{2}], userRetryInterval" 231 + " [{3}]", 232 wfAction.getId(), wfAction.getUserRetryCount(), wfAction.getUserRetryMax(), wfAction 233 .getUserRetryInterval()); 234 235 Instrumentation.Cron cron = new Instrumentation.Cron(); 236 cron.start(); 237 // do not override starttime for retries 238 if (wfAction.getStartTime() == null) { 239 context.setStartTime(); 240 } 241 context.setVar(JobUtils.getRetryKey(wfAction, JsonTags.WORKFLOW_ACTION_START_TIME), 242 String.valueOf(new Date().getTime())); 243 executor.start(context, wfAction); 244 cron.stop(); 245 FaultInjection.activate("org.apache.oozie.command.SkipCommitFaultInjection"); 246 addActionCron(wfAction.getType(), cron); 247 248 wfAction.setRetries(0); 249 if (wfAction.isExecutionComplete()) { 250 if (!context.isExecuted()) { 251 LOG.warn(XLog.OPS, "Action Completed, ActionExecutor [{0}] must call setExecutionData()", executor 252 .getType()); 253 wfAction.setErrorInfo(EXEC_DATA_MISSING, 254 "Execution Complete, but Execution Data Missing from Action"); 255 failJob(context); 256 } else { 257 wfAction.setPending(); 258 if (!(executor instanceof ControlNodeActionExecutor)) { 259 queue(new ActionEndXCommand(wfAction.getId(), wfAction.getType())); 260 } 261 else { 262 execSynchronous = true; 263 } 264 } 265 } 266 else { 267 if (!context.isStarted()) { 268 LOG.warn(XLog.OPS, "Action Started, ActionExecutor [{0}] must call setStartData()", executor 269 .getType()); 270 wfAction.setErrorInfo(START_DATA_MISSING, "Execution Started, but Start Data Missing from Action"); 271 failJob(context); 272 } else { 273 queue(new WorkflowNotificationXCommand(wfJob, wfAction)); 274 } 275 } 276 277 LOG.info(XLog.STD, "[***" + wfAction.getId() + "***]" + "Action status=" + wfAction.getStatusStr()); 278 279 updateList.add(new UpdateEntry<WorkflowActionQuery>(WorkflowActionQuery.UPDATE_ACTION_START, wfAction)); 280 updateJobLastModified(); 281 // Add SLA status event (STARTED) for WF_ACTION 282 SLAEventBean slaEvent = SLADbXOperations.createStatusEvent(wfAction.getSlaXml(), wfAction.getId(), Status.STARTED, 283 SlaAppType.WORKFLOW_ACTION); 284 if(slaEvent != null) { 285 insertList.add(slaEvent); 286 } 287 LOG.info(XLog.STD, "[***" + wfAction.getId() + "***]" + "Action updated in DB!"); 288 } 289 } 290 catch (ActionExecutorException ex) { 291 LOG.warn("Error starting action [{0}]. ErrorType [{1}], ErrorCode [{2}], Message [{3}]", 292 wfAction.getName(), ex.getErrorType(), ex.getErrorCode(), ex.getMessage(), ex); 293 wfAction.setErrorInfo(ex.getErrorCode(), ex.getMessage()); 294 switch (ex.getErrorType()) { 295 case TRANSIENT: 296 if (!handleTransient(context, executor, WorkflowAction.Status.START_RETRY)) { 297 handleNonTransient(context, executor, WorkflowAction.Status.START_MANUAL); 298 wfAction.setPendingAge(new Date()); 299 wfAction.setRetries(0); 300 wfAction.setStartTime(null); 301 } 302 break; 303 case NON_TRANSIENT: 304 handleNonTransient(context, executor, WorkflowAction.Status.START_MANUAL); 305 break; 306 case ERROR: 307 handleError(context, executor, WorkflowAction.Status.ERROR.toString(), true, 308 WorkflowAction.Status.DONE); 309 break; 310 case FAILED: 311 try { 312 failJob(context); 313 endWF(); 314 SLAEventBean slaEvent1 = SLADbXOperations.createStatusEvent(wfAction.getSlaXml(), wfAction.getId(), 315 Status.FAILED, 316 SlaAppType.WORKFLOW_ACTION); 317 if(slaEvent1 != null) { 318 insertList.add(slaEvent1); 319 } 320 } 321 catch (XException x) { 322 LOG.warn("ActionStartXCommand - case:FAILED ", x.getMessage()); 323 } 324 break; 325 } 326 updateList.add(new UpdateEntry<WorkflowActionQuery>(WorkflowActionQuery.UPDATE_ACTION_START, wfAction)); 327 updateJobLastModified(); 328 } 329 finally { 330 try { 331 BatchQueryExecutor.getInstance().executeBatchInsertUpdateDelete(insertList, updateList, null); 332 if (!(executor instanceof ControlNodeActionExecutor) && EventHandlerService.isEnabled()) { 333 generateEvent(wfAction, wfJob.getUser()); 334 } 335 if (execSynchronous) { 336 // Changing to synchronous call from asynchronous queuing to prevent 337 // undue delay from ::start:: to action due to queuing 338 callActionEnd(); 339 } 340 } 341 catch (JPAExecutorException e) { 342 throw new CommandException(e); 343 } 344 } 345 346 LOG.debug("ENDED ActionStartXCommand for wf actionId=" + actionId + ", jobId=" + jobId); 347 348 return null; 349 } 350 351 protected void callActionEnd() throws CommandException { 352 new ActionEndXCommand(wfAction.getId(), wfAction.getType()).call(); 353 } 354 355 /** 356 * Get action executor context 357 * @param isRetry 358 * @param isUserRetry 359 * @return ActionExecutorContext returns action executor context 360 */ 361 protected ActionExecutorContext getContext(boolean isRetry, boolean isUserRetry) { 362 return new ActionXCommand.ActionExecutorContext(wfJob, wfAction, isRetry, isUserRetry); 363 } 364 365 protected void updateJobLastModified(){ 366 wfJob.setLastModifiedTime(new Date()); 367 updateList.add(new UpdateEntry<WorkflowJobQuery>(WorkflowJobQuery.UPDATE_WORKFLOW_STATUS_INSTANCE_MODIFIED, wfJob)); 368 } 369 370 protected void endWF() throws CommandException{ 371 updateParentIfNecessary(wfJob, 3); 372 new WfEndXCommand(wfJob).call(); // To delete the WF temp dir 373 SLAEventBean slaEvent2 = SLADbXOperations.createStatusEvent(wfJob.getSlaXml(), wfJob.getId(), Status.FAILED, 374 SlaAppType.WORKFLOW_JOB); 375 if(slaEvent2 != null) { 376 insertList.add(slaEvent2); 377 } 378 } 379 380 protected void handleError(ActionExecutorContext context, WorkflowJobBean workflow, WorkflowActionBean action) 381 throws CommandException { 382 failJob(context); 383 updateList.add(new UpdateEntry<WorkflowActionQuery>(WorkflowActionQuery.UPDATE_ACTION_START, wfAction)); 384 updateJobLastModified(); 385 SLAEventBean slaEvent1 = SLADbXOperations.createStatusEvent(action.getSlaXml(), action.getId(), 386 Status.FAILED, SlaAppType.WORKFLOW_ACTION); 387 if(slaEvent1 != null) { 388 insertList.add(slaEvent1); 389 } 390 endWF(); 391 return; 392 } 393 394 /* (non-Javadoc) 395 * @see org.apache.oozie.command.XCommand#getKey() 396 */ 397 @Override 398 public String getKey(){ 399 return getName() + "_" + actionId; 400 } 401 402 private void prepareForRetry(WorkflowActionBean wfAction) { 403 if (wfAction.getType().equals("map-reduce")) { 404 // need to delete child job id of original run 405 wfAction.setExternalChildIDs(""); 406 } 407 } 408 409 @Override 410 protected void queueCommandForTransientFailure(long retryDelayMillis){ 411 queue(new ActionStartXCommand(wfAction.getId(), wfAction.getType()), retryDelayMillis); 412 } 413 414 protected void queue(XCommand<?> command, long msDelay) { 415 // ActionStartXCommand is synchronously called from SignalXCommand passing wfJob so that it doesn't have to 416 //reload wfJob again. We need set wfJob to null, so that it get reloaded when the requeued command executes. 417 if (command instanceof ActionStartXCommand) { 418 ((ActionStartXCommand)command).wfJob = null; 419 } 420 super.queue(command, msDelay); 421 } 422}