001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *      http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.oozie.command.wf;
020
021import java.util.ArrayList;
022import java.util.Date;
023import java.util.List;
024
025import javax.servlet.jsp.el.ELException;
026
027import org.apache.hadoop.conf.Configuration;
028import org.apache.oozie.ErrorCode;
029import org.apache.oozie.FaultInjection;
030import org.apache.oozie.SLAEventBean;
031import org.apache.oozie.WorkflowActionBean;
032import org.apache.oozie.WorkflowJobBean;
033import org.apache.oozie.XException;
034import org.apache.oozie.action.ActionExecutor;
035import org.apache.oozie.action.ActionExecutorException;
036import org.apache.oozie.action.control.ControlNodeActionExecutor;
037import org.apache.oozie.client.OozieClient;
038import org.apache.oozie.client.WorkflowAction;
039import org.apache.oozie.client.WorkflowJob;
040import org.apache.oozie.client.SLAEvent.SlaAppType;
041import org.apache.oozie.client.SLAEvent.Status;
042import org.apache.oozie.client.rest.JsonBean;
043import org.apache.oozie.client.rest.JsonTags;
044import org.apache.oozie.command.CommandException;
045import org.apache.oozie.command.PreconditionException;
046import org.apache.oozie.command.XCommand;
047import org.apache.oozie.executor.jpa.BatchQueryExecutor.UpdateEntry;
048import org.apache.oozie.executor.jpa.BatchQueryExecutor;
049import org.apache.oozie.executor.jpa.JPAExecutorException;
050import org.apache.oozie.executor.jpa.WorkflowActionQueryExecutor;
051import org.apache.oozie.executor.jpa.WorkflowActionQueryExecutor.WorkflowActionQuery;
052import org.apache.oozie.executor.jpa.WorkflowJobQueryExecutor;
053import org.apache.oozie.executor.jpa.WorkflowJobQueryExecutor.WorkflowJobQuery;
054import org.apache.oozie.service.ActionService;
055import org.apache.oozie.service.EventHandlerService;
056import org.apache.oozie.service.JPAService;
057import org.apache.oozie.service.Services;
058import org.apache.oozie.service.UUIDService;
059import org.apache.oozie.util.DateUtils;
060import org.apache.oozie.util.ELEvaluationException;
061import org.apache.oozie.util.Instrumentation;
062import org.apache.oozie.util.JobUtils;
063import org.apache.oozie.util.LogUtils;
064import org.apache.oozie.util.XLog;
065import org.apache.oozie.util.XmlUtils;
066import org.apache.oozie.util.db.SLADbXOperations;
067
068@SuppressWarnings("deprecation")
069public class ActionStartXCommand extends ActionXCommand<org.apache.oozie.command.wf.ActionXCommand.ActionExecutorContext> {
070    public static final String EL_ERROR = "EL_ERROR";
071    public static final String EL_EVAL_ERROR = "EL_EVAL_ERROR";
072    public static final String COULD_NOT_START = "COULD_NOT_START";
073    public static final String START_DATA_MISSING = "START_DATA_MISSING";
074    public static final String EXEC_DATA_MISSING = "EXEC_DATA_MISSING";
075
076    private String jobId = null;
077    protected String actionId = null;
078    protected WorkflowJobBean wfJob = null;
079    protected WorkflowActionBean wfAction = null;
080    private JPAService jpaService = null;
081    private ActionExecutor executor = null;
082    private List<UpdateEntry> updateList = new ArrayList<UpdateEntry>();
083    private List<JsonBean> insertList = new ArrayList<JsonBean>();
084    protected ActionExecutorContext context = null;
085
086    public ActionStartXCommand(String actionId, String type) {
087        super("action.start", type, 0);
088        this.actionId = actionId;
089        this.jobId = Services.get().get(UUIDService.class).getId(actionId);
090    }
091
092    public ActionStartXCommand(WorkflowJobBean job, String actionId, String type) {
093        super("action.start", type, 0);
094        this.actionId = actionId;
095        this.wfJob = job;
096        this.jobId = wfJob.getId();
097    }
098
099    @Override
100    protected void setLogInfo() {
101        LogUtils.setLogInfo(actionId);
102    }
103
104    @Override
105    protected boolean isLockRequired() {
106        return true;
107    }
108
109    @Override
110    public String getEntityKey() {
111        return this.jobId;
112    }
113
114    @Override
115    protected void loadState() throws CommandException {
116        try {
117            jpaService = Services.get().get(JPAService.class);
118            if (jpaService != null) {
119                if (wfJob == null) {
120                    this.wfJob = WorkflowJobQueryExecutor.getInstance().get(WorkflowJobQuery.GET_WORKFLOW, jobId);
121                }
122                this.wfAction = WorkflowActionQueryExecutor.getInstance().get(WorkflowActionQuery.GET_ACTION, actionId);
123                LogUtils.setLogInfo( wfJob);
124                LogUtils.setLogInfo(wfAction);
125            }
126            else {
127                throw new CommandException(ErrorCode.E0610);
128            }
129        }
130        catch (XException ex) {
131            throw new CommandException(ex);
132        }
133    }
134
135    @Override
136    protected void verifyPrecondition() throws CommandException, PreconditionException {
137        if (wfJob == null) {
138            throw new PreconditionException(ErrorCode.E0604, jobId);
139        }
140        if (wfAction == null) {
141            throw new PreconditionException(ErrorCode.E0605, actionId);
142        }
143        if (wfAction.isPending()
144                && (wfAction.getStatus() == WorkflowActionBean.Status.PREP
145                        || wfAction.getStatus() == WorkflowActionBean.Status.START_RETRY
146                        || wfAction.getStatus() == WorkflowActionBean.Status.START_MANUAL
147                        || wfAction.getStatus() == WorkflowActionBean.Status.USER_RETRY
148                        )) {
149            if (wfJob.getStatus() != WorkflowJob.Status.RUNNING) {
150                throw new PreconditionException(ErrorCode.E0810, WorkflowJob.Status.RUNNING.toString());
151            }
152        }
153        else {
154            throw new PreconditionException(ErrorCode.E0816, wfAction.isPending(), wfAction.getStatusStr());
155        }
156
157        executor = Services.get().get(ActionService.class).getExecutor(wfAction.getType());
158        if (executor == null) {
159            throw new CommandException(ErrorCode.E0802, wfAction.getType());
160        }
161    }
162
163    @Override
164    protected ActionExecutorContext execute() throws CommandException {
165        LOG.debug("STARTED ActionStartXCommand for wf actionId=" + actionId);
166        Configuration conf = wfJob.getWorkflowInstance().getConf();
167
168        int maxRetries = 0;
169        long retryInterval = 0;
170        boolean execSynchronous = false;
171
172        if (!(executor instanceof ControlNodeActionExecutor)) {
173            maxRetries = conf.getInt(OozieClient.ACTION_MAX_RETRIES, executor.getMaxRetries());
174            retryInterval = conf.getLong(OozieClient.ACTION_RETRY_INTERVAL, executor.getRetryInterval());
175        }
176
177        executor.setMaxRetries(maxRetries);
178        executor.setRetryInterval(retryInterval);
179
180        try {
181            boolean isRetry = false;
182            if (wfAction.getStatus() == WorkflowActionBean.Status.START_RETRY
183                    || wfAction.getStatus() == WorkflowActionBean.Status.START_MANUAL) {
184                isRetry = true;
185                prepareForRetry(wfAction);
186            }
187            boolean isUserRetry = false;
188            if (wfAction.getStatus() == WorkflowActionBean.Status.USER_RETRY) {
189                isUserRetry = true;
190                prepareForRetry(wfAction);
191            }
192            context = getContext(isRetry, isUserRetry);
193            boolean caught = false;
194            try {
195                if (!(executor instanceof ControlNodeActionExecutor)) {
196                    String tmpActionConf = XmlUtils.removeComments(wfAction.getConf());
197                    String actionConf = context.getELEvaluator().evaluate(tmpActionConf, String.class);
198                    wfAction.setConf(actionConf);
199                    LOG.debug("Start, name [{0}] type [{1}] configuration{E}{E}{2}{E}", wfAction.getName(), wfAction
200                            .getType(), actionConf);
201                }
202            }
203            catch (ELEvaluationException ex) {
204                caught = true;
205                throw new ActionExecutorException(ActionExecutorException.ErrorType.TRANSIENT, EL_EVAL_ERROR, ex
206                        .getMessage(), ex);
207            }
208            catch (ELException ex) {
209                caught = true;
210                context.setErrorInfo(EL_ERROR, ex.getMessage());
211                LOG.warn("ELException in ActionStartXCommand ", ex.getMessage(), ex);
212                handleError(context, wfJob, wfAction);
213            }
214            catch (org.jdom.JDOMException je) {
215                caught = true;
216                context.setErrorInfo("ParsingError", je.getMessage());
217                LOG.warn("JDOMException in ActionStartXCommand ", je.getMessage(), je);
218                handleError(context, wfJob, wfAction);
219            }
220            catch (Exception ex) {
221                caught = true;
222                context.setErrorInfo(EL_ERROR, ex.getMessage());
223                LOG.warn("Exception in ActionStartXCommand ", ex.getMessage(), ex);
224                handleError(context, wfJob, wfAction);
225            }
226            if(!caught) {
227                wfAction.setErrorInfo(null, null);
228                incrActionCounter(wfAction.getType(), 1);
229
230                LOG.info("Start action [{0}] with user-retry state : userRetryCount [{1}], userRetryMax [{2}], userRetryInterval"
231                        + " [{3}]",
232                                wfAction.getId(), wfAction.getUserRetryCount(), wfAction.getUserRetryMax(), wfAction
233                                        .getUserRetryInterval());
234
235                Instrumentation.Cron cron = new Instrumentation.Cron();
236                cron.start();
237                // do not override starttime for retries
238                if (wfAction.getStartTime() == null) {
239                    context.setStartTime();
240                }
241                context.setVar(JobUtils.getRetryKey(wfAction, JsonTags.WORKFLOW_ACTION_START_TIME),
242                        String.valueOf(new Date().getTime()));
243                executor.start(context, wfAction);
244                cron.stop();
245                FaultInjection.activate("org.apache.oozie.command.SkipCommitFaultInjection");
246                addActionCron(wfAction.getType(), cron);
247
248                wfAction.setRetries(0);
249                if (wfAction.isExecutionComplete()) {
250                    if (!context.isExecuted()) {
251                        LOG.warn(XLog.OPS, "Action Completed, ActionExecutor [{0}] must call setExecutionData()", executor
252                                .getType());
253                        wfAction.setErrorInfo(EXEC_DATA_MISSING,
254                                "Execution Complete, but Execution Data Missing from Action");
255                        failJob(context);
256                    } else {
257                        wfAction.setPending();
258                        if (!(executor instanceof ControlNodeActionExecutor)) {
259                            queue(new ActionEndXCommand(wfAction.getId(), wfAction.getType()));
260                        }
261                        else {
262                            execSynchronous = true;
263                        }
264                    }
265                }
266                else {
267                    if (!context.isStarted()) {
268                        LOG.warn(XLog.OPS, "Action Started, ActionExecutor [{0}] must call setStartData()", executor
269                                .getType());
270                        wfAction.setErrorInfo(START_DATA_MISSING, "Execution Started, but Start Data Missing from Action");
271                        failJob(context);
272                    } else {
273                        queue(new WorkflowNotificationXCommand(wfJob, wfAction));
274                    }
275                }
276
277                LOG.info(XLog.STD, "[***" + wfAction.getId() + "***]" + "Action status=" + wfAction.getStatusStr());
278
279                updateList.add(new UpdateEntry<WorkflowActionQuery>(WorkflowActionQuery.UPDATE_ACTION_START, wfAction));
280                updateJobLastModified();
281                // Add SLA status event (STARTED) for WF_ACTION
282                SLAEventBean slaEvent = SLADbXOperations.createStatusEvent(wfAction.getSlaXml(), wfAction.getId(), Status.STARTED,
283                        SlaAppType.WORKFLOW_ACTION);
284                if(slaEvent != null) {
285                    insertList.add(slaEvent);
286                }
287                LOG.info(XLog.STD, "[***" + wfAction.getId() + "***]" + "Action updated in DB!");
288            }
289        }
290        catch (ActionExecutorException ex) {
291            LOG.warn("Error starting action [{0}]. ErrorType [{1}], ErrorCode [{2}], Message [{3}]",
292                    wfAction.getName(), ex.getErrorType(), ex.getErrorCode(), ex.getMessage(), ex);
293            wfAction.setErrorInfo(ex.getErrorCode(), ex.getMessage());
294            switch (ex.getErrorType()) {
295                case TRANSIENT:
296                    if (!handleTransient(context, executor, WorkflowAction.Status.START_RETRY)) {
297                        handleNonTransient(context, executor, WorkflowAction.Status.START_MANUAL);
298                        wfAction.setPendingAge(new Date());
299                        wfAction.setRetries(0);
300                        wfAction.setStartTime(null);
301                    }
302                    break;
303                case NON_TRANSIENT:
304                    handleNonTransient(context, executor, WorkflowAction.Status.START_MANUAL);
305                    break;
306                case ERROR:
307                    handleError(context, executor, WorkflowAction.Status.ERROR.toString(), true,
308                            WorkflowAction.Status.DONE);
309                    break;
310                case FAILED:
311                    try {
312                        failJob(context);
313                        endWF();
314                        SLAEventBean slaEvent1 = SLADbXOperations.createStatusEvent(wfAction.getSlaXml(), wfAction.getId(),
315                                Status.FAILED,
316                                SlaAppType.WORKFLOW_ACTION);
317                        if(slaEvent1 != null) {
318                            insertList.add(slaEvent1);
319                        }
320                    }
321                    catch (XException x) {
322                        LOG.warn("ActionStartXCommand - case:FAILED ", x.getMessage());
323                    }
324                    break;
325            }
326            updateList.add(new UpdateEntry<WorkflowActionQuery>(WorkflowActionQuery.UPDATE_ACTION_START, wfAction));
327            updateJobLastModified();
328        }
329        finally {
330            try {
331                BatchQueryExecutor.getInstance().executeBatchInsertUpdateDelete(insertList, updateList, null);
332                if (!(executor instanceof ControlNodeActionExecutor) && EventHandlerService.isEnabled()) {
333                    generateEvent(wfAction, wfJob.getUser());
334                }
335                if (execSynchronous) {
336                    // Changing to synchronous call from asynchronous queuing to prevent
337                    // undue delay from ::start:: to action due to queuing
338                    callActionEnd();
339                }
340            }
341            catch (JPAExecutorException e) {
342                throw new CommandException(e);
343            }
344        }
345
346        LOG.debug("ENDED ActionStartXCommand for wf actionId=" + actionId + ", jobId=" + jobId);
347
348        return null;
349    }
350
351    protected void callActionEnd() throws CommandException {
352        new ActionEndXCommand(wfAction.getId(), wfAction.getType()).call();
353    }
354
355    /**
356     * Get action executor context
357     * @param isRetry
358     * @param isUserRetry
359     * @return ActionExecutorContext returns action executor context
360     */
361    protected ActionExecutorContext getContext(boolean isRetry, boolean isUserRetry) {
362        return new ActionXCommand.ActionExecutorContext(wfJob, wfAction, isRetry, isUserRetry);
363    }
364
365    protected void updateJobLastModified(){
366        wfJob.setLastModifiedTime(new Date());
367        updateList.add(new UpdateEntry<WorkflowJobQuery>(WorkflowJobQuery.UPDATE_WORKFLOW_STATUS_INSTANCE_MODIFIED, wfJob));
368    }
369
370    protected void endWF() throws CommandException{
371        updateParentIfNecessary(wfJob, 3);
372        new WfEndXCommand(wfJob).call(); // To delete the WF temp dir
373        SLAEventBean slaEvent2 = SLADbXOperations.createStatusEvent(wfJob.getSlaXml(), wfJob.getId(), Status.FAILED,
374                SlaAppType.WORKFLOW_JOB);
375        if(slaEvent2 != null) {
376            insertList.add(slaEvent2);
377        }
378    }
379
380    protected void handleError(ActionExecutorContext context, WorkflowJobBean workflow, WorkflowActionBean action)
381            throws CommandException {
382        failJob(context);
383        updateList.add(new UpdateEntry<WorkflowActionQuery>(WorkflowActionQuery.UPDATE_ACTION_START, wfAction));
384        updateJobLastModified();
385        SLAEventBean slaEvent1 = SLADbXOperations.createStatusEvent(action.getSlaXml(), action.getId(),
386                Status.FAILED, SlaAppType.WORKFLOW_ACTION);
387        if(slaEvent1 != null) {
388            insertList.add(slaEvent1);
389        }
390        endWF();
391        return;
392    }
393
394    /* (non-Javadoc)
395     * @see org.apache.oozie.command.XCommand#getKey()
396     */
397    @Override
398    public String getKey(){
399        return getName() + "_" + actionId;
400    }
401
402    private void prepareForRetry(WorkflowActionBean wfAction) {
403        if (wfAction.getType().equals("map-reduce")) {
404            // need to delete child job id of original run
405            wfAction.setExternalChildIDs("");
406        }
407    }
408
409    @Override
410    protected void queueCommandForTransientFailure(long retryDelayMillis){
411        queue(new ActionStartXCommand(wfAction.getId(), wfAction.getType()), retryDelayMillis);
412    }
413
414    protected void queue(XCommand<?> command, long msDelay) {
415        // ActionStartXCommand is synchronously called from SignalXCommand passing wfJob so that it doesn't have to
416        //reload wfJob again. We need set wfJob to null, so that it get reloaded when the requeued command executes.
417        if (command instanceof ActionStartXCommand) {
418            ((ActionStartXCommand)command).wfJob = null;
419        }
420        super.queue(command, msDelay);
421    }
422}