#!/bin/bash
#    Copyright 2016 Mirantis, Inc.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.
#
# See usage() function below for more details ...
#
# OCF instance parameters:
#   OCF_RESKEY_binary
#   OCF_RESKEY_config
#   OCF_RESKEY_log_file
#   OCF_RESKEY_user
#   OCF_RESKEY_watchdog_file
#   OCF_RESKEY_watchdog_timeout
#######################################################################
# Initialization:

: "${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}"
. "${OCF_FUNCTIONS_DIR}/ocf-shellfuncs"

#######################################################################

# Fill in some defaults if no values are specified

OCF_RESKEY_binary_default="/usr/bin/hekad"
OCF_RESKEY_user_default="root"
OCF_RESKEY_watchdog_file_default=
OCF_RESKEY_watchdog_timeout_default=20

: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}}
: ${OCF_RESKEY_watchdog_file=${OCF_RESKEY_watchdog_file_default}}
: ${OCF_RESKEY_watchdog_timeout=${OCF_RESKEY_watchdog_timeout_default}}

#######################################################################

usage() {
    cat <<UEND
        usage: $0 (start|stop|validate-all|meta-data|status|monitor)

        $0 manages the collector process as an HA resource

        The 'start' operation starts the collector
        The 'stop' operation stops the collector
        The 'validate-all' operation reports whether the parameters are valid
        The 'meta-data' operation reports this RA's meta-data information
        The 'status' operation reports whether the collector is running
        The 'monitor' operation reports whether the collector is running

UEND
}

meta_data() {
    cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="lma_collector">
<version>1.0</version>

<longdesc lang="en">
Manages the LMA collector daemon as a Pacemaker Resource.
</longdesc>
<shortdesc lang="en">Manages Log or Metric collector</shortdesc>
<parameters>

<parameter name="service_name" unique="0" required="1">
<longdesc lang="en">
Name of the collector service.
</longdesc>
<shortdesc lang="en">Collector service name</shortdesc>
<content type="string" />
</parameter>

<parameter name="binary" unique="0" required="0">
<longdesc lang="en">
Path of the LMA collector binary file that will be run.
</longdesc>
<shortdesc lang="en">LMA collector binary file</shortdesc>
<content type="string" default="${OCF_RESKEY_binary_default}" />
</parameter>

<parameter name="config" unique="0" required="1">
<longdesc lang="en">
Path to the LMA collector configuration file or directory
</longdesc>
<shortdesc lang="en">LMA collector configuration</shortdesc>
<content type="string" />
</parameter>

<parameter name="log_file" unique="0" required="1">
<longdesc lang="en">
Path to the LMA collector log file
</longdesc>
<shortdesc lang="en">LMA collector log file</shortdesc>
<content type="string" />
</parameter>

<parameter name="user" unique="0" required="0">
<longdesc lang="en">
User running the LMA collector process
</longdesc>
<shortdesc lang="en">LMA collector user</shortdesc>
<content type="string" default="${OCF_RESKEY_user_default}" />
</parameter>

<parameter name="watchdog_file" unique="0" required="0">
<longdesc lang="en">
The file to monitor that the process is up and running
</longdesc>
<shortdesc lang="en">LMA collector watchdog file</shortdesc>
<content type="string" default="${OCF_RESKEY_watchdog_file_default}" />
</parameter>

<parameter name="watchdog_timeout" unique="0" required="0">
<longdesc lang="en">
How much time the watchdog file can be left unmodified before claiming that
the process is unresponsive.
</longdesc>
<shortdesc lang="en">LMA collector watchdog timeout</shortdesc>
<content type="string" default="${OCF_RESKEY_watchdog_timeout_default}" />
</parameter>

</parameters>

<actions>
<action name="start" timeout="20" />
<action name="stop" timeout="20" />
<action name="status" timeout="20" />
<action name="monitor" timeout="30" interval="20" />
<action name="validate-all" timeout="5" />
<action name="meta-data" timeout="5" />
</actions>
</resource-agent>
END
}

#######################################################################
# Functions invoked by resource manager actions

service_validate() {
    local rc

    PID_FILE="${HA_RSCTMP}/${__SCRIPT_NAME}/${OCF_RESKEY_service_name}.pid"

    check_binary "$OCF_RESKEY_binary"

    if [[ ! -f $OCF_RESKEY_config && ! -d $OCF_RESKEY_config ]]; then
        ocf_log err "Config $OCF_RESKEY_config doesn't exist"
        return "$OCF_ERR_INSTALLED"
    fi

    getent passwd "$OCF_RESKEY_user" >/dev/null 2>&1
    rc=$?
    if [ $rc -ne 0 ]; then
        ocf_log err "User $OCF_RESKEY_user doesn't exist"
        return "$OCF_ERR_INSTALLED"
    fi

    true
}

service_status() {
    local rc
    local pid

    # check and make PID file dir
    local PID_DIR
    PID_DIR=$( dirname "${PID_FILE}" )
    if [ ! -d "${PID_DIR}" ] ; then
        ocf_log debug "Create pid file dir: ${PID_DIR} and chown to ${OCF_RESKEY_user}"
        mkdir -p "${PID_DIR}"
        chown -R "${OCF_RESKEY_user}" "${PID_DIR}"
        chmod 755 "${PID_DIR}"
    fi

    if [ ! -f "$PID_FILE" ]; then
        ocf_log info "LMA collector is not running"
        return "$OCF_NOT_RUNNING"
    else
        pid=$(cat "$PID_FILE")
    fi

    if [ -n "${pid}" ]; then
        ocf_run -warn kill -s 0 "$pid"
        rc=$?
        if [ $rc -ne 0 ]; then
            ocf_log info "Old PID file found, but LMA collector process isn't running"
            return "$OCF_NOT_RUNNING"
        fi
    else
        ocf_log err "PID file ${PID_FILE} is empty!"
        return "$OCF_ERR_GENERIC"
    fi

    if [ ! -z "$OCF_RESKEY_watchdog_file" ]; then
        if [ ! -f "$OCF_RESKEY_watchdog_file" ]; then
            ocf_log info "${OCF_RESKEY_watchdog_file} is missing"
            return "$OCF_NOT_RUNNING"
        else
            now=$(date '+%s')
            last_access=$(stat -c %Y "${OCF_RESKEY_watchdog_file}")
            if [ $(( now - last_access )) -gt "${OCF_RESKEY_watchdog_timeout}" ]; then
              ocf_log err "File ${OCF_RESKEY_watchdog_file} not modified since ${OCF_RESKEY_watchdog_timeout} seconds"
              return "$OCF_NOT_RUNNING"
            fi
        fi
    else
        ocf_log debug "Skip watchdog check since watchdog_file parameter is not set"
    fi

    return "$OCF_SUCCESS"
}

service_monitor() {
    local rc
    service_status
    rc=$?
    return $rc
}

service_start() {
    local rc

    service_monitor
    rc=$?
    if [ $rc -eq "$OCF_SUCCESS" ]; then
        ocf_log info "${OCF_RESKEY_service_name} is already running"
        return "$OCF_SUCCESS"
    fi

    # See https://bugs.launchpad.net/lma-toolchain/+bug/1543289
    ulimit -n 102400

    su "${OCF_RESKEY_user}" -s /bin/sh -c "${OCF_RESKEY_binary} \
-config=${OCF_RESKEY_config} >> $OCF_RESKEY_log_file 2>&1"' & echo $!' > "$PID_FILE"

    # Spin waiting for the server to come up
    while true; do
        service_monitor
        rc=$?
        [ $rc -eq "$OCF_SUCCESS" ] && break
        if [ $rc -ne "$OCF_NOT_RUNNING" ]; then
            ocf_log err "${OCF_RESKEY_service_name} start failed"
            exit "$OCF_ERR_GENERIC"
        fi
        sleep 3
    done

    ocf_log info "${OCF_RESKEY_service_name} started"
    return "$OCF_SUCCESS"
}

service_stop() {
    local rc
    local pid

    service_monitor
    rc=$?
    if [ $rc -eq "$OCF_NOT_RUNNING" ]; then
        ocf_log info "${OCF_RESKEY_service_name} is already stopped"
        return "$OCF_SUCCESS"
    fi

    # Try SIGTERM
    pid=$(cat "$PID_FILE")
    ocf_run kill -s TERM "$pid"
    rc=$?
    if [ $rc -ne 0 ]; then
        ocf_log err "${OCF_RESKEY_service_name} couldn't be stopped"
        exit "$OCF_ERR_GENERIC"
    fi

    # stop waiting
    shutdown_timeout=15
    if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
        shutdown_timeout=$(( (OCF_RESKEY_CRM_meta_timeout/1000)-5 ))
    fi
    count=0
    while [ $count -lt $shutdown_timeout ]; do
        service_monitor
        rc=$?
        if [ $rc -eq "$OCF_NOT_RUNNING" ]; then
            break
        fi
        count=$(( count + 1))
        sleep 1
        ocf_log debug "${OCF_RESKEY_service_name} still hasn't stopped yet. Waiting ..."
    done

    service_monitor
    rc=$?
    if [ "${rc}" -ne "${OCF_NOT_RUNNING}" ]; then
        # SIGTERM didn't help either, try SIGKILL
        ocf_log info "${OCF_RESKEY_service_name} failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL ..."
        ocf_run kill -s KILL "${pid}"
    fi

    ocf_log info "${OCF_RESKEY_service_name} stopped"

    ocf_log debug "Delete pid file: ${PID_FILE} with content $(cat "${PID_FILE}")"
    rm -f "${PID_FILE}"

    return "${OCF_SUCCESS}"
}

#######################################################################

case "$1" in
  meta-data)    meta_data
                exit "$OCF_SUCCESS";;
  usage|help)   usage
                exit "$OCF_SUCCESS";;
esac

# Anything except meta-data and help must pass validation
service_validate || exit $?

# What kind of method was invoked?
case "$1" in
  start)        service_start;;
  stop)         service_stop;;
  status)       service_status;;
  monitor)      service_monitor;;
  validate-all) ;;
  *)            usage
                exit "$OCF_ERR_UNIMPLEMENTED";;
esac
