#! /bin/ksh
#
# esm-ssp585-ocn-alk_EXP3.progress
#
# Generated by Make Experiments! (mkexp) 1.1.5rc2
#
# $Id$
#
# $Id: DEFAULT.config 1 2021-02-03 20:53:47Z m221078 $
# $Id$
# $Id$
# $Id: CDRSYNTRA_OUTPUT.config 9899 2019-07-01 09:57:06Z m221078 $
# $Id: levante.config $
#

#
# Setup for levante (SLURM)
#
# $Id: levante.tmpl $
#
#SBATCH --job-name=esm-ssp585-ocn-alk_EXP3_progress
#SBATCH --partition=shared
#SBATCH --tasks-per-node=1
#SBATCH --time=01:00:00
#SBATCH --output=%x_%j.log
#SBATCH --mail-type=FAIL,ARRAY_TASKS
#SBATCH --account=bm1241
#SBATCH --propagate=STACK,CORE

# Workaround for SLURM bug in chained jobs
SLURM_JOB_NAME=esm-ssp585-ocn-alk_EXP3_progress
sbatch () {
    unset SLURM_MEM_PER_CPU SLURM_NTASKS_PER_NODE
    command sbatch "$@"
}

#

DEBUG_LEVEL=0

# Support log style output
print () { command print "$(date +'%F %T'):" "$@"; }

# Bail out on error
trap 'print Error at line $LINENO >&2' ERR
set -eu
# Print command info
[[ $DEBUG_LEVEL -ge 2 ]] && set -x

#
# Main script starts here
#

JOB_ID=$1
JOB_NAME=$2

PATH=/home/m/m300966/CDRSynTra-mpiesm-1.2.01p7/util:$PATH

DATA_FILE=esm-ssp585-ocn-alk_EXP3_progress_$JOB_ID.dat

print "monitoring of model progress for $JOB_ID started"

while sleep 60 && (FOUND=$(squeue -t r -h -o true -j $JOB_ID 2> /dev/null); eval ${FOUND:-false})
do
    if BUFFER=$(mpiesm_eta $JOB_ID)
    then
        read PROGRESS PREDICTION TIMESTAMP LAST_MODEL << ---
        $(
            echo "$BUFFER" | perl -lne '
                BEGIN {
                    use POSIX qw(strftime);
                    $t = strftime("%Y-%m-%dT%H:%M:%S", localtime());
                }
                /^estimated flight time \(.*?\): ([\d.]+)/ and $y = $1;
                /^estimated time of arrival.* ([\d.]+) % done$/ and $p = $1;
                /^lagging model: (\d+)/ and $m = $1;
                END {
                    print "$p $y $t $m";
                }
            '
        )
---
        echo $PROGRESS $PREDICTION $TIMESTAMP $LAST_MODEL >> esm-ssp585-ocn-alk_EXP3_progress_$JOB_ID.dat
        # Need time limit ([[d-]H:]M:S) in minutes
        TIME_LIMIT=$(
            squeue -o '%l' -P -h -j $JOB_ID |
            perl -pe '/(((\d+)-)?(\d+):)?(\d+):(\d+)/ and
                      $_ = ($3*24. + $4)*60. + $5 + $6/60.'
        )
        # Prediction is reliable only after the first 10% have passed
        if (( PROGRESS > 10 && PREDICTION > TIME_LIMIT ))
        then
            print "Sorry: predicted run time for $JOB_ID ($PREDICTION min) exceeds limit ($TIME_LIMIT min); cancelling and restarting esm-ssp585-ocn-alk_EXP3.$JOB_NAME"
            scancel $JOB_ID
            sbatch esm-ssp585-ocn-alk_EXP3.$JOB_NAME
            break
        fi
    fi
done

print "monitoring of model progress for $JOB_ID finished"
