#!/bin/bash
#
# runseg.sh
#
# WESTPA runs this script for each trajectory segment. WESTPA supplies
# environment variables that are unique to each segment, such as:
#
# WEST_CURRENT_SEG_DATA_REF: A path to where the current trajectory segment's
# data will be stored. This will become "WEST_PARENT_DATA_REF" for any
# child segments that spawn from this segment
# WEST_PARENT_DATA_REF: A path to a file or directory containing data for the
# parent segment.
# WEST_CURRENT_SEG_INITPOINT_TYPE: Specifies whether this segment is starting
# anew, or if this segment continues from where another segment left off.
# WEST_RAND16: A random integer
#
# This script has the following three jobs:
# 1. Create a directory for the current trajectory segment, and set up the
# directory for running gmx mdrun
# 2. Run the dynamics
# 3. Calculate the progress coordinates and return data to WESTPA
# Start time for the whole script
SCRIPT_START_TIME=$(date +%s.%N)
# Function to calculate and print elapsed time
print_timing() {
local desc=$1
local start_time=$2
local end_time=$(date +%s.%N)
local elapsed_time=$(echo "$end_time - $start_time" | bc)
echo "[TIMER] $desc: ${elapsed_time} seconds"
}
# If we are running in debug mode, then output a lot of extra information.
if [ -n "$SEG_DEBUG" ] ; then
env | sort
fi
######################## Set up for running the dynamics #######################
# Set up the temp directory where data for this segment will be calculated
ITER=$(printf "%06d" $WEST_CURRENT_ITER)
SEG=$(printf "%06d" $WEST_CURRENT_SEG_ID)
CALC_TMPDIR="$WEST_SIM_TMP/"traj_segs"/$ITER/$SEG"
mkdir -pv $CALC_TMPDIR
cd $CALC_TMPDIR
# The weighted ensemble algorithm requires that dynamics are stochastic.
# We'll use the "sed" command to replace the string "RAND" with a randomly
# generated seed.
sed "s/RAND/$WEST_RAND16/g" $MD_MDP > md.mdp
# Setup GROMACS process and GPU indexes
# WM_PROCESS_INDEX is a 0-based integer identifying the process among the set of processes started on a given node
# It is not defined for all work managers, e.g. serial, but needed for westraj map_worker
if [ -z "$WM_PROCESS_INDEX" ]; then
export WM_PROCESS_INDEX=0
fi
# This script assigns a GPU_IDX, CPU_IDX, and NUMA_IDX for the current worker
START_TIME=$(date +%s.%N)
eval "$($PYTHON -m westraj.cli.map_worker)"
echo "WORKER_IDX: $WM_PROCESS_INDEX, GPU_IDX: $GPU_IDX, CPU_IDX: $CPU_IDX, NUMA_IDX: $NUMA_IDX, CPU_RANGE: $CPU_RANGE"
print_timing "NUMA Node, GPU, and CPU assignment" $START_TIME
# Assigns the correct GPU/CPUs for this worker
export CUDA_VISIBLE_DEVICES=$GPU_IDX
if [ "$NUMA_AFFINITY_ENABLED" = true ]; then
NUMA_CONFIG="$NUMACTL --physcpubind=$CPU_RANGE"
else
NUMA_CONFIG=""
fi
MDRUN_CPU_CONFIG="-ntmpi 1 -nt $OMP_NUM_THREADS -pin on -pinoffset $CPU_IDX -pinstride 1"
echo "MDRUN_CPU_CONFIG: $MDRUN_CPU_CONFIG, NUMA_CONFIG: $NUMA_CONFIG"
# Run the GROMACS preprocessor
START_TIME=$(date +%s.%N)
$NUMA_CONFIG $GMX grompp -f md.mdp -c $REF_GRO -p $TOPOL_TOP \
-t parent.trr -o seg.tpr -po $NULL -n $INDEX_NDX
print_timing "gmx grompp" $START_TIME
############################## Run the dynamics with re-tries on failure ################################
TRY=0
while true; do
# Propagate the segment using gmx mdrun
START_TIME=$(date +%s.%N)
$NUMA_CONFIG $GMX mdrun $MDRUN_CPU_CONFIG -tunepme no -update gpu -nb gpu -pme gpu \
-pmefft gpu -bonded cpu -deffnm seg -cpt -1 -nocpnum -cpo $NULL -noconfout
EXIT_STATUS=$?
print_timing "gmx mdrun" $START_TIME
if [ $EXIT_STATUS -eq 0 ]; then
# GROMACS exited without error, so we can break out of the loop
break
fi
# Archive the crashed simulation files and retry
echo "mdrun failed with exit status $EXIT_STATUS. Copying simulation files to crash archive."
ARCHIVE_DIR=$WEST_SIM_ROOT/crashes/$ITER/$SEG/try_$TRY
mkdir -p $ARCHIVE_DIR
cp -r $CALC_TMPDIR/* $ARCHIVE_DIR
# If try meets or exceeds the maximum number of retries, then exit
if [ $TRY -ge $MDRUN_RETRY_MAX ]; then
echo "max mdrun retry attempts ($MDRUN_RETRY_MAX) reached. Exiting..."
if [ "$WEST_SIM_TMP" != "$WEST_SIM_ROOT" ]; then
rm -r $CALC_TMPDIR
fi
exit 1
fi
# Increment the number of tries and wait before retrying
TRY=$((TRY + 1))
echo "Retrying in $MDRUN_RETRY_WAIT seconds... (Attempt $TRY of $MDRUN_RETRY_MAX)"
sleep $MDRUN_RETRY_WAIT
done
########################## Transform Coordinates ##########################
# 1: Unwrap chains with pbc whole (much faster than MDAnalysis!)
START_TIME=$(date +%s.%N)
mv seg.xtc seg_orig.xtc # prevents overwriting the original xtc file
echo "SOLU" | $NUMA_CONFIG $GMX trjconv -f seg_orig.xtc -s seg.tpr -n $INDEX_NDX -pbc whole -o seg.xtc
print_timing "gmx trajconv to remove pbc artifacts" $START_TIME
########################## Calculate and return data ###########################
# Link the ref.pdb topology file to WEST_TRAJECTORY_RETURN
ln -s $REF_PDB $WEST_TRAJECTORY_RETURN/ref.pdb
# The $CALC_PCOORD script calculates pcoords and auxdata and returns them to westpa
START_TIME=$(date +%s.%N)
$NUMA_CONFIG $CALC_PCOORD
print_timing "Calculated and returned progress coordinates" $START_TIME
# Only pass on the final frame of the trr file to the next iteration to save space
START_TIME=$(date +%s.%N)
$NUMA_CONFIG $GMX trjconv -f seg.trr -o $WEST_RESTART_RETURN/parent.trr -b 1
print_timing "gmx trjconv to truncate trajectory restart file" $START_TIME
# Return the gromacs log to westpa
cp seg.log $WEST_LOG_RETURN
# If everything ran correctly, clean up all the files that we don't need to save.
# But only if $WEST_SIM_TMP is not the same as $WEST_SIM_ROOT.
if [ "$WEST_SIM_TMP" != "$WEST_SIM_ROOT" ]; then
rm -r $CALC_TMPDIR
fi
# Final script timing
print_timing "Total script execution time" $SCRIPT_START_TIME