winnet_monitor_run.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 # Author: M. Reichert
3 import numpy as np
4 import matplotlib.pyplot as plt
5 import pandas as pd
6 from tqdm import tqdm
7 from datetime import datetime,time,timedelta
8 from winnet_class import winnet
9 import sys
10 import subprocess
11 import os
12 import h5py
13 
14 # Define slurm user here (not really necessary, but it may improve the estimate)
15 user = "User"
16 
17 # Use slurm to estimate times?
18 # This will speed up the monitoring
19 use_slurm_times = False
20 
21 # Get the input
22 if len(sys.argv) < 2:
23  path ="."
24 else:
25  path = sys.argv[1]
26 
27 # Get number of slurm processes
28 cmd = "squeue | grep "+user+" | grep R | wc -l"
29 x = subprocess.check_output(cmd, shell=True)
30 nr_process = int(x.strip())
31 
32 # Set variables initially to zero
33 allTotal = 0
34 allRem = 0
35 allFin = 0
36 allFail = 0
37 
38 eltime = []
39 all_folders = os.listdir(path)
40 for p in all_folders:
41  tot_path = os.path.join(path,p)
42  if not os.path.isdir(tot_path):
43  continue
44 
45  w_path = os.path.join(tot_path,"winnet")
46  b_path = os.path.join(tot_path,"blocked")
47  f_path = os.path.join(tot_path,"finab.dat")
48  h_path = os.path.join(tot_path,"WinNet_data.h5")
49  o_path = os.path.join(tot_path,"OUT")
50  # Some other folder
51  if not os.path.isfile(w_path) and not os.path.isfile(b_path):
52  continue
53  elif not os.path.isfile(w_path) and os.path.isfile(b_path):
54  # Either Fail or finished
55  if os.path.isfile(f_path):
56  # Read elapsed time of finshed runs
57  if not use_slurm_times:
58  w = winnet(tot_path)
59  w.read_OUT()
60  eltime.append(w.elapsed_time)
61  allFin += 1
62  elif os.path.isfile(h_path):
63  ftmp = h5py.File(h_path,"r")
64  if "finab/" in ftmp:
65  if not use_slurm_times:
66  w = winnet(tot_path)
67  w.read_OUT()
68  eltime.append(w.elapsed_time)
69  allFin += 1
70  else:
71  allFail += 1
72  else:
73  allFail += 1
74  elif os.path.isfile(w_path) and os.path.isfile(b_path):
75  allRem += 1
76 
77  allTotal +=1
78 
79 if use_slurm_times:
80  # Get the time running already
81  cmd = "squeue | grep "+user+" | grep R | awk '{print $6}' "
82  x = subprocess.check_output(cmd, shell=True)
83 
84  times = x.split("\n")
85  ttt = [] # time in hours
86  for t in times:
87  if t.strip() == "":
88  continue
89  days = t.split("-")
90  if len(days)==2:
91  tottime = float(days[0])*24
92  days = days[1]
93  else:
94  tottime = 0
95  days = days[0]
96  days = days.split(":")
97 
98  if len(days)== 2:
99  tottime += float(days[0])/60.+float(days[1])/60./60.
100  elif len(days)== 3:
101  tottime += float(days[0])+float(days[1])/60.+float(days[2])/60./60.
102 
103  ttt.append(tottime)
104  # Average time [h] the processes were running
105  av_time = np.average(ttt)
106 else:
107  ttt = np.array(eltime)/60./60.
108 
109  if allRem!=0:
110  av_per_h = 1./(np.nanmean(eltime)/60./60./allRem)
111  av_time = allFin/av_per_h
112  elif nr_process!=0:
113  av_per_h = 1./allFin/(np.nanmean(eltime)/60./60./nr_process)
114  av_time = allFin/av_per_h
115  else:
116  av_per_h = 1./allFin/(np.nanmean(eltime)/60./60./100.)
117  av_time = allFin/av_per_h
118 
119 # Run per hour
120 if allFin != 0 and len(ttt)>0:
121  finished = False
122 elif allFin+allFail == allTotal:
123  finished =True
124 else:
125  if allRem != 0:
126  # Make rough estimate, 15min per run
127  av_per_h = 1./((1.5/6.)/allRem)
128  elif nr_process!=0:
129  av_per_h = 1./((1.5/6.)/nr_process)
130  else:
131  av_per_h = 1./((1.5/6.)/100)
132  finished = False
133 
134 
135 # Estimated rest time
136 rest_runs = allTotal-allFin-allFail
137 if not finished:
138  est_time = rest_runs/av_per_h
139  # Make it well readable
140  est_time = np.round(est_time*60*60)
141  secs = int(est_time % 60)
142  est_time = (est_time-secs)/60.
143  mins = int(est_time % 60)
144  est_time = int((est_time-mins)/60.)
145  hours = est_time
146 else:
147  secs = 0
148  mins = 0
149  hours = 0
150 
151 timestr = str(hours).zfill(2)+":"+str(mins).zfill(2)+":"+str(secs).zfill(2)
152 
153 # Duration
154 date_object = timedelta(hours=hours,minutes=mins,seconds=secs)
155 # Current date
156 now = datetime.now()
157 
158 # Estimated time of finish
159 finishtime = now+date_object
160 
161 # Average time per tracer
162 est_time = np.round(av_time*60*60)
163 secs2 = int(est_time % 60)
164 est_time = (est_time-secs)/60.
165 mins2 = int(est_time % 60)
166 est_time = int((est_time-mins)/60.)
167 hours2 = est_time
168 timestr2 = str(hours2).zfill(2)+":"+str(mins2).zfill(2)+":"+str(secs2).zfill(2)
169 
170 
171 # Create the output
172 
173 outstr = ""
174 outstr += " WinNet monitoring "+"\n"
175 outstr += "===================================="+"\n"
176 # outstr += "\n"
177 outstr += "| Number of runs | "+str(allTotal).rjust(10)+" | \n"
178 outstr += "| Running | "+str(allRem).rjust(10) +" | \n"
179 outstr += "| Finished | "+str(allFin).rjust(10) +" | \n"
180 outstr += "| Failed | "+str(allFail).rjust(10) +" | \n"
181 outstr += "| --------------------------------"+" | \n"
182 # outstr += "\n"
183 outstr += "| Av. time / tracer | "+timestr2.rjust(10) +" | \n"
184 outstr += "| Estimated duration | "+timestr.rjust(10) +" | \n"
185 outstr += "| Estimated finish | "+finishtime.strftime("%H:%M:%S").rjust(10) +" | \n"
186 outstr += "|"+"_"*34 +"| \n"
187 
188 
189 # Output
190 print(outstr)