compton_combiner.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407
  1. """Script to combine compton measurements with runs and process these data
  2. """
  3. import argparse
  4. from configparser import ConfigParser
  5. from datetime import datetime, timedelta, timezone
  6. import sqlite3
  7. from typing import Union, Tuple, Optional
  8. from compton_filter import CalibrdbHandler
  9. from iminuit import Minuit
  10. import matplotlib.dates as mdates
  11. import matplotlib.pyplot as plt
  12. from mysql.connector import connect, Error
  13. import numpy as np
  14. import pandas as pd
  15. from tqdm import tqdm
  16. SEASONS = {
  17. 'name': ['HIGH2017', 'RHO2018', 'HIGH2019', 'LOW2020', 'HIGH2020', 'HIGH2021', 'NNBAR2021'],
  18. 'start_run': [36872, 48938, 70014, 85224, 89973, 98116, 107342, None],
  19. }
  20. class RunsDBHandler():
  21. def __init__(self, host: str = 'cmddb', database: str = 'online', user: str = None, password: str = None):
  22. self.conn = connect(host = host, database = database, user = user, password = password)
  23. self.cur = self.conn.cursor()
  24. self.cur.execute("SET time_zone = '+07:00';")
  25. @property
  26. def fields(self) -> list:
  27. """Returns a list of available columns in the RunsDB
  28. """
  29. self.cur.execute("""DESCRIBE Runlog""")
  30. return self.cur.fetchall()
  31. def load_tables(self, range: Union[Tuple[int, Optional[int]], Tuple[datetime, datetime]]):
  32. """
  33. Parameters
  34. ----------
  35. range : Union[Tuple[int, Optional[int]], Tuple[datetime, datetime]]
  36. selection range
  37. int range defines an interval in runs
  38. datetime range defines a time interval (NSK: +7:00 time)
  39. """
  40. cond = ""
  41. if isinstance(range[0], int):
  42. cond = f" AND run >= {range[0]} "
  43. if range[1] is not None:
  44. cond += f" AND run <= {range[1]} "
  45. elif isinstance(range[0], datetime):
  46. cond = f" AND stoptime BETWEEN %s AND %s"
  47. sql_query = f"""
  48. SELECT
  49. run,
  50. starttime,
  51. stoptime,
  52. energy,
  53. luminosity
  54. FROM Runlog
  55. WHERE
  56. quality = "Y"
  57. {cond}
  58. AND luminosity > 0
  59. AND stoptime > starttime
  60. AND nevent > 0
  61. ORDER BY run DESC"""
  62. if isinstance(range[0], datetime):
  63. self.cur.execute(sql_query, range)
  64. else:
  65. self.cur.execute(sql_query)
  66. field_names = [i[0] for i in self.cur.description]
  67. res = self.cur.fetchall()
  68. return res, field_names
  69. def __del__(self):
  70. self.conn.close()
  71. class Combiner():
  72. """Combines a dataframe with runs and a dataframe with compton measurements together
  73. """
  74. def __init__(self, runsdb: Tuple[list, list], clbrdb: Tuple[list, list]):
  75. """
  76. Parameters
  77. ----------
  78. runsdb : Tuple[list, list]
  79. table of runs (rows and field names)
  80. clbrdb : Tuple[list, list]
  81. table of compton measurements (rows and field names)
  82. """
  83. rdb_rows, r_fld = runsdb
  84. cdb_rows, c_fld = clbrdb
  85. self.conn = sqlite3.connect(":memory:", detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES)
  86. self.cur = self.conn.cursor()
  87. self.cur.execute(f"CREATE table runs (run, elabel, starttime timestamp, stoptime timestamp, luminosity)")
  88. self.cur.execute(f"CREATE table compton (begintime timestamp, endtime timestamp, e_mean, e_std, spread_mean, spread_std)")
  89. run_row_generator = map(lambda x: (x[r_fld.index("run")], x[r_fld.index("energy")],
  90. x[r_fld.index("starttime")], x[r_fld.index("stoptime")],
  91. x[r_fld.index("luminosity")]), rdb_rows)
  92. c_data_idx = c_fld.index("data")
  93. compton_row_generator = map(lambda x: (x[c_fld.index("begintime")], x[c_fld.index("endtime")],
  94. float(x[c_data_idx][0]), float(x[c_data_idx][1]),
  95. float(x[c_data_idx][2]), float(x[c_data_idx][3])), cdb_rows)
  96. self.cur.executemany(f"""INSERT into runs VALUES ({','.join(['?']*5)})""", run_row_generator)
  97. self.cur.executemany(f"""INSERT into compton VALUES ({','.join(['?']*6)})""", compton_row_generator)
  98. self.__create_combined_table()
  99. def __create_combined_table(self):
  100. create_combined_query = """
  101. CREATE TABLE combined_table AS
  102. SELECT
  103. runs.run AS run,
  104. runs.elabel AS elabel,
  105. runs.starttime as "run_start [timestamp]",
  106. runs.stoptime AS "run_stop [timestamp]",
  107. compton.begintime AS "compton_start [timestamp]",
  108. compton.endtime AS "compton_stop [timestamp]",
  109. runs.luminosity, compton.e_mean, compton.e_std, compton.spread_mean, compton.spread_std
  110. FROM runs, compton
  111. WHERE
  112. (runs.starttime BETWEEN compton.begintime AND compton.endtime)
  113. OR (runs.stoptime BETWEEN compton.begintime AND compton.endtime)
  114. OR (compton.begintime BETWEEN runs.starttime AND runs.stoptime)
  115. OR (compton.endtime BETWEEN runs.starttime AND runs.stoptime);
  116. """
  117. self.cur.execute(create_combined_query)
  118. return
  119. def combined_table(self) -> pd.DataFrame:
  120. """Returns combined dataframe
  121. """
  122. sql_query = """
  123. SELECT * FROM combined_table;
  124. """
  125. df = pd.read_sql(sql_query, self.conn)
  126. df['common_duration'] = df[['run_stop', 'compton_stop']].min(axis=1) - df[['run_start', 'compton_start']].max(axis=1)
  127. df['run_duration'] = df['run_stop'] - df['run_start']
  128. df['run_in_measurement'] = df['common_duration']/df['run_duration']
  129. df = df.sort_values(by='run_in_measurement', ascending=False).drop_duplicates(subset='run').sort_values(by='run')
  130. df = df.drop(['run_duration', 'common_duration', 'run_start', 'run_stop'], axis=1)
  131. return df
  132. def __del__(self):
  133. self.conn.close()
  134. class Likelihood():
  135. """
  136. Likelihood function
  137. """
  138. def __init__(self, means: np.array, sigmas: np.array, weights: np.array):
  139. """
  140. Parameters
  141. ----------
  142. means : np.array
  143. array of means, [MeV]
  144. sigmas : np.array
  145. array of standard deviations, [MeV]
  146. weights : np.array
  147. array of luminosities
  148. """
  149. self.means = means
  150. self.sigmas = sigmas
  151. self.weights = weights/weights.mean()
  152. def __call__(self, mean: float, sigma: float):
  153. """
  154. Calls likelihood calculation
  155. Parameters
  156. ----------
  157. mean : float
  158. expected mean
  159. sigma : float
  160. expected standard deviation
  161. """
  162. sigma_total = np.sqrt(sigma**2 + self.sigmas**2)
  163. ln_L = -np.sum( self.weights*( ((mean - self.means)**2)/(2*(sigma_total**2)) + np.log(sigma_total) ) )
  164. return -ln_L
  165. def calculate_point(comb_df: pd.DataFrame, runs_df: pd.DataFrame, compton_df: pd.DataFrame) -> dict:
  166. """Calculates parameters of the energy (mean, std, spread) in this dataFrame
  167. Parameters
  168. ----------
  169. comb_df : pd.DataFrame
  170. table of the measurements linked with runs
  171. runs_df : pd.DataFrame
  172. table of the runs
  173. compton_df : pd.DataFrame
  174. table of the comptons
  175. Returns
  176. -------
  177. dict
  178. average parameters on this DataFrame
  179. """
  180. if (len(comb_df) == 1) and pd.isnull(comb_df.iloc[0].at['compton_start']):
  181. # no direct measurements of the compton during data runs
  182. # estimate energy by the nearest points
  183. # print('Hello')
  184. min_run_time = runs_df[runs_df.run == comb_df.iloc[0].at['run_first']].iloc[0].at['starttime']
  185. max_run_time = runs_df[runs_df.run == comb_df.iloc[0].at['run_last']].iloc[0].at['stoptime']
  186. nearest_row_before = compton_df.iloc[pd.Index(compton_df.endtime).get_loc(min_run_time, 'nearest')]
  187. nearest_row_after = compton_df.iloc[pd.Index(compton_df.begintime).get_loc(max_run_time, 'nearest')]
  188. # regulatization
  189. nearest_row_before['data'][1] = max(nearest_row_before['data'][3], 1e-3)
  190. nearest_row_after['data'][3] = max(nearest_row_after['data'][3], 1e-3)
  191. nearest_row_before['data'][1] = max(nearest_row_before['data'][1], 1e-3)
  192. nearest_row_after['data'][3] = max(nearest_row_after['data'][3], 1e-3)
  193. mean_energy = (nearest_row_before['data'][0] + nearest_row_after['data'][0])/2
  194. mean_spread = (nearest_row_before['data'][2] + nearest_row_after['data'][2])/2
  195. std_energy = np.sqrt(1/(1/(nearest_row_before['data'][1])**2 + 1/(nearest_row_after['data'][1])**2))
  196. std_spread = np.sqrt(1/(1/(nearest_row_before['data'][3])**2 + 1/(nearest_row_after['data'][3])**2))
  197. sys_energy = np.std([nearest_row_before['data'][0], nearest_row_after['data'][0]])
  198. return {
  199. 'energy_point': comb_df.elabel.min(),
  200. 'first_run': comb_df.run_first.min(),
  201. 'last_run': comb_df.run_last.max(),
  202. 'mean_energy': mean_energy,
  203. 'mean_energy_stat_err': std_energy,
  204. 'mean_energy_sys_err': sys_energy,
  205. 'mean_spread': mean_spread,
  206. 'mean_spread_stat_err': std_spread,
  207. 'used_lum': 0,
  208. 'comment': 'indirect measurement',
  209. }, pd.DataFrame()
  210. df = comb_df.copy()
  211. df.spread_std = np.where(df.spread_std < 1e-4, 1e-4, df.spread_std)
  212. df = df[df.e_std > 0]
  213. mean_energy = np.sum(df.e_mean*df.luminosity/(df.e_std**2))/np.sum(df.luminosity/(df.e_std**2))
  214. # std_energy = np.sqrt(1/np.sum((df.luminosity/df.luminosity.mean())/df.e_std**2))
  215. good_criterion = np.abs((df.e_mean - mean_energy)/np.sqrt(df.e_mean.std()**2 + df.e_std**2)) < 5
  216. good_criterion = good_criterion
  217. df = df[good_criterion]
  218. m = Minuit(Likelihood(df.e_mean, df.e_std, df.luminosity), mean=df.e_mean.mean(), sigma=df.e_mean.std())
  219. m.errordef = 0.5
  220. m.limits['sigma'] = (0, None)
  221. m.migrad();
  222. sys_err = m.values['sigma']
  223. mean_en = m.values['mean']
  224. mean_spread = np.sum(df.spread_mean*df.luminosity/(df.spread_std**2))/np.sum(df.luminosity/(df.spread_std**2))
  225. std_spread = np.sqrt(1/np.sum((df.luminosity/df.luminosity.mean())/df.spread_std**2))
  226. res_dict = {
  227. 'energy_point': comb_df.elabel.min(),
  228. 'first_run': comb_df.run_first.min(),
  229. 'last_run': comb_df.run_last.max(),
  230. 'mean_energy': mean_en,
  231. 'mean_energy_stat_err': m.errors['mean'],
  232. 'mean_energy_sys_err': sys_err,
  233. 'mean_spread': mean_spread,
  234. 'mean_spread_stat_err': std_spread,
  235. 'used_lum': df.luminosity.sum()/comb_df.luminosity_total.sum(),
  236. 'comment': '',
  237. }
  238. return res_dict, df
  239. def process_combined(combined_df: pd.DataFrame, runs_df: pd.DataFrame, compton_df: pd.DataFrame, pics_folder: Optional[str] = None) -> pd.DataFrame:
  240. if pics_folder is not None:
  241. plt.ioff()
  242. plt.style.use('ggplot')
  243. locator = mdates.AutoDateLocator(minticks=5)
  244. formatter = mdates.ConciseDateFormatter(locator)
  245. formatter.formats = ['%y', '%b', '%d', '%H:%M', '%H:%M', '%S.%f', ]
  246. formatter.zero_formats = [''] + formatter.formats[:-1]
  247. formatter.zero_formats[3] = '%d-%b'
  248. formatter.offset_formats = ['', '%Y', '%b %Y', '%d %b %Y', '%d %b %Y', '%d %b %Y %H:%M', ]
  249. runs_df = runs_df.rename({'luminosity': 'luminosity_full', 'energy': 'elabel'}, axis=1)
  250. combined_df = pd.merge(combined_df.drop(['elabel'], axis=1), runs_df[['run', 'elabel', 'luminosity_full']], how='outer')
  251. combined_df = combined_df.sort_values(by='run')
  252. combined_df['luminosity'] = combined_df['luminosity'].fillna(0)
  253. combined_df['point_idx'] = np.cumsum(~np.isclose(combined_df.elabel, combined_df.elabel.shift(1), atol=1e-4))
  254. combined_df = combined_df.groupby(['point_idx', 'compton_start'], dropna=False).agg(
  255. elabel=('elabel', 'min'), elabel_test=('elabel', 'max'),
  256. run_first=('run', 'min'), run_last=('run', 'max'),
  257. luminosity=('luminosity', 'sum'), luminosity_total=('luminosity_full', 'sum'),
  258. compton_stop=('compton_stop', 'min'), compton_stop_test=('compton_stop', 'max'),
  259. e_mean=('e_mean', 'min'), e_mean_test=('e_mean', 'max'),
  260. e_std=('e_std', 'min'), e_std_test=('e_std', 'max'),
  261. spread_mean=('spread_mean', 'min'), spread_mean_test=('spread_mean', 'max'),
  262. spread_std=('spread_std', 'min'), spread_std_test=('spread_std', 'max'),
  263. ).reset_index().set_index('point_idx')
  264. # return combined_df
  265. result_df = pd.DataFrame(columns=['energy_point', 'first_run', 'last_run', 'mean_energy', 'mean_energy_stat_err', 'mean_energy_sys_err', 'mean_spread', 'mean_spread_stat_err', 'used_lum', 'comment'])
  266. for i, table in tqdm(combined_df.groupby('point_idx', dropna=False)):
  267. res_dict, good_df = calculate_point(table, runs_df, compton_df)
  268. result_df = result_df.append(res_dict, ignore_index=True)
  269. if pics_folder is not None:
  270. plt_table = good_df.dropna()
  271. if len(plt_table) == 0:
  272. continue
  273. total_error = np.sqrt(res_dict["mean_energy_stat_err"]**2 + res_dict["mean_energy_sys_err"]**2)
  274. half_timedelta = (plt_table.compton_stop - plt_table.compton_start)/2
  275. time = plt_table.compton_start + half_timedelta
  276. dlt0 = timedelta(days=1)
  277. fig, ax = plt.subplots(1, 1, dpi=120, tight_layout=True)
  278. ax.errorbar(time, plt_table.e_mean, xerr=half_timedelta, yerr=plt_table.e_std, fmt='.')
  279. ax.axhline(res_dict['mean_energy'], color='black', zorder=3, label='Mean')
  280. ax.fill_between([plt_table.compton_stop.min(), plt_table.compton_stop.max()],
  281. [res_dict['mean_energy'] - total_error]*2,
  282. [res_dict['mean_energy'] + total_error]*2, color='green', zorder=1, alpha=0.4)
  283. ax.tick_params(axis='x', labelrotation=45)
  284. ax.xaxis.set_major_locator(locator)
  285. ax.xaxis.set_major_formatter(formatter)
  286. ax.set(title=f'{res_dict["energy_point"]}, E = {res_dict["mean_energy"]:.3f} ± {res_dict["mean_energy_stat_err"]:.3f} ± {res_dict["mean_energy_sys_err"]:.3f} MeV', xlabel='Time, NSK', ylabel='Energy, [MeV]',
  287. xlim=[plt_table.compton_stop.min(), plt_table.compton_stop.max()])
  288. plt.savefig(f'{pics_folder}/{res_dict["first_run"]}_{res_dict["energy_point"]}.png')
  289. plt.close()
  290. return result_df
  291. def final_table_to_clbrdb(df: pd.DataFrame, clbrdb: CalibrdbHandler, runs_df: pd.DataFrame, season: str):
  292. """Write good values from the averaged table into clbrdb
  293. """
  294. good_values = (df.comment=='')|((df.comment!='')&((df.mean_energy.astype(float) - df.energy_point).abs()<5))
  295. df_clbrdb = df.loc[good_values].drop(['comment', 'used_lum'], axis=1)
  296. df_clbrdb = pd.merge(df_clbrdb, runs_df[['run', 'starttime']], how='left', left_on='first_run', right_on='run').drop(['run'], axis=1)
  297. df_clbrdb = pd.merge(df_clbrdb, runs_df[['run', 'stoptime']], how='left', left_on='last_run', right_on='run').drop(['run'], axis=1)
  298. df_clbrdb = df_clbrdb.assign(writetime=lambda df: df['stoptime'])
  299. df_clbrdb = df_clbrdb[['writetime', 'starttime', 'stoptime',
  300. 'energy_point', 'first_run', 'last_run', 'mean_energy',
  301. 'mean_energy_stat_err', 'mean_energy_sys_err', 'mean_spread', 'mean_spread_stat_err']].values.tolist()
  302. clbrdb.insert(df_clbrdb, 'Misc', 'RunHeader', 'Compton_run_avg', 'Default', comment = season)
  303. clbrdb.commit()
  304. # python scripts/compton_combiner.py -s NNBAR2021 -c database.ini --csv --clbrdb
  305. def main():
  306. parser = argparse.ArgumentParser(description = 'Mean compton energy measurements from clbrdb')
  307. parser.add_argument('-s', '--season', help = 'Name of the season')
  308. parser.add_argument('-c', '--config', help = 'Config file containing information for access to databases')
  309. parser.add_argument('--csv', action = 'store_true', help = 'Save csv file with data or not')
  310. parser.add_argument('--clbrdb', action = 'store_true', help = 'Update Compton_run_avg clbrdb or not')
  311. parser.add_argument('--pics_folder', help = 'Path to the directory for saving the pictures')
  312. args = parser.parse_args()
  313. # logging.info(f"Arguments: season: {args.season}, config {args.config}")
  314. parser = ConfigParser()
  315. parser.read(args.config);
  316. rdb = RunsDBHandler(**parser['cmdruns'])
  317. idx = SEASONS['name'].index(args.season)
  318. runs_range = (SEASONS['start_run'][idx], SEASONS['start_run'][idx+1])
  319. res_rdb = rdb.load_tables(runs_range)
  320. runs_df = pd.DataFrame(res_rdb[0], columns=res_rdb[1])
  321. tdlt0 = timedelta(days=2)
  322. time_range = (runs_df.starttime.min() - tdlt0, runs_df.stoptime.max() + tdlt0)
  323. clbrdb = CalibrdbHandler(**parser['clbrDB'])
  324. res_clbrdb = clbrdb.load_table('Misc', 'RunHeader', 'Compton_run', num_last_rows = None, timerange = time_range)
  325. cb = Combiner(res_rdb, res_clbrdb)
  326. comb_df = cb.combined_table()
  327. compton_df = pd.DataFrame(res_clbrdb[0], columns=res_clbrdb[1])
  328. cdf = process_combined(comb_df, runs_df, compton_df, args.pics_folder)
  329. if args.csv:
  330. cdf.to_csv(f'{args.season}.csv', index=False, float_format='%g')
  331. if args.clbrdb:
  332. final_table_to_clbrdb(cdf, clbrdb, runs_df, args.season)
  333. return
  334. if __name__ == "__main__":
  335. main()