0
0

compton_combiner.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. """Script to combine compton measurements with runs and process these data
  2. """
  3. import argparse
  4. from configparser import ConfigParser
  5. from datetime import datetime, timedelta
  6. import sqlite3
  7. from typing import Union, Tuple, Optional
  8. from compton_filter import CalibrdbHandler
  9. from iminuit import Minuit
  10. import matplotlib.dates as mdates
  11. import matplotlib.pyplot as plt
  12. from mysql.connector import connect, Error
  13. import numpy as np
  14. import pandas as pd
  15. from tqdm import tqdm
  16. SEASONS = {
  17. 'name': ['HIGH2017', 'RHO2018', 'HIGH2019', 'LOW2020', 'HIGH2020', 'HIGH2021', 'NNBAR2021'],
  18. 'start_run': [36872, 48938, 70014, 85224, 89973, 98116, 107342, None],
  19. }
  20. class RunsDBHandler():
  21. def __init__(self, host: str = 'cmddb', database: str = 'online', user: str = None, password: str = None):
  22. self.conn = connect(host = host, database = database, user = user, password = password)
  23. self.cur = self.conn.cursor()
  24. self.cur.execute("SET time_zone = 'UTC';")
  25. @property
  26. def fields(self) -> list:
  27. """Returns a list of available columns in the RunsDB
  28. """
  29. self.cur.execute("""DESCRIBE Runlog""")
  30. return self.cur.fetchall()
  31. def load_tables(self, range: Union[Tuple[int, Optional[int]], Tuple[datetime, datetime]]):
  32. cond = ""
  33. if isinstance(range[0], int):
  34. cond = f" AND run >= {range[0]} "
  35. if range[1] is not None:
  36. cond += f" AND run <= {range[1]} "
  37. elif isinstance(range[0], datetime):
  38. cond = f" AND stoptime BETWEEN %s AND %s"
  39. sql_query = f"""
  40. SELECT
  41. run,
  42. starttime,
  43. stoptime,
  44. energy,
  45. luminosity
  46. FROM Runlog
  47. WHERE
  48. quality = "Y"
  49. {cond}
  50. AND luminosity > 0
  51. AND stoptime > starttime
  52. AND nevent > 0
  53. ORDER BY run DESC"""
  54. if isinstance(range[0], datetime):
  55. self.cur.execute(sql_query, range)
  56. else:
  57. self.cur.execute(sql_query)
  58. field_names = [i[0] for i in self.cur.description]
  59. res = self.cur.fetchall()
  60. return res, field_names
  61. def __del__(self):
  62. self.conn.close()
  63. class Combiner():
  64. """Combines a dataframe with runs and a dataframe with compton measurements together
  65. """
  66. def __init__(self, runsdb: Tuple[list, list], clbrdb: Tuple[list, list]):
  67. """
  68. Parameters
  69. ----------
  70. runsdb : Tuple[list, list]
  71. table of runs (rows and field names)
  72. clbrdb : Tuple[list, list]
  73. table of compton measurements (rows and field names)
  74. """
  75. rdb_rows, r_fld = runsdb
  76. cdb_rows, c_fld = clbrdb
  77. self.conn = sqlite3.connect(":memory:", detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES)
  78. self.cur = self.conn.cursor()
  79. self.cur.execute(f"CREATE table runs (run, elabel, starttime timestamp, stoptime timestamp, luminosity)")
  80. self.cur.execute(f"CREATE table compton (begintime timestamp, endtime timestamp, e_mean, e_std, spread_mean, spread_std)")
  81. run_row_generator = map(lambda x: (x[r_fld.index("run")], x[r_fld.index("energy")],
  82. x[r_fld.index("starttime")], x[r_fld.index("stoptime")],
  83. x[r_fld.index("luminosity")]), rdb_rows)
  84. c_data_idx = c_fld.index("data")
  85. compton_row_generator = map(lambda x: (x[c_fld.index("begintime")], x[c_fld.index("endtime")],
  86. float(x[c_data_idx][0]), float(x[c_data_idx][1]),
  87. float(x[c_data_idx][2]), float(x[c_data_idx][3])), cdb_rows)
  88. self.cur.executemany(f"""INSERT into runs VALUES ({','.join(['?']*5)})""", run_row_generator)
  89. self.cur.executemany(f"""INSERT into compton VALUES ({','.join(['?']*6)})""", compton_row_generator)
  90. self.__create_combined_table()
  91. def __create_combined_table(self):
  92. create_combined_query = """
  93. CREATE TABLE combined_table AS
  94. SELECT
  95. runs.run AS run,
  96. runs.elabel AS elabel,
  97. runs.starttime as "run_start [timestamp]",
  98. runs.stoptime AS "run_stop [timestamp]",
  99. compton.begintime AS "compton_start [timestamp]",
  100. compton.endtime AS "compton_stop [timestamp]",
  101. runs.luminosity, compton.e_mean, compton.e_std, compton.spread_mean, compton.spread_std
  102. FROM runs, compton
  103. WHERE
  104. (runs.starttime BETWEEN compton.begintime AND compton.endtime)
  105. OR (runs.stoptime BETWEEN compton.begintime AND compton.endtime)
  106. OR (compton.begintime BETWEEN runs.starttime AND runs.stoptime)
  107. OR (compton.endtime BETWEEN runs.starttime AND runs.stoptime);
  108. """
  109. self.cur.execute(create_combined_query)
  110. return
  111. def combined_table(self) -> pd.DataFrame:
  112. """Returns combined dataframe
  113. """
  114. sql_query = """
  115. SELECT * FROM combined_table;
  116. """
  117. df = pd.read_sql(sql_query, self.conn)
  118. df['common_duration'] = df[['run_stop', 'compton_stop']].min(axis=1) - df[['run_start', 'compton_start']].max(axis=1)
  119. df['run_duration'] = df['run_stop'] - df['run_start']
  120. df['run_in_measurement'] = df['common_duration']/df['run_duration']
  121. df = df.sort_values(by='run_in_measurement', ascending=False).drop_duplicates(subset='run').sort_values(by='run')
  122. df = df.drop(['run_duration', 'common_duration', 'run_start', 'run_stop'], axis=1)
  123. return df
  124. def __del__(self):
  125. self.conn.close()
  126. class Likelihood():
  127. """
  128. Likelihood function
  129. """
  130. def __init__(self, means: np.array, sigmas: np.array, weights: np.array):
  131. """
  132. Parameters
  133. ----------
  134. means : np.array
  135. array of means, [MeV]
  136. sigmas : np.array
  137. array of standard deviations, [MeV]
  138. weights : np.array
  139. array of luminosities
  140. """
  141. self.means = means
  142. self.sigmas = sigmas
  143. self.weights = weights/weights.mean()
  144. def __call__(self, mean: float, sigma: float):
  145. """
  146. Calls likelihood calculation
  147. Parameters
  148. ----------
  149. mean : float
  150. expected mean
  151. sigma : float
  152. expected standard deviation
  153. """
  154. sigma_total = np.sqrt(sigma**2 + self.sigmas**2)
  155. ln_L = -np.sum( self.weights*( ((mean - self.means)**2)/(2*(sigma_total**2)) + np.log(sigma_total) ) )
  156. return -ln_L
  157. def calculate_point(comb_df: pd.DataFrame, runs_df: pd.DataFrame, compton_df: pd.DataFrame) -> dict:
  158. """Calculates parameters of the energy (mean, std, spread) in this dataFrame
  159. Parameters
  160. ----------
  161. comb_df : pd.DataFrame
  162. table of the measurements linked with runs
  163. runs_df : pd.DataFrame
  164. table of the runs
  165. compton_df : pd.DataFrame
  166. table of the comptons
  167. Returns
  168. -------
  169. dict
  170. average parameters on this DataFrame
  171. """
  172. if (len(comb_df) == 1) and pd.isnull(comb_df.iloc[0].at['compton_start']):
  173. # no direct measurements of the compton during data runs
  174. # estimate energy by the nearest points
  175. # print('Hello')
  176. min_run_time = runs_df[runs_df.run == comb_df.iloc[0].at['run_first']].iloc[0].at['starttime']
  177. max_run_time = runs_df[runs_df.run == comb_df.iloc[0].at['run_last']].iloc[0].at['stoptime']
  178. nearest_row_before = compton_df.iloc[pd.Index(compton_df.endtime).get_loc(min_run_time, 'nearest')]
  179. nearest_row_after = compton_df.iloc[pd.Index(compton_df.begintime).get_loc(max_run_time, 'nearest')]
  180. # regulatization
  181. nearest_row_before['data'][1] = max(nearest_row_before['data'][3], 1e-3)
  182. nearest_row_after['data'][3] = max(nearest_row_after['data'][3], 1e-3)
  183. nearest_row_before['data'][1] = max(nearest_row_before['data'][1], 1e-3)
  184. nearest_row_after['data'][3] = max(nearest_row_after['data'][3], 1e-3)
  185. mean_energy = (nearest_row_before['data'][0] + nearest_row_after['data'][0])/2
  186. mean_spread = (nearest_row_before['data'][2] + nearest_row_after['data'][2])/2
  187. std_energy = np.sqrt(1/(1/(nearest_row_before['data'][1])**2 + 1/(nearest_row_after['data'][1])**2))
  188. std_spread = np.sqrt(1/(1/(nearest_row_before['data'][3])**2 + 1/(nearest_row_after['data'][3])**2))
  189. sys_energy = np.std([nearest_row_before['data'][0], nearest_row_after['data'][0]])
  190. return {
  191. 'energy_point': comb_df.elabel.min(),
  192. 'first_run': comb_df.run_first.min(),
  193. 'last_run': comb_df.run_last.max(),
  194. 'mean_energy': mean_energy,
  195. 'mean_energy_stat_err': std_energy,
  196. 'mean_energy_sys_err': sys_energy,
  197. 'mean_spread': mean_spread,
  198. 'mean_spread_stat_err': std_spread,
  199. 'used_lum': 0,
  200. 'comment': 'indirect measurement',
  201. }, pd.DataFrame()
  202. df = comb_df.copy()
  203. df.spread_std = np.where(df.spread_std < 1e-4, 1e-4, df.spread_std)
  204. df = df[df.e_std > 0]
  205. mean_energy = np.sum(df.e_mean*df.luminosity/(df.e_std**2))/np.sum(df.luminosity/(df.e_std**2))
  206. # std_energy = np.sqrt(1/np.sum((df.luminosity/df.luminosity.mean())/df.e_std**2))
  207. good_criterion = np.abs((df.e_mean - mean_energy)/np.sqrt(df.e_mean.std()**2 + df.e_std**2)) < 5
  208. good_criterion = good_criterion
  209. df = df[good_criterion]
  210. m = Minuit(Likelihood(df.e_mean, df.e_std, df.luminosity), mean=df.e_mean.mean(), sigma=df.e_mean.std())
  211. m.errordef = 0.5
  212. m.limits['sigma'] = (0, None)
  213. m.migrad();
  214. sys_err = m.values['sigma']
  215. mean_en = m.values['mean']
  216. mean_spread = np.sum(df.spread_mean*df.luminosity/(df.spread_std**2))/np.sum(df.luminosity/(df.spread_std**2))
  217. std_spread = np.sqrt(1/np.sum((df.luminosity/df.luminosity.mean())/df.spread_std**2))
  218. res_dict = {
  219. 'energy_point': comb_df.elabel.min(),
  220. 'first_run': comb_df.run_first.min(),
  221. 'last_run': comb_df.run_last.max(),
  222. 'mean_energy': mean_en,
  223. 'mean_energy_stat_err': m.errors['mean'],
  224. 'mean_energy_sys_err': sys_err,
  225. 'mean_spread': mean_spread,
  226. 'mean_spread_stat_err': std_spread,
  227. 'used_lum': df.luminosity.sum()/comb_df.luminosity_total.sum(),
  228. 'comment': '',
  229. }
  230. return res_dict, df
  231. def process_combined(combined_df: pd.DataFrame, runs_df: pd.DataFrame, compton_df: pd.DataFrame, pics_folder: Optional[str] = None) -> pd.DataFrame:
  232. if pics_folder is not None:
  233. plt.ioff()
  234. plt.style.use('ggplot')
  235. locator = mdates.AutoDateLocator(minticks=5)
  236. formatter = mdates.ConciseDateFormatter(locator)
  237. formatter.formats = ['%y', '%b', '%d', '%H:%M', '%H:%M', '%S.%f', ]
  238. formatter.zero_formats = [''] + formatter.formats[:-1]
  239. formatter.zero_formats[3] = '%d-%b'
  240. formatter.offset_formats = ['', '%Y', '%b %Y', '%d %b %Y', '%d %b %Y', '%d %b %Y %H:%M', ]
  241. runs_df = runs_df.rename({'luminosity': 'luminosity_full', 'energy': 'elabel'}, axis=1)
  242. combined_df = pd.merge(combined_df.drop(['elabel'], axis=1), runs_df[['run', 'elabel', 'luminosity_full']], how='outer')
  243. combined_df = combined_df.sort_values(by='run')
  244. combined_df['luminosity'] = combined_df['luminosity'].fillna(0)
  245. combined_df['point_idx'] = np.cumsum(~np.isclose(combined_df.elabel, combined_df.elabel.shift(1), atol=1e-4))
  246. combined_df = combined_df.groupby(['point_idx', 'compton_start'], dropna=False).agg(
  247. elabel=('elabel', 'min'), elabel_test=('elabel', 'max'),
  248. run_first=('run', 'min'), run_last=('run', 'max'),
  249. luminosity=('luminosity', 'sum'), luminosity_total=('luminosity_full', 'sum'),
  250. compton_stop=('compton_stop', 'min'), compton_stop_test=('compton_stop', 'max'),
  251. e_mean=('e_mean', 'min'), e_mean_test=('e_mean', 'max'),
  252. e_std=('e_std', 'min'), e_std_test=('e_std', 'max'),
  253. spread_mean=('spread_mean', 'min'), spread_mean_test=('spread_mean', 'max'),
  254. spread_std=('spread_std', 'min'), spread_std_test=('spread_std', 'max'),
  255. ).reset_index().set_index('point_idx')
  256. # return combined_df
  257. result_df = pd.DataFrame(columns=['energy_point', 'first_run', 'last_run', 'mean_energy', 'mean_energy_stat_err', 'mean_energy_sys_err', 'mean_spread', 'mean_spread_stat_err', 'used_lum', 'comment'])
  258. for i, table in tqdm(combined_df.groupby('point_idx', dropna=False)):
  259. res_dict, good_df = calculate_point(table, runs_df, compton_df)
  260. result_df = result_df.append(res_dict, ignore_index=True)
  261. if pics_folder is not None:
  262. plt_table = good_df.dropna()
  263. if len(plt_table) == 0:
  264. continue
  265. total_error = np.sqrt(res_dict["mean_energy_stat_err"]**2 + res_dict["mean_energy_sys_err"]**2)
  266. half_timedelta = (plt_table.compton_stop - plt_table.compton_start)/2
  267. time = plt_table.compton_start + half_timedelta
  268. dlt0 = timedelta(days=1)
  269. fig, ax = plt.subplots(1, 1, dpi=120, tight_layout=True)
  270. ax.errorbar(time, plt_table.e_mean, xerr=half_timedelta, yerr=plt_table.e_std, fmt='.')
  271. ax.axhline(res_dict['mean_energy'], color='black', zorder=3, label='Mean')
  272. ax.fill_between([plt_table.compton_stop.min(), plt_table.compton_stop.max()],
  273. [res_dict['mean_energy'] - total_error]*2,
  274. [res_dict['mean_energy'] + total_error]*2, color='green', zorder=1, alpha=0.4)
  275. ax.tick_params(axis='x', labelrotation=45)
  276. ax.xaxis.set_major_locator(locator)
  277. ax.xaxis.set_major_formatter(formatter)
  278. ax.set(title=f'{res_dict["energy_point"]}, E = {res_dict["mean_energy"]:.3f} ± {res_dict["mean_energy_stat_err"]:.3f} ± {res_dict["mean_energy_sys_err"]:.3f} MeV', xlabel='Time, UTC', ylabel='Energy, [MeV]',
  279. xlim=[plt_table.compton_stop.min(), plt_table.compton_stop.max()])
  280. plt.savefig(f'{pics_folder}/{res_dict["first_run"]}_{res_dict["energy_point"]}.png')
  281. plt.close()
  282. return result_df
  283. # python scripts/compton_combiner.py --season NNBAR2021 --config database.ini
  284. def main():
  285. parser = argparse.ArgumentParser(description = 'Mean compton energy measurements from clbrdb')
  286. parser.add_argument('--season', help = 'Name of the season')
  287. parser.add_argument('--config', help = 'Config file containing information for access to databases')
  288. args = parser.parse_args()
  289. # logging.info(f"Arguments: season: {args.season}, config {args.config}")
  290. parser = ConfigParser()
  291. parser.read(args.config);
  292. rdb = RunsDBHandler(**parser['cmdruns'])
  293. idx = SEASONS['name'].index(args.season)
  294. runs_range = (SEASONS['start_run'][idx], SEASONS['start_run'][idx+1])
  295. res_rdb = rdb.load_tables(runs_range)
  296. runs_df = pd.DataFrame(res_rdb[0], columns=res_rdb[1])
  297. tdlt0 = timedelta(days=2)
  298. time_range = (runs_df.starttime.min() - tdlt0, runs_df.stoptime.max() + tdlt0)
  299. clbrdb = CalibrdbHandler(**parser['clbrDB'])
  300. res_clbrdb = clbrdb.load_table('Misc', 'RunHeader', 'Compton_run', num_last_rows = None, timerange = time_range)
  301. cb = Combiner(res_rdb, res_clbrdb)
  302. comb_df = cb.combined_table()
  303. compton_df = pd.DataFrame(res_clbrdb[0], columns=res_clbrdb[1])
  304. cdf = process_combined(comb_df, runs_df, compton_df, './pics')
  305. cdf.to_csv(f'{args.season}.csv', index=False, float_format='%g')
  306. return
  307. if __name__ == "__main__":
  308. main()