"""Script to combine compton measurements with runs and process these data """ import argparse from configparser import ConfigParser from datetime import datetime, timedelta import sqlite3 from typing import Union, Tuple, Optional from compton_filter import CalibrdbHandler from iminuit import Minuit import matplotlib.dates as mdates import matplotlib.pyplot as plt from mysql.connector import connect, Error import numpy as np import pandas as pd from tqdm import tqdm SEASONS = { 'name': ['HIGH2017', 'RHO2018', 'HIGH2019', 'LOW2020', 'HIGH2020', 'HIGH2021', 'NNBAR2021'], 'start_run': [36872, 48938, 70014, 85224, 89973, 98116, 107342, None], } class RunsDBHandler(): def __init__(self, host: str = 'cmddb', database: str = 'online', user: str = None, password: str = None): self.conn = connect(host = host, database = database, user = user, password = password) self.cur = self.conn.cursor() self.cur.execute("SET time_zone = 'UTC';") @property def fields(self) -> list: """Returns a list of available columns in the RunsDB """ self.cur.execute("""DESCRIBE Runlog""") return self.cur.fetchall() def load_tables(self, range: Union[Tuple[int, Optional[int]], Tuple[datetime, datetime]]): cond = "" if isinstance(range[0], int): cond = f" AND run >= {range[0]} " if range[1] is not None: cond += f" AND run <= {range[1]} " elif isinstance(range[0], datetime): cond = f" AND stoptime BETWEEN %s AND %s" sql_query = f""" SELECT run, starttime, stoptime, energy, luminosity FROM Runlog WHERE quality = "Y" {cond} AND luminosity > 0 AND stoptime > starttime AND nevent > 0 ORDER BY run DESC""" if isinstance(range[0], datetime): self.cur.execute(sql_query, range) else: self.cur.execute(sql_query) field_names = [i[0] for i in self.cur.description] res = self.cur.fetchall() return res, field_names def __del__(self): self.conn.close() class Combiner(): """Combines a dataframe with runs and a dataframe with compton measurements together """ def __init__(self, runsdb: Tuple[list, list], clbrdb: Tuple[list, list]): """ Parameters ---------- runsdb : Tuple[list, list] table of runs (rows and field names) clbrdb : Tuple[list, list] table of compton measurements (rows and field names) """ rdb_rows, r_fld = runsdb cdb_rows, c_fld = clbrdb self.conn = sqlite3.connect(":memory:", detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES) self.cur = self.conn.cursor() self.cur.execute(f"CREATE table runs (run, elabel, starttime timestamp, stoptime timestamp, luminosity)") self.cur.execute(f"CREATE table compton (begintime timestamp, endtime timestamp, e_mean, e_std, spread_mean, spread_std)") run_row_generator = map(lambda x: (x[r_fld.index("run")], x[r_fld.index("energy")], x[r_fld.index("starttime")], x[r_fld.index("stoptime")], x[r_fld.index("luminosity")]), rdb_rows) c_data_idx = c_fld.index("data") compton_row_generator = map(lambda x: (x[c_fld.index("begintime")], x[c_fld.index("endtime")], float(x[c_data_idx][0]), float(x[c_data_idx][1]), float(x[c_data_idx][2]), float(x[c_data_idx][3])), cdb_rows) self.cur.executemany(f"""INSERT into runs VALUES ({','.join(['?']*5)})""", run_row_generator) self.cur.executemany(f"""INSERT into compton VALUES ({','.join(['?']*6)})""", compton_row_generator) self.__create_combined_table() def __create_combined_table(self): create_combined_query = """ CREATE TABLE combined_table AS SELECT runs.run AS run, runs.elabel AS elabel, runs.starttime as "run_start [timestamp]", runs.stoptime AS "run_stop [timestamp]", compton.begintime AS "compton_start [timestamp]", compton.endtime AS "compton_stop [timestamp]", runs.luminosity, compton.e_mean, compton.e_std, compton.spread_mean, compton.spread_std FROM runs, compton WHERE (runs.starttime BETWEEN compton.begintime AND compton.endtime) OR (runs.stoptime BETWEEN compton.begintime AND compton.endtime) OR (compton.begintime BETWEEN runs.starttime AND runs.stoptime) OR (compton.endtime BETWEEN runs.starttime AND runs.stoptime); """ self.cur.execute(create_combined_query) return def combined_table(self) -> pd.DataFrame: """Returns combined dataframe """ sql_query = """ SELECT * FROM combined_table; """ df = pd.read_sql(sql_query, self.conn) df['common_duration'] = df[['run_stop', 'compton_stop']].min(axis=1) - df[['run_start', 'compton_start']].max(axis=1) df['run_duration'] = df['run_stop'] - df['run_start'] df['run_in_measurement'] = df['common_duration']/df['run_duration'] df = df.sort_values(by='run_in_measurement', ascending=False).drop_duplicates(subset='run').sort_values(by='run') df = df.drop(['run_duration', 'common_duration', 'run_start', 'run_stop'], axis=1) return df def __del__(self): self.conn.close() class Likelihood(): """ Likelihood function """ def __init__(self, means: np.array, sigmas: np.array, weights: np.array): """ Parameters ---------- means : np.array array of means, [MeV] sigmas : np.array array of standard deviations, [MeV] weights : np.array array of luminosities """ self.means = means self.sigmas = sigmas self.weights = weights/weights.mean() def __call__(self, mean: float, sigma: float): """ Calls likelihood calculation Parameters ---------- mean : float expected mean sigma : float expected standard deviation """ sigma_total = np.sqrt(sigma**2 + self.sigmas**2) ln_L = -np.sum( self.weights*( ((mean - self.means)**2)/(2*(sigma_total**2)) + np.log(sigma_total) ) ) return -ln_L def calculate_point(comb_df: pd.DataFrame, runs_df: pd.DataFrame, compton_df: pd.DataFrame) -> dict: """Calculates parameters of the energy (mean, std, spread) in this dataFrame Parameters ---------- comb_df : pd.DataFrame table of the measurements linked with runs runs_df : pd.DataFrame table of the runs compton_df : pd.DataFrame table of the comptons Returns ------- dict average parameters on this DataFrame """ if (len(comb_df) == 1) and pd.isnull(comb_df.iloc[0].at['compton_start']): # no direct measurements of the compton during data runs # estimate energy by the nearest points # print('Hello') min_run_time = runs_df[runs_df.run == comb_df.iloc[0].at['run_first']].iloc[0].at['starttime'] max_run_time = runs_df[runs_df.run == comb_df.iloc[0].at['run_last']].iloc[0].at['stoptime'] nearest_row_before = compton_df.iloc[pd.Index(compton_df.endtime).get_loc(min_run_time, 'nearest')] nearest_row_after = compton_df.iloc[pd.Index(compton_df.begintime).get_loc(max_run_time, 'nearest')] # regulatization nearest_row_before['data'][1] = max(nearest_row_before['data'][3], 1e-3) nearest_row_after['data'][3] = max(nearest_row_after['data'][3], 1e-3) nearest_row_before['data'][1] = max(nearest_row_before['data'][1], 1e-3) nearest_row_after['data'][3] = max(nearest_row_after['data'][3], 1e-3) mean_energy = (nearest_row_before['data'][0] + nearest_row_after['data'][0])/2 mean_spread = (nearest_row_before['data'][2] + nearest_row_after['data'][2])/2 std_energy = np.sqrt(1/(1/(nearest_row_before['data'][1])**2 + 1/(nearest_row_after['data'][1])**2)) std_spread = np.sqrt(1/(1/(nearest_row_before['data'][3])**2 + 1/(nearest_row_after['data'][3])**2)) sys_energy = np.std([nearest_row_before['data'][0], nearest_row_after['data'][0]]) return { 'energy_point': comb_df.elabel.min(), 'first_run': comb_df.run_first.min(), 'last_run': comb_df.run_last.max(), 'mean_energy': mean_energy, 'mean_energy_stat_err': std_energy, 'mean_energy_sys_err': sys_energy, 'mean_spread': mean_spread, 'mean_spread_stat_err': std_spread, 'used_lum': 0, 'comment': 'indirect measurement', }, pd.DataFrame() df = comb_df.copy() df.spread_std = np.where(df.spread_std < 1e-4, 1e-4, df.spread_std) df = df[df.e_std > 0] mean_energy = np.sum(df.e_mean*df.luminosity/(df.e_std**2))/np.sum(df.luminosity/(df.e_std**2)) # std_energy = np.sqrt(1/np.sum((df.luminosity/df.luminosity.mean())/df.e_std**2)) good_criterion = np.abs((df.e_mean - mean_energy)/np.sqrt(df.e_mean.std()**2 + df.e_std**2)) < 5 good_criterion = good_criterion df = df[good_criterion] m = Minuit(Likelihood(df.e_mean, df.e_std, df.luminosity), mean=df.e_mean.mean(), sigma=df.e_mean.std()) m.errordef = 0.5 m.limits['sigma'] = (0, None) m.migrad(); sys_err = m.values['sigma'] mean_en = m.values['mean'] mean_spread = np.sum(df.spread_mean*df.luminosity/(df.spread_std**2))/np.sum(df.luminosity/(df.spread_std**2)) std_spread = np.sqrt(1/np.sum((df.luminosity/df.luminosity.mean())/df.spread_std**2)) res_dict = { 'energy_point': comb_df.elabel.min(), 'first_run': comb_df.run_first.min(), 'last_run': comb_df.run_last.max(), 'mean_energy': mean_en, 'mean_energy_stat_err': m.errors['mean'], 'mean_energy_sys_err': sys_err, 'mean_spread': mean_spread, 'mean_spread_stat_err': std_spread, 'used_lum': df.luminosity.sum()/comb_df.luminosity_total.sum(), 'comment': '', } return res_dict, df def process_combined(combined_df: pd.DataFrame, runs_df: pd.DataFrame, compton_df: pd.DataFrame, pics_folder: Optional[str] = None) -> pd.DataFrame: if pics_folder is not None: plt.ioff() plt.style.use('ggplot') locator = mdates.AutoDateLocator(minticks=5) formatter = mdates.ConciseDateFormatter(locator) formatter.formats = ['%y', '%b', '%d', '%H:%M', '%H:%M', '%S.%f', ] formatter.zero_formats = [''] + formatter.formats[:-1] formatter.zero_formats[3] = '%d-%b' formatter.offset_formats = ['', '%Y', '%b %Y', '%d %b %Y', '%d %b %Y', '%d %b %Y %H:%M', ] runs_df = runs_df.rename({'luminosity': 'luminosity_full', 'energy': 'elabel'}, axis=1) combined_df = pd.merge(combined_df.drop(['elabel'], axis=1), runs_df[['run', 'elabel', 'luminosity_full']], how='outer') combined_df = combined_df.sort_values(by='run') combined_df['luminosity'] = combined_df['luminosity'].fillna(0) combined_df['point_idx'] = np.cumsum(~np.isclose(combined_df.elabel, combined_df.elabel.shift(1), atol=1e-4)) combined_df = combined_df.groupby(['point_idx', 'compton_start'], dropna=False).agg( elabel=('elabel', 'min'), elabel_test=('elabel', 'max'), run_first=('run', 'min'), run_last=('run', 'max'), luminosity=('luminosity', 'sum'), luminosity_total=('luminosity_full', 'sum'), compton_stop=('compton_stop', 'min'), compton_stop_test=('compton_stop', 'max'), e_mean=('e_mean', 'min'), e_mean_test=('e_mean', 'max'), e_std=('e_std', 'min'), e_std_test=('e_std', 'max'), spread_mean=('spread_mean', 'min'), spread_mean_test=('spread_mean', 'max'), spread_std=('spread_std', 'min'), spread_std_test=('spread_std', 'max'), ).reset_index().set_index('point_idx') # return combined_df result_df = pd.DataFrame(columns=['energy_point', 'first_run', 'last_run', 'mean_energy', 'mean_energy_stat_err', 'mean_energy_sys_err', 'mean_spread', 'mean_spread_stat_err', 'used_lum', 'comment']) for i, table in tqdm(combined_df.groupby('point_idx', dropna=False)): res_dict, good_df = calculate_point(table, runs_df, compton_df) result_df = result_df.append(res_dict, ignore_index=True) if pics_folder is not None: plt_table = good_df.dropna() if len(plt_table) == 0: continue total_error = np.sqrt(res_dict["mean_energy_stat_err"]**2 + res_dict["mean_energy_sys_err"]**2) half_timedelta = (plt_table.compton_stop - plt_table.compton_start)/2 time = plt_table.compton_start + half_timedelta dlt0 = timedelta(days=1) fig, ax = plt.subplots(1, 1, dpi=120, tight_layout=True) ax.errorbar(time, plt_table.e_mean, xerr=half_timedelta, yerr=plt_table.e_std, fmt='.') ax.axhline(res_dict['mean_energy'], color='black', zorder=3, label='Mean') ax.fill_between([plt_table.compton_stop.min(), plt_table.compton_stop.max()], [res_dict['mean_energy'] - total_error]*2, [res_dict['mean_energy'] + total_error]*2, color='green', zorder=1, alpha=0.4) ax.tick_params(axis='x', labelrotation=45) ax.xaxis.set_major_locator(locator) ax.xaxis.set_major_formatter(formatter) ax.set(title=f'{res_dict["energy_point"]}, E = {res_dict["mean_energy"]:.3f} ± {res_dict["mean_energy_stat_err"]:.3f} ± {res_dict["mean_energy_sys_err"]:.3f} MeV', xlabel='Time, UTC', ylabel='Energy, [MeV]', xlim=[plt_table.compton_stop.min(), plt_table.compton_stop.max()]) plt.savefig(f'{pics_folder}/{res_dict["first_run"]}_{res_dict["energy_point"]}.png') plt.close() return result_df # python scripts/compton_combiner.py --season NNBAR2021 --config database.ini def main(): parser = argparse.ArgumentParser(description = 'Mean compton energy measurements from clbrdb') parser.add_argument('--season', help = 'Name of the season') parser.add_argument('--config', help = 'Config file containing information for access to databases') args = parser.parse_args() # logging.info(f"Arguments: season: {args.season}, config {args.config}") parser = ConfigParser() parser.read(args.config); rdb = RunsDBHandler(**parser['cmdruns']) idx = SEASONS['name'].index(args.season) runs_range = (SEASONS['start_run'][idx], SEASONS['start_run'][idx+1]) res_rdb = rdb.load_tables(runs_range) runs_df = pd.DataFrame(res_rdb[0], columns=res_rdb[1]) tdlt0 = timedelta(days=2) time_range = (runs_df.starttime.min() - tdlt0, runs_df.stoptime.max() + tdlt0) clbrdb = CalibrdbHandler(**parser['clbrDB']) res_clbrdb = clbrdb.load_table('Misc', 'RunHeader', 'Compton_run', num_last_rows = None, timerange = time_range) cb = Combiner(res_rdb, res_clbrdb) comb_df = cb.combined_table() compton_df = pd.DataFrame(res_clbrdb[0], columns=res_clbrdb[1]) cdf = process_combined(comb_df, runs_df, compton_df, './pics') cdf.to_csv(f'{args.season}.csv', index=False, float_format='%g') return if __name__ == "__main__": main()