compton
/
sources


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
							"""Script to combine compton measurements with runs and process these data
"""

import argparse
from configparser import ConfigParser
from datetime import datetime, timedelta
import sqlite3
from typing import Union, Tuple, Optional

from compton_filter import CalibrdbHandler
from iminuit import Minuit
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from mysql.connector import connect, Error
import numpy as np
import pandas as pd
from tqdm import tqdm

SEASONS = {
    'name': ['HIGH2017', 'RHO2018', 'HIGH2019', 'LOW2020', 'HIGH2020', 'HIGH2021', 'NNBAR2021'],
    'start_run': [36872, 48938, 70014, 85224, 89973, 98116, 107342, None],
}

class RunsDBHandler():
    def __init__(self, host: str = 'cmddb', database: str = 'online', user: str = None, password: str = None):
        self.conn = connect(host = host, database = database, user = user, password = password)
        self.cur = self.conn.cursor()
        self.cur.execute("SET time_zone = 'UTC';")
        
    @property
    def fields(self) -> list:
        """Returns a list of available columns in the RunsDB
        """
        
        self.cur.execute("""DESCRIBE Runlog""")
        return self.cur.fetchall()
        
    def load_tables(self, range: Union[Tuple[int, Optional[int]], Tuple[datetime, datetime]]):
        
        cond = ""
        if isinstance(range[0], int):
            cond = f" AND run >= {range[0]} "
            if range[1] is not None:
                cond += f" AND run <= {range[1]} "
        elif isinstance(range[0], datetime):
            cond = f" AND stoptime BETWEEN %s AND %s"
            
        sql_query = f"""
        SELECT 
            run, 
            starttime, 
            stoptime, 
            energy, 
            luminosity
        FROM Runlog 
        WHERE 
            quality = "Y"
            {cond}
            AND luminosity > 0
            AND stoptime > starttime
            AND nevent > 0
        ORDER BY run DESC"""
        
        if isinstance(range[0], datetime):
            self.cur.execute(sql_query, range)
        else:
            self.cur.execute(sql_query)
        
        field_names = [i[0] for i in self.cur.description]
        res = self.cur.fetchall()
        return res, field_names
        
    def __del__(self):
        self.conn.close()
        
class Combiner():
    """Combines a dataframe with runs and a dataframe with compton measurements together
    """
    
    def __init__(self, runsdb: Tuple[list, list], clbrdb: Tuple[list, list]):
        """
        Parameters
        ----------
        runsdb : Tuple[list, list]
            table of runs (rows and field names)
        clbrdb : Tuple[list, list]
            table of compton measurements (rows and field names)            
        """
        
        rdb_rows, r_fld = runsdb
        cdb_rows, c_fld = clbrdb
        
        self.conn = sqlite3.connect(":memory:", detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES)
        self.cur = self.conn.cursor()
        self.cur.execute(f"CREATE table runs (run, elabel, starttime timestamp, stoptime timestamp, luminosity)")
        self.cur.execute(f"CREATE table compton (begintime timestamp, endtime timestamp, e_mean, e_std, spread_mean, spread_std)")
        
        run_row_generator = map(lambda x: (x[r_fld.index("run")], x[r_fld.index("energy")],
                                           x[r_fld.index("starttime")], x[r_fld.index("stoptime")],
                                           x[r_fld.index("luminosity")]), rdb_rows)
        c_data_idx = c_fld.index("data")
        compton_row_generator = map(lambda x: (x[c_fld.index("begintime")], x[c_fld.index("endtime")], 
                                               float(x[c_data_idx][0]), float(x[c_data_idx][1]), 
                                               float(x[c_data_idx][2]), float(x[c_data_idx][3])), cdb_rows)
        
        self.cur.executemany(f"""INSERT into runs VALUES ({','.join(['?']*5)})""", run_row_generator)
        self.cur.executemany(f"""INSERT into compton VALUES ({','.join(['?']*6)})""", compton_row_generator)
        
        self.__create_combined_table()        
        
    def __create_combined_table(self):
        create_combined_query = """
        CREATE TABLE combined_table AS
        SELECT 
            runs.run AS run,
            runs.elabel AS elabel,
            runs.starttime as "run_start [timestamp]", 
            runs.stoptime AS "run_stop [timestamp]", 
            compton.begintime AS "compton_start [timestamp]", 
            compton.endtime AS "compton_stop [timestamp]",
            runs.luminosity, compton.e_mean, compton.e_std, compton.spread_mean, compton.spread_std
        FROM runs, compton
        WHERE 
            (runs.starttime BETWEEN compton.begintime AND compton.endtime) 
            OR (runs.stoptime BETWEEN compton.begintime AND compton.endtime)
            OR (compton.begintime BETWEEN runs.starttime AND runs.stoptime)
            OR (compton.endtime BETWEEN runs.starttime AND runs.stoptime);
        """
        self.cur.execute(create_combined_query)
        return
        
    def combined_table(self) -> pd.DataFrame:
        """Returns combined dataframe
        """
        
        sql_query = """
        SELECT * FROM combined_table;
        """
        df = pd.read_sql(sql_query, self.conn)
        df['common_duration'] = df[['run_stop', 'compton_stop']].min(axis=1) - df[['run_start', 'compton_start']].max(axis=1)
        df['run_duration'] = df['run_stop'] - df['run_start']
        df['run_in_measurement'] = df['common_duration']/df['run_duration']
        df = df.sort_values(by='run_in_measurement', ascending=False).drop_duplicates(subset='run').sort_values(by='run')
        df = df.drop(['run_duration', 'common_duration', 'run_start', 'run_stop'], axis=1)
        return df
        
    def __del__(self):
        self.conn.close()
        
class Likelihood():
    """
    Likelihood function
    """
    
    def __init__(self, means: np.array, sigmas: np.array, weights: np.array):
        """
        Parameters
        ----------
        means : np.array
            array of means, [MeV]
        sigmas : np.array
            array of standard deviations, [MeV]
        weights : np.array
            array of luminosities
        """
        
        self.means = means
        self.sigmas = sigmas
        self.weights = weights/weights.mean()
        
    def __call__(self, mean: float, sigma: float):
        """
        Calls likelihood calculation
        
        Parameters
        ----------
        mean : float
            expected mean
        sigma : float
            expected standard deviation
        """
        
        sigma_total = np.sqrt(sigma**2 + self.sigmas**2)
        ln_L = -np.sum( self.weights*( ((mean - self.means)**2)/(2*(sigma_total**2)) + np.log(sigma_total) ) )
        return -ln_L
    
def calculate_point(comb_df: pd.DataFrame, runs_df: pd.DataFrame, compton_df: pd.DataFrame) -> dict:
    """Calculates parameters of the energy (mean, std, spread) in this dataFrame
    
    Parameters
    ----------
    comb_df : pd.DataFrame
        table of the measurements linked with runs
    runs_df : pd.DataFrame
        table of the runs
    compton_df : pd.DataFrame
        table of the comptons
    
    Returns
    -------
    dict
        average parameters on this DataFrame
    """
        
    if (len(comb_df) == 1) and pd.isnull(comb_df.iloc[0].at['compton_start']):
        # no direct measurements of the compton during data runs
        # estimate energy by the nearest points
        # print('Hello')
        min_run_time = runs_df[runs_df.run == comb_df.iloc[0].at['run_first']].iloc[0].at['starttime']
        max_run_time = runs_df[runs_df.run == comb_df.iloc[0].at['run_last']].iloc[0].at['stoptime']
        
        nearest_row_before = compton_df.iloc[pd.Index(compton_df.endtime).get_loc(min_run_time, 'nearest')]
        nearest_row_after = compton_df.iloc[pd.Index(compton_df.begintime).get_loc(max_run_time, 'nearest')]
        
        # regulatization
        nearest_row_before['data'][1] = max(nearest_row_before['data'][3], 1e-3)
        nearest_row_after['data'][3] = max(nearest_row_after['data'][3], 1e-3)
        nearest_row_before['data'][1] = max(nearest_row_before['data'][1], 1e-3)
        nearest_row_after['data'][3] = max(nearest_row_after['data'][3], 1e-3)
                
        mean_energy = (nearest_row_before['data'][0] + nearest_row_after['data'][0])/2
        mean_spread = (nearest_row_before['data'][2] + nearest_row_after['data'][2])/2
        std_energy = np.sqrt(1/(1/(nearest_row_before['data'][1])**2 + 1/(nearest_row_after['data'][1])**2))
        std_spread = np.sqrt(1/(1/(nearest_row_before['data'][3])**2 + 1/(nearest_row_after['data'][3])**2))
        sys_energy = np.std([nearest_row_before['data'][0], nearest_row_after['data'][0]])
        
        
        return {
            'energy_point': comb_df.elabel.min(),
            'first_run': comb_df.run_first.min(),
            'last_run': comb_df.run_last.max(), 
            'mean_energy': mean_energy, 
            'mean_energy_stat_err': std_energy, 
            'mean_energy_sys_err': sys_energy, 
            'mean_spread': mean_spread,
            'mean_spread_stat_err': std_spread, 
            'used_lum': 0, 
            'comment': 'indirect measurement',
        }, pd.DataFrame()
        
        
    
    df = comb_df.copy()
    df.spread_std = np.where(df.spread_std < 1e-4, 1e-4, df.spread_std)
    
    df = df[df.e_std > 0]
    mean_energy = np.sum(df.e_mean*df.luminosity/(df.e_std**2))/np.sum(df.luminosity/(df.e_std**2))
    # std_energy = np.sqrt(1/np.sum((df.luminosity/df.luminosity.mean())/df.e_std**2))
    
    good_criterion = np.abs((df.e_mean - mean_energy)/np.sqrt(df.e_mean.std()**2 + df.e_std**2)) < 5
    good_criterion = good_criterion
    df = df[good_criterion]
    
    m = Minuit(Likelihood(df.e_mean, df.e_std, df.luminosity), mean=df.e_mean.mean(), sigma=df.e_mean.std())
    m.errordef = 0.5 
    m.limits['sigma'] = (0, None)
    m.migrad();
    sys_err = m.values['sigma']
    mean_en = m.values['mean']
    
    mean_spread = np.sum(df.spread_mean*df.luminosity/(df.spread_std**2))/np.sum(df.luminosity/(df.spread_std**2))
    std_spread = np.sqrt(1/np.sum((df.luminosity/df.luminosity.mean())/df.spread_std**2))
    
    res_dict = {
        'energy_point': comb_df.elabel.min(), 
        'first_run': comb_df.run_first.min(),
        'last_run': comb_df.run_last.max(), 
        'mean_energy': mean_en, 
        'mean_energy_stat_err': m.errors['mean'], 
        'mean_energy_sys_err': sys_err, 
        'mean_spread': mean_spread,
        'mean_spread_stat_err': std_spread, 
        'used_lum': df.luminosity.sum()/comb_df.luminosity_total.sum(), 
        'comment': '',
    }
    return res_dict, df

def process_combined(combined_df: pd.DataFrame, runs_df: pd.DataFrame, compton_df: pd.DataFrame, pics_folder: Optional[str] = None) -> pd.DataFrame:
    
    if pics_folder is not None:
        plt.ioff()
        plt.style.use('ggplot')
        locator = mdates.AutoDateLocator(minticks=5)
        formatter = mdates.ConciseDateFormatter(locator)
        formatter.formats = ['%y', '%b', '%d', '%H:%M', '%H:%M', '%S.%f', ]
        formatter.zero_formats = [''] + formatter.formats[:-1]
        formatter.zero_formats[3] = '%d-%b'
        formatter.offset_formats = ['', '%Y', '%b %Y', '%d %b %Y', '%d %b %Y', '%d %b %Y %H:%M', ]
    
    runs_df = runs_df.rename({'luminosity': 'luminosity_full', 'energy': 'elabel'}, axis=1)
    combined_df = pd.merge(combined_df.drop(['elabel'], axis=1), runs_df[['run', 'elabel', 'luminosity_full']], how='outer')
    combined_df = combined_df.sort_values(by='run')
    combined_df['luminosity'] = combined_df['luminosity'].fillna(0)
    
    combined_df['point_idx'] = np.cumsum(~np.isclose(combined_df.elabel, combined_df.elabel.shift(1), atol=1e-4))
    combined_df = combined_df.groupby(['point_idx', 'compton_start'], dropna=False).agg(
        elabel=('elabel', 'min'), elabel_test=('elabel', 'max'),
        run_first=('run', 'min'), run_last=('run', 'max'),
        luminosity=('luminosity', 'sum'), luminosity_total=('luminosity_full', 'sum'),
        compton_stop=('compton_stop', 'min'), compton_stop_test=('compton_stop', 'max'),
        e_mean=('e_mean', 'min'), e_mean_test=('e_mean', 'max'),
        e_std=('e_std', 'min'), e_std_test=('e_std', 'max'),
        spread_mean=('spread_mean', 'min'), spread_mean_test=('spread_mean', 'max'),
        spread_std=('spread_std', 'min'), spread_std_test=('spread_std', 'max'),
    ).reset_index().set_index('point_idx')
    # return combined_df
    
    result_df = pd.DataFrame(columns=['energy_point', 'first_run', 'last_run', 'mean_energy', 'mean_energy_stat_err', 'mean_energy_sys_err', 'mean_spread', 'mean_spread_stat_err', 'used_lum', 'comment'])
    
    for i, table in tqdm(combined_df.groupby('point_idx', dropna=False)):
        res_dict, good_df = calculate_point(table, runs_df, compton_df)
        result_df = result_df.append(res_dict, ignore_index=True)
        
        if pics_folder is not None:
            plt_table = good_df.dropna()
            if len(plt_table) == 0:
                continue
            
            total_error = np.sqrt(res_dict["mean_energy_stat_err"]**2 + res_dict["mean_energy_sys_err"]**2)
            half_timedelta = (plt_table.compton_stop - plt_table.compton_start)/2 
            time = plt_table.compton_start + half_timedelta
            dlt0 = timedelta(days=1)
            
            fig, ax = plt.subplots(1, 1, dpi=120, tight_layout=True)
            ax.errorbar(time, plt_table.e_mean, xerr=half_timedelta, yerr=plt_table.e_std, fmt='.')
            ax.axhline(res_dict['mean_energy'], color='black', zorder=3, label='Mean')
            ax.fill_between([plt_table.compton_stop.min(), plt_table.compton_stop.max()], 
                            [res_dict['mean_energy'] - total_error]*2, 
                            [res_dict['mean_energy'] + total_error]*2, color='green', zorder=1, alpha=0.4)
            ax.tick_params(axis='x', labelrotation=45)
            ax.xaxis.set_major_locator(locator)
            ax.xaxis.set_major_formatter(formatter)
            ax.set(title=f'{res_dict["energy_point"]}, E = {res_dict["mean_energy"]:.3f} ± {res_dict["mean_energy_stat_err"]:.3f} ± {res_dict["mean_energy_sys_err"]:.3f} MeV', xlabel='Time, UTC', ylabel='Energy, [MeV]', 
                   xlim=[plt_table.compton_stop.min(), plt_table.compton_stop.max()])
            plt.savefig(f'{pics_folder}/{res_dict["first_run"]}_{res_dict["energy_point"]}.png')
            plt.close()
    
    return result_df
    
# python scripts/compton_combiner.py --season NNBAR2021 --config database.ini
def main():
    parser = argparse.ArgumentParser(description = 'Mean compton energy measurements from clbrdb')
    parser.add_argument('--season', help = 'Name of the season')
    parser.add_argument('--config', help = 'Config file containing information for access to databases')
    
    args = parser.parse_args()
    # logging.info(f"Arguments: season: {args.season}, config {args.config}")

    parser = ConfigParser()
    parser.read(args.config);
    
    rdb = RunsDBHandler(**parser['cmdruns'])
    
    idx = SEASONS['name'].index(args.season)
    runs_range = (SEASONS['start_run'][idx], SEASONS['start_run'][idx+1])
    res_rdb = rdb.load_tables(runs_range)
    runs_df = pd.DataFrame(res_rdb[0], columns=res_rdb[1])
    
    tdlt0 = timedelta(days=2)
    time_range = (runs_df.starttime.min() - tdlt0, runs_df.stoptime.max() + tdlt0)
    clbrdb = CalibrdbHandler(**parser['clbrDB'])
    res_clbrdb = clbrdb.load_table('Misc', 'RunHeader', 'Compton_run', num_last_rows = None, timerange = time_range)
    
    cb = Combiner(res_rdb, res_clbrdb)
    comb_df = cb.combined_table()
    
    compton_df = pd.DataFrame(res_clbrdb[0], columns=res_clbrdb[1])
    
    cdf = process_combined(comb_df, runs_df, compton_df, './pics')
    cdf.to_csv(f'{args.season}.csv', index=False, float_format='%g')
    return 

if __name__ == "__main__":
    main()