can now read and write simple sfcf measurements

2024-06-19 12:55:42 +00:00 · 2024-06-19 12:55:42 +00:00 · c3aa7577fb
commit c3aa7577fb
parent 5ec34ad98b
7 changed files with 408 additions and 0 deletions
--- a/backlogger/init.py
+++ b/backlogger/init.py
@ -0,0 +1,20 @@
 """
 The aim of this project is to extend pyerrors to be able to collect measurements from different projects and make them easily accessable to 
 the research group. The idea is to build a database, in which the researcher can easily search for measurements on a correlator basis,
 which may be reusable.
 As a standard to store the measurements, we will use the .json.gz format from pyerrors.
 This allows us to organize a library by files and exported dictionaries.
 Also, this is compressable, but is also human readable in uncompressed form.
 The project creates a database with a table to store the measurements, and a folder to store the .json.gz files and tracks the changes to the folder automatically using datalad.
 This way, we can harness the power of datalad as a backand, to reproducibly build our database.
 Projects, that are also datalad datasets can be linked to the backlog of correlators as subdatasets, such that, using datalad rerun function,
 it can be easily seen wehere the respective measurement came from and ho it may be reproduced.
 For now, we are interested in collecting primary IObservables only, as these are the most computationally expensive and time consuming to calculate.
 """
 from .main import *
 from .input import *
 from .initialization import *
 from .io import *
--- a/backlogger/git_tools.py
+++ b/backlogger/git_tools.py
@ -0,0 +1,31 @@
 import os
 import datalad.api as dl
 def move_submodule(repo_path, old_path, new_path):
    """
    Move a submodule to a new location.
    Parameters
    ----------
    old_path: str
        The old path of the module.
    new_path: str
        The new path of the module.
    """
    os.rename(repo_path + "/" + old_path, repo_path + "/" + new_path)
    with open(repo_path + '/.gitmodules', 'r') as fp:
        lines = fp.readlines()
    for line in lines:
        if line.startswith('\tpath'):
            line = line.replace(old_path, new_path)
            break
        if line.startswith('[submodule "projects/tmp"]'):
            line = line.replace(old_path, new_path)
            break
    with open(repo_path + '/.gitmodules', 'w') as fp:
        fp.writelines(lines)
    dl.save(repo_path, message="Move module from " + old_path + " to " + new_path, dataset=repo_path)
--- a/backlogger/initialization.py
+++ b/backlogger/initialization.py
@ -0,0 +1,46 @@
 import sqlite3
 import datalad.api as dl
 import os
 def _create_db(path):
    """
    Create the database file and the table.
    """
    conn = sqlite3.connect(path)
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS backlogs
                 (id INTEGER PRIMARY KEY,
                  name TEXT,
                  ensemble TEXT,
                  code TEXT,
                  path TEXT,
                  project TEXT,
                  parameters TEXT,
                  parameter_file TEXT,
                  created_at TEXT,
                  updated_at TEXT)''')
    c.execute('''CREATE TABLE IF NOT EXISTS projects
                 (id TEXT PRIMARY KEY,
                  aliases TEXT,
                  code TEXT,
                  created_at TEXT,
                  updated_at TEXT)''')
    conn.commit()
    conn.close()
 def create(path):
    """
    Create folder of backlogs.
    """
    dl.create(path)
    _create_db(path + '/backlogger.db')
    os.chmod(path + '/backlogger.db', 0o666)
    os.makedirs(path + '/projects')
    os.makedirs(path + '/projects/tmp')
    os.makedirs(path + '/archive')
    os.makedirs(path + '/import_scripts/template.py')
    dl.save(path, dataset=path, message="Initialize backlogger directory.")
--- a/backlogger/input/init.py
+++ b/backlogger/input/init.py
@ -0,0 +1,5 @@
 """
 Import functions for different codes.
 """
 from . import sfcf
--- a/backlogger/input/sfcf.py
+++ b/backlogger/input/sfcf.py
@ -0,0 +1,195 @@
 import pyerrors as pe
 import datalad.api as dl
 def read_param(path, project, file_in_project):
    """
    Read the parameters from the sfcf file.
    Parameters
    ----------
    file : str
        The path to the sfcf file.
    Returns
    -------
    dict
        The parameters in the sfcf file.
    """
    file = path + "/projects/" + project + '/' + file_in_project
    dl.get(file, dataset=path)
    with open(file, 'r') as f:
        lines = f.readlines()
    params = {}
    params['wf_offsets'] = []
    params['wf_basis'] = []
    params['wf_coeff'] = []
    params['qr'] = {}
    params['mrr'] = []
    params['crr'] = []
    params['qs'] = {}
    params['mrs'] = []
    params['crs'] = []
    for line in lines:
        if line.startswith('#'):
            continue
        if line.startswith('\n'):
            continue
        if line.startswith('wf_offsets'):
            num_wf_offsets = line.split()[1]
            for i in range(int(num_wf_offsets)):
                params['wf_offsets'].append([float(x) for x in lines[lines.index(line) + i + 1].split("#")[0].split()])
        if line.startswith('wf_basis'):
            num_wf_basis = line.split()[1]
            for i in range(int(num_wf_basis)):
                params['wf_basis'].append([float(x) for x in lines[lines.index(line) + i + 1].split("#")[0].split()])
        if line.startswith('wf_coeff'):
            num_wf_coeff = line.split()[1]
            for i in range(int(num_wf_coeff)):
                params['wf_coeff'].append([float(x) for x in lines[lines.index(line) + i + 1].split("#")[0].split()])
        if line.startswith('qr'):
            num_qr = line.split()[1]
            for i in range(int(num_qr)):
                dat = lines[lines.index(line) + i + 1].split("#")[0].strip().split()[:-1]
                params['qr'][dat[0]] = {}
                params['qr'][dat[0]]['mass'] = float(dat[1])
                params['qr'][dat[0]]['thetas'] = [float(x) for x in dat[2:5]]
        if line.startswith('mrr'):
            num_mrr = line.split()[1]
            for i in range(int(num_mrr)):
                params['mrr'].append(lines[lines.index(line) + i + 1].split("#")[0].strip())
        if line.startswith('crr'):
            num_crr = line.split()[1]
            for i in range(int(num_crr)):
                params['crr'].append(lines[lines.index(line) + i + 1].split("#")[0].strip())
        if line.startswith('qs'):
            num_qs = line.split()[1]
            for i in range(int(num_qs)):
                dat = lines[lines.index(line) + i + 1].split("#")[0].strip().split()[:-1]
                params['qs'][dat[0]] = {}
                params['qs'][dat[0]]['mass'] = float(dat[1])
                params['qs'][dat[0]]['thetas'] = [float(x) for x in dat[2:5]]
        if line.startswith('mrs'):
            num_mrs = line.split()[1]
            for i in range(int(num_mrs)):
                params['mrs'].append(lines[lines.index(line) + i + 1].split("#")[0].strip())
        if line.startswith('crs'):
            num_crs = line.split()[1]
            for i in range(int(num_crs)):
                params['mrs'].append(lines[lines.index(line) + i + 1].split("#")[0].strip())
    # catch standard cases
    if params['wf_offsets'] == []:
        params['wf_offsets'] = [[0, 0, 0]]
    if params['wf_coeff'] == []:
        params['wf_coeff'] = [[0, -1]]
    return params
 def _map_params(params, spec_list):
    """
    Map the extracted parameters to the extracted data.
    """
    new_specs = {}
    # quarks
    quarks = spec_list[0].split(" ")
    new_specs['quarks'] = (params['qr'][quarks[0]], params['qr'][quarks[1]])
    # offset
    new_specs['offset'] = (params['wf_offsets'][int(spec_list[1])])
    # wf1
    contribs = []
    for i, coeff in enumerate(params['wf_coeff'][int(spec_list[2])]):
        if not coeff == 0:
            contrib = (coeff, params['wf_basis'][i])
            contribs.append(contrib)
    new_specs['wf1'] = contribs
    if len(spec_list) == 4:
        # wf2
        contribs = []
        for i, coeff in enumerate(params['wf_coeff'][int(spec_list[3])]):
            if not coeff == 0:
                contrib = (coeff, params['wf_basis'][i])
                contribs.append(contrib)
        new_specs['wf2'] = contribs
    return new_specs
 def read_data(path, project, dir_in_project, prefix, param, version='1.0c', cfg_seperator='n', sep='/', **kwargs):
    """
    Extract the data from the sfcf file.
    Returns
    -------
    dict
        The data from the sfcf file.
    """
    names = kwargs.get('names', None)
    corr_types = {
        'f_A': 'bi',
        'f_P': 'bi',
        'g_A': 'bi',
        'g_P': 'bi',
        'f_1': 'bb',
        'k_1': 'bb',
    }
    directory = path + "/projects/" + project + '/' + dir_in_project
    dl.get(directory, dataset=path)
    corr_type_list = []
    for corr_name in param['crr']:
        if corr_name not in corr_types:
            raise ValueError('Correlator type not known.')
        corr_type_list.append(corr_types[corr_name])
    if not param['crr'] == []:
        if names is not None:
            data_crr = pe.input.sfcf.read_sfcf_multi(directory, prefix, param['crr'], param['mrr'], corr_type_list, range(len(param['wf_offsets'])),
                                                     range(len(param['wf_basis'])), range(len(param['wf_basis'])), version, cfg_seperator, keyed_out=True, names=names)
        else:
            data_crr = pe.input.sfcf.read_sfcf_multi(directory, prefix, param['crr'], param['mrr'], corr_type_list, range(len(param['wf_offsets'])),
                                                     range(len(param['wf_basis'])), range(len(param['wf_basis'])), version, cfg_seperator, keyed_out=True)
    if not param['crs'] == []:
        data_crs = pe.input.sfcf.read_sfcf_multi(directory, param['crs'])
    data = {}
    if not param['crr'] == []:
        for key in data_crr.keys():
            data[key] = data_crr[key]
    if not param['crs'] == []:
        for key in data_crs.keys():
            data[key] = data_crs[key]
    # sort data by correlator
    sorted_data = {}
    for key in data.keys():
        key_parts = key.split(sep)
        corr = key_parts[0]
        if corr_types[corr] == 'bi':
            specs = _map_params(param, key_parts[1:-1])
        else:
            specs = _map_params(param, key_parts[1:])
        if corr not in sorted_data:
            sorted_data[corr] = {}
        sorted_data[corr][sep.join(key_parts[1:])] = data[key]
    return sorted_data, specs
--- a/backlogger/io.py
+++ b/backlogger/io.py
@ -0,0 +1,46 @@
 from pyerrors.input import json as pj
 import os
 import datalad.api as dl
 import sqlite3
 def write_measurement(path, ensemble, measurement, uuid, code, parameters, parameter_file):
    """
    Write a measurement to the backlog.
    If the file for the measurement already exists, update the measurement.
    Parameters
    ----------
    path: str
        The path to the backlogger folder.
    ensemble: str
        The ensemble of the measurement.
    measurement: dict
        Measurements to be captured in the backlogging system.
    uuid: str
        The uuid of the project.
    """
    for corr in measurement.keys():
        file = path + "/archive/" + ensemble + "/" + corr + '/' + uuid + '.json.gz'
        if not os.path.exists(path + "/archive/" + ensemble + "/" + corr):
            os.makedirs(path + "/archive/" + ensemble + "/" + corr)
        conn = sqlite3.connect(path + '/backlogger.db')
        c = conn.cursor()
        if os.path.exists(file):
            dl.unlock(file, dataset=path)
            known_meas = pj.load_json_dict(file)
            for key in measurement[corr].keys():
                known_meas[key] = measurement[corr][key]
        else:
            known_meas = measurement[corr]
        pj.dump_dict_to_json(measurement[corr], file)
        for subkey in measurement[corr].keys():
            meas_path = file + "::" + subkey
        if not os.path.exists(file):
            c.execute("INSERT INTO backlogs (name, ensemble, code, path, project, parameters, parameter_file, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'), datetime('now'))", (corr, ensemble, code, meas_path, uuid, parameters, parameter_file))
        else:
            c.execute("UPDATE backlogs SET updated_at=datetime('now') WHERE path=?", (file,))
        conn.commit()
        conn.close()
        dl.save([path + '/backlogger.db', file], message="Add measurement to database", dataset=path)
--- a/backlogger/main.py
+++ b/backlogger/main.py
@ -0,0 +1,65 @@
 import sqlite3
 import datalad.api as dl
 import os
 from .git_tools import move_submodule
 def create_project(path, uuid, aliases=None, code=None):
    """
    Create a new project entry in the database.
    Parameters
    ----------
    path: str
        The path to the backlogger folder.
    uuid: str
        The uuid of the project.
    name: str (optional)
        Costum name for the project (e.g. 'cA determination on exponential clover').
    code: str (optional)
        The code that was used to create the measurements.
    """
    conn = sqlite3.connect(path + "/backlogger.db")
    c = conn.cursor()
    known_projects = c.execute("SELECT * FROM projects WHERE id=?", (uuid,))
    if known_projects.fetchone():
        raise ValueError("Project already imported, use update_project() instead.")
    dl.unlock(path + "/backlogger.db", dataset=path)
    c.execute("INSERT INTO projects (id, aliases, code, created_at, updated_at) VALUES (?, ?, ?, datetime('now'), datetime('now'))", (uuid, aliases, code))
    conn.commit()
    conn.close()
    dl.save(path + "/backlogger.db", message="Added entry for project " + uuid + " to database", dataset=path)
 def import_project(url, path, aliases=None, code=None):
    """
    Parameters
    ----------
    url: str
        The url of the project to import. This can be any url that datalad can handle.
    path: str
        The path to the backlogger folder.
    name: str
        Custom name of the project, alias of the project.
    code: str
        Code that was used to create the measurements.
    """
    # install in tmp to find uuid
    tmp_path = path + '/projects/tmp'
    dl.install(path=tmp_path, source=url, dataset=path)
    with open(tmp_path + "/.datalad/config") as fp:
        for line in fp:
            if line.startswith("\tid"):
                uuid = line.split()[2]
                break
    create_project(path, uuid, aliases, code)
    move_submodule(path, 'projects/tmp', 'projects/' + uuid)
    os.mkdir(path + '/projects/tmp')
    os.mkdir(path + '/import_scripts/' + uuid)
    dl.save(path, message="Import project from " + url, dataset=path)
    return uuid