diff --git a/backlogger/__init__.py b/backlogger/__init__.py new file mode 100644 index 0000000..91da91c --- /dev/null +++ b/backlogger/__init__.py @@ -0,0 +1,20 @@ +""" +The aim of this project is to extend pyerrors to be able to collect measurements from different projects and make them easily accessable to +the research group. The idea is to build a database, in which the researcher can easily search for measurements on a correlator basis, +which may be reusable. +As a standard to store the measurements, we will use the .json.gz format from pyerrors. +This allows us to organize a library by files and exported dictionaries. +Also, this is compressable, but is also human readable in uncompressed form. +The project creates a database with a table to store the measurements, and a folder to store the .json.gz files and tracks the changes to the folder automatically using datalad. +This way, we can harness the power of datalad as a backand, to reproducibly build our database. +Projects, that are also datalad datasets can be linked to the backlog of correlators as subdatasets, such that, using datalad rerun function, +it can be easily seen wehere the respective measurement came from and ho it may be reproduced. + +For now, we are interested in collecting primary IObservables only, as these are the most computationally expensive and time consuming to calculate. +""" + + +from .main import * +from .input import * +from .initialization import * +from .io import * diff --git a/backlogger/git_tools.py b/backlogger/git_tools.py new file mode 100644 index 0000000..c003e29 --- /dev/null +++ b/backlogger/git_tools.py @@ -0,0 +1,31 @@ +import os +import datalad.api as dl + + +def move_submodule(repo_path, old_path, new_path): + """ + Move a submodule to a new location. + + Parameters + ---------- + + old_path: str + The old path of the module. + new_path: str + The new path of the module. + """ + + os.rename(repo_path + "/" + old_path, repo_path + "/" + new_path) + with open(repo_path + '/.gitmodules', 'r') as fp: + lines = fp.readlines() + + for line in lines: + if line.startswith('\tpath'): + line = line.replace(old_path, new_path) + break + if line.startswith('[submodule "projects/tmp"]'): + line = line.replace(old_path, new_path) + break + with open(repo_path + '/.gitmodules', 'w') as fp: + fp.writelines(lines) + dl.save(repo_path, message="Move module from " + old_path + " to " + new_path, dataset=repo_path) diff --git a/backlogger/initialization.py b/backlogger/initialization.py new file mode 100644 index 0000000..590fed9 --- /dev/null +++ b/backlogger/initialization.py @@ -0,0 +1,46 @@ +import sqlite3 +import datalad.api as dl +import os + + +def _create_db(path): + """ + Create the database file and the table. + + """ + conn = sqlite3.connect(path) + c = conn.cursor() + c.execute('''CREATE TABLE IF NOT EXISTS backlogs + (id INTEGER PRIMARY KEY, + name TEXT, + ensemble TEXT, + code TEXT, + path TEXT, + project TEXT, + parameters TEXT, + parameter_file TEXT, + created_at TEXT, + updated_at TEXT)''') + c.execute('''CREATE TABLE IF NOT EXISTS projects + (id TEXT PRIMARY KEY, + aliases TEXT, + code TEXT, + created_at TEXT, + updated_at TEXT)''') + conn.commit() + conn.close() + + +def create(path): + """ + Create folder of backlogs. + + """ + dl.create(path) + _create_db(path + '/backlogger.db') + os.chmod(path + '/backlogger.db', 0o666) + os.makedirs(path + '/projects') + os.makedirs(path + '/projects/tmp') + os.makedirs(path + '/archive') + os.makedirs(path + '/import_scripts/template.py') + dl.save(path, dataset=path, message="Initialize backlogger directory.") diff --git a/backlogger/input/__init__.py b/backlogger/input/__init__.py new file mode 100644 index 0000000..f917e2a --- /dev/null +++ b/backlogger/input/__init__.py @@ -0,0 +1,5 @@ +""" +Import functions for different codes. +""" + +from . import sfcf diff --git a/backlogger/input/sfcf.py b/backlogger/input/sfcf.py new file mode 100644 index 0000000..a8bb33d --- /dev/null +++ b/backlogger/input/sfcf.py @@ -0,0 +1,195 @@ +import pyerrors as pe +import datalad.api as dl + + +def read_param(path, project, file_in_project): + """ + Read the parameters from the sfcf file. + + Parameters + ---------- + file : str + The path to the sfcf file. + + Returns + ------- + dict + The parameters in the sfcf file. + + """ + + file = path + "/projects/" + project + '/' + file_in_project + dl.get(file, dataset=path) + with open(file, 'r') as f: + lines = f.readlines() + + params = {} + params['wf_offsets'] = [] + params['wf_basis'] = [] + params['wf_coeff'] = [] + params['qr'] = {} + params['mrr'] = [] + params['crr'] = [] + params['qs'] = {} + params['mrs'] = [] + params['crs'] = [] + + for line in lines: + if line.startswith('#'): + continue + if line.startswith('\n'): + continue + if line.startswith('wf_offsets'): + num_wf_offsets = line.split()[1] + for i in range(int(num_wf_offsets)): + params['wf_offsets'].append([float(x) for x in lines[lines.index(line) + i + 1].split("#")[0].split()]) + + if line.startswith('wf_basis'): + num_wf_basis = line.split()[1] + for i in range(int(num_wf_basis)): + params['wf_basis'].append([float(x) for x in lines[lines.index(line) + i + 1].split("#")[0].split()]) + + if line.startswith('wf_coeff'): + num_wf_coeff = line.split()[1] + for i in range(int(num_wf_coeff)): + params['wf_coeff'].append([float(x) for x in lines[lines.index(line) + i + 1].split("#")[0].split()]) + + if line.startswith('qr'): + num_qr = line.split()[1] + for i in range(int(num_qr)): + dat = lines[lines.index(line) + i + 1].split("#")[0].strip().split()[:-1] + params['qr'][dat[0]] = {} + params['qr'][dat[0]]['mass'] = float(dat[1]) + params['qr'][dat[0]]['thetas'] = [float(x) for x in dat[2:5]] + + if line.startswith('mrr'): + num_mrr = line.split()[1] + for i in range(int(num_mrr)): + params['mrr'].append(lines[lines.index(line) + i + 1].split("#")[0].strip()) + + if line.startswith('crr'): + num_crr = line.split()[1] + for i in range(int(num_crr)): + params['crr'].append(lines[lines.index(line) + i + 1].split("#")[0].strip()) + + if line.startswith('qs'): + num_qs = line.split()[1] + for i in range(int(num_qs)): + dat = lines[lines.index(line) + i + 1].split("#")[0].strip().split()[:-1] + params['qs'][dat[0]] = {} + params['qs'][dat[0]]['mass'] = float(dat[1]) + params['qs'][dat[0]]['thetas'] = [float(x) for x in dat[2:5]] + + if line.startswith('mrs'): + num_mrs = line.split()[1] + for i in range(int(num_mrs)): + params['mrs'].append(lines[lines.index(line) + i + 1].split("#")[0].strip()) + + if line.startswith('crs'): + num_crs = line.split()[1] + for i in range(int(num_crs)): + params['mrs'].append(lines[lines.index(line) + i + 1].split("#")[0].strip()) + + # catch standard cases + if params['wf_offsets'] == []: + params['wf_offsets'] = [[0, 0, 0]] + + if params['wf_coeff'] == []: + params['wf_coeff'] = [[0, -1]] + + return params + + +def _map_params(params, spec_list): + """ + Map the extracted parameters to the extracted data. + + """ + + new_specs = {} + # quarks + quarks = spec_list[0].split(" ") + + new_specs['quarks'] = (params['qr'][quarks[0]], params['qr'][quarks[1]]) + + # offset + new_specs['offset'] = (params['wf_offsets'][int(spec_list[1])]) + # wf1 + contribs = [] + for i, coeff in enumerate(params['wf_coeff'][int(spec_list[2])]): + if not coeff == 0: + contrib = (coeff, params['wf_basis'][i]) + contribs.append(contrib) + new_specs['wf1'] = contribs + + if len(spec_list) == 4: + # wf2 + contribs = [] + for i, coeff in enumerate(params['wf_coeff'][int(spec_list[3])]): + if not coeff == 0: + contrib = (coeff, params['wf_basis'][i]) + contribs.append(contrib) + new_specs['wf2'] = contribs + return new_specs + + +def read_data(path, project, dir_in_project, prefix, param, version='1.0c', cfg_seperator='n', sep='/', **kwargs): + """ + Extract the data from the sfcf file. + + Returns + ------- + dict + The data from the sfcf file. + + """ + names = kwargs.get('names', None) + corr_types = { + 'f_A': 'bi', + 'f_P': 'bi', + 'g_A': 'bi', + 'g_P': 'bi', + 'f_1': 'bb', + 'k_1': 'bb', + } + directory = path + "/projects/" + project + '/' + dir_in_project + dl.get(directory, dataset=path) + corr_type_list = [] + for corr_name in param['crr']: + if corr_name not in corr_types: + raise ValueError('Correlator type not known.') + corr_type_list.append(corr_types[corr_name]) + + if not param['crr'] == []: + if names is not None: + data_crr = pe.input.sfcf.read_sfcf_multi(directory, prefix, param['crr'], param['mrr'], corr_type_list, range(len(param['wf_offsets'])), + range(len(param['wf_basis'])), range(len(param['wf_basis'])), version, cfg_seperator, keyed_out=True, names=names) + else: + data_crr = pe.input.sfcf.read_sfcf_multi(directory, prefix, param['crr'], param['mrr'], corr_type_list, range(len(param['wf_offsets'])), + range(len(param['wf_basis'])), range(len(param['wf_basis'])), version, cfg_seperator, keyed_out=True) + + if not param['crs'] == []: + data_crs = pe.input.sfcf.read_sfcf_multi(directory, param['crs']) + + data = {} + if not param['crr'] == []: + for key in data_crr.keys(): + data[key] = data_crr[key] + + if not param['crs'] == []: + for key in data_crs.keys(): + data[key] = data_crs[key] + + # sort data by correlator + sorted_data = {} + for key in data.keys(): + key_parts = key.split(sep) + corr = key_parts[0] + if corr_types[corr] == 'bi': + specs = _map_params(param, key_parts[1:-1]) + else: + specs = _map_params(param, key_parts[1:]) + if corr not in sorted_data: + sorted_data[corr] = {} + sorted_data[corr][sep.join(key_parts[1:])] = data[key] + return sorted_data, specs diff --git a/backlogger/io.py b/backlogger/io.py new file mode 100644 index 0000000..ed01132 --- /dev/null +++ b/backlogger/io.py @@ -0,0 +1,46 @@ +from pyerrors.input import json as pj +import os +import datalad.api as dl +import sqlite3 + + +def write_measurement(path, ensemble, measurement, uuid, code, parameters, parameter_file): + """ + Write a measurement to the backlog. + If the file for the measurement already exists, update the measurement. + + Parameters + ---------- + path: str + The path to the backlogger folder. + ensemble: str + The ensemble of the measurement. + measurement: dict + Measurements to be captured in the backlogging system. + uuid: str + The uuid of the project. + """ + + for corr in measurement.keys(): + file = path + "/archive/" + ensemble + "/" + corr + '/' + uuid + '.json.gz' + if not os.path.exists(path + "/archive/" + ensemble + "/" + corr): + os.makedirs(path + "/archive/" + ensemble + "/" + corr) + conn = sqlite3.connect(path + '/backlogger.db') + c = conn.cursor() + if os.path.exists(file): + dl.unlock(file, dataset=path) + known_meas = pj.load_json_dict(file) + for key in measurement[corr].keys(): + known_meas[key] = measurement[corr][key] + else: + known_meas = measurement[corr] + pj.dump_dict_to_json(measurement[corr], file) + for subkey in measurement[corr].keys(): + meas_path = file + "::" + subkey + if not os.path.exists(file): + c.execute("INSERT INTO backlogs (name, ensemble, code, path, project, parameters, parameter_file, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'), datetime('now'))", (corr, ensemble, code, meas_path, uuid, parameters, parameter_file)) + else: + c.execute("UPDATE backlogs SET updated_at=datetime('now') WHERE path=?", (file,)) + conn.commit() + conn.close() + dl.save([path + '/backlogger.db', file], message="Add measurement to database", dataset=path) diff --git a/backlogger/main.py b/backlogger/main.py new file mode 100644 index 0000000..2a8670e --- /dev/null +++ b/backlogger/main.py @@ -0,0 +1,65 @@ +import sqlite3 +import datalad.api as dl +import os +from .git_tools import move_submodule + + +def create_project(path, uuid, aliases=None, code=None): + """ + Create a new project entry in the database. + + Parameters + ---------- + path: str + The path to the backlogger folder. + uuid: str + The uuid of the project. + name: str (optional) + Costum name for the project (e.g. 'cA determination on exponential clover'). + code: str (optional) + The code that was used to create the measurements. + """ + conn = sqlite3.connect(path + "/backlogger.db") + c = conn.cursor() + known_projects = c.execute("SELECT * FROM projects WHERE id=?", (uuid,)) + if known_projects.fetchone(): + raise ValueError("Project already imported, use update_project() instead.") + + dl.unlock(path + "/backlogger.db", dataset=path) + c.execute("INSERT INTO projects (id, aliases, code, created_at, updated_at) VALUES (?, ?, ?, datetime('now'), datetime('now'))", (uuid, aliases, code)) + conn.commit() + conn.close() + dl.save(path + "/backlogger.db", message="Added entry for project " + uuid + " to database", dataset=path) + + +def import_project(url, path, aliases=None, code=None): + """ + Parameters + ---------- + + url: str + The url of the project to import. This can be any url that datalad can handle. + path: str + The path to the backlogger folder. + name: str + Custom name of the project, alias of the project. + code: str + Code that was used to create the measurements. + """ + + # install in tmp to find uuid + tmp_path = path + '/projects/tmp' + dl.install(path=tmp_path, source=url, dataset=path) + with open(tmp_path + "/.datalad/config") as fp: + for line in fp: + if line.startswith("\tid"): + uuid = line.split()[2] + break + + create_project(path, uuid, aliases, code) + move_submodule(path, 'projects/tmp', 'projects/' + uuid) + os.mkdir(path + '/projects/tmp') + + os.mkdir(path + '/import_scripts/' + uuid) + dl.save(path, message="Import project from " + url, dataset=path) + return uuid