can now read and write simple sfcf measurements

This commit is contained in:
Justus Kuhlmann 2024-06-19 12:55:42 +00:00
parent 5ec34ad98b
commit c3aa7577fb
7 changed files with 408 additions and 0 deletions

20
backlogger/__init__.py Normal file
View file

@ -0,0 +1,20 @@
"""
The aim of this project is to extend pyerrors to be able to collect measurements from different projects and make them easily accessable to
the research group. The idea is to build a database, in which the researcher can easily search for measurements on a correlator basis,
which may be reusable.
As a standard to store the measurements, we will use the .json.gz format from pyerrors.
This allows us to organize a library by files and exported dictionaries.
Also, this is compressable, but is also human readable in uncompressed form.
The project creates a database with a table to store the measurements, and a folder to store the .json.gz files and tracks the changes to the folder automatically using datalad.
This way, we can harness the power of datalad as a backand, to reproducibly build our database.
Projects, that are also datalad datasets can be linked to the backlog of correlators as subdatasets, such that, using datalad rerun function,
it can be easily seen wehere the respective measurement came from and ho it may be reproduced.
For now, we are interested in collecting primary IObservables only, as these are the most computationally expensive and time consuming to calculate.
"""
from .main import *
from .input import *
from .initialization import *
from .io import *

31
backlogger/git_tools.py Normal file
View file

@ -0,0 +1,31 @@
import os
import datalad.api as dl
def move_submodule(repo_path, old_path, new_path):
"""
Move a submodule to a new location.
Parameters
----------
old_path: str
The old path of the module.
new_path: str
The new path of the module.
"""
os.rename(repo_path + "/" + old_path, repo_path + "/" + new_path)
with open(repo_path + '/.gitmodules', 'r') as fp:
lines = fp.readlines()
for line in lines:
if line.startswith('\tpath'):
line = line.replace(old_path, new_path)
break
if line.startswith('[submodule "projects/tmp"]'):
line = line.replace(old_path, new_path)
break
with open(repo_path + '/.gitmodules', 'w') as fp:
fp.writelines(lines)
dl.save(repo_path, message="Move module from " + old_path + " to " + new_path, dataset=repo_path)

View file

@ -0,0 +1,46 @@
import sqlite3
import datalad.api as dl
import os
def _create_db(path):
"""
Create the database file and the table.
"""
conn = sqlite3.connect(path)
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS backlogs
(id INTEGER PRIMARY KEY,
name TEXT,
ensemble TEXT,
code TEXT,
path TEXT,
project TEXT,
parameters TEXT,
parameter_file TEXT,
created_at TEXT,
updated_at TEXT)''')
c.execute('''CREATE TABLE IF NOT EXISTS projects
(id TEXT PRIMARY KEY,
aliases TEXT,
code TEXT,
created_at TEXT,
updated_at TEXT)''')
conn.commit()
conn.close()
def create(path):
"""
Create folder of backlogs.
"""
dl.create(path)
_create_db(path + '/backlogger.db')
os.chmod(path + '/backlogger.db', 0o666)
os.makedirs(path + '/projects')
os.makedirs(path + '/projects/tmp')
os.makedirs(path + '/archive')
os.makedirs(path + '/import_scripts/template.py')
dl.save(path, dataset=path, message="Initialize backlogger directory.")

View file

@ -0,0 +1,5 @@
"""
Import functions for different codes.
"""
from . import sfcf

195
backlogger/input/sfcf.py Normal file
View file

@ -0,0 +1,195 @@
import pyerrors as pe
import datalad.api as dl
def read_param(path, project, file_in_project):
"""
Read the parameters from the sfcf file.
Parameters
----------
file : str
The path to the sfcf file.
Returns
-------
dict
The parameters in the sfcf file.
"""
file = path + "/projects/" + project + '/' + file_in_project
dl.get(file, dataset=path)
with open(file, 'r') as f:
lines = f.readlines()
params = {}
params['wf_offsets'] = []
params['wf_basis'] = []
params['wf_coeff'] = []
params['qr'] = {}
params['mrr'] = []
params['crr'] = []
params['qs'] = {}
params['mrs'] = []
params['crs'] = []
for line in lines:
if line.startswith('#'):
continue
if line.startswith('\n'):
continue
if line.startswith('wf_offsets'):
num_wf_offsets = line.split()[1]
for i in range(int(num_wf_offsets)):
params['wf_offsets'].append([float(x) for x in lines[lines.index(line) + i + 1].split("#")[0].split()])
if line.startswith('wf_basis'):
num_wf_basis = line.split()[1]
for i in range(int(num_wf_basis)):
params['wf_basis'].append([float(x) for x in lines[lines.index(line) + i + 1].split("#")[0].split()])
if line.startswith('wf_coeff'):
num_wf_coeff = line.split()[1]
for i in range(int(num_wf_coeff)):
params['wf_coeff'].append([float(x) for x in lines[lines.index(line) + i + 1].split("#")[0].split()])
if line.startswith('qr'):
num_qr = line.split()[1]
for i in range(int(num_qr)):
dat = lines[lines.index(line) + i + 1].split("#")[0].strip().split()[:-1]
params['qr'][dat[0]] = {}
params['qr'][dat[0]]['mass'] = float(dat[1])
params['qr'][dat[0]]['thetas'] = [float(x) for x in dat[2:5]]
if line.startswith('mrr'):
num_mrr = line.split()[1]
for i in range(int(num_mrr)):
params['mrr'].append(lines[lines.index(line) + i + 1].split("#")[0].strip())
if line.startswith('crr'):
num_crr = line.split()[1]
for i in range(int(num_crr)):
params['crr'].append(lines[lines.index(line) + i + 1].split("#")[0].strip())
if line.startswith('qs'):
num_qs = line.split()[1]
for i in range(int(num_qs)):
dat = lines[lines.index(line) + i + 1].split("#")[0].strip().split()[:-1]
params['qs'][dat[0]] = {}
params['qs'][dat[0]]['mass'] = float(dat[1])
params['qs'][dat[0]]['thetas'] = [float(x) for x in dat[2:5]]
if line.startswith('mrs'):
num_mrs = line.split()[1]
for i in range(int(num_mrs)):
params['mrs'].append(lines[lines.index(line) + i + 1].split("#")[0].strip())
if line.startswith('crs'):
num_crs = line.split()[1]
for i in range(int(num_crs)):
params['mrs'].append(lines[lines.index(line) + i + 1].split("#")[0].strip())
# catch standard cases
if params['wf_offsets'] == []:
params['wf_offsets'] = [[0, 0, 0]]
if params['wf_coeff'] == []:
params['wf_coeff'] = [[0, -1]]
return params
def _map_params(params, spec_list):
"""
Map the extracted parameters to the extracted data.
"""
new_specs = {}
# quarks
quarks = spec_list[0].split(" ")
new_specs['quarks'] = (params['qr'][quarks[0]], params['qr'][quarks[1]])
# offset
new_specs['offset'] = (params['wf_offsets'][int(spec_list[1])])
# wf1
contribs = []
for i, coeff in enumerate(params['wf_coeff'][int(spec_list[2])]):
if not coeff == 0:
contrib = (coeff, params['wf_basis'][i])
contribs.append(contrib)
new_specs['wf1'] = contribs
if len(spec_list) == 4:
# wf2
contribs = []
for i, coeff in enumerate(params['wf_coeff'][int(spec_list[3])]):
if not coeff == 0:
contrib = (coeff, params['wf_basis'][i])
contribs.append(contrib)
new_specs['wf2'] = contribs
return new_specs
def read_data(path, project, dir_in_project, prefix, param, version='1.0c', cfg_seperator='n', sep='/', **kwargs):
"""
Extract the data from the sfcf file.
Returns
-------
dict
The data from the sfcf file.
"""
names = kwargs.get('names', None)
corr_types = {
'f_A': 'bi',
'f_P': 'bi',
'g_A': 'bi',
'g_P': 'bi',
'f_1': 'bb',
'k_1': 'bb',
}
directory = path + "/projects/" + project + '/' + dir_in_project
dl.get(directory, dataset=path)
corr_type_list = []
for corr_name in param['crr']:
if corr_name not in corr_types:
raise ValueError('Correlator type not known.')
corr_type_list.append(corr_types[corr_name])
if not param['crr'] == []:
if names is not None:
data_crr = pe.input.sfcf.read_sfcf_multi(directory, prefix, param['crr'], param['mrr'], corr_type_list, range(len(param['wf_offsets'])),
range(len(param['wf_basis'])), range(len(param['wf_basis'])), version, cfg_seperator, keyed_out=True, names=names)
else:
data_crr = pe.input.sfcf.read_sfcf_multi(directory, prefix, param['crr'], param['mrr'], corr_type_list, range(len(param['wf_offsets'])),
range(len(param['wf_basis'])), range(len(param['wf_basis'])), version, cfg_seperator, keyed_out=True)
if not param['crs'] == []:
data_crs = pe.input.sfcf.read_sfcf_multi(directory, param['crs'])
data = {}
if not param['crr'] == []:
for key in data_crr.keys():
data[key] = data_crr[key]
if not param['crs'] == []:
for key in data_crs.keys():
data[key] = data_crs[key]
# sort data by correlator
sorted_data = {}
for key in data.keys():
key_parts = key.split(sep)
corr = key_parts[0]
if corr_types[corr] == 'bi':
specs = _map_params(param, key_parts[1:-1])
else:
specs = _map_params(param, key_parts[1:])
if corr not in sorted_data:
sorted_data[corr] = {}
sorted_data[corr][sep.join(key_parts[1:])] = data[key]
return sorted_data, specs

46
backlogger/io.py Normal file
View file

@ -0,0 +1,46 @@
from pyerrors.input import json as pj
import os
import datalad.api as dl
import sqlite3
def write_measurement(path, ensemble, measurement, uuid, code, parameters, parameter_file):
"""
Write a measurement to the backlog.
If the file for the measurement already exists, update the measurement.
Parameters
----------
path: str
The path to the backlogger folder.
ensemble: str
The ensemble of the measurement.
measurement: dict
Measurements to be captured in the backlogging system.
uuid: str
The uuid of the project.
"""
for corr in measurement.keys():
file = path + "/archive/" + ensemble + "/" + corr + '/' + uuid + '.json.gz'
if not os.path.exists(path + "/archive/" + ensemble + "/" + corr):
os.makedirs(path + "/archive/" + ensemble + "/" + corr)
conn = sqlite3.connect(path + '/backlogger.db')
c = conn.cursor()
if os.path.exists(file):
dl.unlock(file, dataset=path)
known_meas = pj.load_json_dict(file)
for key in measurement[corr].keys():
known_meas[key] = measurement[corr][key]
else:
known_meas = measurement[corr]
pj.dump_dict_to_json(measurement[corr], file)
for subkey in measurement[corr].keys():
meas_path = file + "::" + subkey
if not os.path.exists(file):
c.execute("INSERT INTO backlogs (name, ensemble, code, path, project, parameters, parameter_file, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'), datetime('now'))", (corr, ensemble, code, meas_path, uuid, parameters, parameter_file))
else:
c.execute("UPDATE backlogs SET updated_at=datetime('now') WHERE path=?", (file,))
conn.commit()
conn.close()
dl.save([path + '/backlogger.db', file], message="Add measurement to database", dataset=path)

65
backlogger/main.py Normal file
View file

@ -0,0 +1,65 @@
import sqlite3
import datalad.api as dl
import os
from .git_tools import move_submodule
def create_project(path, uuid, aliases=None, code=None):
"""
Create a new project entry in the database.
Parameters
----------
path: str
The path to the backlogger folder.
uuid: str
The uuid of the project.
name: str (optional)
Costum name for the project (e.g. 'cA determination on exponential clover').
code: str (optional)
The code that was used to create the measurements.
"""
conn = sqlite3.connect(path + "/backlogger.db")
c = conn.cursor()
known_projects = c.execute("SELECT * FROM projects WHERE id=?", (uuid,))
if known_projects.fetchone():
raise ValueError("Project already imported, use update_project() instead.")
dl.unlock(path + "/backlogger.db", dataset=path)
c.execute("INSERT INTO projects (id, aliases, code, created_at, updated_at) VALUES (?, ?, ?, datetime('now'), datetime('now'))", (uuid, aliases, code))
conn.commit()
conn.close()
dl.save(path + "/backlogger.db", message="Added entry for project " + uuid + " to database", dataset=path)
def import_project(url, path, aliases=None, code=None):
"""
Parameters
----------
url: str
The url of the project to import. This can be any url that datalad can handle.
path: str
The path to the backlogger folder.
name: str
Custom name of the project, alias of the project.
code: str
Code that was used to create the measurements.
"""
# install in tmp to find uuid
tmp_path = path + '/projects/tmp'
dl.install(path=tmp_path, source=url, dataset=path)
with open(tmp_path + "/.datalad/config") as fp:
for line in fp:
if line.startswith("\tid"):
uuid = line.split()[2]
break
create_project(path, uuid, aliases, code)
move_submodule(path, 'projects/tmp', 'projects/' + uuid)
os.mkdir(path + '/projects/tmp')
os.mkdir(path + '/import_scripts/' + uuid)
dl.save(path, message="Import project from " + url, dataset=path)
return uuid