corrlib/corrlib/find.py

import sqlite3
import os
import json
import pandas as pd
import numpy as np
from .input.implementations import codes
from .tools import k2m, get_db_file
from .tracker import get
from .integrity import check_time_validity
from typing import Any, Optional, Union
from pathlib import Path
import datetime as dt
from collections.abc import Callable
import warnings


def _project_lookup_by_alias(db: Path, alias: str) -> str:
    """
    Lookup a projects UUID by its (human-readable) alias.

    Parameters
    ----------
    db: str
        The database to look up the project.
    alias: str
        The alias to look up.

    Returns
    -------
    uuid: str
        The UUID of the project with the given alias.
    """
    conn = sqlite3.connect(db)
    c = conn.cursor()
    c.execute(f"SELECT * FROM 'projects' WHERE aliases = '{alias}'")
    results = c.fetchall()
    conn.close()
    if len(results)>1:
        print("Error: multiple projects found with alias " + alias)
    elif len(results) == 0:
        raise Exception("Error: no project found with alias " + alias)
    return str(results[0][0])


def _project_lookup_by_id(db: Path, uuid: str) -> list[tuple[str, ...]]:
    """
    Return the project information available in the database by UUID.

    Parameters
    ----------
    db: str
        The database to look up the project.
    uuid: str
        The uuid of the project in question.

    Returns
    -------
    results: list
        The row of the project in the database.
    """
    conn = sqlite3.connect(db)
    c = conn.cursor()
    c.execute(f"SELECT * FROM 'projects' WHERE id = '{uuid}'")
    results = c.fetchall()
    conn.close()
    return results


def _time_filter(results: pd.DataFrame,  created_before: Optional[str]=None, created_after: Optional[Any]=None, updated_before: Optional[Any]=None, updated_after: Optional[Any]=None) -> pd.DataFrame:
    """
    Filter the results from the database in terms of the creation and update times.

    Parameters
    ----------
    results: pd.DataFrame
        The dataframe holding the unfilteres results from the database.
    created_before: str
        Contraint on the creation date in datetime.datetime.isoformat. Note that this is exclusive. The creation date has to be truly before the date and time given.
    created_after: str
        Contraint on the creation date in datetime.datetime.isoformat. Note that this is exclusive. The creation date has to be truly after the date and time given.
    updated_before: str
        Contraint on the creation date in datetime.datetime.isoformat. Note that this is exclusive. The date of the last update has to be truly before the date and time given.
    updated_after: str
        Contraint on the creation date in datetime.datetime.isoformat. Note that this is exclusive. The date of the last update has to be truly after the date and time given.
    """
    drops = []
    for ind in range(len(results)):
        result = results.iloc[ind]
        created_at = dt.datetime.fromisoformat(result['created_at'])
        updated_at = dt.datetime.fromisoformat(result['updated_at'])
        db_times_valid = check_time_validity(created_at=created_at, updated_at=updated_at)
        if not db_times_valid:
            raise ValueError('Time stamps not valid for result with path', result["path"])

        if created_before is not None:
            date_created_before = dt.datetime.fromisoformat(created_before)
            if date_created_before < created_at:
                drops.append(ind)
                continue
        if created_after is not None:
            date_created_after = dt.datetime.fromisoformat(created_after)
            if date_created_after > created_at:
                drops.append(ind)
                continue
        if updated_before is not None:
            date_updated_before = dt.datetime.fromisoformat(updated_before)
            if date_updated_before < updated_at:
                drops.append(ind)
                continue
        if updated_after is not None:
            date_updated_after = dt.datetime.fromisoformat(updated_after)
            if date_updated_after > updated_at:
                drops.append(ind)
                continue
    return results.drop(drops)


def _db_lookup(db: Path, ensemble: str, correlator_name: str, code: str, project: Optional[str]=None,  parameters: Optional[str]=None) -> pd.DataFrame:
    """
    Look up a correlator record in the database by the data given to the method.

    Parameters
    ----------
    db: str
        The database to look up the record.
    ensemble: str
        The ensemble the record is associated with.
    correlator_name: str
        The name of the correlator in question.
    code: str
        The name of the code which was used to calculate the correlator.
    project: str, optional
        The UUID of the project the correlator was calculated in.
    parameters: str, optional
        A dictionary holding the exact parameters for the measurement that are held in the database.
    created_before: str, optional
        Timestamp string before which the meaurement has been created.
    created_after: str, optional
        Timestamp string after which the meaurement has been created.
    updated_before: str, optional
        Timestamp string before which the meaurement has been updated.
    updated_after: str, optional
        Timestamp string after which the meaurement has been updated.

    Returns
    -------
    results: pd.DataFrame
        A pandas DataFrame holding the information received form the DB query.
    """

    project_str = project

    search_expr = f"SELECT * FROM 'backlogs' WHERE name = '{correlator_name}' AND ensemble = '{ensemble}'"
    if project:
        search_expr += f" AND project = '{project_str}'"
    if code:
        search_expr += f" AND code = '{code}'"
    if parameters:
        search_expr += f" AND parameters = '{parameters}'"
    conn = sqlite3.connect(db)
    results = pd.read_sql(search_expr, conn)
    conn.close()
    return results


def _sfcf_drop(param: dict[str, Any], **kwargs: Any) -> bool:
        if 'offset' in kwargs:
            if kwargs.get('offset') != param['offset']:
                return True
        if 'quark_kappas' in kwargs:
            kappas = kwargs['quark_kappas']
            if (not np.isclose(kappas[0], param['quarks'][0]['mass']) or not np.isclose(kappas[1], param['quarks'][1]['mass'])):
                return True
        if 'quark_masses' in kwargs:
            masses = kwargs['quark_masses']
            if (not np.isclose(masses[0], k2m(param['quarks'][0]['mass'])) or not np.isclose(masses[1], k2m(param['quarks'][1]['mass']))):
                return True
        if 'qk1' in kwargs:
            quark_kappa1 = kwargs['qk1']
            if not isinstance(quark_kappa1, list):
                if (not np.isclose(quark_kappa1, param['quarks'][0]['mass'])):
                    return True
            else:
                if len(quark_kappa1) == 2:
                    if (quark_kappa1[0] > param['quarks'][0]['mass']) or (quark_kappa1[1] < param['quarks'][0]['mass']):
                        return True
                else:
                    raise ValueError("quark_kappa1 has to have length 2")
        if 'qk2' in kwargs:
            quark_kappa2 = kwargs['qk2']
            if not isinstance(quark_kappa2, list):
                if (not np.isclose(quark_kappa2, param['quarks'][1]['mass'])):
                    return True
            else:
                if len(quark_kappa2) == 2:
                    if (quark_kappa2[0] > param['quarks'][1]['mass']) or (quark_kappa2[1] < param['quarks'][1]['mass']):
                        return True
                else:
                    raise ValueError("quark_kappa2 has to have length 2")
        if 'qm1' in kwargs:
            quark_mass1 = kwargs['qm1']
            if not isinstance(quark_mass1, list):
                if (not np.isclose(quark_mass1, k2m(param['quarks'][0]['mass']))):
                    return True
            else:
                if len(quark_mass1) == 2:
                    if (quark_mass1[0] > k2m(param['quarks'][0]['mass'])) or (quark_mass1[1] < k2m(param['quarks'][0]['mass'])):
                        return True
                else:
                    raise ValueError("quark_mass1 has to have length 2")
        if 'qm2' in kwargs:
            quark_mass2 = kwargs['qm2']
            if not isinstance(quark_mass2, list):
                if (not np.isclose(quark_mass2, k2m(param['quarks'][1]['mass']))):
                    return True
            else:
                if len(quark_mass2) == 2:
                    if (quark_mass2[0] > k2m(param['quarks'][1]['mass'])) or (quark_mass2[1] < k2m(param['quarks'][1]['mass'])):
                        return True
                else:
                    raise ValueError("quark_mass2 has to have length 2")
        if 'quark_thetas' in kwargs:
            quark_thetas = kwargs['quark_thetas']
            if (quark_thetas[0] != param['quarks'][0]['thetas'] and quark_thetas[1] != param['quarks'][1]['thetas']) or (quark_thetas[0] != param['quarks'][1]['thetas'] and quark_thetas[1] != param['quarks'][0]['thetas']):
                return True
        # careful, this is not save, when multiple contributions are present!
        if 'wf1' in kwargs:
            wf1 = kwargs['wf1']
            if not (np.isclose(wf1[0][0], param['wf1'][0][0], 1e-8) and np.isclose(wf1[0][1][0], param['wf1'][0][1][0], 1e-8) and np.isclose(wf1[0][1][1], param['wf1'][0][1][1], 1e-8)):
                return True
        if 'wf2' in kwargs:
            wf2 = kwargs['wf2']
            if not (np.isclose(wf2[0][0], param['wf2'][0][0], 1e-8) and np.isclose(wf2[0][1][0], param['wf2'][0][1][0], 1e-8) and np.isclose(wf2[0][1][1], param['wf2'][0][1][1], 1e-8)):
                return True
        return False


def sfcf_filter(results: pd.DataFrame, **kwargs: Any) -> pd.DataFrame:
    r"""
    Filter method for the Database entries holding SFCF calculations.

    Parameters
    ----------
    results: pd.DataFrame
        The unfiltered pandas DataFrame holding the entries from the database.

    offset: list[float], optional
    quark_kappas: list[float]
    quarks_masses: list[float]
    qk1: float, optional
        Mass parameter $\kappa_1$ of the first quark.
    qk2: float, optional
        Mass parameter $\kappa_2$ of the first quark.
    qm1: float, optional
        Bare quark mass $m_1$ of the first quark.
    qm2: float, optional
        Bare quark mass $m_2$ of the first quark.
    quarks_thetas: list[list[float]], optional
    wf1: optional
    wf2: optional

    Results
    -------
    results: pd.DataFrame
        The filtered DataFrame, only holding the records that fit to the parameters given.
    """

    drops = []
    for ind in range(len(results)):
        result = results.iloc[ind]
        param = json.loads(result['parameters'])
        if _sfcf_drop(param, **kwargs):
            drops.append(ind)
    return results.drop(drops)


def openQCD_filter(results:pd.DataFrame, **kwargs: Any) -> pd.DataFrame:
    """
    Filter for parameters of openQCD.

    Parameters
    ----------
    results: pd.DataFrame
        The unfiltered list of results from the database.

    Returns
    -------
    results: pd.DataFrame
        The filtered results.

    """
    warnings.warn("A filter for openQCD parameters is no implemented yet.", Warning)

    return results


def _code_filter(results: pd.DataFrame, code: str, **kwargs: Any) -> pd.DataFrame:
    """
    Abstraction of the filters for the different codes that are available.
    At the moment, only openQCD and SFCF are known.
    The possible key words for the parameters can be seen in the descriptionso f the code-specific filters.

    Parameters
    ----------
    results: pd.DataFrame
        The unfiltered list of results from the database.
    code: str
        The name of the code that produced the record at hand.
    kwargs:
        The keyworkd args that are handed over to the code-specific filters.

    Returns
    -------
    results: pd.DataFrame
        The filtered results.
    """
    if code == "sfcf":
        return sfcf_filter(results, **kwargs)
    elif code == "openQCD":
        return openQCD_filter(results, **kwargs)
    else:
        raise ValueError(f"Code {code} is not known.")


def find_record(path: Path, ensemble: str, correlator_name: str, code: str, project: Optional[str]=None, parameters: Optional[str]=None,
                created_before: Optional[str]=None, created_after: Optional[str]=None, updated_before: Optional[str]=None, updated_after: Optional[str]=None,
                revision: Optional[str]=None,
                customFilter: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
                **kwargs: Any) -> pd.DataFrame:
    db_file = get_db_file(path)
    db = path / db_file
    if code not in codes:
        raise ValueError("Code " + code + "unknown, take one of the following:" + ", ".join(codes))
    get(path, db_file)
    results = _db_lookup(db, ensemble, correlator_name,code, project, parameters=parameters)
    if any([arg is not None for arg in [created_before, created_after, updated_before, updated_after]]):
        results = _time_filter(results, created_before, created_after, updated_before, updated_after)
    results = _code_filter(results, code, **kwargs)
    if customFilter is not None:
        results =  customFilter(results)
    print("Found " + str(len(results)) + " result" + ("s" if len(results)>1 else ""))
    return results.reset_index()


def find_project(path: Path, name: str) -> str:
    """
    Find a project by it's human readable name.

    Parameters
    ----------
    path: str
        The path of the library.
    name: str
        The name of the project to look for in the library.

    Returns
    -------
    uuid: str
        The uuid of the project in question.
    """
    db_file = get_db_file(path)
    get(path, db_file)
    return _project_lookup_by_alias(path / db_file, name)


def list_projects(path: Path) -> list[tuple[str, str]]:
    """
    List all projects known to the library.

    Parameters
    ----------
    path: str
        The path of the library.

    Returns
    -------
    results: list[Any]
        The projects known to the library.
    """
    db_file = get_db_file(path)
    get(path, db_file)
    conn = sqlite3.connect(os.path.join(path, db_file))
    c = conn.cursor()
    c.execute("SELECT id,aliases FROM projects")
    results = c.fetchall()
    conn.close()
    return results