implemented idl into sfcf-read method

2026-02-20 12:02:47 +01:00 · 2021-12-17 15:16:17 +01:00 · 2021-12-17 15:16:17 +01:00 · c5292f8342
commit c5292f8342
parent 5e5a9df404
2 changed files with 104 additions and 166 deletions
--- a/pyerrors/input/sfcf.py
+++ b/pyerrors/input/sfcf.py
@ -6,125 +6,41 @@ import fnmatch
 import re
 import numpy as np  # Thinly-wrapped numpy
 from ..obs import Obs
-
-
-def read_sfcf_old(path, prefix, name, quarks, noffset = 0, wf=0, wf2=0, **kwargs):
-    """Read sfcf format (from around 2012) from given folder structure.
-
-    Keyword arguments
-    -----------------
-    im -- if True, read imaginary instead of real part of the correlation function.
-    single -- if True, read a boundary-to-boundary correlation function with a single value
-    b2b -- if True, read a time-dependent boundary-to-boundary correlation function
-    names -- Alternative labeling for replicas/ensembles. Has to have the appropriate length
-    """
-    if kwargs.get('im'):
-        im = 1
-        part = 'imaginary'
-    else:
-        im = 0
-        part = 'real'
-        
-    b2b = 0
-
-    if kwargs.get('b2b'):
-        b2b = 1
-    
-    quarks = quarks.split(" ")
-    read = 0
-    T = 0
-    start = 0
-    ls = []
-    for (dirpath, dirnames, filenames) in os.walk(path):
-        ls.extend(dirnames)
-        break
-    if not ls:
-        print('Error, directory not found')
-        #sys.exit()
-    for exc in ls:
-        if fnmatch.fnmatch(exc, prefix + '*'):
-            ls = list(set(ls) - set(exc))
-    if len(ls) > 1:
-        ls.sort(key=lambda x: int(re.findall(r'\d+', x[len(prefix):])[0]))
-    replica = len(ls)
-    print('Read', part, 'part of', name, 'from', prefix, ',', replica, 'replica')
-    if 'names' in kwargs:
-        new_names = kwargs.get('names')
-        if len(new_names) != replica:
-            raise Exception('Names does not have the required length', replica)
-    else:
-        new_names = ls
-    print(replica, 'replica')
-    for i, item in enumerate(ls):
-        print(item)
-        sub_ls = []
-        for (dirpath, dirnames, filenames) in os.walk(path+'/'+item):
-            sub_ls.extend(dirnames)
-            break
-        for exc in sub_ls:
-            if fnmatch.fnmatch(exc, 'cfg*'):
-                sub_ls = list(set(sub_ls) - set(exc))
-        sub_ls.sort(key=lambda x: int(x[3:]))
-        no_cfg = len(sub_ls)
-        print(no_cfg, 'configurations')
-        if i == 0:
-            with open(path + '/' + item + '/' + sub_ls[0] + '/' + name) as fp:
-                for k, line in enumerate(fp):
-                    #check if this is really the right file
-                    pattern = "# "+name+" : offset "+str(noffset)+", wf "+"0"
-                    #if b2b, a second wf is needed
-                    if b2b:
-                        pattern+=", wf_2 "+"0"
-                    pattern+=" : "+quarks[0]+" - "+quarks[1]
-
-                    if read == 1 and not line.strip() and k > start + 1:
-                        break
-                    if read == 1 and k >= start:
-                        T += 1
-                    if pattern in line:
-                        #print(line)
-                        read = 1
-                        start = k+1
-                print(str(T)+" entries found.")
-
-            deltas = []
-            for j in range(T):
-                deltas.append([])
-
-        sublength = len(sub_ls)
-        for j in range(T):
-            deltas[j].append(np.zeros(sublength))
-
-        for cnfg, subitem in enumerate(sub_ls):
-            with open(path + '/' + item + '/' + subitem + '/'+name) as fp:
-                for k, line in enumerate(fp):
-                    if(k >= start and k < start + T):
-                        floats = list(map(float, line.split()))
-                        deltas[k-start][i][cnfg] = floats[im]
-                        
-
-    result = []
-    for t in range(T):
-        result.append(Obs(deltas[t], new_names))
-
-    return result
-
+from . import utils

 def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs):
    """Read sfcf c format from given folder structure.

    Parameters
    ----------
-    quarks -- Label of the quarks used in the sfcf input file
-    noffset -- Offset of the source (only relevant when wavefunctions are used)
-    wf -- ID of wave function
-    wf2 -- ID of the second wavefunction (only relevant for boundary-to-boundary correlation functions)
-    im -- if True, read imaginary instead of real part of the correlation function.
-    b2b -- if True, read a time-dependent boundary-to-boundary correlation function
-    single -- if True, read time independent boundary to boundary correlation function
-    names -- Alternative labeling for replicas/ensembles. Has to have the appropriate length
+    quarks: str
+        Label of the quarks used in the sfcf input file. e.g. "quark quark"
+        for version 0.0 this does NOT need to be given with the typical " - " that is present in the output file,
+        this is done automatically for this version
+    noffset: int
+        Offset of the source (only relevant when wavefunctions are used)
+    wf: int
+        ID of wave function
+    wf2: int
+        ID of the second wavefunction (only relevant for boundary-to-boundary correlation functions)
+    im: bool
+        if True, read imaginary instead of real part of the correlation function.
+    b2b: bool
+        if True, read a time-dependent boundary-to-boundary correlation function
+    single: bool
+        if True, read time independent boundary to boundary correlation function
+    names: list
+        Alternative labeling for replicas/ensembles. Has to have the appropriate length
    ens_name : str
        replaces the name of the ensemble
+    version: str
+        version of SFCF, with which the measurement was done. if the compact output option (-c) was spectified, append a c to the version (e.g. "1.0c")
+    replica: list
+        list of replica to be read, default is all
+    files: list
+        list of files to be read per replica, default is all. for non-conpact ouztput format, hand the folders to be read here.
+    check_configs:
+        list of list of supposed configs, eg. [range(1,1000)] for one replicum with 1000 configs
    """
    if kwargs.get('im'):
        im = 1
@ -142,8 +58,8 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs)
        else:
            b2b = 0
        single = 0
-
-    files = []
+    if "replica" in kwargs:
+        reps = kwargs.get("replica")
    if "files" in kwargs:
        files = kwargs.get("files")

@ -172,8 +88,8 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs)
    if not ls:
        raise Exception('Error, directory not found')
    # Exclude folders with different names
-    if len(files) != 0:
-        ls = files
+    if "replica" in kwargs:
+        ls = reps
    else:
        for exc in ls:
            if not fnmatch.fnmatch(exc, prefix + '*'):
@ -182,9 +98,11 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs)
        ls.sort(key=lambda x: int(re.findall(r'\d+', x[len(prefix):])[0]))  # New version, to cope with ids, etc.
    replica = len(ls)
    print('Read', part, 'part of', name, 'from', prefix[:-1], ',', replica, 'replica')
-
+    idl = []
    if 'names' in kwargs:
        new_names = kwargs.get('names')
+        if len(new_names)!=len(set(new_names)):
+            raise Exception("names are nor unique!")
        if len(new_names) != replica:
            raise Exception('Names does not have the required length', replica)
    else:
@ -194,59 +112,65 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs)
            try:
                idx = entry.index('r')
            except:
-                idx = len(entry)-2
+                raise Exception("Automatic recognition of replicum failed, please enter the key word 'names'.")
+        
            if 'ens_name' in kwargs:
                new_names.append(kwargs.get('ens_name') + '|' + entry[idx:])
            else:
                new_names.append(entry[:idx] + '|' + entry[idx:])
    for i, item in enumerate(ls):
        sub_ls = []
-        for (dirpath, dirnames, filenames) in os.walk(path + '/' + item):
-            if compact:
-                sub_ls.extend(filenames)
-            else:
-                sub_ls.extend(dirnames)
-            break
-        
-        #print(sub_ls)
-        for exc in sub_ls:    
-            if compact:
-                if not fnmatch.fnmatch(exc, prefix + '*'):
-                    sub_ls = list(set(sub_ls) - set([exc]))
-                sub_ls.sort(key=lambda x: int(re.findall(r'\d+', x)[-1]))
-            else:
-                if not fnmatch.fnmatch(exc, 'cfg*'):
-                    sub_ls = list(set(sub_ls) - set([exc]))
-                sub_ls.sort(key=lambda x: int(x[3:]))
-        
-        if compact:
-            first_cfg = int(re.findall(r'\d+', sub_ls[0])[-1])
-
-            last_cfg = len(sub_ls) + first_cfg - 1
-
-            for cfg in range(1, len(sub_ls)):
-                if int(re.findall(r'\d+', sub_ls[cfg])[-1]) != first_cfg + cfg:
-                    last_cfg = cfg + first_cfg - 1
-                    break
-
-            no_cfg = last_cfg - first_cfg + 1
-            print(item, ':', no_cfg, 'evenly spaced configurations (', first_cfg, '-', last_cfg, ') ,', len(sub_ls) - no_cfg, 'configs omitted\n')
+        if "files" in kwargs:
+            sub_ls = kwargs.get("files")
+            sub_ls.sort(key=lambda x: int(re.findall(r'\d+', x)[-1]))
        else:
-            no_cfg = len(sub_ls)
-            print(no_cfg, 'configurations')
-
-        #here we have found all the files we need to look into.
+            for (dirpath, dirnames, filenames) in os.walk(path + '/' + item):
+                if compact:
+                    sub_ls.extend(filenames)
+                else:
+                    sub_ls.extend(dirnames)
+                break
+        
+            #print(sub_ls)
+            for exc in sub_ls:    
+                if compact:
+                    if not fnmatch.fnmatch(exc, prefix + '*'):
+                        sub_ls = list(set(sub_ls) - set([exc]))
+                    sub_ls.sort(key=lambda x: int(re.findall(r'\d+', x)[-1]))
+                else:
+                    if not fnmatch.fnmatch(exc, 'cfg*'):
+                        sub_ls = list(set(sub_ls) - set([exc]))
+                    sub_ls.sort(key=lambda x: int(x[3:]))
+        #print(sub_ls)
+        rep_idl = []
+        no_cfg = len(sub_ls)
+        for cfg in sub_ls:
+            try:
+                if compact:
+                    rep_idl.append(int(cfg.split("n")[-1]))
+                else:
+                    rep_idl.append(int(cfg[3:]))
+            except:
+                raise Exception("Couldn't parse idl from directroy, problem with file "+cfg)
+        rep_idl.sort()
+        #maybe there is a better way to print the idls
+        print(item, ':', no_cfg, ' configurations')
+        idl.append(rep_idl)
+    #here we have found all the files we need to look into.
        if i == 0:
+            #here, we want to find the place within the file, where the correlator we need is stored.
+            
            if compact:
-    
+                #to do so, the pattern needed is put together from the input values
                pattern = 'name      ' + name + '\nquarks    ' + quarks + '\noffset    ' + str(noffset) + '\nwf        ' + str(wf)
                if b2b:
                    pattern += '\nwf_2      ' + str(wf2)
-
+                #and the file is parsed through to find the pattern
                with open(path + '/' + item + '/' + sub_ls[0], 'r') as file:
                    content = file.read()
                    match = re.search(pattern, content)
                    if match:
+                        #the start and end point of the correlator in quaetion is extracted for later use in the other files
                        start_read = content.count('\n', 0, match.start()) + 5 + b2b
                        end_match = re.search(r'\n\s*\n', content[match.start():])
                        T = content[match.start():].count('\n', 0, end_match.start()) - 4 - b2b
@ -255,11 +179,11 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs)
                    else:
                        raise Exception('Correlator with pattern\n' + pattern + '\nnot found.')
            else:
-                #print(path + '/' + item + '/')# + sub_ls[0] + '/' + name)
+                #this part does the same as above, but for non-compactified versions of the files
                with open(path + '/' + item + '/' + sub_ls[0] + '/' + name) as fp:
                    for k, line in enumerate(fp):
                        if version == "0.0":
-                            #check if this is really the right file
+                            #check if this is really the right file by matchin pattern similar to above
                            pattern = "# "+name+" : offset "+str(noffset)+", wf "+str(wf)
                            #if b2b, a second wf is needed
                            if b2b:
@ -284,19 +208,24 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs)
                                T -= b2b
                    print(str(T)+" entries found.")
            #we found where the correlator that is to be read is in the files
+            #after preparing the datastructure the correlators get parsed into...
            deltas = []
            for j in range(T):
                deltas.append([])
-
+        
        sublength = no_cfg
        for j in range(T):
            deltas[j].append(np.zeros(sublength))
+        #... the actual parsing can start. we iterate through all measurement files in the path given...
        if compact:
            for cfg in range(no_cfg):
                with open(path + '/' + item + '/' + sub_ls[cfg]) as fp:
                    lines = fp.readlines()
+                    #check, if the correlator is in fact printed completely
                    if(start_read + T>len(lines)):
                        raise Exception("EOF before end of correlator data! Maybe "+path + '/' + item + '/' + sub_ls[cfg]+" is corrupted?")
+                    #and start to read the correlator.
+                    #the range here is chosen like this, since this allows for implementing a security check for every read correlator later...
                    for k in range(start_read - 6,start_read + T):
                        if k == start_read - 5 - b2b:
                            if lines[k].strip() != 'name      ' + name:
@ -307,6 +236,8 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs)
        else:
            for cnfg, subitem in enumerate(sub_ls):
                with open(path + '/' + item + '/' + subitem + '/' + name) as fp:
+                    #since the non-compatified files are typically not so long, we can iterate over the whole file.
+                    #here one can also implement the chekc from above.
                    for k, line in enumerate(fp):
                        if(k >= start and k < start + T):
                            floats = list(map(float, line.split()))
@ -315,9 +246,17 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs)
                            else:
                                deltas[k - start][i][cnfg] = floats[1 + im - single]

-
+    if "check_configs" in kwargs:
+        print("Chekcing for missing configs...")
+        che = kwargs.get("check_configs")
+        if not (len(che) == len(idl)):
+            raise Exception("check_configs has to be the same length as replica!")
+        for r in range(len(idl)):
+            print("checking "+new_names[r])
+            utils.check_idl(idl[r], che[r])
+        print("Done")
    result = []
    for t in range(T):
-        result.append(Obs(deltas[t], new_names))
+        result.append(Obs(deltas[t], new_names, idl = idl))
    return result

--- a/pyerrors/input/utils.py
+++ b/pyerrors/input/utils.py
@ -1,14 +1,13 @@
-import fnmatch
+"""Utilities for the input"""

-def check_missing(idl,che):
+def check_idl(idl,che):
    missing = []
-    for ind in che:
-            if not ind in idl:
-                missing.append(ind)
-    if(len(missing) == 0):
-        print("There are no measurements missing.")
-    else:
-        print(len(missing),"measurements missing")
+    for c in che:
+        if not c in idl:
+            missing.append(c)
+    #print missing such that it can directly be parsed to slurm terminal
+    if not (len(missing) == 0):
+        print(len(missing),"configs missing")
        miss_str = str(missing[0])
        for i in missing[1:]:
            miss_str += ","+str(i)