implemented read_sfcf for append-mode output, bug fixes

2026-02-20 03:52:45 +01:00 · 2022-01-03 21:34:04 +01:00 · 2022-01-03 21:34:04 +01:00 · ebdc17aa66
commit ebdc17aa66
parent 31c2ada963
1 changed files with 245 additions and 148 deletions
--- a/pyerrors/input/sfcf.py
+++ b/pyerrors/input/sfcf.py
@ -8,7 +8,7 @@ import numpy as np  # Thinly-wrapped numpy
 from ..obs import Obs
 from . import utils

-def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, version = "1.0", **kwargs):
+def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, version = "1.0c", **kwargs):
    """Read sfcf c format from given folder structure.

    Parameters
@ -65,186 +65,283 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, version =

    #due to higher usage in current projects, compact file format is default
    compact = True
+    appended = False
    #get version string
    known_versions = ["0.0","1.0","2.0","1.0c","2.0c","1.0a","2.0a"]
-    if "version" in kwargs:
-        version = kwargs.get("version")
-        if not version in known_versions:
-            raise Exception("This version is not known!")
-        #if the letter c is appended to the version, the compact fileformat is used (former read_sfcf_c)
-        if(version[-1] == "c"):
-            compact = True
-            version = version[:-1]
-        else:
-            compact = False
+
+    if not version in known_versions:
+        raise Exception("This version is not known!")
+    #if the letter c is appended to the version, the compact fileformat is used (former read_sfcf_c)
+    if(version[-1] == "c"):
+        appended = False
+        compact = True
+        version = version[:-1]
+    elif(version[-1] == "a"):
+        appended = True
+        compact = False
+        version = version[:-1]
+    else:
+        compact = False
+        appended = False
    read = 0
    T = 0
    start = 0
    ls = []
-    for (dirpath, dirnames, filenames) in os.walk(path):
-        ls.extend(dirnames)
-        break
-    if not ls:
-        raise Exception('Error, directory not found')
-    # Exclude folders with different names
    if "replica" in kwargs:
        ls = reps
    else:
+        for (dirpath, dirnames, filenames) in os.walk(path):
+            if not appended:
+                ls.extend(dirnames)
+            else:
+                ls.extend(filenames)
+            break
+        if not ls:
+            raise Exception('Error, directory not found')
+        # Exclude folders with different names
        for exc in ls:
            if not fnmatch.fnmatch(exc, prefix + '*'):
                ls = list(set(ls) - set([exc]))
    if len(ls) > 1:
        ls.sort(key=lambda x: int(re.findall(r'\d+', x[len(prefix):])[0]))  # New version, to cope with ids, etc.
-    replica = len(ls)
+    if not appended:
+        replica = len(ls)
+    else:
+        replica = len([l.split(".")[-1] for l in ls])//len(set([l.split(".")[-1] for l in ls]))
    print('Read', part, 'part of', name, 'from', prefix[:-1], ',', replica, 'replica')
-    idl = []
    if 'names' in kwargs:
        new_names = kwargs.get('names')
        if len(new_names)!=len(set(new_names)):
-            raise Exception("names are nor unique!")
+            raise Exception("names are not unique!")
        if len(new_names) != replica:
            raise Exception('Names does not have the required length', replica)
    else:
        # Adjust replica names to new bookmarking system
+
        new_names = []
-        for entry in ls:
-            try:
-                idx = entry.index('r')
-            except:
-                raise Exception("Automatic recognition of replicum failed, please enter the key word 'names'.")
-        
-            if 'ens_name' in kwargs:
-                new_names.append(kwargs.get('ens_name') + '|' + entry[idx:])
-            else:
-                new_names.append(entry[:idx] + '|' + entry[idx:])
-    for i, item in enumerate(ls):
-        sub_ls = []
-        if "files" in kwargs:
-            sub_ls = kwargs.get("files")
-            sub_ls.sort(key=lambda x: int(re.findall(r'\d+', x)[-1]))
+        if not appended:
+            for entry in ls:
+                try:
+                    idx = entry.index('r')
+                except:
+                    raise Exception("Automatic recognition of replicum failed, please enter the key word 'names'.")
+                    
+                if 'ens_name' in kwargs:
+                    new_names.append(kwargs.get('ens_name') + '|' + entry[idx:])
+                else:
+                    new_names.append(entry[:idx] + '|' + entry[idx:])
        else:
-            for (dirpath, dirnames, filenames) in os.walk(path + '/' + item):
-                if compact:
-                    sub_ls.extend(filenames)
-                else:
-                    sub_ls.extend(dirnames)
-                break
-        
-            #print(sub_ls)
-            for exc in sub_ls:    
-                if compact:
-                    if not fnmatch.fnmatch(exc, prefix + '*'):
-                        sub_ls = list(set(sub_ls) - set([exc]))
-                    sub_ls.sort(key=lambda x: int(re.findall(r'\d+', x)[-1]))
-                else:
-                    if not fnmatch.fnmatch(exc, 'cfg*'):
-                        sub_ls = list(set(sub_ls) - set([exc]))
-                    sub_ls.sort(key=lambda x: int(x[3:]))
-        #print(sub_ls)
-        rep_idl = []
-        no_cfg = len(sub_ls)
-        for cfg in sub_ls:
-            try:
-                if compact:
-                    rep_idl.append(int(cfg.split("n")[-1]))
-                else:
-                    rep_idl.append(int(cfg[3:]))
-            except:
-                raise Exception("Couldn't parse idl from directroy, problem with file "+cfg)
-        rep_idl.sort()
-        #maybe there is a better way to print the idls
-        print(item, ':', no_cfg, ' configurations')
-        idl.append(rep_idl)
-    #here we have found all the files we need to look into.
-        if i == 0:
-            #here, we want to find the place within the file, where the correlator we need is stored.
            
-            if compact:
-                #to do so, the pattern needed is put together from the input values
-                pattern = 'name      ' + name + '\nquarks    ' + quarks + '\noffset    ' + str(noffset) + '\nwf        ' + str(wf)
-                if b2b:
-                    pattern += '\nwf_2      ' + str(wf2)
-                #and the file is parsed through to find the pattern
-                with open(path + '/' + item + '/' + sub_ls[0], 'r') as file:
-                    content = file.read()
-                    match = re.search(pattern, content)
-                    if match:
-                        #the start and end point of the correlator in quaetion is extracted for later use in the other files
-                        start_read = content.count('\n', 0, match.start()) + 5 + b2b
-                        end_match = re.search(r'\n\s*\n', content[match.start():])
-                        T = content[match.start():].count('\n', 0, end_match.start()) - 4 - b2b
-                        assert T > 0
-                        print(T, 'entries, starting to read in line', start_read)
-                    else:
-                        raise Exception('Correlator with pattern\n' + pattern + '\nnot found.')
+            for exc in ls:
+                if not fnmatch.fnmatch(exc, prefix + '*.'+name):
+                    ls = list(set(ls) - set([exc]))
+            ls.sort(key=lambda x: int(re.findall(r'\d+', x)[-1]))
+            for entry in ls:
+                myentry = entry.removesuffix("."+name)
+                try:
+                    idx = myentry.index('r')
+                except:
+                    raise Exception("Automatic recognition of replicum failed, please enter the key word 'names'.")
+                
+                if 'ens_name' in kwargs:
+                    new_names.append(kwargs.get('ens_name') + '|' + myentry[idx:])
+                else:
+                    new_names.append(myentry[:idx] + '|' + myentry[idx:])
+            #print(new_names)
+    idl = []
+    if not appended:
+        for i, item in enumerate(ls):
+            sub_ls = []
+            if "files" in kwargs:
+                sub_ls = kwargs.get("files")
+                sub_ls.sort(key=lambda x: int(re.findall(r'\d+', x)[-1]))
            else:
-                #this part does the same as above, but for non-compactified versions of the files
-                with open(path + '/' + item + '/' + sub_ls[0] + '/' + name) as fp:
-                    for k, line in enumerate(fp):
-                        if version == "0.0":
-                            #check if this is really the right file by matchin pattern similar to above
-                            pattern = "# "+name+" : offset "+str(noffset)+", wf "+str(wf)
-                            #if b2b, a second wf is needed
-                            if b2b:
-                                pattern+=", wf_2 "+str(wf2)
-                            qs = quarks.split(" ")
-                            pattern+=" : "+qs[0]+" - "+qs[1]
-                            #print(pattern)
-                        if read == 1 and not line.strip() and k > start + 1:
-                            break
-                        if read == 1 and k >= start:
-                            T += 1
-
-                        if version == "0.0":
-                            if pattern in line:
-                                #print(line)
-                                read = 1
-                                start = k+1
+                for (dirpath, dirnames, filenames) in os.walk(path + '/' + item):
+                    if compact:
+                        sub_ls.extend(filenames)
+                    else:
+                        sub_ls.extend(dirnames)
+                    break
+            
+                #print(sub_ls)
+                for exc in sub_ls:    
+                    if compact:
+                        if not fnmatch.fnmatch(exc, prefix + '*'):
+                            sub_ls = list(set(sub_ls) - set([exc]))
+                        sub_ls.sort(key=lambda x: int(re.findall(r'\d+', x)[-1]))
+                    else:
+                        if not fnmatch.fnmatch(exc, 'cfg*'):
+                            sub_ls = list(set(sub_ls) - set([exc]))
+                        sub_ls.sort(key=lambda x: int(x[3:]))
+            #print(sub_ls)
+            rep_idl = []
+            no_cfg = len(sub_ls)
+            for cfg in sub_ls:
+                try:
+                    if compact:
+                        rep_idl.append(int(cfg.split("n")[-1]))
+                    else:
+                        rep_idl.append(int(cfg[3:]))
+                except:
+                    raise Exception("Couldn't parse idl from directroy, problem with file "+cfg)
+            rep_idl.sort()
+            #maybe there is a better way to print the idls
+            print(item, ':', no_cfg, ' configurations')
+            idl.append(rep_idl)
+        #here we have found all the files we need to look into.
+            if i == 0:
+                #here, we want to find the place within the file, where the correlator we need is stored.
+                if compact:
+                    #to do so, the pattern needed is put together from the input values
+                    pattern = 'name      ' + name + '\nquarks    ' + quarks + '\noffset    ' + str(noffset) + '\nwf        ' + str(wf)
+                    if b2b:
+                        pattern += '\nwf_2      ' + str(wf2)
+                    #and the file is parsed through to find the pattern
+                    with open(path + '/' + item + '/' + sub_ls[0], 'r') as file:
+                        content = file.read()
+                        match = re.search(pattern, content)
+                        if match:
+                            #the start and end point of the correlator in quaetion is extracted for later use in the other files
+                            start_read = content.count('\n', 0, match.start()) + 5 + b2b
+                            end_match = re.search(r'\n\s*\n', content[match.start():])
+                            T = content[match.start():].count('\n', 0, end_match.start()) - 4 - b2b
+                            assert T > 0
+                            print(T, 'entries, starting to read in line', start_read)
                        else:
-                            if '[correlator]' in line:
-                                read = 1
-                                start = k + 7 + b2b
-                                T -= b2b
-                    print(str(T)+" entries found.")
-            #we found where the correlator that is to be read is in the files
-            #after preparing the datastructure the correlators get parsed into...
-            deltas = []
-            for j in range(T):
-                deltas.append([])
-        
-        sublength = no_cfg
-        for j in range(T):
-            deltas[j].append(np.zeros(sublength))
-        #... the actual parsing can start. we iterate through all measurement files in the path given...
-        if compact:
-            for cfg in range(no_cfg):
-                with open(path + '/' + item + '/' + sub_ls[cfg]) as fp:
-                    lines = fp.readlines()
-                    #check, if the correlator is in fact printed completely
-                    if(start_read + T>len(lines)):
-                        raise Exception("EOF before end of correlator data! Maybe "+path + '/' + item + '/' + sub_ls[cfg]+" is corrupted?")
-                    #and start to read the correlator.
-                    #the range here is chosen like this, since this allows for implementing a security check for every read correlator later...
-                    for k in range(start_read - 6,start_read + T):
-                        if k == start_read - 5 - b2b:
-                            if lines[k].strip() != 'name      ' + name:
-                                raise Exception('Wrong format', sub_ls[cfg])
-                        if(k >= start_read and k < start_read + T):
-                            floats = list(map(float, lines[k].split()))
-                            deltas[k - start_read][i][cfg] = floats[-2:][im]
-        else:
-            for cnfg, subitem in enumerate(sub_ls):
-                with open(path + '/' + item + '/' + subitem + '/' + name) as fp:
-                    #since the non-compatified files are typically not so long, we can iterate over the whole file.
-                    #here one can also implement the chekc from above.
-                    for k, line in enumerate(fp):
-                        if(k >= start and k < start + T):
-                            floats = list(map(float, line.split()))
+                            raise Exception('Correlator with pattern\n' + pattern + '\nnot found.')
+                else:
+                    #this part does the same as above, but for non-compactified versions of the files
+                    with open(path + '/' + item + '/' + sub_ls[0] + '/' + name) as fp:
+                        for k, line in enumerate(fp):
                            if version == "0.0":
-                                deltas[k-start][i][cnfg] = floats[im]
-                            else:
-                                deltas[k - start][i][cnfg] = floats[1 + im - single]
+                                #check if this is really the right file by matchin pattern similar to above
+                                pattern = "# "+name+" : offset "+str(noffset)+", wf "+str(wf)
+                                #if b2b, a second wf is needed
+                                if b2b:
+                                    pattern+=", wf_2 "+str(wf2)
+                                qs = quarks.split(" ")
+                                pattern+=" : "+qs[0]+" - "+qs[1]
+                                #print(pattern)
+                            if read == 1 and not line.strip() and k > start + 1:
+                                break
+                            if read == 1 and k >= start:
+                                T += 1

+                            if version == "0.0":
+                                if pattern in line:
+                                    #print(line)
+                                    read = 1
+                                    start = k+1
+                            else:
+                                if '[correlator]' in line:
+                                    read = 1
+                                    start = k + 7 + b2b
+                                    T -= b2b
+                        print(str(T)+" entries found.")
+                #we found where the correlator that is to be read is in the files
+                #after preparing the datastructure the correlators get parsed into...
+                deltas = []
+                for j in range(T):
+                    deltas.append([])
+            
+            
+            for t in range(T):
+                deltas[t].append(np.zeros(no_cfg))
+            #... the actual parsing can start. we iterate through all measurement files in the path given...
+            if compact:
+                for cfg in range(no_cfg):
+                    with open(path + '/' + item + '/' + sub_ls[cfg]) as fp:
+                        lines = fp.readlines()
+                        #check, if the correlator is in fact printed completely
+                        if(start_read + T>len(lines)):
+                            raise Exception("EOF before end of correlator data! Maybe "+path + '/' + item + '/' + sub_ls[cfg]+" is corrupted?")
+                        #and start to read the correlator.
+                        #the range here is chosen like this, since this allows for implementing a security check for every read correlator later...
+                        for k in range(start_read - 6,start_read + T):
+                            if k == start_read - 5 - b2b:
+                                if lines[k].strip() != 'name      ' + name:
+                                    raise Exception('Wrong format', sub_ls[cfg])
+                            if(k >= start_read and k < start_read + T):
+                                floats = list(map(float, lines[k].split()))
+                                deltas[k - start_read][i][cfg] = floats[-2:][im]
+            else:
+                for cnfg, subitem in enumerate(sub_ls):
+                    with open(path + '/' + item + '/' + subitem + '/' + name) as fp:
+                        #since the non-compatified files are typically not so long, we can iterate over the whole file.
+                        #here one can also implement the chekc from above.
+                        for k, line in enumerate(fp):
+                            if(k >= start and k < start + T):
+                                floats = list(map(float, line.split()))
+                                if version == "0.0":
+                                    deltas[k-start][i][cnfg] = floats[im]
+                                else:
+                                    deltas[k - start][i][cnfg] = floats[1 + im - single]
+                                        
+    else:
+        for exc in ls:
+            if not fnmatch.fnmatch(exc, prefix + '*.'+name):
+                ls = list(set(ls) - set([exc]))
+            ls.sort(key=lambda x: int(re.findall(r'\d+', x)[-1]))
+        #print(ls)
+        pattern = 'name      ' + name + '\nquarks    ' + quarks + '\noffset    ' + str(noffset) + '\nwf        ' + str(wf)
+        if b2b:
+            pattern += '\nwf_2      ' + str(wf2)
+        for rep,file in enumerate(ls):
+            rep_idl = []
+            with open(path + '/' + file, 'r') as fp:
+                content = fp.readlines()
+                data_starts = []
+                for l,line in enumerate(content):
+                    if "[run]" in line:
+                        data_starts.append(l)
+                if len(set([data_starts[i]-data_starts[i-1] for i in range(1,len(data_starts))])) > 1:
+                    raise Exception ("Irregularities in file structure found, not all runs have the same output length")
+                #print(data_starts)
+                #first chunk of data
+                chunk = content[:data_starts[1]]
+                for l,line in enumerate(chunk):
+                    if line.startswith("gauge_name"):
+                        gauge_line = l
+                        #meta_data["gauge_name"] = (line.strip()).split("/")[-1]
+                    elif line.startswith("[correlator]"):
+                        corr_line = l
+                        found_pat = ""
+                        for li in chunk[corr_line+1:corr_line+6+b2b]:
+                            found_pat += li
+                        if re.search(pattern,found_pat):
+                            start_read = corr_line+7+b2b
+                            T=len(chunk)-1-start_read
+                if rep == 0:
+                    deltas = []
+                    for t in range(T):
+                        deltas.append([])
+                for t in range(T):
+                    deltas[t].append(np.zeros(len(data_starts)))
+                #all other chunks should follow the same structure
+                for cnfg in range(len(data_starts)):
+                    start = data_starts[cnfg]
+                    stop = start+data_starts[1]
+                    chunk = content[start:stop]
+                    #meta_data = {}
+                    
+                    try:
+                        rep_idl.append(int(chunk[gauge_line].split("n")[-1]))
+                    except:
+                        raise Exception("Couldn't parse idl from directroy, problem with chunk around line "+gauge_line)
+                    
+                    found_pat = ""
+                    for li in chunk[corr_line+1:corr_line+6+b2b]:
+                        found_pat += li
+                    if re.search(pattern,found_pat):
+                        #print("found pattern")
+                        for t,line in enumerate(chunk[start_read:start_read+T]):
+                            floats = list(map(float, line.split()))
+                            deltas[t][rep][cnfg] = floats[-2:][im]
+            idl.append(rep_idl)
+
+    #print(new_names)
+    #print(deltas)    
+    #print(idl)
    if "check_configs" in kwargs:
        print("Checking for missing configs...")
        che = kwargs.get("check_configs")