[SW-174155] Fix race condition bug when reading scales

Tiefen-boop · Eran Geva · commit 775d5a247c48 · 2024-08-13T06:17:23.000Z
Implement an inter-process reader-writer lock
Implement locking mechanism at save_file/load_file

Change-Id: I140fdc05814286796bb47e6be8170b2ae9dd5154
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -19,6 +19,7 @@
 
 import numpy as np
 import torch
+import fcntl
 
 from .._quant_common.helper_modules import *
 from .._quant_common.quant_config import get_hqt_config
@@ -105,22 +106,69 @@ def load_npz(fname):
     return d["arr_0"].item()
 
 
+class ProcessSafeReaderLock:
+    def __init__(self, file_path):
+        self.file_path = file_path
+        
+    def __enter__(self):
+        self.lock = open(self.file_path + ".lock", 'w')
+        fcntl.flock(self.lock, fcntl.LOCK_SH)  # Shared lock for reading
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        fcntl.flock(self.lock, fcntl.LOCK_UN)  # Unlock the file
+        self.lock.close()
+
+
+class ProcessSafeWriterLock:
+    def __init__(self, file_path):
+        self.file_path = file_path
+
+    def __enter__(self):
+        self.lock = open(self.file_path + ".lock", 'w')
+        fcntl.flock(self.lock, fcntl.LOCK_EX)  # Exclusive lock for writing
+        return self
+
+    def __exit__(self, *args):
+        fcntl.flock(self.lock, fcntl.LOCK_UN)  # Unlock the file
+        self.lock.close()
+
+
 def save_file(model, d, source_format, fname, mode):
     config = get_hqt_config(model)
     logger.debug("Saving %s file: %s", mode, fname)
     ext = os.path.splitext(fname)[1]
-    target_format = file_functions[ext][0]
+    target_format = file_functions[ext]['format']
     dc = rec_fn(d, format_functions[(source_format, target_format)])
     df = {
         "GlobalRank": config.cfg["global_rank"],
         "LocalRank": config.cfg["local_rank"],
         "Mode": mode,
         "Nodes": dc,
     }
-    try:
-        file_functions[ext][1](df, fname)
-    except:
-        pass
+    with ProcessSafeWriterLock(fname):
+        try:
+            file_functions[ext]['save'](df, fname)
+        except:
+            pass
+
+
+def load_file(fname, target_format, fail_on_file_not_exist):
+    logger.debug("Loading file: %s", fname)
+    ext = os.path.splitext(fname)[1]
+    source_format = file_functions[ext]['format']
+    d = {}
+    if os.path.isfile(fname):
+        with ProcessSafeReaderLock(fname):
+            d = file_functions[ext]['load'](fname)
+    elif fail_on_file_not_exist:
+        raise FileNotFoundError(f"Failed to load file {fname}")
+    if "Nodes" in d:
+        dc = {k: ModuleConfig(**fix_fields(d["Nodes"][k])) for k in d["Nodes"]}
+        dc = {k: module_convert(dc[k], format_functions[(source_format, target_format)]) for k in dc}
+    else:
+        dc = {}
+    return dc
 
 
 # convert module config data to other format
@@ -147,29 +195,26 @@ def fix_fields(d):
     return d
 
 
-def load_file(fname, target_format, fail_on_file_not_exist):
-    logger.debug("Loading file: %s", fname)
-    ext = os.path.splitext(fname)[1]
-    source_format = file_functions[ext][0]
-    d = {}
-    if os.path.isfile(fname):
-        d = file_functions[ext][2](fname)
-    elif fail_on_file_not_exist:
-        raise FileNotFoundError(f"Failed to load file {fname}")
-    if "Nodes" in d:
-        dc = {k: ModuleConfig(**fix_fields(d["Nodes"][k])) for k in d["Nodes"]}
-        dc = {k: module_convert(dc[k], format_functions[(source_format, target_format)]) for k in dc}
-    else:
-        dc = {}
-    return dc
-
-
 def save_scales(model, d, source_format, fname):
+    """Saves scales measured of a given model.
+
+    Args:
+        model : The measured model.
+        d : Modules_names to configuration dictionary.
+        source_format : How the data is stored in memory.
+        fname : File to save the scales to.
+    """
     dc = {k: d[k].__dict__ for k in d}
     save_file(model, dc, source_format, fname, "Scale")
 
 
 def load_scales(fname, target_format):
+    """Loads scales from given file.
+
+    Args:
+        fname : File to load the scales from.
+        target_format: How the data is stored in file.
+    """
     logger.debug("Loading scales file %s", fname)
     d = load_file(fname, target_format, False)
     return d
@@ -184,8 +229,8 @@ def convert_scales_to_tensors_dict(scales_obj, scales_file_format, hp_dtype):
 
 
 file_functions = {
-    ".json": (list, save_json, load_json),
-    ".npz": (np.ndarray, save_npz, load_npz),
+    ".json": {'format': list, 'save': save_json, 'load': load_json},
+    ".npz": {'format': np.ndarray, 'save': save_npz, 'load': load_npz}
 }
 
 format_functions = {