when writing features do it safely and atomically

This commit is contained in:
Andrej Karpathy
2021-11-26 20:00:37 -08:00
parent c3161b2a49
commit aa877c9397
2 changed files with 69 additions and 4 deletions
+69 -3
View File
@@ -5,11 +5,78 @@ Any of the file system I/O and the associated settings are in this single file.
""" """
import os import os
import sqlite3, zlib, pickle import sqlite3, zlib, pickle, tempfile
from sqlitedict import SqliteDict from sqlitedict import SqliteDict
from contextlib import contextmanager
# -----------------------------------------------------------------------------
# global configuration
DATA_DIR = 'data' DATA_DIR = 'data'
# -----------------------------------------------------------------------------
# utilities for safe writing of a pickle file
# Context managers for atomic writes courtesy of
# http://stackoverflow.com/questions/2333872/atomic-writing-to-file-with-python
@contextmanager
def _tempfile(*args, **kws):
""" Context for temporary file.
Will find a free temporary filename upon entering
and will try to delete the file on leaving
Parameters
----------
suffix : string
optional file suffix
"""
fd, name = tempfile.mkstemp(*args, **kws)
os.close(fd)
try:
yield name
finally:
try:
os.remove(name)
except OSError as e:
if e.errno == 2:
pass
else:
raise e
@contextmanager
def open_atomic(filepath, *args, **kwargs):
""" Open temporary file object that atomically moves to destination upon
exiting.
Allows reading and writing to and from the same filename.
Parameters
----------
filepath : string
the file path to be opened
fsync : bool
whether to force write the file to disk
kwargs : mixed
Any valid keyword arguments for :code:`open`
"""
fsync = kwargs.pop('fsync', False)
with _tempfile(dir=os.path.dirname(filepath)) as tmppath:
with open(tmppath, *args, **kwargs) as f:
yield f
if fsync:
f.flush()
os.fsync(f.fileno())
os.rename(tmppath, filepath)
def safe_pickle_dump(obj, fname):
"""
prevents a case where one process could be writing a pickle file
while another process is reading it, causing a crash. the solution
is to write the pickle file to a temporary file and then move it.
"""
with open_atomic(fname, 'wb') as f:
pickle.dump(obj, f, -1) # -1 specifies highest binary protocol
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
class CompressedSqliteDict(SqliteDict): class CompressedSqliteDict(SqliteDict):
@@ -62,8 +129,7 @@ FEATURES_FILE = os.path.join(DATA_DIR, 'features.p')
def save_features(features): def save_features(features):
""" takes the features dict and save it to disk in a simple pickle file """ """ takes the features dict and save it to disk in a simple pickle file """
with open(FEATURES_FILE, 'wb') as f: safe_pickle_dump(features, FEATURES_FILE)
pickle.dump(features, f)
def load_features(): def load_features():
""" loads the features dict from disk """ """ loads the features dict from disk """
-1
View File
@@ -2,7 +2,6 @@
Extracts tfidf features from all paper abstracts and saves them to disk. Extracts tfidf features from all paper abstracts and saves them to disk.
""" """
import pickle
import argparse import argparse
import numpy as np import numpy as np