when writing features do it safely and atomically

This commit is contained in:
Andrej Karpathy
2021-11-26 20:00:37 -08:00
parent c3161b2a49
commit aa877c9397
2 changed files with 69 additions and 4 deletions
+69 -3
View File
@@ -5,11 +5,78 @@ Any of the file system I/O and the associated settings are in this single file.
"""
import os
import sqlite3, zlib, pickle
import sqlite3, zlib, pickle, tempfile
from sqlitedict import SqliteDict
from contextlib import contextmanager
# -----------------------------------------------------------------------------
# global configuration
DATA_DIR = 'data'
# -----------------------------------------------------------------------------
# utilities for safe writing of a pickle file
# Context managers for atomic writes courtesy of
# http://stackoverflow.com/questions/2333872/atomic-writing-to-file-with-python
@contextmanager
def _tempfile(*args, **kws):
""" Context for temporary file.
Will find a free temporary filename upon entering
and will try to delete the file on leaving
Parameters
----------
suffix : string
optional file suffix
"""
fd, name = tempfile.mkstemp(*args, **kws)
os.close(fd)
try:
yield name
finally:
try:
os.remove(name)
except OSError as e:
if e.errno == 2:
pass
else:
raise e
@contextmanager
def open_atomic(filepath, *args, **kwargs):
""" Open temporary file object that atomically moves to destination upon
exiting.
Allows reading and writing to and from the same filename.
Parameters
----------
filepath : string
the file path to be opened
fsync : bool
whether to force write the file to disk
kwargs : mixed
Any valid keyword arguments for :code:`open`
"""
fsync = kwargs.pop('fsync', False)
with _tempfile(dir=os.path.dirname(filepath)) as tmppath:
with open(tmppath, *args, **kwargs) as f:
yield f
if fsync:
f.flush()
os.fsync(f.fileno())
os.rename(tmppath, filepath)
def safe_pickle_dump(obj, fname):
"""
prevents a case where one process could be writing a pickle file
while another process is reading it, causing a crash. the solution
is to write the pickle file to a temporary file and then move it.
"""
with open_atomic(fname, 'wb') as f:
pickle.dump(obj, f, -1) # -1 specifies highest binary protocol
# -----------------------------------------------------------------------------
class CompressedSqliteDict(SqliteDict):
@@ -62,8 +129,7 @@ FEATURES_FILE = os.path.join(DATA_DIR, 'features.p')
def save_features(features):
""" takes the features dict and save it to disk in a simple pickle file """
with open(FEATURES_FILE, 'wb') as f:
pickle.dump(features, f)
safe_pickle_dump(features, FEATURES_FILE)
def load_features():
""" loads the features dict from disk """
-1
View File
@@ -2,7 +2,6 @@
Extracts tfidf features from all paper abstracts and saves them to disk.
"""
import pickle
import argparse
import numpy as np