diff --git a/aslite/db.py b/aslite/db.py index ecef5c6..548f278 100644 --- a/aslite/db.py +++ b/aslite/db.py @@ -5,11 +5,78 @@ Any of the file system I/O and the associated settings are in this single file. """ import os -import sqlite3, zlib, pickle +import sqlite3, zlib, pickle, tempfile from sqlitedict import SqliteDict +from contextlib import contextmanager + +# ----------------------------------------------------------------------------- +# global configuration DATA_DIR = 'data' +# ----------------------------------------------------------------------------- +# utilities for safe writing of a pickle file + +# Context managers for atomic writes courtesy of +# http://stackoverflow.com/questions/2333872/atomic-writing-to-file-with-python +@contextmanager +def _tempfile(*args, **kws): + """ Context for temporary file. + Will find a free temporary filename upon entering + and will try to delete the file on leaving + Parameters + ---------- + suffix : string + optional file suffix + """ + + fd, name = tempfile.mkstemp(*args, **kws) + os.close(fd) + try: + yield name + finally: + try: + os.remove(name) + except OSError as e: + if e.errno == 2: + pass + else: + raise e + + +@contextmanager +def open_atomic(filepath, *args, **kwargs): + """ Open temporary file object that atomically moves to destination upon + exiting. + Allows reading and writing to and from the same filename. + Parameters + ---------- + filepath : string + the file path to be opened + fsync : bool + whether to force write the file to disk + kwargs : mixed + Any valid keyword arguments for :code:`open` + """ + fsync = kwargs.pop('fsync', False) + + with _tempfile(dir=os.path.dirname(filepath)) as tmppath: + with open(tmppath, *args, **kwargs) as f: + yield f + if fsync: + f.flush() + os.fsync(f.fileno()) + os.rename(tmppath, filepath) + +def safe_pickle_dump(obj, fname): + """ + prevents a case where one process could be writing a pickle file + while another process is reading it, causing a crash. the solution + is to write the pickle file to a temporary file and then move it. + """ + with open_atomic(fname, 'wb') as f: + pickle.dump(obj, f, -1) # -1 specifies highest binary protocol + # ----------------------------------------------------------------------------- class CompressedSqliteDict(SqliteDict): @@ -62,8 +129,7 @@ FEATURES_FILE = os.path.join(DATA_DIR, 'features.p') def save_features(features): """ takes the features dict and save it to disk in a simple pickle file """ - with open(FEATURES_FILE, 'wb') as f: - pickle.dump(features, f) + safe_pickle_dump(features, FEATURES_FILE) def load_features(): """ loads the features dict from disk """ diff --git a/compute.py b/compute.py index 6961fa9..17f1aef 100644 --- a/compute.py +++ b/compute.py @@ -2,7 +2,6 @@ Extracts tfidf features from all paper abstracts and saves them to disk. """ -import pickle import argparse import numpy as np