remove the tinyshakespeare dataset until i can bring it back later in a nicer form, otherwise right now we just have a ton of copy paste code here

2023-08-13 02:18:30 +00:00
parent f5fc0c245f
commit 00a61dc7f9
2 changed files with 1 additions and 144 deletions
@@ -1,140 +0,0 @@
 """
 Download, preprocess and serve the TinyShakespeare dataset as a DataLoader.
 Follows the same interface as the TinyStories dataset.
 """
 import argparse
 import os
 import random
 import numpy as np
 import requests
 import torch
 import torch.distributed as dist
 from tqdm import tqdm
 from tokenizer import Tokenizer
 DATA_CACHE_DIR = "data"
 def download_file(url: str, fname: str, chunk_size=1024):
    """Helper function to download a file from a given url"""
    resp = requests.get(url, stream=True)
    total = int(resp.headers.get("content-length", 0))
    with open(fname, "wb") as file, tqdm(
        desc=fname,
        total=total,
        unit="iB",
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in resp.iter_content(chunk_size=chunk_size):
            size = file.write(data)
            bar.update(size)
 def download():
    """Downloads the dataset to disk."""
    os.makedirs(DATA_CACHE_DIR, exist_ok=True)
    # download the TinyShakespeare dataset, unless it's already downloaded
    data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
    data_filename = os.path.join(DATA_CACHE_DIR, "tinyshakespeare.txt")
    if not os.path.exists(data_filename):
        print(f"Downloading {data_url} to {data_filename}...")
        download_file(data_url, data_filename)
    else:
        print(f"{data_filename} already exists, skipping download...")
    print("Download done.")
 def pretokenize():
    enc = Tokenizer()
    data_file = os.path.join(DATA_CACHE_DIR, "tinyshakespeare.txt")
    all_tokens = []
    with open(data_file, "r") as f:
        for line in f:
            text = line.strip()
            tokens = enc.encode(text, bos=True, eos=False)
            all_tokens.extend(tokens)
    all_tokens = np.array(all_tokens, dtype=np.uint16)
    print(f"Total tokens: {len(all_tokens)}")
    with open(data_file.replace(".txt", ".bin"), "wb") as f:
        f.write(all_tokens.tobytes())
    print(f"Saved {data_file.replace('.txt', '.bin')}")
    print("Done.")
 class PretokDataset(torch.utils.data.IterableDataset):
    """Loads pretokenized examples from disk and yields them as PyTorch tensors."""
    def __init__(self, split, max_seq_len):
        super().__init__()
        self.split = split
        self.max_seq_len = max_seq_len
    def __iter__(self):
        # get worker info within a DataLoader
        worker_info = torch.utils.data.get_worker_info()
        worker_id = worker_info.id if worker_info else 0
        # get DDP rank info
        rank = dist.get_rank() if dist.is_initialized() else 0
        # combine the worker_id and worker_rank to create a unique seed for rng
        seed = 42 + worker_id + 1337 * rank
        rng = random.Random(seed)
        print(f"Created a PretokDataset with rng seed {seed}")
        data_file = os.path.join(DATA_CACHE_DIR, "tinyshakespeare.bin")
        m_all = np.memmap(data_file, dtype=np.uint16, mode="r")
        # split out 10% of the data for validation
        split_ix = int(len(m_all) * 0.9)
        if self.split == "train":
            m = m_all[:split_ix]
        else:
            m = m_all[split_ix:]
        num_batches = len(m) // self.max_seq_len
        num_batches -= 1  # drop the last partial batch
        assert num_batches > 0, "this split is way too small? investigate."
        while True:
            ixs = list(range(num_batches))
            rng.shuffle(ixs)
            for ix in ixs:
                start = ix * self.max_seq_len
                end = start + self.max_seq_len + 1
                # calling .astype will copy the data into a new numpy array, now in RAM
                chunk = torch.from_numpy((m[start:end]).astype(np.int64))
                x = chunk[:-1]
                y = chunk[1:]
                yield x, y
 class ShakespeareTask:
    @staticmethod
    def iter_batches(split, batch_size, max_seq_len, device, num_workers=0):
        ds = PretokDataset(split, max_seq_len)
        dl = torch.utils.data.DataLoader(
            ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers
        )
        for x, y in dl:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)
            yield x, y
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("stage", type=str, choices=["download", "train_tokenizer", "pretokenize"])
    args = parser.parse_args()
    # depending on the stage call the appropriate function
    fun = {
        "download": download,
        "pretokenize": pretokenize,
    }
    fun[args.stage]()
@@ -29,7 +29,6 @@ from torch.distributed import destroy_process_group, init_process_group
 from torch.nn.parallel import DistributedDataParallel as DDP
 from tinystories import Task
 from tinyshakespeare import ShakespeareTask
 # -----------------------------------------------------------------------------
 # I/O
@@ -49,7 +48,6 @@ batch_size = 128  # if gradient_accumulation_steps > 1, this is the micro-batch
 max_seq_len = 256
 vocab_source = "custom" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
 vocab_size = 512
 dataset = "tinystories"  # tinystories|tinyshakespeare
 # model
 dim = 288
 n_layers = 6
@@ -129,9 +127,8 @@ ctx = (
 )
 # task-specific setup
 task = {'tinystories': Task, 'tinyshakespeare': ShakespeareTask}[dataset]
 iter_batches = partial(
-    task.iter_batches,
+    Task.iter_batches,
    batch_size=batch_size,
    max_seq_len=max_seq_len,
    vocab_size=vocab_size,