We’ll be using the oscar-1GB.jsonl dataset along with gpt2-vocab.json and gpt2-merges.txt files from Hugging Face.
cd /fsx
mkdir -p /fsx/data/ && cd data/
wget https://huggingface.co/bigscience/misc-test-data/resolve/main/stas/oscar-1GB.jsonl.xz
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
xz -d oscar-1GB.jsonl.xz