diff --git a/metaseq/data/jsonl_dataset.py b/metaseq/data/jsonl_dataset.py index 51f56c1119017f26e26b0ee143498afef6a07b2a..117bf3c5df15f6eca80cc937bff5d4c22547d8ac 100644 --- a/metaseq/data/jsonl_dataset.py +++ b/metaseq/data/jsonl_dataset.py @@ -49,8 +49,11 @@ class JsonlDataset(torch.utils.data.Dataset): self.tokenizer = tokenizer self.threadlocal = threading.local() + # resolve symlinks to for cached indexes. This lets us re-use indexes + # across our experiments using differently composed datasets + resolved_path = Path(path).resolve() # TODO(susan): Fix this fairseq reference. _build_index fails otherwise. - self.cache = Path(f"{path}.fairseq.idx.npy") + self.cache = Path(f"{resolved_path}.fairseq.idx.npy") # only build the cache in on the primary worker to prevent overloading nfs if distributed_utils.get_global_rank() != 0: distributed_utils.global_barrier()