diff --git a/metaseq/data/jsonl_dataset.py b/metaseq/data/jsonl_dataset.py
index 51f56c1119017f26e26b0ee143498afef6a07b2a..117bf3c5df15f6eca80cc937bff5d4c22547d8ac 100644
--- a/metaseq/data/jsonl_dataset.py
+++ b/metaseq/data/jsonl_dataset.py
@@ -49,8 +49,11 @@ class JsonlDataset(torch.utils.data.Dataset):
         self.tokenizer = tokenizer
 
         self.threadlocal = threading.local()
+        # resolve symlinks to for cached indexes. This lets us re-use indexes
+        # across our experiments using differently composed datasets
+        resolved_path = Path(path).resolve()
         # TODO(susan): Fix this fairseq reference. _build_index fails otherwise.
-        self.cache = Path(f"{path}.fairseq.idx.npy")
+        self.cache = Path(f"{resolved_path}.fairseq.idx.npy")
         # only build the cache in on the primary worker to prevent overloading nfs
         if distributed_utils.get_global_rank() != 0:
             distributed_utils.global_barrier()