From b5c5384d48729327cf1bcb51cb65c5696e5d0007 Mon Sep 17 00:00:00 2001
From: Andrew Poulton <awdpoulton@gmail.com>
Date: Tue, 11 Oct 2022 15:02:19 +0000
Subject: [PATCH 01/14] remove old codepath, only use unified tokenizer file

---
 metaseq/tasks/streaming_language_modeling.py | 24 ++++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/metaseq/tasks/streaming_language_modeling.py b/metaseq/tasks/streaming_language_modeling.py
index 4e31c89..84154f6 100644
--- a/metaseq/tasks/streaming_language_modeling.py
+++ b/metaseq/tasks/streaming_language_modeling.py
@@ -51,12 +51,12 @@ class StreamingLanguageModelingConfig(MetaseqDataclass):
     hf_tokenizer: Optional[str] = field(
         default="", metadata={"help": "path to a HF tokenizer json file."}
     )
-    vocab_filename: Optional[str] = field(
-        default="", metadata={"help": "path to bpe-vocab.json"}
-    )
-    merges_filename: Optional[str] = field(
-        default="", metadata={"help": "path to bpe-merges.txt"}
-    )
+    # vocab_filename: Optional[str] = field(
+    #     default="", metadata={"help": "path to bpe-vocab.json"}
+    # )
+    # merges_filename: Optional[str] = field(
+    #     default="", metadata={"help": "path to bpe-merges.txt"}
+    # )
     end_of_document_symbol: Optional[str] = field(
         default="</s>", metadata={"help": "symbol indicating an end-of-document"}
     )
@@ -127,12 +127,12 @@ class StreamingLanguageModelingTask(LegacyTask):
         if not has_hf_tokenizers:
             raise ImportError("Please install tokenizers with: pip install tokenizers")
 
-        if args.hf_tokenizer:
-            self.tokenizer = Tokenizer.from_file(args.hf_tokenizer)
-        else:
-            self.tokenizer = ByteLevelBPETokenizer.from_file(
-                args.vocab_filename, args.merges_filename
-            )
+        # if args.hf_tokenizer:
+        self.tokenizer = Tokenizer.from_file(args.hf_tokenizer)
+        # else:
+        #     self.tokenizer = ByteLevelBPETokenizer.from_file(
+        #         args.vocab_filename, args.merges_filename
+        #     )
 
         if max(args.update_freq) > 1:
             raise NotImplementedError(
-- 
GitLab


From d9d3b1ba1c59e1bd0955399e8de9bd399d4b2736 Mon Sep 17 00:00:00 2001
From: Andrew Poulton <awdpoulton@gmail.com>
Date: Wed, 12 Oct 2022 14:19:03 +0000
Subject: [PATCH 02/14] Removes old two-file codepath for tokenizer replacing
 with universal HF format in a single file.

Then whack-a-moles a bunch of bugs until tests pass, namely:
- `state_dict` of `StreamingEpochBatchIterator` would return empty list for `sequences_consumed` if the current epoch had finished and `num_workers==0`
- `DDP_BACKEND_CHOICES` enum missed a `no_c10d` testing option
- `TransformerDecoder` missed null check on `embed_positions` before moving to device/dtype
- added kwargs to base optimiser `clip_grad_norm` signature to pacify tests
- Copied `fixed_schedule.py` from fairseq

Also added `unify_tokenizers.py` script to merge *vocab.json, *merges.txt files super naively
---
 cpu_tests/test_utils.py                       | 14 ++++
 gpu_tests/test_hf_compatibility.py            | 23 ++++--
 metaseq/data/iterators.py                     |  2 +-
 metaseq/dataclass/constants.py                |  1 +
 metaseq/hub_utils.py                          | 11 ++-
 metaseq/models/transformer_decoder.py         |  3 +-
 metaseq/optim/base_optimizer.py               |  4 +-
 metaseq/optim/lr_scheduler/fixed_schedule.py  | 79 +++++++++++++++++++
 metaseq/scripts/unify_tokenizer.py            | 11 +++
 metaseq/tasks/streaming_language_modeling.py  |  9 +--
 .../test_streaming_language_modeling_task.py  | 44 +++++++----
 11 files changed, 166 insertions(+), 35 deletions(-)
 create mode 100644 metaseq/optim/lr_scheduler/fixed_schedule.py
 create mode 100644 metaseq/scripts/unify_tokenizer.py

diff --git a/cpu_tests/test_utils.py b/cpu_tests/test_utils.py
index 8ea0083..918fd29 100644
--- a/cpu_tests/test_utils.py
+++ b/cpu_tests/test_utils.py
@@ -51,6 +51,20 @@ def write_dummy_bpe(data_dir):
     return vocab, merges
 
 
+def write_dummy_bpe_unified(tokenizer_file):
+    from tokenizers import ByteLevelBPETokenizer
+
+    tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)
+    tokenizer.train(
+        [],
+        vocab_size=500,
+        special_tokens=["<s>", "<pad>", "</s>", "<unk>"],
+        show_progress=False,
+    )
+    tokenizer.save(tokenizer_file)
+    return tokenizer_file
+
+
 class TestUtils(unittest.TestCase):
     def test_make_positions(self):
         pad = 1
diff --git a/gpu_tests/test_hf_compatibility.py b/gpu_tests/test_hf_compatibility.py
index 8b83599..d655f2e 100644
--- a/gpu_tests/test_hf_compatibility.py
+++ b/gpu_tests/test_hf_compatibility.py
@@ -15,7 +15,12 @@ from metaseq.scripts.convert_to_singleton import create_generation_config_with_d
 from metaseq.distributed import utils as distributed_utils
 from metaseq.distributed import fsdp_enable_wrap, fsdp_wrap
 from metaseq.dataclass.configs import MetaseqConfig
-from metaseq.hub_utils import tensorize_input, get_next_token, setup_vocab_and_merges
+from metaseq.hub_utils import (
+    tensorize_input,
+    get_next_token,
+    setup_vocab_and_merges,
+    setup_unified_tokenizer,
+)
 from megatron.mpu import destroy_model_parallel
 
 
@@ -101,13 +106,15 @@ def load_mp_model_and_run_eval(cfg: MetaseqConfig, **kwargs):
 class TestHFCompatibility(unittest.TestCase):
     def test_singleton_metaseq_hf_compatibility(self):
         model_path = os.path.join(os.path.dirname(__file__), "125m")
-        vocab_file, merges_file, tokenizer = setup_vocab_and_merges(model_path)
+        # vocab_file, merges_file, tokenizer = setup_vocab_and_merges(model_path)
+        unified_file, tokenizer = setup_unified_tokenizer(model_path)
 
         checkpoint = checkpoint_utils.load_model_ensemble_and_task(
             [os.path.join(model_path, "restored.pt")],
             arg_overrides={
-                "vocab_filename": vocab_file,
-                "merges_filename": merges_file,
+                # "vocab_filename": vocab_file,
+                # "merges_filename": merges_file,
+                "hf_tokenizer": unified_file,
             },
         )
 
@@ -142,12 +149,14 @@ class TestHFCompatibility(unittest.TestCase):
 
         # Verify that the generated logits match the consolidated model logits
 
-        vocab_file, merges_file, tokenizer = setup_vocab_and_merges(model_path)
+        # vocab_file, merges_file, tokenizer = setup_vocab_and_merges(model_path)
+        unified_file, tokenizer = setup_unified_tokenizer(model_path)
         checkpoint = checkpoint_utils.load_model_ensemble_and_task(
             [os.path.join(model_path, "restored.pt")],
             arg_overrides={
-                "vocab_filename": vocab_file,
-                "merges_filename": merges_file,
+                # "vocab_filename": vocab_file,
+                # "merges_filename": merges_file,
+                "hf_tokenizer": unified_file,
             },
         )
         model = checkpoint[0][0].eval()
diff --git a/metaseq/data/iterators.py b/metaseq/data/iterators.py
index 92ad0d7..196f921 100644
--- a/metaseq/data/iterators.py
+++ b/metaseq/data/iterators.py
@@ -280,7 +280,7 @@ class StreamingEpochBatchIterator(EpochBatchIterating):
             # small optimization: we advance the epoch before saving, so that
             # when loading later we don't end up fast-forwarding the iterator
             epoch = self.epoch + 1
-            sequences_consumed = [0 for _ in range(self.num_workers)]
+            sequences_consumed = [0 for _ in range(max(1, self.num_workers))]
             n = 0
             next_worker = 0
         else:
diff --git a/metaseq/dataclass/constants.py b/metaseq/dataclass/constants.py
index fc1fc33..bf531ce 100644
--- a/metaseq/dataclass/constants.py
+++ b/metaseq/dataclass/constants.py
@@ -40,6 +40,7 @@ DDP_BACKEND_CHOICES = ChoiceEnum(
         "c10d",  # alias for pytorch_ddp
         "fully_sharded",  # FullyShardedDataParallel from fairscale
         "pytorch_ddp",
+        "no_c10d",  # Deflake tests
     ]
 )
 DATASET_IMPL_CHOICES = ChoiceEnum(["raw", "lazy", "cached", "mmap", "fasta"])
diff --git a/metaseq/hub_utils.py b/metaseq/hub_utils.py
index a7ff7fa..ce54aaa 100644
--- a/metaseq/hub_utils.py
+++ b/metaseq/hub_utils.py
@@ -11,7 +11,7 @@ import os
 import time
 from argparse import Namespace
 from typing import Any, Dict, Iterator, List, Optional
-from tokenizers import ByteLevelBPETokenizer
+from tokenizers import ByteLevelBPETokenizer, Tokenizer
 
 import numpy as np
 import torch
@@ -125,6 +125,15 @@ def setup_vocab_and_merges(model_path):
     return vocab_file, merges_file, tokenizer
 
 
+def setup_unified_tokenizer(model_path):
+    unified_file = os.path.join(model_path, "gpt2-unified.json")
+    if not os.path.exists(unified_file):
+        _, _, tokenizer = setup_vocab_and_merges(model_path)
+        tokenizer.save(unified_file)
+    tokenizer = Tokenizer.from_file(unified_file)
+    return unified_file, tokenizer
+
+
 class GeneratorHubInterface(nn.Module):
     """
     PyTorch Hub interface for generating sequences from a pre-trained
diff --git a/metaseq/models/transformer_decoder.py b/metaseq/models/transformer_decoder.py
index 0e4cabb..49b673e 100644
--- a/metaseq/models/transformer_decoder.py
+++ b/metaseq/models/transformer_decoder.py
@@ -119,7 +119,8 @@ class TransformerDecoder(IncrementalDecoder):
             if args.decoder_learned_pos and not self.use_alibi
             else None
         )
-        self.embed_positions.to(device).to(dtype)
+        if self.embed_positions is not None:
+            self.embed_positions.to(device).to(dtype)
 
         self.cross_self_attention = getattr(args, "cross_self_attention", False)
 
diff --git a/metaseq/optim/base_optimizer.py b/metaseq/optim/base_optimizer.py
index bcca8cc..49c461d 100644
--- a/metaseq/optim/base_optimizer.py
+++ b/metaseq/optim/base_optimizer.py
@@ -108,7 +108,9 @@ class BaseOptimizer(object):
                     c = c.to(p.grad.device)
                 p.grad.data.mul_(c)
 
-    def clip_grad_norm(self, max_norm, norm_type="l2", aggregate_norm_fn=None):
+    def clip_grad_norm(
+        self, max_norm, norm_type="l2", aggregate_norm_fn=None, **kwargs
+    ):
         """Clips gradient norm."""
         return utils.clip_grad_norm_(
             self.params, max_norm, norm_type, aggregate_norm_fn
diff --git a/metaseq/optim/lr_scheduler/fixed_schedule.py b/metaseq/optim/lr_scheduler/fixed_schedule.py
new file mode 100644
index 0000000..cea9100
--- /dev/null
+++ b/metaseq/optim/lr_scheduler/fixed_schedule.py
@@ -0,0 +1,79 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+from typing import Optional, List
+from omegaconf import II
+
+from metaseq.dataclass import MetaseqDataclass
+from metaseq.optim.lr_scheduler import BaseLRScheduler, register_lr_scheduler
+
+# from fairseq.dataclass import FairseqDataclass
+# from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+
+
+@dataclass
+class FixedLRScheduleConfig(MetaseqDataclass):
+    force_anneal: Optional[int] = field(
+        default=None,
+        metadata={"help": "force annealing at specified epoch"},
+    )
+    lr_shrink: float = field(
+        default=0.1,
+        metadata={"help": "shrink factor for annealing, lr_new = (lr * lr_shrink)"},
+    )
+    warmup_updates: int = field(
+        default=0,
+        metadata={"help": "warmup the learning rate linearly for the first N updates"},
+    )
+    lr: List[float] = II("optimization.lr")
+
+
+@register_lr_scheduler("fixed", dataclass=FixedLRScheduleConfig)
+class FixedLRSchedule(BaseLRScheduler):
+    """Decay the LR on a fixed schedule."""
+
+    def __init__(self, cfg: FixedLRScheduleConfig, optimizer):
+        super().__init__(cfg, optimizer)
+
+        self.lr = cfg.lr[0]
+        if cfg.warmup_updates > 0:
+            self.warmup_factor = 1.0 / cfg.warmup_updates
+        else:
+            self.warmup_factor = 1
+
+    def state_dict(self):
+        return {"lr": self.lr}
+
+    def load_state_dict(self, state_dict):
+        if "lr" in state_dict:
+            self.lr = state_dict["lr"]
+
+    def get_next_lr(self, epoch):
+        lrs = self.cfg.lr
+        if self.cfg.force_anneal is None or epoch < self.cfg.force_anneal:
+            # use fixed LR schedule
+            next_lr = lrs[min(epoch - 1, len(lrs) - 1)]
+        else:
+            # annneal based on lr_shrink
+            next_lr = lrs[-1] * self.cfg.lr_shrink ** (
+                epoch + 1 - self.cfg.force_anneal
+            )
+        return next_lr
+
+    def step_begin_epoch(self, epoch):
+        """Update the learning rate at the beginning of the given epoch."""
+        self.lr = self.get_next_lr(epoch)
+        self.optimizer.set_lr(self.warmup_factor * self.lr)
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        if self.cfg.warmup_updates > 0 and num_updates < self.cfg.warmup_updates:
+            self.warmup_factor = (num_updates + 1) / float(self.cfg.warmup_updates)
+            self.optimizer.set_lr(self.warmup_factor * self.lr)
+        else:
+            self.optimizer.set_lr(self.lr)
+        return self.optimizer.get_lr()
diff --git a/metaseq/scripts/unify_tokenizer.py b/metaseq/scripts/unify_tokenizer.py
new file mode 100644
index 0000000..c2457b0
--- /dev/null
+++ b/metaseq/scripts/unify_tokenizer.py
@@ -0,0 +1,11 @@
+from tokenizers import ByteLevelBPETokenizer
+from fire import Fire
+
+
+def main(vocab_file, merges_file, unified_path):
+    old_tokenizer = ByteLevelBPETokenizer(vocab_file, merges_file)
+    old_tokenizer.save(unified_path)
+
+
+if __name__ == "__main__":
+    Fire(main)
diff --git a/metaseq/tasks/streaming_language_modeling.py b/metaseq/tasks/streaming_language_modeling.py
index 84154f6..634bdf8 100644
--- a/metaseq/tasks/streaming_language_modeling.py
+++ b/metaseq/tasks/streaming_language_modeling.py
@@ -31,7 +31,7 @@ from metaseq.tasks import LegacyTask, register_task
 from metaseq.data.document_to_sequence import DocumentToSequenceDataset
 
 try:
-    from tokenizers import ByteLevelBPETokenizer, Tokenizer
+    from tokenizers import Tokenizer
 
     has_hf_tokenizers = True
 except ImportError:
@@ -51,12 +51,7 @@ class StreamingLanguageModelingConfig(MetaseqDataclass):
     hf_tokenizer: Optional[str] = field(
         default="", metadata={"help": "path to a HF tokenizer json file."}
     )
-    # vocab_filename: Optional[str] = field(
-    #     default="", metadata={"help": "path to bpe-vocab.json"}
-    # )
-    # merges_filename: Optional[str] = field(
-    #     default="", metadata={"help": "path to bpe-merges.txt"}
-    # )
+
     end_of_document_symbol: Optional[str] = field(
         default="</s>", metadata={"help": "symbol indicating an end-of-document"}
     )
diff --git a/tests/test_streaming_language_modeling_task.py b/tests/test_streaming_language_modeling_task.py
index 9952fa4..e7be941 100644
--- a/tests/test_streaming_language_modeling_task.py
+++ b/tests/test_streaming_language_modeling_task.py
@@ -11,7 +11,11 @@ import unittest
 import torch
 
 from tests.utils import train_language_model
-from cpu_tests.test_utils import write_dummy_jsonl_data_dir, write_dummy_bpe
+from cpu_tests.test_utils import (
+    write_dummy_jsonl_data_dir,
+    write_dummy_bpe,
+    write_dummy_bpe_unified,
+)
 
 try:
     import tokenizers  # noqa
@@ -41,7 +45,8 @@ class TestReproducibility(unittest.TestCase):
 
         with tempfile.TemporaryDirectory(name) as data_dir:
             write_dummy_jsonl_data_dir(data_dir)
-            vocab, merges = write_dummy_bpe(data_dir)
+            # vocab, merges = write_dummy_bpe(data_dir)
+            tokenizer_path = write_dummy_bpe_unified(f"{data_dir}/tokenizer.json")
 
             # train epochs 1 and 2 together
             with self.assertLogs() as logs:
@@ -49,10 +54,12 @@ class TestReproducibility(unittest.TestCase):
                     data_dir=data_dir,
                     arch="transformer_lm_gpt2_tiny",
                     extra_flags=[
-                        "--vocab-filename",
-                        vocab,
-                        "--merges-filename",
-                        merges,
+                        # "--vocab-filename",
+                        # vocab,
+                        # "--merges-filename",
+                        # merges,
+                        "--hf-tokenizer",
+                        tokenizer_path,
                         "--dropout",
                         "0.0",
                         "--log-format",
@@ -68,7 +75,7 @@ class TestReproducibility(unittest.TestCase):
                     task="streaming_language_modeling",
                     max_tokens=None,
                 )
-            train_log = get_last_log_stats_containing_string(logs.records, "train_loss")
+            train_log = get_last_log_stats_containing_string(logs.records, '"loss"')
             valid_log = get_last_log_stats_containing_string(logs.records, "valid_loss")
 
             # train epoch 2, resuming from previous checkpoint 1
@@ -81,10 +88,12 @@ class TestReproducibility(unittest.TestCase):
                     data_dir=data_dir,
                     arch="transformer_lm_gpt2_tiny",
                     extra_flags=[
-                        "--vocab-filename",
-                        vocab,
-                        "--merges-filename",
-                        merges,
+                        # "--vocab-filename",
+                        # vocab,
+                        # "--merges-filename",
+                        # merges,
+                        "--hf-tokenizer",
+                        tokenizer_path,
                         "--dropout",
                         "0.0",
                         "--log-format",
@@ -100,14 +109,12 @@ class TestReproducibility(unittest.TestCase):
                     task="streaming_language_modeling",
                     max_tokens=None,
                 )
-            train_res_log = get_last_log_stats_containing_string(
-                logs.records, "train_loss"
-            )
+            train_res_log = get_last_log_stats_containing_string(logs.records, "loss")
             valid_res_log = get_last_log_stats_containing_string(
                 logs.records, "valid_loss"
             )
 
-            for k in ["train_loss", "train_ppl", "train_num_updates", "train_gnorm"]:
+            for k in ["loss", "ppl", "num_updates", "gnorm"]:
                 self.assertAlmostEqual(
                     float(train_log[k]), float(train_res_log[k]), delta=delta
                 )
@@ -122,7 +129,10 @@ class TestReproducibility(unittest.TestCase):
                 )
 
     def test_reproducibility(self):
-        self._test_reproducibility("test_reproducibility")
+        self._test_reproducibility(
+            "test_reproducibility",
+            ["--save-interval-updates", "3"],
+        )
 
     @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
     def test_reproducibility_fp16(self):
@@ -151,7 +161,7 @@ class TestReproducibility(unittest.TestCase):
         self._test_reproducibility(
             "test_mid_epoch_reproducibility",
             ["--save-interval-updates", "3"],
-            resume_checkpoint="checkpoint_1_3.pt",
+            resume_checkpoint="checkpoint_3.pt",
             max_epoch=1,
         )
 
-- 
GitLab


From c5ea071ee2c82675ee363e39009fa1aef7cb797b Mon Sep 17 00:00:00 2001
From: Andrew Poulton <awdpoulton@gmail.com>
Date: Wed, 12 Oct 2022 15:21:27 +0000
Subject: [PATCH 03/14] [WIP] CircleCI debug

---
 metaseq/scripts/unify_tokenizer.py           | 9 +++++++--
 metaseq/tasks/streaming_language_modeling.py | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/metaseq/scripts/unify_tokenizer.py b/metaseq/scripts/unify_tokenizer.py
index c2457b0..cd18cd3 100644
--- a/metaseq/scripts/unify_tokenizer.py
+++ b/metaseq/scripts/unify_tokenizer.py
@@ -1,10 +1,15 @@
 from tokenizers import ByteLevelBPETokenizer
 from fire import Fire
+import os
 
 
-def main(vocab_file, merges_file, unified_path):
+def main(path):
+    merges_file = os.path.join(path, "gpt2-merges.txt")
+    vocab_file = os.path.join(path, "gpt2-vocab.json")
+    unified_file = os.path.join(path, "gpt2-unified.json")
+
     old_tokenizer = ByteLevelBPETokenizer(vocab_file, merges_file)
-    old_tokenizer.save(unified_path)
+    old_tokenizer.save(unified_file)
 
 
 if __name__ == "__main__":
diff --git a/metaseq/tasks/streaming_language_modeling.py b/metaseq/tasks/streaming_language_modeling.py
index 634bdf8..6c42212 100644
--- a/metaseq/tasks/streaming_language_modeling.py
+++ b/metaseq/tasks/streaming_language_modeling.py
@@ -121,7 +121,7 @@ class StreamingLanguageModelingTask(LegacyTask):
 
         if not has_hf_tokenizers:
             raise ImportError("Please install tokenizers with: pip install tokenizers")
-
+        print(args.hf_tokenizer)
         # if args.hf_tokenizer:
         self.tokenizer = Tokenizer.from_file(args.hf_tokenizer)
         # else:
-- 
GitLab


From e3e04ed5cf2b72856a97150061b576c0c37c2bbc Mon Sep 17 00:00:00 2001
From: Andrew Poulton <awdpoulton@gmail.com>
Date: Wed, 12 Oct 2022 15:34:59 +0000
Subject: [PATCH 04/14] ditto

---
 metaseq/tasks/streaming_language_modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metaseq/tasks/streaming_language_modeling.py b/metaseq/tasks/streaming_language_modeling.py
index 6c42212..6432e0f 100644
--- a/metaseq/tasks/streaming_language_modeling.py
+++ b/metaseq/tasks/streaming_language_modeling.py
@@ -121,7 +121,7 @@ class StreamingLanguageModelingTask(LegacyTask):
 
         if not has_hf_tokenizers:
             raise ImportError("Please install tokenizers with: pip install tokenizers")
-        print(args.hf_tokenizer)
+        logger.info(f"loading tokenizer: {args.hf_tokenizer}")
         # if args.hf_tokenizer:
         self.tokenizer = Tokenizer.from_file(args.hf_tokenizer)
         # else:
-- 
GitLab


From ede26170cda9d7dc370c80889d0136b713561fd3 Mon Sep 17 00:00:00 2001
From: Andrew Poulton <awdpoulton@gmail.com>
Date: Thu, 13 Oct 2022 07:23:11 +0000
Subject: [PATCH 05/14] Correct arg_overrides

---
 gpu_tests/test_hf_compatibility.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/gpu_tests/test_hf_compatibility.py b/gpu_tests/test_hf_compatibility.py
index d655f2e..113fb06 100644
--- a/gpu_tests/test_hf_compatibility.py
+++ b/gpu_tests/test_hf_compatibility.py
@@ -106,15 +106,12 @@ def load_mp_model_and_run_eval(cfg: MetaseqConfig, **kwargs):
 class TestHFCompatibility(unittest.TestCase):
     def test_singleton_metaseq_hf_compatibility(self):
         model_path = os.path.join(os.path.dirname(__file__), "125m")
-        # vocab_file, merges_file, tokenizer = setup_vocab_and_merges(model_path)
         unified_file, tokenizer = setup_unified_tokenizer(model_path)
 
         checkpoint = checkpoint_utils.load_model_ensemble_and_task(
             [os.path.join(model_path, "restored.pt")],
             arg_overrides={
-                # "vocab_filename": vocab_file,
-                # "merges_filename": merges_file,
-                "hf_tokenizer": unified_file,
+                "task": {"hf_tokenizer": unified_file},
             },
         )
 
@@ -149,14 +146,11 @@ class TestHFCompatibility(unittest.TestCase):
 
         # Verify that the generated logits match the consolidated model logits
 
-        # vocab_file, merges_file, tokenizer = setup_vocab_and_merges(model_path)
         unified_file, tokenizer = setup_unified_tokenizer(model_path)
         checkpoint = checkpoint_utils.load_model_ensemble_and_task(
             [os.path.join(model_path, "restored.pt")],
             arg_overrides={
-                # "vocab_filename": vocab_file,
-                # "merges_filename": merges_file,
-                "hf_tokenizer": unified_file,
+                "task": {"hf_tokenizer": unified_file},
             },
         )
         model = checkpoint[0][0].eval()
-- 
GitLab


From 6a5e6e627dea9c488afdb8903a6f593295a85c3c Mon Sep 17 00:00:00 2001
From: Andrew Poulton <awdpoulton@gmail.com>
Date: Thu, 13 Oct 2022 07:46:15 +0000
Subject: [PATCH 06/14] Add hf_tokenizer arg

---
 gpu_tests/test_hub_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gpu_tests/test_hub_utils.py b/gpu_tests/test_hub_utils.py
index 2e15698..23b82a2 100644
--- a/gpu_tests/test_hub_utils.py
+++ b/gpu_tests/test_hub_utils.py
@@ -88,6 +88,7 @@ def generate_using_generator_hub_interface(cfg: MetaseqConfig, **kwargs):
         "merges_filename": os.path.join(kwargs["model_path"], "gpt2-merges.txt"),
         "bpe_vocab": os.path.join(kwargs["model_path"], "gpt2-vocab.json"),
         "vocab_filename": os.path.join(kwargs["model_path"], "gpt2-vocab.json"),
+        "hf_tokenizer": os.path.join(kwargs["model_path"], "gpt2-unified.json"),
         "bpe_add_prefix_space": False,
         "specify_arch": True,
         "tensor_parallel_init_model_on_gpu": True,
-- 
GitLab


From 3f29d9ee53a4a7742cb964790fb096c4cf7c457e Mon Sep 17 00:00:00 2001
From: Andrew Poulton <awdpoulton@gmail.com>
Date: Thu, 13 Oct 2022 15:28:14 +0000
Subject: [PATCH 07/14] Fix train integrity test

---
 gpu_tests/test_hub_utils.py       |  6 +++++-
 metaseq/launcher/opt_baselines.py | 20 +++++++++++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/gpu_tests/test_hub_utils.py b/gpu_tests/test_hub_utils.py
index 23b82a2..94e373b 100644
--- a/gpu_tests/test_hub_utils.py
+++ b/gpu_tests/test_hub_utils.py
@@ -88,7 +88,11 @@ def generate_using_generator_hub_interface(cfg: MetaseqConfig, **kwargs):
         "merges_filename": os.path.join(kwargs["model_path"], "gpt2-merges.txt"),
         "bpe_vocab": os.path.join(kwargs["model_path"], "gpt2-vocab.json"),
         "vocab_filename": os.path.join(kwargs["model_path"], "gpt2-vocab.json"),
-        "hf_tokenizer": os.path.join(kwargs["model_path"], "gpt2-unified.json"),
+        "task": {
+            "hf_tokenizer": os.path.join(kwargs["model_path"], "gpt2-unified.json"),
+            "vocab_filename": os.path.join(kwargs["model_path"], "gpt2-vocab.json"),
+            "merges_filename": os.path.join(kwargs["model_path"], "gpt2-merges.txt"),
+        },
         "bpe_add_prefix_space": False,
         "specify_arch": True,
         "tensor_parallel_init_model_on_gpu": True,
diff --git a/metaseq/launcher/opt_baselines.py b/metaseq/launcher/opt_baselines.py
index 3da3c5a..b77220a 100644
--- a/metaseq/launcher/opt_baselines.py
+++ b/metaseq/launcher/opt_baselines.py
@@ -81,6 +81,11 @@ def get_grid(args):
         if args.circleci:
             data_loc_by_env = "./gpu_tests/circleci"
             valid_subsets = ["BookCorpusFair"]
+            import metaseq.scripts.unify_tokenizer as unify_tokenizer
+
+            # import metaseq.pdb; metaseq.pdb.set_trace_rank0()
+            # print("data loc:", data_loc_by_env)
+            unify_tokenizer.main(os.path.join(data_loc_by_env, "tokenizers"))
         args.data = os.path.join(data_loc_by_env, "corpus_dedup_10_10_1_0.05_exp29")
         if os.path.exists(args.data):
             DATA_ROOT = data_loc_by_env
@@ -124,14 +129,19 @@ def get_grid(args):
             "none",
             save_dir_key=lambda val: f"bm_{val}" if not no_save_params else "",
         ),
+        # hyperparam(
+        #     "--vocab-filename",
+        #     os.path.join(DATA_ROOT, "tokenizers/gpt2-vocab.json"),
+        #     save_dir_key=lambda _: "gpt2" if not no_save_params else "",
+        # ),
+        # hyperparam(
+        #     "--merges-filename", os.path.join(DATA_ROOT, "tokenizers/gpt2-merges.txt")
+        # ),
         hyperparam(
-            "--vocab-filename",
-            os.path.join(DATA_ROOT, "tokenizers/gpt2-vocab.json"),
+            "--hf-tokenizer",
+            os.path.join(DATA_ROOT, "tokenizers/gpt2-unified.json"),
             save_dir_key=lambda _: "gpt2" if not no_save_params else "",
         ),
-        hyperparam(
-            "--merges-filename", os.path.join(DATA_ROOT, "tokenizers/gpt2-merges.txt")
-        ),
     ]
 
     # separate task config for dummy_lm
-- 
GitLab


From a49c8d6e36e56ddc93d0163b19ed0e232b250f6d Mon Sep 17 00:00:00 2001
From: Andrew Poulton <awdpoulton@gmail.com>
Date: Thu, 13 Oct 2022 15:39:59 +0000
Subject: [PATCH 08/14] Remove old comments

---
 metaseq/launcher/opt_baselines.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/metaseq/launcher/opt_baselines.py b/metaseq/launcher/opt_baselines.py
index b77220a..ef67303 100644
--- a/metaseq/launcher/opt_baselines.py
+++ b/metaseq/launcher/opt_baselines.py
@@ -83,8 +83,6 @@ def get_grid(args):
             valid_subsets = ["BookCorpusFair"]
             import metaseq.scripts.unify_tokenizer as unify_tokenizer
 
-            # import metaseq.pdb; metaseq.pdb.set_trace_rank0()
-            # print("data loc:", data_loc_by_env)
             unify_tokenizer.main(os.path.join(data_loc_by_env, "tokenizers"))
         args.data = os.path.join(data_loc_by_env, "corpus_dedup_10_10_1_0.05_exp29")
         if os.path.exists(args.data):
@@ -129,14 +127,6 @@ def get_grid(args):
             "none",
             save_dir_key=lambda val: f"bm_{val}" if not no_save_params else "",
         ),
-        # hyperparam(
-        #     "--vocab-filename",
-        #     os.path.join(DATA_ROOT, "tokenizers/gpt2-vocab.json"),
-        #     save_dir_key=lambda _: "gpt2" if not no_save_params else "",
-        # ),
-        # hyperparam(
-        #     "--merges-filename", os.path.join(DATA_ROOT, "tokenizers/gpt2-merges.txt")
-        # ),
         hyperparam(
             "--hf-tokenizer",
             os.path.join(DATA_ROOT, "tokenizers/gpt2-unified.json"),
-- 
GitLab


From eefc89d163858a266f754cec2909043bc4340fbe Mon Sep 17 00:00:00 2001
From: Andrew Poulton <awdpoulton@gmail.com>
Date: Mon, 17 Oct 2022 15:47:38 +0000
Subject: [PATCH 09/14] Raise error and give mitagation steps if hf_tokenizer
 not specified

---
 metaseq/scripts/unify_tokenizer.py           |  6 ++++++
 metaseq/tasks/streaming_language_modeling.py | 12 +++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/metaseq/scripts/unify_tokenizer.py b/metaseq/scripts/unify_tokenizer.py
index cd18cd3..216d1a7 100644
--- a/metaseq/scripts/unify_tokenizer.py
+++ b/metaseq/scripts/unify_tokenizer.py
@@ -7,6 +7,12 @@ def main(path):
     merges_file = os.path.join(path, "gpt2-merges.txt")
     vocab_file = os.path.join(path, "gpt2-vocab.json")
     unified_file = os.path.join(path, "gpt2-unified.json")
+    if os.path.exists(unified_file):
+        return
+    if not (os.path.exists(merges_file and os.path.exists(vocab_file))):
+        raise ValueError(
+            f"Path {path} is missing one or both of {merges_file} and {vocab_file}"
+        )
 
     old_tokenizer = ByteLevelBPETokenizer(vocab_file, merges_file)
     old_tokenizer.save(unified_file)
diff --git a/metaseq/tasks/streaming_language_modeling.py b/metaseq/tasks/streaming_language_modeling.py
index 6432e0f..6646cb0 100644
--- a/metaseq/tasks/streaming_language_modeling.py
+++ b/metaseq/tasks/streaming_language_modeling.py
@@ -117,17 +117,19 @@ class StreamingLanguageModelingTask(LegacyTask):
     """
 
     def __init__(self, args):
+        if not hasattr(args, "hf_tokenizer"):
+            msg = f"""`StreamingLanguageModelingTask` expects a unified tokenizer format, and not
+            the merges/vocab dual format from days of yore. You can convert the old format to the new by running
+            `python -m metaseq.scripts.unify_tokenizer <PATH_TO_TOKENIZERS>`
+            """
+            raise ValueError(msg)
         super().__init__(args)
 
         if not has_hf_tokenizers:
             raise ImportError("Please install tokenizers with: pip install tokenizers")
         logger.info(f"loading tokenizer: {args.hf_tokenizer}")
-        # if args.hf_tokenizer:
+
         self.tokenizer = Tokenizer.from_file(args.hf_tokenizer)
-        # else:
-        #     self.tokenizer = ByteLevelBPETokenizer.from_file(
-        #         args.vocab_filename, args.merges_filename
-        #     )
 
         if max(args.update_freq) > 1:
             raise NotImplementedError(
-- 
GitLab


From 99784dd1a0113656fa5ce5500dc17bc6e64e88a5 Mon Sep 17 00:00:00 2001
From: Andrew Poulton <awdpoulton@gmail.com>
Date: Mon, 17 Oct 2022 16:11:32 +0000
Subject: [PATCH 10/14] Fix sneaky cpu tests

---
 cpu_tests/test_streaming_language_modeling_task.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/cpu_tests/test_streaming_language_modeling_task.py b/cpu_tests/test_streaming_language_modeling_task.py
index 7dd65de..4c0b5ab 100644
--- a/cpu_tests/test_streaming_language_modeling_task.py
+++ b/cpu_tests/test_streaming_language_modeling_task.py
@@ -14,6 +14,7 @@ from metaseq.tasks.streaming_language_modeling import StreamingLanguageModelingT
 from cpu_tests.test_utils import (
     write_one_jsonl,
     write_dummy_bpe,
+    write_dummy_bpe_unified,
 )
 from metaseq.dataclass.utils import convert_namespace_to_omegaconf
 
@@ -34,7 +35,10 @@ class TestDatasetLoading(unittest.TestCase):
             shard_00 = os.path.join(train_dir, "00")
             shard_01 = os.path.join(train_dir, "01")
 
-            vocab_file, merges_file = write_dummy_bpe(data_dir)
+            # vocab_file, merges_file = write_dummy_bpe(data_dir)
+            tokenizer_path = write_dummy_bpe_unified(
+                os.path.join(data_dir, "unified.json")
+            )
 
             # Create shard folders, and jsonl files
             os.makedirs(shard_00)
@@ -62,10 +66,8 @@ class TestDatasetLoading(unittest.TestCase):
                     "--task",
                     "streaming_language_modeling",
                     data_dir,
-                    "--vocab-filename",
-                    vocab_file,
-                    "--merges-filename",
-                    merges_file,
+                    "--hf-tokenizer",
+                    tokenizer_path,
                     "--sample-break-mode",
                     "complete",
                 ],
-- 
GitLab


From eb800442f744bd36528f9f064491647cefef8dfc Mon Sep 17 00:00:00 2001
From: Andrew Poulton <awdpoulton@gmail.com>
Date: Mon, 17 Oct 2022 16:19:14 +0000
Subject: [PATCH 11/14] Appease the linter

---
 cpu_tests/test_streaming_language_modeling_task.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpu_tests/test_streaming_language_modeling_task.py b/cpu_tests/test_streaming_language_modeling_task.py
index 4c0b5ab..e88200f 100644
--- a/cpu_tests/test_streaming_language_modeling_task.py
+++ b/cpu_tests/test_streaming_language_modeling_task.py
@@ -13,7 +13,6 @@ from metaseq import options
 from metaseq.tasks.streaming_language_modeling import StreamingLanguageModelingTask
 from cpu_tests.test_utils import (
     write_one_jsonl,
-    write_dummy_bpe,
     write_dummy_bpe_unified,
 )
 from metaseq.dataclass.utils import convert_namespace_to_omegaconf
@@ -35,7 +34,6 @@ class TestDatasetLoading(unittest.TestCase):
             shard_00 = os.path.join(train_dir, "00")
             shard_01 = os.path.join(train_dir, "01")
 
-            # vocab_file, merges_file = write_dummy_bpe(data_dir)
             tokenizer_path = write_dummy_bpe_unified(
                 os.path.join(data_dir, "unified.json")
             )
-- 
GitLab


From b22625a7764cee63cf93eecca294a06113465709 Mon Sep 17 00:00:00 2001
From: Andrew Poulton <awdpoulton@gmail.com>
Date: Mon, 17 Oct 2022 16:34:59 +0000
Subject: [PATCH 12/14] Correctly appease the linter

---
 tests/test_streaming_language_modeling_task.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_streaming_language_modeling_task.py b/tests/test_streaming_language_modeling_task.py
index e7be941..f0e1deb 100644
--- a/tests/test_streaming_language_modeling_task.py
+++ b/tests/test_streaming_language_modeling_task.py
@@ -13,7 +13,6 @@ import torch
 from tests.utils import train_language_model
 from cpu_tests.test_utils import (
     write_dummy_jsonl_data_dir,
-    write_dummy_bpe,
     write_dummy_bpe_unified,
 )
 
-- 
GitLab


From 6034abe4a37b1096778ea157bfc29302bf668ccf Mon Sep 17 00:00:00 2001
From: Andrew Poulton <awdpoulton@gmail.com>
Date: Tue, 18 Oct 2022 09:58:55 +0000
Subject: [PATCH 13/14] remove comments

---
 tests/test_streaming_language_modeling_task.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/test_streaming_language_modeling_task.py b/tests/test_streaming_language_modeling_task.py
index f0e1deb..e412e37 100644
--- a/tests/test_streaming_language_modeling_task.py
+++ b/tests/test_streaming_language_modeling_task.py
@@ -87,10 +87,6 @@ class TestReproducibility(unittest.TestCase):
                     data_dir=data_dir,
                     arch="transformer_lm_gpt2_tiny",
                     extra_flags=[
-                        # "--vocab-filename",
-                        # vocab,
-                        # "--merges-filename",
-                        # merges,
                         "--hf-tokenizer",
                         tokenizer_path,
                         "--dropout",
-- 
GitLab


From 7cd1c29daaf1409e6e5cdc4e90a01edf518cf31f Mon Sep 17 00:00:00 2001
From: Andrew Poulton <awdpoulton@gmail.com>
Date: Wed, 19 Oct 2022 16:54:25 +0000
Subject: [PATCH 14/14] A few tidyups per review

---
 metaseq/models/transformer_decoder.py        |  4 +-
 metaseq/models/transformer_lm.py             |  4 +-
 metaseq/optim/base_optimizer.py              | 10 ++-
 metaseq/optim/lr_scheduler/__init__.py       |  2 +-
 metaseq/optim/lr_scheduler/fixed_schedule.py | 79 --------------------
 5 files changed, 13 insertions(+), 86 deletions(-)
 delete mode 100644 metaseq/optim/lr_scheduler/fixed_schedule.py

diff --git a/metaseq/models/transformer_decoder.py b/metaseq/models/transformer_decoder.py
index 49b673e..56e88dd 100644
--- a/metaseq/models/transformer_decoder.py
+++ b/metaseq/models/transformer_decoder.py
@@ -119,8 +119,8 @@ class TransformerDecoder(IncrementalDecoder):
             if args.decoder_learned_pos and not self.use_alibi
             else None
         )
-        if self.embed_positions is not None:
-            self.embed_positions.to(device).to(dtype)
+
+        self.embed_positions.to(device).to(dtype)
 
         self.cross_self_attention = getattr(args, "cross_self_attention", False)
 
diff --git a/metaseq/models/transformer_lm.py b/metaseq/models/transformer_lm.py
index 988a94b..acd1c05 100644
--- a/metaseq/models/transformer_lm.py
+++ b/metaseq/models/transformer_lm.py
@@ -60,7 +60,7 @@ class TransformerLanguageModelConfig(MetaseqDataclass):
         default=False, metadata={"help": "share decoder input and output embeddings"}
     )
     decoder_learned_pos: bool = field(
-        default=False,
+        default=True,
         metadata={"help": "use learned positional embeddings in the decoder"},
     )
     decoder_learned_sinusoidal: bool = field(
@@ -209,7 +209,7 @@ def base_lm_architecture(args):
     args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048)
     args.decoder_layers = getattr(args, "decoder_layers", 6)
     args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
-    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", True)
     args.decoder_learned_sinusoidal = getattr(args, "decoder_learned_sinusoidal", False)
     args.activation_fn = getattr(args, "activation_fn", "relu")
 
diff --git a/metaseq/optim/base_optimizer.py b/metaseq/optim/base_optimizer.py
index 49c461d..3affc66 100644
--- a/metaseq/optim/base_optimizer.py
+++ b/metaseq/optim/base_optimizer.py
@@ -109,9 +109,15 @@ class BaseOptimizer(object):
                 p.grad.data.mul_(c)
 
     def clip_grad_norm(
-        self, max_norm, norm_type="l2", aggregate_norm_fn=None, **kwargs
+        self,
+        max_norm,
+        norm_type="l2",
+        aggregate_norm_fn=None,
+        skip_gradient_update_on_clip_norm=False,
     ):
-        """Clips gradient norm."""
+        """Clips gradient norm.
+        `skip_gradient_update_on_clip_norm` arg included to preserve API
+        """
         return utils.clip_grad_norm_(
             self.params, max_norm, norm_type, aggregate_norm_fn
         )
diff --git a/metaseq/optim/lr_scheduler/__init__.py b/metaseq/optim/lr_scheduler/__init__.py
index 4327178..96d03f2 100644
--- a/metaseq/optim/lr_scheduler/__init__.py
+++ b/metaseq/optim/lr_scheduler/__init__.py
@@ -18,7 +18,7 @@ from omegaconf import DictConfig
     LR_SCHEDULER_REGISTRY,
     LR_SCHEDULER_DATACLASS_REGISTRY,
 ) = registry.setup_registry(
-    "--lr-scheduler", base_class=BaseLRScheduler, default="fixed"
+    "--lr-scheduler", base_class=BaseLRScheduler, default="inverse_sqrt"
 )
 
 
diff --git a/metaseq/optim/lr_scheduler/fixed_schedule.py b/metaseq/optim/lr_scheduler/fixed_schedule.py
deleted file mode 100644
index cea9100..0000000
--- a/metaseq/optim/lr_scheduler/fixed_schedule.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from dataclasses import dataclass, field
-from typing import Optional, List
-from omegaconf import II
-
-from metaseq.dataclass import MetaseqDataclass
-from metaseq.optim.lr_scheduler import BaseLRScheduler, register_lr_scheduler
-
-# from fairseq.dataclass import FairseqDataclass
-# from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
-
-
-@dataclass
-class FixedLRScheduleConfig(MetaseqDataclass):
-    force_anneal: Optional[int] = field(
-        default=None,
-        metadata={"help": "force annealing at specified epoch"},
-    )
-    lr_shrink: float = field(
-        default=0.1,
-        metadata={"help": "shrink factor for annealing, lr_new = (lr * lr_shrink)"},
-    )
-    warmup_updates: int = field(
-        default=0,
-        metadata={"help": "warmup the learning rate linearly for the first N updates"},
-    )
-    lr: List[float] = II("optimization.lr")
-
-
-@register_lr_scheduler("fixed", dataclass=FixedLRScheduleConfig)
-class FixedLRSchedule(BaseLRScheduler):
-    """Decay the LR on a fixed schedule."""
-
-    def __init__(self, cfg: FixedLRScheduleConfig, optimizer):
-        super().__init__(cfg, optimizer)
-
-        self.lr = cfg.lr[0]
-        if cfg.warmup_updates > 0:
-            self.warmup_factor = 1.0 / cfg.warmup_updates
-        else:
-            self.warmup_factor = 1
-
-    def state_dict(self):
-        return {"lr": self.lr}
-
-    def load_state_dict(self, state_dict):
-        if "lr" in state_dict:
-            self.lr = state_dict["lr"]
-
-    def get_next_lr(self, epoch):
-        lrs = self.cfg.lr
-        if self.cfg.force_anneal is None or epoch < self.cfg.force_anneal:
-            # use fixed LR schedule
-            next_lr = lrs[min(epoch - 1, len(lrs) - 1)]
-        else:
-            # annneal based on lr_shrink
-            next_lr = lrs[-1] * self.cfg.lr_shrink ** (
-                epoch + 1 - self.cfg.force_anneal
-            )
-        return next_lr
-
-    def step_begin_epoch(self, epoch):
-        """Update the learning rate at the beginning of the given epoch."""
-        self.lr = self.get_next_lr(epoch)
-        self.optimizer.set_lr(self.warmup_factor * self.lr)
-        return self.optimizer.get_lr()
-
-    def step_update(self, num_updates):
-        """Update the learning rate after each update."""
-        if self.cfg.warmup_updates > 0 and num_updates < self.cfg.warmup_updates:
-            self.warmup_factor = (num_updates + 1) / float(self.cfg.warmup_updates)
-            self.optimizer.set_lr(self.warmup_factor * self.lr)
-        else:
-            self.optimizer.set_lr(self.lr)
-        return self.optimizer.get_lr()
-- 
GitLab