From b5c5384d48729327cf1bcb51cb65c5696e5d0007 Mon Sep 17 00:00:00 2001 From: Andrew Poulton <awdpoulton@gmail.com> Date: Tue, 11 Oct 2022 15:02:19 +0000 Subject: [PATCH 01/14] remove old codepath, only use unified tokenizer file --- metaseq/tasks/streaming_language_modeling.py | 24 ++++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/metaseq/tasks/streaming_language_modeling.py b/metaseq/tasks/streaming_language_modeling.py index 4e31c89..84154f6 100644 --- a/metaseq/tasks/streaming_language_modeling.py +++ b/metaseq/tasks/streaming_language_modeling.py @@ -51,12 +51,12 @@ class StreamingLanguageModelingConfig(MetaseqDataclass): hf_tokenizer: Optional[str] = field( default="", metadata={"help": "path to a HF tokenizer json file."} ) - vocab_filename: Optional[str] = field( - default="", metadata={"help": "path to bpe-vocab.json"} - ) - merges_filename: Optional[str] = field( - default="", metadata={"help": "path to bpe-merges.txt"} - ) + # vocab_filename: Optional[str] = field( + # default="", metadata={"help": "path to bpe-vocab.json"} + # ) + # merges_filename: Optional[str] = field( + # default="", metadata={"help": "path to bpe-merges.txt"} + # ) end_of_document_symbol: Optional[str] = field( default="</s>", metadata={"help": "symbol indicating an end-of-document"} ) @@ -127,12 +127,12 @@ class StreamingLanguageModelingTask(LegacyTask): if not has_hf_tokenizers: raise ImportError("Please install tokenizers with: pip install tokenizers") - if args.hf_tokenizer: - self.tokenizer = Tokenizer.from_file(args.hf_tokenizer) - else: - self.tokenizer = ByteLevelBPETokenizer.from_file( - args.vocab_filename, args.merges_filename - ) + # if args.hf_tokenizer: + self.tokenizer = Tokenizer.from_file(args.hf_tokenizer) + # else: + # self.tokenizer = ByteLevelBPETokenizer.from_file( + # args.vocab_filename, args.merges_filename + # ) if max(args.update_freq) > 1: raise NotImplementedError( -- GitLab From d9d3b1ba1c59e1bd0955399e8de9bd399d4b2736 Mon Sep 17 00:00:00 2001 From: Andrew Poulton <awdpoulton@gmail.com> Date: Wed, 12 Oct 2022 14:19:03 +0000 Subject: [PATCH 02/14] Removes old two-file codepath for tokenizer replacing with universal HF format in a single file. Then whack-a-moles a bunch of bugs until tests pass, namely: - `state_dict` of `StreamingEpochBatchIterator` would return empty list for `sequences_consumed` if the current epoch had finished and `num_workers==0` - `DDP_BACKEND_CHOICES` enum missed a `no_c10d` testing option - `TransformerDecoder` missed null check on `embed_positions` before moving to device/dtype - added kwargs to base optimiser `clip_grad_norm` signature to pacify tests - Copied `fixed_schedule.py` from fairseq Also added `unify_tokenizers.py` script to merge *vocab.json, *merges.txt files super naively --- cpu_tests/test_utils.py | 14 ++++ gpu_tests/test_hf_compatibility.py | 23 ++++-- metaseq/data/iterators.py | 2 +- metaseq/dataclass/constants.py | 1 + metaseq/hub_utils.py | 11 ++- metaseq/models/transformer_decoder.py | 3 +- metaseq/optim/base_optimizer.py | 4 +- metaseq/optim/lr_scheduler/fixed_schedule.py | 79 +++++++++++++++++++ metaseq/scripts/unify_tokenizer.py | 11 +++ metaseq/tasks/streaming_language_modeling.py | 9 +-- .../test_streaming_language_modeling_task.py | 44 +++++++---- 11 files changed, 166 insertions(+), 35 deletions(-) create mode 100644 metaseq/optim/lr_scheduler/fixed_schedule.py create mode 100644 metaseq/scripts/unify_tokenizer.py diff --git a/cpu_tests/test_utils.py b/cpu_tests/test_utils.py index 8ea0083..918fd29 100644 --- a/cpu_tests/test_utils.py +++ b/cpu_tests/test_utils.py @@ -51,6 +51,20 @@ def write_dummy_bpe(data_dir): return vocab, merges +def write_dummy_bpe_unified(tokenizer_file): + from tokenizers import ByteLevelBPETokenizer + + tokenizer = ByteLevelBPETokenizer(add_prefix_space=True) + tokenizer.train( + [], + vocab_size=500, + special_tokens=["<s>", "<pad>", "</s>", "<unk>"], + show_progress=False, + ) + tokenizer.save(tokenizer_file) + return tokenizer_file + + class TestUtils(unittest.TestCase): def test_make_positions(self): pad = 1 diff --git a/gpu_tests/test_hf_compatibility.py b/gpu_tests/test_hf_compatibility.py index 8b83599..d655f2e 100644 --- a/gpu_tests/test_hf_compatibility.py +++ b/gpu_tests/test_hf_compatibility.py @@ -15,7 +15,12 @@ from metaseq.scripts.convert_to_singleton import create_generation_config_with_d from metaseq.distributed import utils as distributed_utils from metaseq.distributed import fsdp_enable_wrap, fsdp_wrap from metaseq.dataclass.configs import MetaseqConfig -from metaseq.hub_utils import tensorize_input, get_next_token, setup_vocab_and_merges +from metaseq.hub_utils import ( + tensorize_input, + get_next_token, + setup_vocab_and_merges, + setup_unified_tokenizer, +) from megatron.mpu import destroy_model_parallel @@ -101,13 +106,15 @@ def load_mp_model_and_run_eval(cfg: MetaseqConfig, **kwargs): class TestHFCompatibility(unittest.TestCase): def test_singleton_metaseq_hf_compatibility(self): model_path = os.path.join(os.path.dirname(__file__), "125m") - vocab_file, merges_file, tokenizer = setup_vocab_and_merges(model_path) + # vocab_file, merges_file, tokenizer = setup_vocab_and_merges(model_path) + unified_file, tokenizer = setup_unified_tokenizer(model_path) checkpoint = checkpoint_utils.load_model_ensemble_and_task( [os.path.join(model_path, "restored.pt")], arg_overrides={ - "vocab_filename": vocab_file, - "merges_filename": merges_file, + # "vocab_filename": vocab_file, + # "merges_filename": merges_file, + "hf_tokenizer": unified_file, }, ) @@ -142,12 +149,14 @@ class TestHFCompatibility(unittest.TestCase): # Verify that the generated logits match the consolidated model logits - vocab_file, merges_file, tokenizer = setup_vocab_and_merges(model_path) + # vocab_file, merges_file, tokenizer = setup_vocab_and_merges(model_path) + unified_file, tokenizer = setup_unified_tokenizer(model_path) checkpoint = checkpoint_utils.load_model_ensemble_and_task( [os.path.join(model_path, "restored.pt")], arg_overrides={ - "vocab_filename": vocab_file, - "merges_filename": merges_file, + # "vocab_filename": vocab_file, + # "merges_filename": merges_file, + "hf_tokenizer": unified_file, }, ) model = checkpoint[0][0].eval() diff --git a/metaseq/data/iterators.py b/metaseq/data/iterators.py index 92ad0d7..196f921 100644 --- a/metaseq/data/iterators.py +++ b/metaseq/data/iterators.py @@ -280,7 +280,7 @@ class StreamingEpochBatchIterator(EpochBatchIterating): # small optimization: we advance the epoch before saving, so that # when loading later we don't end up fast-forwarding the iterator epoch = self.epoch + 1 - sequences_consumed = [0 for _ in range(self.num_workers)] + sequences_consumed = [0 for _ in range(max(1, self.num_workers))] n = 0 next_worker = 0 else: diff --git a/metaseq/dataclass/constants.py b/metaseq/dataclass/constants.py index fc1fc33..bf531ce 100644 --- a/metaseq/dataclass/constants.py +++ b/metaseq/dataclass/constants.py @@ -40,6 +40,7 @@ DDP_BACKEND_CHOICES = ChoiceEnum( "c10d", # alias for pytorch_ddp "fully_sharded", # FullyShardedDataParallel from fairscale "pytorch_ddp", + "no_c10d", # Deflake tests ] ) DATASET_IMPL_CHOICES = ChoiceEnum(["raw", "lazy", "cached", "mmap", "fasta"]) diff --git a/metaseq/hub_utils.py b/metaseq/hub_utils.py index a7ff7fa..ce54aaa 100644 --- a/metaseq/hub_utils.py +++ b/metaseq/hub_utils.py @@ -11,7 +11,7 @@ import os import time from argparse import Namespace from typing import Any, Dict, Iterator, List, Optional -from tokenizers import ByteLevelBPETokenizer +from tokenizers import ByteLevelBPETokenizer, Tokenizer import numpy as np import torch @@ -125,6 +125,15 @@ def setup_vocab_and_merges(model_path): return vocab_file, merges_file, tokenizer +def setup_unified_tokenizer(model_path): + unified_file = os.path.join(model_path, "gpt2-unified.json") + if not os.path.exists(unified_file): + _, _, tokenizer = setup_vocab_and_merges(model_path) + tokenizer.save(unified_file) + tokenizer = Tokenizer.from_file(unified_file) + return unified_file, tokenizer + + class GeneratorHubInterface(nn.Module): """ PyTorch Hub interface for generating sequences from a pre-trained diff --git a/metaseq/models/transformer_decoder.py b/metaseq/models/transformer_decoder.py index 0e4cabb..49b673e 100644 --- a/metaseq/models/transformer_decoder.py +++ b/metaseq/models/transformer_decoder.py @@ -119,7 +119,8 @@ class TransformerDecoder(IncrementalDecoder): if args.decoder_learned_pos and not self.use_alibi else None ) - self.embed_positions.to(device).to(dtype) + if self.embed_positions is not None: + self.embed_positions.to(device).to(dtype) self.cross_self_attention = getattr(args, "cross_self_attention", False) diff --git a/metaseq/optim/base_optimizer.py b/metaseq/optim/base_optimizer.py index bcca8cc..49c461d 100644 --- a/metaseq/optim/base_optimizer.py +++ b/metaseq/optim/base_optimizer.py @@ -108,7 +108,9 @@ class BaseOptimizer(object): c = c.to(p.grad.device) p.grad.data.mul_(c) - def clip_grad_norm(self, max_norm, norm_type="l2", aggregate_norm_fn=None): + def clip_grad_norm( + self, max_norm, norm_type="l2", aggregate_norm_fn=None, **kwargs + ): """Clips gradient norm.""" return utils.clip_grad_norm_( self.params, max_norm, norm_type, aggregate_norm_fn diff --git a/metaseq/optim/lr_scheduler/fixed_schedule.py b/metaseq/optim/lr_scheduler/fixed_schedule.py new file mode 100644 index 0000000..cea9100 --- /dev/null +++ b/metaseq/optim/lr_scheduler/fixed_schedule.py @@ -0,0 +1,79 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass, field +from typing import Optional, List +from omegaconf import II + +from metaseq.dataclass import MetaseqDataclass +from metaseq.optim.lr_scheduler import BaseLRScheduler, register_lr_scheduler + +# from fairseq.dataclass import FairseqDataclass +# from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler + + +@dataclass +class FixedLRScheduleConfig(MetaseqDataclass): + force_anneal: Optional[int] = field( + default=None, + metadata={"help": "force annealing at specified epoch"}, + ) + lr_shrink: float = field( + default=0.1, + metadata={"help": "shrink factor for annealing, lr_new = (lr * lr_shrink)"}, + ) + warmup_updates: int = field( + default=0, + metadata={"help": "warmup the learning rate linearly for the first N updates"}, + ) + lr: List[float] = II("optimization.lr") + + +@register_lr_scheduler("fixed", dataclass=FixedLRScheduleConfig) +class FixedLRSchedule(BaseLRScheduler): + """Decay the LR on a fixed schedule.""" + + def __init__(self, cfg: FixedLRScheduleConfig, optimizer): + super().__init__(cfg, optimizer) + + self.lr = cfg.lr[0] + if cfg.warmup_updates > 0: + self.warmup_factor = 1.0 / cfg.warmup_updates + else: + self.warmup_factor = 1 + + def state_dict(self): + return {"lr": self.lr} + + def load_state_dict(self, state_dict): + if "lr" in state_dict: + self.lr = state_dict["lr"] + + def get_next_lr(self, epoch): + lrs = self.cfg.lr + if self.cfg.force_anneal is None or epoch < self.cfg.force_anneal: + # use fixed LR schedule + next_lr = lrs[min(epoch - 1, len(lrs) - 1)] + else: + # annneal based on lr_shrink + next_lr = lrs[-1] * self.cfg.lr_shrink ** ( + epoch + 1 - self.cfg.force_anneal + ) + return next_lr + + def step_begin_epoch(self, epoch): + """Update the learning rate at the beginning of the given epoch.""" + self.lr = self.get_next_lr(epoch) + self.optimizer.set_lr(self.warmup_factor * self.lr) + return self.optimizer.get_lr() + + def step_update(self, num_updates): + """Update the learning rate after each update.""" + if self.cfg.warmup_updates > 0 and num_updates < self.cfg.warmup_updates: + self.warmup_factor = (num_updates + 1) / float(self.cfg.warmup_updates) + self.optimizer.set_lr(self.warmup_factor * self.lr) + else: + self.optimizer.set_lr(self.lr) + return self.optimizer.get_lr() diff --git a/metaseq/scripts/unify_tokenizer.py b/metaseq/scripts/unify_tokenizer.py new file mode 100644 index 0000000..c2457b0 --- /dev/null +++ b/metaseq/scripts/unify_tokenizer.py @@ -0,0 +1,11 @@ +from tokenizers import ByteLevelBPETokenizer +from fire import Fire + + +def main(vocab_file, merges_file, unified_path): + old_tokenizer = ByteLevelBPETokenizer(vocab_file, merges_file) + old_tokenizer.save(unified_path) + + +if __name__ == "__main__": + Fire(main) diff --git a/metaseq/tasks/streaming_language_modeling.py b/metaseq/tasks/streaming_language_modeling.py index 84154f6..634bdf8 100644 --- a/metaseq/tasks/streaming_language_modeling.py +++ b/metaseq/tasks/streaming_language_modeling.py @@ -31,7 +31,7 @@ from metaseq.tasks import LegacyTask, register_task from metaseq.data.document_to_sequence import DocumentToSequenceDataset try: - from tokenizers import ByteLevelBPETokenizer, Tokenizer + from tokenizers import Tokenizer has_hf_tokenizers = True except ImportError: @@ -51,12 +51,7 @@ class StreamingLanguageModelingConfig(MetaseqDataclass): hf_tokenizer: Optional[str] = field( default="", metadata={"help": "path to a HF tokenizer json file."} ) - # vocab_filename: Optional[str] = field( - # default="", metadata={"help": "path to bpe-vocab.json"} - # ) - # merges_filename: Optional[str] = field( - # default="", metadata={"help": "path to bpe-merges.txt"} - # ) + end_of_document_symbol: Optional[str] = field( default="</s>", metadata={"help": "symbol indicating an end-of-document"} ) diff --git a/tests/test_streaming_language_modeling_task.py b/tests/test_streaming_language_modeling_task.py index 9952fa4..e7be941 100644 --- a/tests/test_streaming_language_modeling_task.py +++ b/tests/test_streaming_language_modeling_task.py @@ -11,7 +11,11 @@ import unittest import torch from tests.utils import train_language_model -from cpu_tests.test_utils import write_dummy_jsonl_data_dir, write_dummy_bpe +from cpu_tests.test_utils import ( + write_dummy_jsonl_data_dir, + write_dummy_bpe, + write_dummy_bpe_unified, +) try: import tokenizers # noqa @@ -41,7 +45,8 @@ class TestReproducibility(unittest.TestCase): with tempfile.TemporaryDirectory(name) as data_dir: write_dummy_jsonl_data_dir(data_dir) - vocab, merges = write_dummy_bpe(data_dir) + # vocab, merges = write_dummy_bpe(data_dir) + tokenizer_path = write_dummy_bpe_unified(f"{data_dir}/tokenizer.json") # train epochs 1 and 2 together with self.assertLogs() as logs: @@ -49,10 +54,12 @@ class TestReproducibility(unittest.TestCase): data_dir=data_dir, arch="transformer_lm_gpt2_tiny", extra_flags=[ - "--vocab-filename", - vocab, - "--merges-filename", - merges, + # "--vocab-filename", + # vocab, + # "--merges-filename", + # merges, + "--hf-tokenizer", + tokenizer_path, "--dropout", "0.0", "--log-format", @@ -68,7 +75,7 @@ class TestReproducibility(unittest.TestCase): task="streaming_language_modeling", max_tokens=None, ) - train_log = get_last_log_stats_containing_string(logs.records, "train_loss") + train_log = get_last_log_stats_containing_string(logs.records, '"loss"') valid_log = get_last_log_stats_containing_string(logs.records, "valid_loss") # train epoch 2, resuming from previous checkpoint 1 @@ -81,10 +88,12 @@ class TestReproducibility(unittest.TestCase): data_dir=data_dir, arch="transformer_lm_gpt2_tiny", extra_flags=[ - "--vocab-filename", - vocab, - "--merges-filename", - merges, + # "--vocab-filename", + # vocab, + # "--merges-filename", + # merges, + "--hf-tokenizer", + tokenizer_path, "--dropout", "0.0", "--log-format", @@ -100,14 +109,12 @@ class TestReproducibility(unittest.TestCase): task="streaming_language_modeling", max_tokens=None, ) - train_res_log = get_last_log_stats_containing_string( - logs.records, "train_loss" - ) + train_res_log = get_last_log_stats_containing_string(logs.records, "loss") valid_res_log = get_last_log_stats_containing_string( logs.records, "valid_loss" ) - for k in ["train_loss", "train_ppl", "train_num_updates", "train_gnorm"]: + for k in ["loss", "ppl", "num_updates", "gnorm"]: self.assertAlmostEqual( float(train_log[k]), float(train_res_log[k]), delta=delta ) @@ -122,7 +129,10 @@ class TestReproducibility(unittest.TestCase): ) def test_reproducibility(self): - self._test_reproducibility("test_reproducibility") + self._test_reproducibility( + "test_reproducibility", + ["--save-interval-updates", "3"], + ) @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") def test_reproducibility_fp16(self): @@ -151,7 +161,7 @@ class TestReproducibility(unittest.TestCase): self._test_reproducibility( "test_mid_epoch_reproducibility", ["--save-interval-updates", "3"], - resume_checkpoint="checkpoint_1_3.pt", + resume_checkpoint="checkpoint_3.pt", max_epoch=1, ) -- GitLab From c5ea071ee2c82675ee363e39009fa1aef7cb797b Mon Sep 17 00:00:00 2001 From: Andrew Poulton <awdpoulton@gmail.com> Date: Wed, 12 Oct 2022 15:21:27 +0000 Subject: [PATCH 03/14] [WIP] CircleCI debug --- metaseq/scripts/unify_tokenizer.py | 9 +++++++-- metaseq/tasks/streaming_language_modeling.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/metaseq/scripts/unify_tokenizer.py b/metaseq/scripts/unify_tokenizer.py index c2457b0..cd18cd3 100644 --- a/metaseq/scripts/unify_tokenizer.py +++ b/metaseq/scripts/unify_tokenizer.py @@ -1,10 +1,15 @@ from tokenizers import ByteLevelBPETokenizer from fire import Fire +import os -def main(vocab_file, merges_file, unified_path): +def main(path): + merges_file = os.path.join(path, "gpt2-merges.txt") + vocab_file = os.path.join(path, "gpt2-vocab.json") + unified_file = os.path.join(path, "gpt2-unified.json") + old_tokenizer = ByteLevelBPETokenizer(vocab_file, merges_file) - old_tokenizer.save(unified_path) + old_tokenizer.save(unified_file) if __name__ == "__main__": diff --git a/metaseq/tasks/streaming_language_modeling.py b/metaseq/tasks/streaming_language_modeling.py index 634bdf8..6c42212 100644 --- a/metaseq/tasks/streaming_language_modeling.py +++ b/metaseq/tasks/streaming_language_modeling.py @@ -121,7 +121,7 @@ class StreamingLanguageModelingTask(LegacyTask): if not has_hf_tokenizers: raise ImportError("Please install tokenizers with: pip install tokenizers") - + print(args.hf_tokenizer) # if args.hf_tokenizer: self.tokenizer = Tokenizer.from_file(args.hf_tokenizer) # else: -- GitLab From e3e04ed5cf2b72856a97150061b576c0c37c2bbc Mon Sep 17 00:00:00 2001 From: Andrew Poulton <awdpoulton@gmail.com> Date: Wed, 12 Oct 2022 15:34:59 +0000 Subject: [PATCH 04/14] ditto --- metaseq/tasks/streaming_language_modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metaseq/tasks/streaming_language_modeling.py b/metaseq/tasks/streaming_language_modeling.py index 6c42212..6432e0f 100644 --- a/metaseq/tasks/streaming_language_modeling.py +++ b/metaseq/tasks/streaming_language_modeling.py @@ -121,7 +121,7 @@ class StreamingLanguageModelingTask(LegacyTask): if not has_hf_tokenizers: raise ImportError("Please install tokenizers with: pip install tokenizers") - print(args.hf_tokenizer) + logger.info(f"loading tokenizer: {args.hf_tokenizer}") # if args.hf_tokenizer: self.tokenizer = Tokenizer.from_file(args.hf_tokenizer) # else: -- GitLab From ede26170cda9d7dc370c80889d0136b713561fd3 Mon Sep 17 00:00:00 2001 From: Andrew Poulton <awdpoulton@gmail.com> Date: Thu, 13 Oct 2022 07:23:11 +0000 Subject: [PATCH 05/14] Correct arg_overrides --- gpu_tests/test_hf_compatibility.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/gpu_tests/test_hf_compatibility.py b/gpu_tests/test_hf_compatibility.py index d655f2e..113fb06 100644 --- a/gpu_tests/test_hf_compatibility.py +++ b/gpu_tests/test_hf_compatibility.py @@ -106,15 +106,12 @@ def load_mp_model_and_run_eval(cfg: MetaseqConfig, **kwargs): class TestHFCompatibility(unittest.TestCase): def test_singleton_metaseq_hf_compatibility(self): model_path = os.path.join(os.path.dirname(__file__), "125m") - # vocab_file, merges_file, tokenizer = setup_vocab_and_merges(model_path) unified_file, tokenizer = setup_unified_tokenizer(model_path) checkpoint = checkpoint_utils.load_model_ensemble_and_task( [os.path.join(model_path, "restored.pt")], arg_overrides={ - # "vocab_filename": vocab_file, - # "merges_filename": merges_file, - "hf_tokenizer": unified_file, + "task": {"hf_tokenizer": unified_file}, }, ) @@ -149,14 +146,11 @@ class TestHFCompatibility(unittest.TestCase): # Verify that the generated logits match the consolidated model logits - # vocab_file, merges_file, tokenizer = setup_vocab_and_merges(model_path) unified_file, tokenizer = setup_unified_tokenizer(model_path) checkpoint = checkpoint_utils.load_model_ensemble_and_task( [os.path.join(model_path, "restored.pt")], arg_overrides={ - # "vocab_filename": vocab_file, - # "merges_filename": merges_file, - "hf_tokenizer": unified_file, + "task": {"hf_tokenizer": unified_file}, }, ) model = checkpoint[0][0].eval() -- GitLab From 6a5e6e627dea9c488afdb8903a6f593295a85c3c Mon Sep 17 00:00:00 2001 From: Andrew Poulton <awdpoulton@gmail.com> Date: Thu, 13 Oct 2022 07:46:15 +0000 Subject: [PATCH 06/14] Add hf_tokenizer arg --- gpu_tests/test_hub_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gpu_tests/test_hub_utils.py b/gpu_tests/test_hub_utils.py index 2e15698..23b82a2 100644 --- a/gpu_tests/test_hub_utils.py +++ b/gpu_tests/test_hub_utils.py @@ -88,6 +88,7 @@ def generate_using_generator_hub_interface(cfg: MetaseqConfig, **kwargs): "merges_filename": os.path.join(kwargs["model_path"], "gpt2-merges.txt"), "bpe_vocab": os.path.join(kwargs["model_path"], "gpt2-vocab.json"), "vocab_filename": os.path.join(kwargs["model_path"], "gpt2-vocab.json"), + "hf_tokenizer": os.path.join(kwargs["model_path"], "gpt2-unified.json"), "bpe_add_prefix_space": False, "specify_arch": True, "tensor_parallel_init_model_on_gpu": True, -- GitLab From 3f29d9ee53a4a7742cb964790fb096c4cf7c457e Mon Sep 17 00:00:00 2001 From: Andrew Poulton <awdpoulton@gmail.com> Date: Thu, 13 Oct 2022 15:28:14 +0000 Subject: [PATCH 07/14] Fix train integrity test --- gpu_tests/test_hub_utils.py | 6 +++++- metaseq/launcher/opt_baselines.py | 20 +++++++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/gpu_tests/test_hub_utils.py b/gpu_tests/test_hub_utils.py index 23b82a2..94e373b 100644 --- a/gpu_tests/test_hub_utils.py +++ b/gpu_tests/test_hub_utils.py @@ -88,7 +88,11 @@ def generate_using_generator_hub_interface(cfg: MetaseqConfig, **kwargs): "merges_filename": os.path.join(kwargs["model_path"], "gpt2-merges.txt"), "bpe_vocab": os.path.join(kwargs["model_path"], "gpt2-vocab.json"), "vocab_filename": os.path.join(kwargs["model_path"], "gpt2-vocab.json"), - "hf_tokenizer": os.path.join(kwargs["model_path"], "gpt2-unified.json"), + "task": { + "hf_tokenizer": os.path.join(kwargs["model_path"], "gpt2-unified.json"), + "vocab_filename": os.path.join(kwargs["model_path"], "gpt2-vocab.json"), + "merges_filename": os.path.join(kwargs["model_path"], "gpt2-merges.txt"), + }, "bpe_add_prefix_space": False, "specify_arch": True, "tensor_parallel_init_model_on_gpu": True, diff --git a/metaseq/launcher/opt_baselines.py b/metaseq/launcher/opt_baselines.py index 3da3c5a..b77220a 100644 --- a/metaseq/launcher/opt_baselines.py +++ b/metaseq/launcher/opt_baselines.py @@ -81,6 +81,11 @@ def get_grid(args): if args.circleci: data_loc_by_env = "./gpu_tests/circleci" valid_subsets = ["BookCorpusFair"] + import metaseq.scripts.unify_tokenizer as unify_tokenizer + + # import metaseq.pdb; metaseq.pdb.set_trace_rank0() + # print("data loc:", data_loc_by_env) + unify_tokenizer.main(os.path.join(data_loc_by_env, "tokenizers")) args.data = os.path.join(data_loc_by_env, "corpus_dedup_10_10_1_0.05_exp29") if os.path.exists(args.data): DATA_ROOT = data_loc_by_env @@ -124,14 +129,19 @@ def get_grid(args): "none", save_dir_key=lambda val: f"bm_{val}" if not no_save_params else "", ), + # hyperparam( + # "--vocab-filename", + # os.path.join(DATA_ROOT, "tokenizers/gpt2-vocab.json"), + # save_dir_key=lambda _: "gpt2" if not no_save_params else "", + # ), + # hyperparam( + # "--merges-filename", os.path.join(DATA_ROOT, "tokenizers/gpt2-merges.txt") + # ), hyperparam( - "--vocab-filename", - os.path.join(DATA_ROOT, "tokenizers/gpt2-vocab.json"), + "--hf-tokenizer", + os.path.join(DATA_ROOT, "tokenizers/gpt2-unified.json"), save_dir_key=lambda _: "gpt2" if not no_save_params else "", ), - hyperparam( - "--merges-filename", os.path.join(DATA_ROOT, "tokenizers/gpt2-merges.txt") - ), ] # separate task config for dummy_lm -- GitLab From a49c8d6e36e56ddc93d0163b19ed0e232b250f6d Mon Sep 17 00:00:00 2001 From: Andrew Poulton <awdpoulton@gmail.com> Date: Thu, 13 Oct 2022 15:39:59 +0000 Subject: [PATCH 08/14] Remove old comments --- metaseq/launcher/opt_baselines.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/metaseq/launcher/opt_baselines.py b/metaseq/launcher/opt_baselines.py index b77220a..ef67303 100644 --- a/metaseq/launcher/opt_baselines.py +++ b/metaseq/launcher/opt_baselines.py @@ -83,8 +83,6 @@ def get_grid(args): valid_subsets = ["BookCorpusFair"] import metaseq.scripts.unify_tokenizer as unify_tokenizer - # import metaseq.pdb; metaseq.pdb.set_trace_rank0() - # print("data loc:", data_loc_by_env) unify_tokenizer.main(os.path.join(data_loc_by_env, "tokenizers")) args.data = os.path.join(data_loc_by_env, "corpus_dedup_10_10_1_0.05_exp29") if os.path.exists(args.data): @@ -129,14 +127,6 @@ def get_grid(args): "none", save_dir_key=lambda val: f"bm_{val}" if not no_save_params else "", ), - # hyperparam( - # "--vocab-filename", - # os.path.join(DATA_ROOT, "tokenizers/gpt2-vocab.json"), - # save_dir_key=lambda _: "gpt2" if not no_save_params else "", - # ), - # hyperparam( - # "--merges-filename", os.path.join(DATA_ROOT, "tokenizers/gpt2-merges.txt") - # ), hyperparam( "--hf-tokenizer", os.path.join(DATA_ROOT, "tokenizers/gpt2-unified.json"), -- GitLab From eefc89d163858a266f754cec2909043bc4340fbe Mon Sep 17 00:00:00 2001 From: Andrew Poulton <awdpoulton@gmail.com> Date: Mon, 17 Oct 2022 15:47:38 +0000 Subject: [PATCH 09/14] Raise error and give mitagation steps if hf_tokenizer not specified --- metaseq/scripts/unify_tokenizer.py | 6 ++++++ metaseq/tasks/streaming_language_modeling.py | 12 +++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/metaseq/scripts/unify_tokenizer.py b/metaseq/scripts/unify_tokenizer.py index cd18cd3..216d1a7 100644 --- a/metaseq/scripts/unify_tokenizer.py +++ b/metaseq/scripts/unify_tokenizer.py @@ -7,6 +7,12 @@ def main(path): merges_file = os.path.join(path, "gpt2-merges.txt") vocab_file = os.path.join(path, "gpt2-vocab.json") unified_file = os.path.join(path, "gpt2-unified.json") + if os.path.exists(unified_file): + return + if not (os.path.exists(merges_file and os.path.exists(vocab_file))): + raise ValueError( + f"Path {path} is missing one or both of {merges_file} and {vocab_file}" + ) old_tokenizer = ByteLevelBPETokenizer(vocab_file, merges_file) old_tokenizer.save(unified_file) diff --git a/metaseq/tasks/streaming_language_modeling.py b/metaseq/tasks/streaming_language_modeling.py index 6432e0f..6646cb0 100644 --- a/metaseq/tasks/streaming_language_modeling.py +++ b/metaseq/tasks/streaming_language_modeling.py @@ -117,17 +117,19 @@ class StreamingLanguageModelingTask(LegacyTask): """ def __init__(self, args): + if not hasattr(args, "hf_tokenizer"): + msg = f"""`StreamingLanguageModelingTask` expects a unified tokenizer format, and not + the merges/vocab dual format from days of yore. You can convert the old format to the new by running + `python -m metaseq.scripts.unify_tokenizer <PATH_TO_TOKENIZERS>` + """ + raise ValueError(msg) super().__init__(args) if not has_hf_tokenizers: raise ImportError("Please install tokenizers with: pip install tokenizers") logger.info(f"loading tokenizer: {args.hf_tokenizer}") - # if args.hf_tokenizer: + self.tokenizer = Tokenizer.from_file(args.hf_tokenizer) - # else: - # self.tokenizer = ByteLevelBPETokenizer.from_file( - # args.vocab_filename, args.merges_filename - # ) if max(args.update_freq) > 1: raise NotImplementedError( -- GitLab From 99784dd1a0113656fa5ce5500dc17bc6e64e88a5 Mon Sep 17 00:00:00 2001 From: Andrew Poulton <awdpoulton@gmail.com> Date: Mon, 17 Oct 2022 16:11:32 +0000 Subject: [PATCH 10/14] Fix sneaky cpu tests --- cpu_tests/test_streaming_language_modeling_task.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cpu_tests/test_streaming_language_modeling_task.py b/cpu_tests/test_streaming_language_modeling_task.py index 7dd65de..4c0b5ab 100644 --- a/cpu_tests/test_streaming_language_modeling_task.py +++ b/cpu_tests/test_streaming_language_modeling_task.py @@ -14,6 +14,7 @@ from metaseq.tasks.streaming_language_modeling import StreamingLanguageModelingT from cpu_tests.test_utils import ( write_one_jsonl, write_dummy_bpe, + write_dummy_bpe_unified, ) from metaseq.dataclass.utils import convert_namespace_to_omegaconf @@ -34,7 +35,10 @@ class TestDatasetLoading(unittest.TestCase): shard_00 = os.path.join(train_dir, "00") shard_01 = os.path.join(train_dir, "01") - vocab_file, merges_file = write_dummy_bpe(data_dir) + # vocab_file, merges_file = write_dummy_bpe(data_dir) + tokenizer_path = write_dummy_bpe_unified( + os.path.join(data_dir, "unified.json") + ) # Create shard folders, and jsonl files os.makedirs(shard_00) @@ -62,10 +66,8 @@ class TestDatasetLoading(unittest.TestCase): "--task", "streaming_language_modeling", data_dir, - "--vocab-filename", - vocab_file, - "--merges-filename", - merges_file, + "--hf-tokenizer", + tokenizer_path, "--sample-break-mode", "complete", ], -- GitLab From eb800442f744bd36528f9f064491647cefef8dfc Mon Sep 17 00:00:00 2001 From: Andrew Poulton <awdpoulton@gmail.com> Date: Mon, 17 Oct 2022 16:19:14 +0000 Subject: [PATCH 11/14] Appease the linter --- cpu_tests/test_streaming_language_modeling_task.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpu_tests/test_streaming_language_modeling_task.py b/cpu_tests/test_streaming_language_modeling_task.py index 4c0b5ab..e88200f 100644 --- a/cpu_tests/test_streaming_language_modeling_task.py +++ b/cpu_tests/test_streaming_language_modeling_task.py @@ -13,7 +13,6 @@ from metaseq import options from metaseq.tasks.streaming_language_modeling import StreamingLanguageModelingTask from cpu_tests.test_utils import ( write_one_jsonl, - write_dummy_bpe, write_dummy_bpe_unified, ) from metaseq.dataclass.utils import convert_namespace_to_omegaconf @@ -35,7 +34,6 @@ class TestDatasetLoading(unittest.TestCase): shard_00 = os.path.join(train_dir, "00") shard_01 = os.path.join(train_dir, "01") - # vocab_file, merges_file = write_dummy_bpe(data_dir) tokenizer_path = write_dummy_bpe_unified( os.path.join(data_dir, "unified.json") ) -- GitLab From b22625a7764cee63cf93eecca294a06113465709 Mon Sep 17 00:00:00 2001 From: Andrew Poulton <awdpoulton@gmail.com> Date: Mon, 17 Oct 2022 16:34:59 +0000 Subject: [PATCH 12/14] Correctly appease the linter --- tests/test_streaming_language_modeling_task.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_streaming_language_modeling_task.py b/tests/test_streaming_language_modeling_task.py index e7be941..f0e1deb 100644 --- a/tests/test_streaming_language_modeling_task.py +++ b/tests/test_streaming_language_modeling_task.py @@ -13,7 +13,6 @@ import torch from tests.utils import train_language_model from cpu_tests.test_utils import ( write_dummy_jsonl_data_dir, - write_dummy_bpe, write_dummy_bpe_unified, ) -- GitLab From 6034abe4a37b1096778ea157bfc29302bf668ccf Mon Sep 17 00:00:00 2001 From: Andrew Poulton <awdpoulton@gmail.com> Date: Tue, 18 Oct 2022 09:58:55 +0000 Subject: [PATCH 13/14] remove comments --- tests/test_streaming_language_modeling_task.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/test_streaming_language_modeling_task.py b/tests/test_streaming_language_modeling_task.py index f0e1deb..e412e37 100644 --- a/tests/test_streaming_language_modeling_task.py +++ b/tests/test_streaming_language_modeling_task.py @@ -87,10 +87,6 @@ class TestReproducibility(unittest.TestCase): data_dir=data_dir, arch="transformer_lm_gpt2_tiny", extra_flags=[ - # "--vocab-filename", - # vocab, - # "--merges-filename", - # merges, "--hf-tokenizer", tokenizer_path, "--dropout", -- GitLab From 7cd1c29daaf1409e6e5cdc4e90a01edf518cf31f Mon Sep 17 00:00:00 2001 From: Andrew Poulton <awdpoulton@gmail.com> Date: Wed, 19 Oct 2022 16:54:25 +0000 Subject: [PATCH 14/14] A few tidyups per review --- metaseq/models/transformer_decoder.py | 4 +- metaseq/models/transformer_lm.py | 4 +- metaseq/optim/base_optimizer.py | 10 ++- metaseq/optim/lr_scheduler/__init__.py | 2 +- metaseq/optim/lr_scheduler/fixed_schedule.py | 79 -------------------- 5 files changed, 13 insertions(+), 86 deletions(-) delete mode 100644 metaseq/optim/lr_scheduler/fixed_schedule.py diff --git a/metaseq/models/transformer_decoder.py b/metaseq/models/transformer_decoder.py index 49b673e..56e88dd 100644 --- a/metaseq/models/transformer_decoder.py +++ b/metaseq/models/transformer_decoder.py @@ -119,8 +119,8 @@ class TransformerDecoder(IncrementalDecoder): if args.decoder_learned_pos and not self.use_alibi else None ) - if self.embed_positions is not None: - self.embed_positions.to(device).to(dtype) + + self.embed_positions.to(device).to(dtype) self.cross_self_attention = getattr(args, "cross_self_attention", False) diff --git a/metaseq/models/transformer_lm.py b/metaseq/models/transformer_lm.py index 988a94b..acd1c05 100644 --- a/metaseq/models/transformer_lm.py +++ b/metaseq/models/transformer_lm.py @@ -60,7 +60,7 @@ class TransformerLanguageModelConfig(MetaseqDataclass): default=False, metadata={"help": "share decoder input and output embeddings"} ) decoder_learned_pos: bool = field( - default=False, + default=True, metadata={"help": "use learned positional embeddings in the decoder"}, ) decoder_learned_sinusoidal: bool = field( @@ -209,7 +209,7 @@ def base_lm_architecture(args): args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048) args.decoder_layers = getattr(args, "decoder_layers", 6) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) - args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", True) args.decoder_learned_sinusoidal = getattr(args, "decoder_learned_sinusoidal", False) args.activation_fn = getattr(args, "activation_fn", "relu") diff --git a/metaseq/optim/base_optimizer.py b/metaseq/optim/base_optimizer.py index 49c461d..3affc66 100644 --- a/metaseq/optim/base_optimizer.py +++ b/metaseq/optim/base_optimizer.py @@ -109,9 +109,15 @@ class BaseOptimizer(object): p.grad.data.mul_(c) def clip_grad_norm( - self, max_norm, norm_type="l2", aggregate_norm_fn=None, **kwargs + self, + max_norm, + norm_type="l2", + aggregate_norm_fn=None, + skip_gradient_update_on_clip_norm=False, ): - """Clips gradient norm.""" + """Clips gradient norm. + `skip_gradient_update_on_clip_norm` arg included to preserve API + """ return utils.clip_grad_norm_( self.params, max_norm, norm_type, aggregate_norm_fn ) diff --git a/metaseq/optim/lr_scheduler/__init__.py b/metaseq/optim/lr_scheduler/__init__.py index 4327178..96d03f2 100644 --- a/metaseq/optim/lr_scheduler/__init__.py +++ b/metaseq/optim/lr_scheduler/__init__.py @@ -18,7 +18,7 @@ from omegaconf import DictConfig LR_SCHEDULER_REGISTRY, LR_SCHEDULER_DATACLASS_REGISTRY, ) = registry.setup_registry( - "--lr-scheduler", base_class=BaseLRScheduler, default="fixed" + "--lr-scheduler", base_class=BaseLRScheduler, default="inverse_sqrt" ) diff --git a/metaseq/optim/lr_scheduler/fixed_schedule.py b/metaseq/optim/lr_scheduler/fixed_schedule.py deleted file mode 100644 index cea9100..0000000 --- a/metaseq/optim/lr_scheduler/fixed_schedule.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from dataclasses import dataclass, field -from typing import Optional, List -from omegaconf import II - -from metaseq.dataclass import MetaseqDataclass -from metaseq.optim.lr_scheduler import BaseLRScheduler, register_lr_scheduler - -# from fairseq.dataclass import FairseqDataclass -# from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler - - -@dataclass -class FixedLRScheduleConfig(MetaseqDataclass): - force_anneal: Optional[int] = field( - default=None, - metadata={"help": "force annealing at specified epoch"}, - ) - lr_shrink: float = field( - default=0.1, - metadata={"help": "shrink factor for annealing, lr_new = (lr * lr_shrink)"}, - ) - warmup_updates: int = field( - default=0, - metadata={"help": "warmup the learning rate linearly for the first N updates"}, - ) - lr: List[float] = II("optimization.lr") - - -@register_lr_scheduler("fixed", dataclass=FixedLRScheduleConfig) -class FixedLRSchedule(BaseLRScheduler): - """Decay the LR on a fixed schedule.""" - - def __init__(self, cfg: FixedLRScheduleConfig, optimizer): - super().__init__(cfg, optimizer) - - self.lr = cfg.lr[0] - if cfg.warmup_updates > 0: - self.warmup_factor = 1.0 / cfg.warmup_updates - else: - self.warmup_factor = 1 - - def state_dict(self): - return {"lr": self.lr} - - def load_state_dict(self, state_dict): - if "lr" in state_dict: - self.lr = state_dict["lr"] - - def get_next_lr(self, epoch): - lrs = self.cfg.lr - if self.cfg.force_anneal is None or epoch < self.cfg.force_anneal: - # use fixed LR schedule - next_lr = lrs[min(epoch - 1, len(lrs) - 1)] - else: - # annneal based on lr_shrink - next_lr = lrs[-1] * self.cfg.lr_shrink ** ( - epoch + 1 - self.cfg.force_anneal - ) - return next_lr - - def step_begin_epoch(self, epoch): - """Update the learning rate at the beginning of the given epoch.""" - self.lr = self.get_next_lr(epoch) - self.optimizer.set_lr(self.warmup_factor * self.lr) - return self.optimizer.get_lr() - - def step_update(self, num_updates): - """Update the learning rate after each update.""" - if self.cfg.warmup_updates > 0 and num_updates < self.cfg.warmup_updates: - self.warmup_factor = (num_updates + 1) / float(self.cfg.warmup_updates) - self.optimizer.set_lr(self.warmup_factor * self.lr) - else: - self.optimizer.set_lr(self.lr) - return self.optimizer.get_lr() -- GitLab