Administrator · 233d3525
--- a/metaseq/cli/train.py

+ 27

- 4

View file @ 233d3525

Edit in single-file editor

Open in Web IDE
+++ b/metaseq/cli/train.py

+ 27

- 4

View file @ 233d3525

Edit in single-file editor

Open in Web IDE
 @@ -87,6 +87,16 @@ def main(cfg: DictConfig) -> None:
    ), "Must specify batch size either with --max-tokens or --batch-size"
    metrics.reset()

+    if cfg.checkpoint.local_save_interval_updates > 0:
+        assert (
+            cfg.checkpoint.save_interval_updates > 0
+        ), "local save must be used with --save-interval-updates > 0"
+        assert (
+            cfg.checkpoint.save_interval_updates
+            % cfg.checkpoint.local_save_interval_updates
+            == 0
+        ), "--save-interval-updates must be a multiple of --local-save-interval-updates"
+
    if cfg.common.log_file is not None:
        handler = logging.FileHandler(filename=cfg.common.log_file)
        logger.addHandler(handler)
 @@ -334,7 +344,11 @@ def train(
                )
                continue

-            if distributed_utils.get_global_rank() == 0 and cfg.common.profile and i == 5:
+            if (
+                distributed_utils.get_global_rank() == 0
+                and cfg.common.profile
+                and i == 5
+            ):
                logger.info("STARTING PROFILER")
                with profiler.profile(
                    profile_memory=True, with_stack=True, record_shapes=True
 @@ -407,6 +421,17 @@ def validate_and_save(
            f"num_updates: {num_updates} >= max_update: {max_update}"
        )

+    save_locally = (
+        cfg.checkpoint.local_save_interval_updates > 0
+        and num_updates > 0
+        and num_updates % cfg.checkpoint.local_save_interval_updates == 0
+    )
+    save_to_NFS = (
+        cfg.checkpoint.save_interval_updates > 0
+        and num_updates > 0
+        and num_updates % cfg.checkpoint.save_interval_updates == 0
+    )
+
    do_save = (
        (
            end_of_epoch
 @@ -414,9 +439,7 @@ def validate_and_save(
            and epoch_itr.epoch % cfg.checkpoint.save_interval_epochs == 0
        )
        or (
-            cfg.checkpoint.save_interval_updates > 0
-            and num_updates > 0
-            and num_updates % cfg.checkpoint.save_interval_updates == 0
+            (save_locally or save_to_NFS)
            and num_updates >= cfg.dataset.validate_after_updates
            and was_successful_step
        )

--- a/metaseq/dataclass/configs.py

+ 7

- 0

View file @ 233d3525

Edit in single-file editor

Open in Web IDE
+++ b/metaseq/dataclass/configs.py

+ 7

- 0

View file @ 233d3525

Edit in single-file editor

Open in Web IDE
 @@ -505,6 +505,13 @@ class CheckpointConfig(MetaseqDataclass):
    save_interval_updates: int = field(
        default=0, metadata={"help": "save a checkpoint (and validate) every N updates"}
    )
+    local_save_interval_updates: int = field(
+        default=0,
+        metadata={
+            "help": "save a checkpoint (and validate) every N updates to local SSD. "
+            "Only applicable when copying to NFS asynchronously"
+        },
+    )
    save_last_checkpoint: bool = field(
        default=True,
        metadata={"help": "store a last checkpoint at the end of the training run."},

--- a/metaseq/checkpoint_utils.py

+ 10

- 5

View file @ 233d3525

Edit in single-file editor

Open in Web IDE
+++ b/metaseq/checkpoint_utils.py

+ 10

- 5

View file @ 233d3525

Edit in single-file editor

Open in Web IDE
 @@ -63,11 +63,15 @@ def save_checkpoint(
        and epoch % cfg.save_interval_epochs == 0
    )

-    save_for_updates = (
-        not end_of_epoch
-        and cfg.save_interval_updates > 0
-        and updates % cfg.save_interval_updates == 0
+    save_locally = (
+        cfg.local_save_interval_updates > 0
+        and updates % cfg.local_save_interval_updates == 0
    )
+    save_to_NFS = (
+        cfg.save_interval_updates > 0 and updates % cfg.save_interval_updates == 0
+    )
+
+    save_for_updates = not end_of_epoch and (save_to_NFS or save_locally)

    checkpoint_conds[f"checkpoint{epoch}{suffix}.pt"] = save_for_epoch
    checkpoint_conds[f"checkpoint_{updates}{suffix}.pt"] = save_for_updates
 @@ -82,6 +86,7 @@ def save_checkpoint(
    checkpoints = [
        os.path.join(cfg.save_dir, fn) for fn, cond in checkpoint_conds.items() if cond
    ]
+
    if len(checkpoints) > 0:
        if PathManager.islink(checkpoints[0]):
            PathManager.rm(checkpoints[0])
 @@ -90,7 +95,7 @@ def save_checkpoint(
            checkpoints[0],
            extra_state,
            training_finished=training_finished,
-            async_callback_fn=async_callback_fn,
+            async_callback_fn=async_callback_fn if save_to_NFS else None,
        )

        write_timer.stop()

--- a/metaseq/trainer.py

+ 3

- 3

View file @ 233d3525

Edit in single-file editor

Open in Web IDE
+++ b/metaseq/trainer.py

+ 3

- 3

View file @ 233d3525

Edit in single-file editor

Open in Web IDE
 @@ -398,13 +398,13 @@ class Trainer(object):
                def perform_save():
                    try:
                        logger.info(f"Beginning asynchronous torch.save to {filename}")
-                        if async_callback_fn is not None:
-                            async_callback_fn(filename)
+                        async_callback_fn(filename)
                        logger.info(f"Asynchronous torch.save to {filename} complete.")
                    except Exception as e:
                        logger.exception(f"Asynchronous save failed: {e}")
                torch.save(state_dict, filename)
-                self.async_checkpoint.submit(perform_save)
+                if async_callback_fn is not None:
+                    self.async_checkpoint.submit(perform_save)
            logger.info(f"Finished saving checkpoint to {filename}")

    def load_checkpoint(