zaydzuhri commited on Jul 10

Commit

5b64e7c

verified ·

1 Parent(s): 613202f

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

fla/models/mamba/__pycache__/configuration_mamba.cpython-312.pyc +0 -0
fla/modules/__pycache__/fused_linear_cross_entropy.cpython-312.pyc +0 -0
logs/none_1_grtqk5/attempt_0/0/stderr.log +0 -0
logs/none_1_grtqk5/attempt_0/1/stderr.log +0 -0
logs/none_1_grtqk5/attempt_0/6/stderr.log +0 -0
setup.py +51 -0
torchtitan/components/__pycache__/lr_scheduler.cpython-312.pyc +0 -0
torchtitan/components/optimizer.py +303 -0
torchtitan/datasets/hf_datasets.py +173 -0
torchtitan/datasets/tokenizer/tiktoken.py +190 -0
torchtitan/distributed/__pycache__/pipeline.cpython-312.pyc +0 -0
torchtitan/distributed/__pycache__/utils.cpython-312.pyc +0 -0
torchtitan/experiments/deepseek_v3/LICENSE-CODE +21 -0
torchtitan/experiments/deepseek_v3/README.md +40 -0
torchtitan/experiments/deepseek_v3/generate.py +308 -0
torchtitan/experiments/deepseek_v3/indices.py +195 -0
torchtitan/experiments/deepseek_v3/symm_mem_recipes/__init__.py +11 -0
torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_on_device_all_to_all_v.py +260 -0
torchtitan/experiments/flux/README.md +23 -0
torchtitan/experiments/flux/__init__.py +122 -0
torchtitan/experiments/flux/dataset/tokenizer.py +64 -0
torchtitan/experiments/flux/model/autoencoder.py +388 -0
torchtitan/experiments/flux/model/hf_embedder.py +40 -0
torchtitan/experiments/flux/model/math.py +38 -0
torchtitan/experiments/flux/model/model.py +177 -0
torchtitan/experiments/flux/scripts/download_autoencoder.py +61 -0
torchtitan/experiments/flux/tests/test_generate_image.py +252 -0
torchtitan/experiments/flux/train_configs/debug_model.toml +68 -0
torchtitan/experiments/kernels/triton_mg_group_gemm/simpleMoE.py +885 -0
torchtitan/experiments/llama4/README.md +29 -0
torchtitan/experiments/llama4/__pycache__/__init__.cpython-312.pyc +0 -0
torchtitan/experiments/llama4/model/__pycache__/moe.cpython-312.pyc +0 -0
torchtitan/experiments/llama4/model/args.py +109 -0
torchtitan/experiments/llama4/model/moe.py +228 -0
torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.py +536 -0
torchtitan/experiments/multimodal/__init__.py +37 -0
torchtitan/experiments/multimodal/mm_collator.py +227 -0
torchtitan/experiments/multimodal/requirements.txt +1 -0
torchtitan/experiments/simple_fsdp/__pycache__/simple_fsdp.cpython-312.pyc +0 -0
torchtitan/experiments/simple_fsdp/model.py +18 -0
torchtitan/experiments/simple_fsdp/simple_fsdp.py +194 -0
torchtitan/experiments/simple_fsdp/tests/__init__.py +5 -0
torchtitan/models/__init__.py +10 -0
torchtitan/models/__pycache__/__init__.cpython-312.pyc +0 -0
torchtitan/models/__pycache__/norms.cpython-312.pyc +0 -0
torchtitan/models/llama3/__init__.py +76 -0
torchtitan/models/llama3/parallelize_llama.py +398 -0
torchtitan/models/llama3/pipeline_llama.py +161 -0
torchtitan/models/llama3/train_configs/llama3_405b.toml +63 -0
torchtitan/tools/profiling.py +131 -0

fla/models/mamba/__pycache__/configuration_mamba.cpython-312.pyc ADDED Viewed

Binary file (7.06 kB). View file

fla/modules/__pycache__/fused_linear_cross_entropy.cpython-312.pyc ADDED Viewed

Binary file (20.6 kB). View file

logs/none_1_grtqk5/attempt_0/0/stderr.log ADDED Viewed

The diff for this file is too large to render. See raw diff

logs/none_1_grtqk5/attempt_0/1/stderr.log ADDED Viewed

The diff for this file is too large to render. See raw diff

logs/none_1_grtqk5/attempt_0/6/stderr.log ADDED Viewed

The diff for this file is too large to render. See raw diff

setup.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# -*- coding: utf-8 -*-
+import ast
+import os
+import re
+from pathlib import Path
+from setuptools import find_packages, setup
+with open('README.md') as f:
+    long_description = f.read()
+def get_package_version():
+    with open(Path(os.path.dirname(os.path.abspath(__file__))) / 'flame' / '__init__.py') as f:
+        version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE)
+    return ast.literal_eval(version_match.group(1))
+setup(
+    name='flame',
+    version=get_package_version(),
+    description='A minimal training framework for scaling FLA models',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    author='Songlin Yang, Yu Zhang',
+    author_email='yangsl66@mit.edu, yzhang.cs@outlook.com',
+    url='https://github.com/fla-org/flame',
+    packages=find_packages(),
+    license='MIT',
+    classifiers=[
+        'Programming Language :: Python :: 3',
+        'License :: OSI Approved :: MIT License',
+        'Operating System :: OS Independent',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence'
+    ],
+    python_requires='>=3.10',
+    install_requires=[
+        'torch==2.6',
+        'torchdata',
+        'transformers==4.51.3',
+        'triton>=3.0',
+        'datasets>=3.3.0',
+        'einops',
+        'ninja',
+        'wandb',
+        'tiktoken',
+        'tensorboard',
+        'python-dotenv'
+    ],
+)

torchtitan/components/__pycache__/lr_scheduler.cpython-312.pyc ADDED Viewed

Binary file (7.71 kB). View file

torchtitan/components/optimizer.py ADDED Viewed

	@@ -0,0 +1,303 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import functools
+from typing import Any, Generic, Iterator, TypeVar
+import torch
+import torch.nn as nn
+from torch.distributed.checkpoint.state_dict import (
+    get_optimizer_state_dict,
+    set_optimizer_state_dict,
+    StateDictOptions,
+)
+from torch.distributed.checkpoint.stateful import Stateful
+from torch.optim import Optimizer
+from torchtitan.components.ft import FTManager, has_torchft
+from torchtitan.config_manager import JobConfig
+__all__ = [
+    "OptimizersContainer",
+    "build_optimizers",
+]
+if has_torchft:
+    import torchft as ft
+T = TypeVar("T", bound=Optimizer)
+class OptimizersContainer(Optimizer, Stateful, Generic[T]):
+    """A container for multiple optimizers.
+    This class is used to wrap multiple optimizers into a single object that can be
+    used to reduce the complexity of the training loop. This mimics the behavior of
+    ``torch.optim.Optimizer``. This class currently only supports ``Adam`` and ``AdamW``.
+    **Note**
+    Users who want to customize the optimizer behavior can inherit from this class and
+    extend the functionality as needed. The following methods must follow the same signature
+    as ``torch.optim.Optimizer`` class: ``step()``, ``zero_grad()``, ``state_dict()``,
+    ``load_state_dict()``.
+    **Limitations**
+    This class assumes that all the optimizers are the same type and have the same
+    configurations. With this assumption, TorchTitan can support lr scheduler resharding
+    (e.g., loading a checkpoint with a different number of GPUs and/or different
+    parallelization strategy). Note that ``get_optimizer_state_dict`` already enables the
+    resharding for the optimizer state but not for the lr scheduler state, hence the limitation.
+    Args:
+        model_parts (List[nn.Module]): List of model parts to be optimized.
+        optimizer_kwargs (Dict[str, Any]): Keyword arguments for the optimizers.
+        name (str): Name of the optimizers.
+    """
+    optimizers: list[T]
+    model_parts: list[nn.Module]
+    def __init__(
+        self,
+        model_parts: list[nn.Module],
+        optimizer_cls: type[T],
+        optimizer_kwargs: dict[str, Any],
+    ) -> None:
+        all_params = []
+        self.optimizers = []
+        self.model_parts = model_parts
+        for model in self.model_parts:
+            params = [p for p in model.parameters() if p.requires_grad]
+            self.optimizers.append(optimizer_cls(params, **optimizer_kwargs))
+            all_params.extend(params)
+        self._validate_length(len(self.model_parts))
+        self._post_init(all_params, optimizer_kwargs)
+    def __iter__(self) -> Iterator[T]:
+        return iter(self.optimizers)
+    def __len__(self) -> int:
+        return len(self.optimizers)
+    def step(self, *args, **kwargs) -> None:
+        for optimizer in self.optimizers:
+            optimizer.step(*args, **kwargs)
+    def zero_grad(self, *args, **kwargs) -> None:
+        for optimizer in self.optimizers:
+            optimizer.zero_grad(*args, **kwargs)
+    def state_dict(self) -> dict[str, Any]:
+        func = functools.partial(
+            get_optimizer_state_dict,
+            options=StateDictOptions(flatten_optimizer_state_dict=True),
+        )
+        return {
+            k: v
+            for sd in map(func, self.model_parts, self.optimizers)
+            for k, v in sd.items()
+        }
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+        func = functools.partial(
+            set_optimizer_state_dict,
+            optim_state_dict=state_dict,
+            options=StateDictOptions(flatten_optimizer_state_dict=True),
+        )
+        list(map(func, self.model_parts, self.optimizers))
+    def _validate_length(self, expected_length: int) -> None:
+        assert expected_length == len(self.optimizers), (
+            "Must pass one optimizer per model part or per param if "
+            "using OptimizersInBackwardContainer."
+        )
+    def _post_init(
+        self, all_params: list[nn.Parameter], optimizer_kwargs: dict[str, Any]
+    ) -> None:
+        # We need to call Optimizer.__init__() to initialize some necessary optimizer
+        # functionality such as hooks.
+        Optimizer.__init__(self, all_params, optimizer_kwargs)
+class OptimizersInBackwardContainer(OptimizersContainer):
+    """OptimizersContainer for executing ``optim.step()`` in backward pass.
+    This class extend ``OptimizersContainer`` to support optimizer step in
+    backward pass. ``step()`` and ``zero_grad()`` are no-op in this class.
+    Instead, ``register_post_accumulate_grad_hook`` is used to register a hook to
+    execute these methods when the gradient is accumulated.
+    """
+    def __init__(
+        self,
+        model_parts: list[nn.Module],
+        optimizer_cls: type[T],
+        optimizer_kwargs: dict[str, Any],
+    ) -> None:
+        all_params = []
+        self.model_parts = model_parts
+        optim_dict = {}
+        for model in self.model_parts:
+            for p in model.parameters():
+                if p.requires_grad:
+                    optim_dict[p] = optimizer_cls([p], **optimizer_kwargs)
+                all_params.append(p)
+        def optim_hook(param) -> None:
+            optim_dict[param].step()
+            optim_dict[param].zero_grad()
+        for model in self.model_parts:
+            for param in model.parameters():
+                if param.requires_grad:
+                    param.register_post_accumulate_grad_hook(optim_hook)
+        self.optimizers = list(optim_dict.values())
+        self._validate_length(
+            sum(len(list(model.parameters())) for model in self.model_parts)
+        )
+        self._post_init(all_params, optimizer_kwargs)
+    def step(self) -> None:
+        pass
+    def zero_grad(self) -> None:
+        pass
+class FTOptimizersContainer(OptimizersContainer):
+    def __init__(
+        self,
+        model_parts: list[nn.Module],
+        optimizer_cls: type[T],
+        optimizer_kwargs: dict[str, Any],
+        ft_manager: "ft.Manager",
+    ) -> None:
+        super().__init__(model_parts, optimizer_cls, optimizer_kwargs)
+        # Force to initialize the optimizer state so that `optim.step()`
+        # won't be called by state_dict() and load_state_dict().
+        _ = {
+            k: v
+            for sd in map(get_optimizer_state_dict, model_parts, self.optimizers)
+            for k, v in sd.items()
+        }
+        self.cache_state_dict: dict[str, Any] = {}
+        self._ft_optimizer = ft.Optimizer(ft_manager, self)
+        self._call_from_ft: bool = False
+    def init_cache_state_dict(self) -> None:
+        self.cache_state_dict = super().state_dict()
+    def state_dict(self) -> dict[str, Any]:
+        return self.cache_state_dict
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+        # We have to invalidate the `cache_state_dict` because optimizer uses
+        # assign instead of copy when doing `load_state_dict()`. Without
+        # invalidating the `cache_state_dict`, there will be memory leakage.
+        self.cache_state_dict = {}
+        super().load_state_dict(state_dict)
+        self.init_cache_state_dict()
+    def step(self, *args, **kwargs) -> None:
+        """Calling the correct step() depending on the caller.
+        TorchFT's OptimizerWrapper.step() is designed to be callled only once
+        per train step per ft.Manager regardless how many optimizers are used.
+        Hence we will need to appropriately dispatch the call.
+        """
+        if self._call_from_ft:
+            super().step(*args, **kwargs)
+        else:
+            self._call_from_ft = True
+            self._ft_optimizer.step(*args, **kwargs)
+            self._call_from_ft = False
+    def zero_grad(self, *args, **kwargs) -> None:
+        """Calling the correct zero_grad() depending on the caller.
+        Check the comment in ``step()``.
+        """
+        if self._call_from_ft:
+            super().zero_grad(*args, **kwargs)
+        else:
+            self._call_from_ft = True
+            self._ft_optimizer.zero_grad(*args, **kwargs)
+            self._call_from_ft = False
+def build_optimizers(
+    model_parts: list[nn.Module],
+    job_config: JobConfig,
+    ft_manager: FTManager,
+) -> OptimizersContainer:
+    """Create a OptimizersContainer for the given model parts and job config.
+    This function creates a ``OptimizersContainer`` for the given model parts.
+    ``job_config`` should define the correct optimizer name and parameters.
+    This function currently supports creating ``OptimizersContainer`` and
+    ``OptimizersInBackwardContainer``.
+    **Note**
+    Users who want to customize the optimizer behavior can create their own
+    ``OptimizersContainer`` subclass and ``build_optimizers``. Passing the
+    customized ``build_optimizers`` to ``TrainSpec`` will create the customized
+    ``OptimizersContainer``.
+    Args:
+        model_parts (List[nn.Module]): List of model parts to be optimized.
+        job_config (JobConfig): Job config containing the optimizer name and parameters.
+    """
+    optim_in_bwd = job_config.optimizer.early_step_in_backward
+    if optim_in_bwd and job_config.parallelism.pipeline_parallel_degree > 1:
+        raise NotImplementedError(
+            "Optimizers in backward is not supported with pipeline parallelism."
+        )
+    name = job_config.optimizer.name
+    lr = job_config.optimizer.lr
+    eps = job_config.optimizer.eps
+    optim_implementation = job_config.optimizer.implementation
+    assert optim_implementation in ["fused", "foreach", "for-loop"]
+    fused = optim_implementation == "fused"
+    foreach = optim_implementation == "foreach"
+    optimizer_kwargs = {
+        "lr": lr,
+        "eps": eps,
+        "betas": (0.9, 0.95),
+        "weight_decay": 0.1,
+        "fused": fused,
+        "foreach": foreach,
+    }
+    optimizer_classes = {
+        "Adam": torch.optim.Adam,
+        "AdamW": torch.optim.AdamW,
+    }
+    if name not in optimizer_classes:
+        raise NotImplementedError(f"Optimizer {name} not added.")
+    optimizer_cls = optimizer_classes[name]
+    if optim_in_bwd and ft_manager.enabled:
+        raise ValueError("TorchFT is not supported with optimizers in backward.")
+    elif optim_in_bwd:
+        return OptimizersInBackwardContainer(
+            model_parts, optimizer_cls, optimizer_kwargs
+        )
+    elif ft_manager.enabled:
+        return FTOptimizersContainer(
+            model_parts, optimizer_cls, optimizer_kwargs, ft_manager.manager
+        )
+    else:
+        return OptimizersContainer(model_parts, optimizer_cls, optimizer_kwargs)

torchtitan/datasets/hf_datasets.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from typing import Any, Callable
+import torch
+from datasets import Dataset, load_dataset
+from datasets.distributed import split_dataset_by_node
+from torch.distributed.checkpoint.stateful import Stateful
+from torch.utils.data import IterableDataset
+from torchtitan.components.dataloader import ParallelAwareDataloader
+from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.config_manager import JobConfig
+from torchtitan.tools.logging import logger
+def _load_c4_dataset(dataset_path: str):
+    """Load C4 dataset with default configuration."""
+    return load_dataset(dataset_path, name="en", split="train", streaming=True)
+def _process_c4_text(sample: dict[str, Any]) -> str:
+    """Process C4 dataset sample text."""
+    return sample["text"]
+@dataclass
+class DatasetConfig:
+    path: str
+    loader: Callable
+    text_processor: Callable
+# Add your dataset here here - more information at docs/datasets.md
+DATASETS = {
+    "c4": DatasetConfig(
+        path="allenai/c4",
+        loader=_load_c4_dataset,
+        text_processor=_process_c4_text,
+    ),
+    "c4_test": DatasetConfig(
+        path="tests/assets/c4_test",
+        loader=lambda path: load_dataset(path, split="train"),
+        text_processor=_process_c4_text,
+    ),
+}
+def _validate_dataset(
+    dataset_name: str, dataset_path: str | None = None
+) -> tuple[str, Callable, Callable]:
+    """Validate dataset name and path."""
+    if dataset_name not in DATASETS:
+        raise ValueError(
+            f"Dataset {dataset_name} is not supported. "
+            f"Supported datasets are: {list(DATASETS.keys())}"
+        )
+    config = DATASETS[dataset_name]
+    path = dataset_path or config.path
+    logger.info(f"Preparing {dataset_name} dataset from {path}")
+    return path, config.loader, config.text_processor
+class HuggingFaceDataset(IterableDataset, Stateful):
+    def __init__(
+        self,
+        dataset_name: str,
+        dataset_path: str | None,
+        tokenizer: Tokenizer,
+        seq_len: int = 2048,
+        dp_rank: int = 0,
+        dp_world_size: int = 1,
+        infinite: bool = False,
+    ) -> None:
+        # Force lowercase for consistent comparison
+        dataset_name = dataset_name.lower()
+        path, dataset_loader, text_processor = _validate_dataset(
+            dataset_name, dataset_path
+        )
+        ds = dataset_loader(path)
+        self.dataset_name = dataset_name
+        self._data = split_dataset_by_node(ds, dp_rank, dp_world_size)
+        self._tokenizer = tokenizer
+        self.seq_len = seq_len
+        self.infinite = infinite
+        self._text_processor = text_processor
+        # Variables for checkpointing
+        self._sample_idx = 0
+        self._all_tokens: list[int] = []
+    def _get_data_iter(self):
+        if isinstance(self._data, Dataset) and self._sample_idx == len(self._data):
+            return iter([])
+        it = iter(self._data)
+        for _ in range(self._sample_idx):
+            next(it)
+        return it
+    def __iter__(self):
+        max_buffer_token_len = 1 + self.seq_len
+        while True:
+            for sample in self._get_data_iter():
+                # Use the dataset-specific text processor
+                sample_text = self._text_processor(sample)
+                sample_tokens = self._tokenizer.encode(sample_text, bos=True, eos=True)
+                self._all_tokens.extend(sample_tokens)
+                self._sample_idx += 1
+                while len(self._all_tokens) >= max_buffer_token_len:
+                    x = torch.LongTensor(self._all_tokens[:max_buffer_token_len])
+                    # update tokens to the remaining tokens
+                    self._all_tokens = self._all_tokens[max_buffer_token_len:]
+                    input = x[:-1]
+                    label = x[1:]
+                    yield {"input": input}, label
+            if not self.infinite:
+                logger.warning(f"Dataset {self.dataset_name} has run out of data")
+                break
+            else:
+                # Reset offset for the next iteration
+                self._sample_idx = 0
+                logger.warning(f"Dataset {self.dataset_name} is being re-looped")
+    def load_state_dict(self, state_dict):
+        self._sample_idx = state_dict["sample_idx"]
+        self._all_tokens = state_dict["token_buffer"]
+    def state_dict(self):
+        return {"token_buffer": self._all_tokens, "sample_idx": self._sample_idx}
+def build_hf_dataloader(
+    dp_world_size: int,
+    dp_rank: int,
+    tokenizer: Tokenizer,
+    job_config: JobConfig,
+    infinite: bool = True,
+) -> ParallelAwareDataloader:
+    """Build a data loader for HuggingFace datasets."""
+    dataset_name = job_config.training.dataset
+    dataset_path = job_config.training.dataset_path
+    batch_size = job_config.training.batch_size
+    seq_len = job_config.training.seq_len
+    hf_ds = HuggingFaceDataset(
+        dataset_name=dataset_name,
+        dataset_path=dataset_path,
+        tokenizer=tokenizer,
+        seq_len=seq_len,
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        infinite=infinite,
+    )
+    return ParallelAwareDataloader(
+        dataset=hf_ds,
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        batch_size=batch_size,
+    )

torchtitan/datasets/tokenizer/tiktoken.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+import os
+from collections.abc import Collection, Iterator, Sequence, Set as AbstractSet
+from pathlib import Path
+from typing import cast, Literal
+import tiktoken
+from tiktoken.load import load_tiktoken_bpe
+from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.config_manager import JobConfig
+from torchtitan.tools.logging import logger
+class TikTokenizer(Tokenizer):
+    """
+    Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
+    Args:
+        model_path (str): The path to the Tiktoken model file.
+    """
+    special_tokens: dict[str, int]
+    num_reserved_special_tokens = 256
+    pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"  # noqa: E501, B950
+    def __init__(self, model_path: str):
+        super().__init__()
+        assert os.path.exists(
+            model_path
+        ), f"The tokenizer path does not exist: {model_path}"
+        assert os.path.isfile(model_path), model_path
+        mergeable_ranks = load_tiktoken_bpe(model_path)
+        num_base_tokens = len(mergeable_ranks)
+        special_tokens = [
+            "<|begin_of_text|>",
+            "<|end_of_text|>",
+            "<|reserved_special_token_0|>",
+            "<|reserved_special_token_1|>",
+            "<|reserved_special_token_2|>",
+            "<|reserved_special_token_3|>",
+            "<|start_header_id|>",
+            "<|end_header_id|>",
+            "<|reserved_special_token_4|>",
+            "<|eot_id|>",  # end of turn
+        ] + [
+            f"<|reserved_special_token_{i}|>"
+            for i in range(5, self.num_reserved_special_tokens - 5)
+        ]
+        self.special_tokens = {
+            token: num_base_tokens + i for i, token in enumerate(special_tokens)
+        }
+        self.model = tiktoken.Encoding(
+            name=Path(model_path).name,
+            pat_str=self.pat_str,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self._n_words: int = self.model.n_vocab
+        # BOS / EOS token IDs
+        self.bos_id: int = self.special_tokens["<|begin_of_text|>"]
+        self.eos_id: int = self.special_tokens["<|end_of_text|>"]
+        self.pad_id: int = -1
+        self.stop_tokens = {
+            self.special_tokens["<|end_of_text|>"],
+            self.special_tokens["<|eot_id|>"],
+        }
+        logger.info(
+            f"TikTokenizer built: #words {self.n_words}, BOS ID {self.bos_id}, EOS ID {self.eos_id}"
+        )
+    def encode(
+        self,
+        s: str,
+        *,
+        bos: bool,
+        eos: bool,
+        allowed_special: Literal["all"] | AbstractSet[str] | None = None,
+        disallowed_special: Literal["all"] | Collection[str] | None = None,
+    ) -> list[int]:
+        """
+        Encodes a string into a list of token IDs.
+        Args:
+            s (str): The input string to be encoded.
+            bos (bool): Whether to prepend the beginning-of-sequence token.
+            eos (bool): Whether to append the end-of-sequence token.
+            allowed_tokens ("all"|set[str]): allowed special tokens in string
+            disallowed_tokens ("all"|set[str]): special tokens that raise an error when in string
+        Returns:
+            list[int]: A list of token IDs.
+        By default, setting disallowed_special=() encodes a string by ignoring
+        special tokens. Specifically:
+        - Setting `disallowed_special` to () will cause all text corresponding
+          to special tokens to be encoded as natural text (insteading of raising
+          an error).
+        - Setting `allowed_special` to "all" will treat all text corresponding
+          to special tokens to be encoded as special tokens.
+        """
+        assert type(s) is str
+        allowed_special = allowed_special or set()
+        disallowed_special = disallowed_special or ()
+        # The tiktoken tokenizer can handle <=400k chars without
+        # pyo3_runtime.PanicException.
+        TIKTOKEN_MAX_ENCODE_CHARS = 400_000
+        # https://github.com/openai/tiktoken/issues/195
+        # Here we iterate over subsequences and split if we exceed the limit
+        # of max consecutive non-whitespace or whitespace characters.
+        MAX_NO_WHITESPACES_CHARS = 25_000
+        substrs = (
+            substr
+            for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
+            for substr in self._split_whitespaces_or_nonwhitespaces(
+                s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
+            )
+        )
+        t: list[int] = []
+        for substr in substrs:
+            t.extend(
+                self.model.encode(
+                    substr,
+                    allowed_special=allowed_special,
+                    disallowed_special=disallowed_special,
+                )
+            )
+        if bos:
+            t.insert(0, self.bos_id)
+        if eos:
+            t.append(self.eos_id)
+        return t
+    def decode(self, t: Sequence[int]) -> str:
+        """
+        Decodes a list of token IDs into a string.
+        Args:
+            t (List[int]): The list of token IDs to be decoded.
+        Returns:
+            str: The decoded string.
+        """
+        # Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
+        return self.model.decode(cast(list[int], t))
+    @staticmethod
+    def _split_whitespaces_or_nonwhitespaces(
+        s: str, max_consecutive_slice_len: int
+    ) -> Iterator[str]:
+        """
+        Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
+        consecutive whitespaces or consecutive non-whitespaces.
+        """
+        current_slice_len = 0
+        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
+        slice_start = 0
+        for i in range(len(s)):
+            is_now_space = s[i].isspace()
+            if current_slice_is_space ^ is_now_space:
+                current_slice_len = 1
+                current_slice_is_space = is_now_space
+            else:
+                current_slice_len += 1
+                if current_slice_len > max_consecutive_slice_len:
+                    yield s[slice_start:i]
+                    slice_start = i
+                    current_slice_len = 1
+        yield s[slice_start:]
+def build_tiktoken_tokenizer(job_config: JobConfig) -> TikTokenizer:
+    return TikTokenizer(job_config.model.tokenizer_path)

torchtitan/distributed/__pycache__/pipeline.cpython-312.pyc ADDED Viewed

Binary file (7.82 kB). View file

torchtitan/distributed/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (14.9 kB). View file

torchtitan/experiments/deepseek_v3/LICENSE-CODE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 DeepSeek
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

torchtitan/experiments/deepseek_v3/README.md ADDED Viewed

	@@ -0,0 +1,40 @@

+# Running DeepSeek in Titan  (experimental)
+This folder contains a DeepSeek model supporting v2 and v3 as well as kernels
+and scripts needed to run it.
+## Inference
+### Prerequisites:
+You will need to download a DeepSeek model's weights if you want to run a
+pre-trained checkpoint.  We provided a script to download the weights from
+HuggingFace Model Hub:
+```bash
+python download.py [vX]
+```
+where `vX` can be v2 or v3, both are supported. You may be required to create a
+HuggingFace account and log in first.
+### Running inference:
+The inference script is in `generate.py`. You can run it with the following
+command:
+```bash
+torchrun --standalone --nproc-per-node 4 generate.py
+```
+This will run inference on the `DeepSeek-V2-Lite-Chat` model using 4 GPUs by
+default.
+Alternatively, you can run inference by using `bash inference.sh`, optionally
+followed by your prompt.
+## Training
+The training script is in `train.py`. You can run it by the following command:
+```bash
+torchrun --standalone --nproc-per-node 8 train.py
+```
+This will run training on the `DeepSeek-V2-Lite-Chat` model using 8 GPUs by
+default, with pipeline parallel, expert parallel, and data parallel enabled.

torchtitan/experiments/deepseek_v3/generate.py ADDED Viewed

	@@ -0,0 +1,308 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# torchrun --standalone --nproc-per-node 4 generate.py
+# use inference.sh "Your Question Here?" to run inference with a single prompt.
+import sys
+from dataclasses import dataclass
+import torch
+import torch.distributed as dist
+from checkpoint import load_weights_from_hf
+from model import DeepseekForCausalLM
+from model_config import deepseek_config_registry
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.pipelining import PipelineStage, ScheduleGPipe
+from torchtitan.tools.utils import Color
+from transformers import AutoTokenizer
+# Uncomment the model you want to run.
+model_id, mesh_shape = "deepseek-ai/DeepSeek-V2-Lite-Chat", (1, 4)
+# model_id, mesh_shape = "deepseek-ai/deepseek-v3", (8, 4)
+def colorize_chat(text, user_color=None, assistant_color=None, output_color=None):
+    """Parse and colorize chat output with optional colors for each role."""
+    lines = text.split("\n")
+    result = []
+    current_role = None
+    current_content = []
+    def _process_current_content():
+        if not current_role or not current_content:
+            return None
+        content = "\n".join(current_content)
+        if current_role == "output":
+            return (
+                f"Output: {output_color}{content}{color.reset}"
+                if output_color
+                else f"Output: {content}"
+            )
+        else:
+            try:
+                prefix, rest = current_content[0].split(":", 1)
+                role_color = user_color if current_role == "user" else assistant_color
+                if role_color:
+                    formatted = f"{prefix}:{role_color}{rest}{color.reset}"
+                    if len(current_content) > 1:
+                        formatted += (
+                            f"{role_color}\n"
+                            + "\n".join(current_content[1:])
+                            + f"{color.reset}"
+                        )
+                    return formatted
+            except ValueError:
+                pass
+        return content
+    for line in lines:
+        if line.startswith("Output:"):
+            if processed := _process_current_content():
+                result.append(processed)
+            current_role = "output"
+            content = line[len("Output:") :].strip()
+            if output_color:
+                content = f"Output: {output_color}{content}{color.reset}"
+            else:
+                content = f"Output: {content}"
+            result.append(content)
+            current_content = []
+        elif line.startswith("User:"):
+            if processed := _process_current_content():
+                result.append(processed)
+            current_role = "user"
+            current_content = [line]
+        elif line.startswith("Assistant:"):
+            if processed := _process_current_content():
+                result.append(processed)
+            current_role = "assistant"
+            current_content = [line]
+        else:
+            if current_content:
+                current_content.append(line)
+            elif line.strip() and current_role is None:
+                # Handle system message at the beginning
+                current_role = "output"
+                if output_color:
+                    result.append(f"Output: {output_color}{line.strip()}{color.reset}")
+                else:
+                    result.append(f"Output: {line.strip()}")
+    # Process the last segment
+    if processed := _process_current_content():
+        result.append(processed)
+    return "\n".join(result)
+color = Color()
+@dataclass
+class DistConfig:
+    mesh: DeviceMesh
+    pp_mesh: DeviceMesh
+    ep_mesh: DeviceMesh
+    pp_size: int
+    ep_size: int
+    ep_rank: int
+    pp_rank: int
+    device: torch.device
+def create_model(dist_config: DistConfig):
+    model_args = deepseek_config_registry[model_id]
+    model_args.ep_size = dist_config.ep_size
+    model_args.num_stages = dist_config.pp_size
+    model_args.stage_idx = dist_config.pp_rank
+    model_args.max_seq_len = 16384
+    with dist_config.device, dist_config.mesh:
+        model = DeepseekForCausalLM(model_args)
+    load_weights_from_hf(model, model_id, dist_config.device)
+    model.eval()
+    model.setup_symm_mem(torch.bfloat16, dist_config.device)
+    stage = PipelineStage(
+        model,
+        dist_config.pp_rank,
+        dist_config.pp_size,
+        dist_config.device,
+        group=dist_config.pp_mesh.get_group(),
+    )
+    pp_schedule = ScheduleGPipe(stage, dist_config.pp_size)
+    return model, pp_schedule
+def create_dist_config(mesh: DeviceMesh):
+    rank = dist.get_rank()
+    device_count = torch.cuda.device_count()
+    device = torch.device("cuda", rank % device_count)
+    dist_config = DistConfig(
+        mesh=mesh,
+        pp_mesh=mesh["pp"],
+        ep_mesh=mesh["ep"],
+        pp_rank=mesh["pp"].get_local_rank(),
+        pp_size=mesh["pp"].size(),
+        ep_size=mesh["ep"].size(),
+        ep_rank=mesh["ep"].get_local_rank(),
+        device=device,
+    )
+    return dist_config
+def decode(tokenizer, x):
+    output = tokenizer.decode(x[0])
+    # Clean up the output by removing special tokens
+    bos = tokenizer.bos_token
+    output = output.replace(bos, "")
+    # Truncate at end of sentence token
+    eos_token = tokenizer.eos_token
+    if eos_token and eos_token in output:
+        output = output.split(eos_token)[0]
+    colored_output = colorize_chat(
+        output,
+        user_color=color.green,
+        assistant_color=color.cyan,
+        output_color=color.blue,
+    )
+    return colored_output
+@torch.inference_mode()
+def generate(
+    model,
+    pp_schedule,
+    tokenizer,
+    dist_config,
+    messages: list[dict],
+    n_tokens: int = 50,
+):
+    rank = dist.get_rank()
+    device = dist_config.device
+    x = tokenizer.apply_chat_template(
+        [messages] * dist_config.pp_size,
+        add_generation_prompt=True,
+        return_tensors="pt",
+    )
+    next_idx = x.shape[-1]
+    x = torch.cat([x, torch.zeros(x.shape[0], n_tokens, dtype=torch.int64)], dim=-1)
+    x = x.to(device)
+    for _ in range(n_tokens):
+        if dist_config.pp_size > 1:
+            if dist_config.pp_rank == 0:
+                pp_schedule.step(x)
+                torch.distributed.broadcast(
+                    x,
+                    group=dist_config.pp_mesh.get_group(),
+                    group_src=dist_config.pp_size - 1,
+                )
+            elif dist_config.pp_rank == dist_config.pp_size - 1:
+                preds = pp_schedule.step()
+                next_token = torch.argmax(preds[:, next_idx - 1], dim=-1)
+                x[:, next_idx] = next_token
+                torch.distributed.broadcast(
+                    x,
+                    group=dist_config.pp_mesh.get_group(),
+                    group_src=dist_config.pp_size - 1,
+                )
+            else:
+                pp_schedule.step()
+                torch.distributed.broadcast(
+                    x,
+                    group=dist_config.pp_mesh.get_group(),
+                    group_src=dist_config.pp_size - 1,
+                )
+            next_idx += 1
+        else:
+            preds = model(x)
+            next_token = torch.argmax(preds[:, next_idx - 1], dim=-1)
+            x[:, next_idx] = next_token
+            next_idx += 1
+    if rank == 0:
+        colored_output = decode(tokenizer, x)
+        print(f"Without CUDA Graph:\n{colored_output}")
+@torch.inference_mode()
+def generate_with_cuda_graph(
+    model,
+    tokenizer,
+    dist_config,
+    messages: list[dict],
+    n_tokens: int = 10,
+):
+    rank = dist.get_rank()
+    device = dist_config.device
+    x = tokenizer.apply_chat_template(
+        [messages] * dist_config.pp_size,
+        add_generation_prompt=True,
+        return_tensors="pt",
+    )
+    next_idx = x.shape[-1]
+    x = torch.cat([x, torch.zeros(x.shape[0], n_tokens, dtype=torch.int64)], dim=-1)
+    x = x.to(device)
+    torch.cuda.synchronize()
+    # Create CUDA graph
+    g = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(g):
+        preds = model(x)
+    # Run CUDA graph
+    for _ in range(n_tokens):
+        g.replay()
+        next_token = torch.argmax(preds[:, next_idx - 1], dim=-1)
+        x[:, next_idx] = next_token
+        next_idx += 1
+    if rank == 0:
+        colored_output = decode(tokenizer, x)
+        print(f"With CUDA Graph:\n{colored_output}")
+if __name__ == "__main__":
+    # Get user prompt from command line arguments
+    user_prompt = "What is 2+2?"  # Default prompt
+    if len(sys.argv) > 1:
+        user_prompt = sys.argv[1]
+    mesh = dist.init_device_mesh("cuda", mesh_shape, mesh_dim_names=("pp", "ep"))
+    rank = dist.get_rank()
+    if rank == 0:
+        print(
+            f"{color.yellow}Running inference with {model_id} on {mesh_shape} mesh{color.reset}"
+        )
+    dist_config = create_dist_config(mesh)
+    model, pp_schedule = create_model(dist_config)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": user_prompt},
+    ]
+    generate(model, pp_schedule, tokenizer, dist_config, messages)
+    generate_with_cuda_graph(model, tokenizer, dist_config, messages)
+    if rank == 0:
+        print(f"\n{color.yellow}Closing inference mesh...{color.reset}")
+    dist.destroy_process_group()

torchtitan/experiments/deepseek_v3/indices.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import triton
+import triton.language as tl
+__all__ = ["generate_permute_indices"]
+@triton.jit
+def fill_indices_kernel(
+    tokens_per_expert_group_ptr,  # *Pointer* to first input vector.
+    start_index_values_ptr,  # *Pointer* to second input vector.
+    write_offsets_ptr,  # *Pointer* to third input vector.
+    output_ptr,  # *Pointer* to output vector.
+    experts_per_rank,  # Number of experts per rank.
+    num_ranks,  # Number of expert ranks.
+):
+    # There are multiple 'programs' processing different data. We identify which program
+    # we are here:
+    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0.
+    # The total number of programs in the launch grid.
+    num_programs = tl.num_programs(axis=0)
+    # We map the programs (blocks) to the experts.
+    for expert_id in tl.range(pid, experts_per_rank, step=num_programs):
+        # Read this expert's write offset.
+        write_offset = tl.load(write_offsets_ptr + expert_id)
+        # Loop over the ranks.
+        for r in tl.range(num_ranks):
+            # Slot in the tokens_per_expert_group array.
+            i = r * experts_per_rank + expert_id
+            start_index = tl.load(start_index_values_ptr + i)
+            length = tl.load(tokens_per_expert_group_ptr + i)
+            # Write the indices.
+            for l in tl.range(length):
+                val = start_index + l
+                tl.store(output_ptr + write_offset + l, val)
+            write_offset += length
+def fill_indices(
+    tokens_per_expert_group: torch.Tensor,
+    start_index_values: torch.Tensor,
+    write_offsets: torch.Tensor,
+    experts_per_rank: int,
+    num_ranks: int,
+    max_len: int,
+):
+    # We need to preallocate the output.
+    permuted_indices = torch.full(
+        (max_len,), -1, dtype=torch.int32, device=tokens_per_expert_group.device
+    )
+    # Analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int].
+    # In this case, we use a 1D grid where the size is the number of blocks (TODO: bump this value).
+    grid = lambda meta: (1,)
+    #  Each torch.tensor object is implicitly converted into a pointer to its first element.
+    fill_indices_kernel[grid](
+        tokens_per_expert_group,
+        start_index_values,
+        write_offsets,
+        permuted_indices,
+        experts_per_rank,
+        num_ranks,
+    )
+    return permuted_indices
+def fill_indices_cpu(
+    tokens_per_expert_group: torch.Tensor,
+    start_index_values: torch.Tensor,
+    write_offsets: torch.Tensor,
+    experts_per_rank: int,
+    num_ranks: int,
+    max_len: int,
+):
+    # We need to preallocate the output.
+    permuted_indices = torch.full((max_len,), -1, dtype=torch.int32)
+    # Fill the permuted indices
+    # For each local expert
+    for e in range(experts_per_rank):
+        write_start = write_offsets[e]
+        # For each remote rank
+        for r in range(num_ranks):
+            i = r * experts_per_rank + e
+            start_index = start_index_values[i]
+            length = tokens_per_expert_group[i]
+            # Fill in the indices
+            permuted_indices[write_start : write_start + length] = torch.arange(
+                start_index, start_index + length
+            )
+            write_start += length
+    return permuted_indices
+def generate_permute_indices(
+    tokens_per_expert_group: torch.Tensor,
+    experts_per_rank: int,
+    num_ranks: int,
+    max_len: int,
+    alignment: int,
+    use_cpu: bool = False,
+):
+    # Prepare permutation indices and the number of tokens for each expert.  The
+    # permutation indices are the indices of the tokens for each expert.  The
+    # number of tokens for each expert is the sum of the number of tokens for
+    # such experts from all ranks. This number is aligned to the provided
+    # alignment requirement (usually comes from group gemm).
+    # Args:
+    #     tokens_per_expert_group: number of tokens for each expert from all ranks.
+    #     experts_per_rank: number of experts per rank.
+    #     num_ranks: number of ranks.
+    #     max_len: maximum length of the output index vector. If greater than
+    #     total number of tokens, the remaining indices are set to -1.
+    #     alignment: alignment for each returned element in `m_sizes`.
+    #     use_cpu: whether to use cpu or gpu.
+    # Returns:
+    #     permuted_indices: permutation indices.
+    #     m_sizes: number of tokens for each expert.
+    # `tokens_per_expert_group` is of shape (num_ranks * experts_per_rank,), for example:
+    # From: |       rank 0      |       rank 1      |
+    # To:   | E0 | E1 | E2 | E3 | E0 | E1 | E2 | E3 |
+    #       |  4 |  2 |  1 |  3 |  1 |  2 |  3 |  4 |
+    # Prefix sum to get the start index value of each expert
+    start_index_values = (
+        torch.cumsum(tokens_per_expert_group, 0) - tokens_per_expert_group
+    )
+    # Chunk sizes for each expert
+    chunk_size_per_expert = tokens_per_expert_group.view(num_ranks, -1).sum(0)
+    # Align the chunk sizes to the given alignment
+    m_sizes = ((chunk_size_per_expert + alignment - 1) // alignment * alignment).to(
+        torch.int32
+    )
+    # Perform another prefix sum to get the write offset of each expert in `permuted_indices`
+    write_offsets = torch.cumsum(m_sizes, 0) - m_sizes
+    # Select the method to fill the permuted indices
+    fill_fn = fill_indices_cpu if use_cpu else fill_indices
+    # Fill the permuted indices
+    permuted_indices = fill_fn(
+        tokens_per_expert_group,
+        start_index_values,
+        write_offsets,
+        experts_per_rank,
+        num_ranks,
+        max_len,
+    )
+    return permuted_indices, m_sizes
+# Below is for testing only
+def test():
+    device = torch.device("cuda", 0)
+    experts_per_rank = 4
+    num_ranks = 4
+    tokens_per_expert_group = torch.full(
+        (num_ranks * experts_per_rank,), 4, dtype=torch.int32, device=device
+    )
+    max_len = 128
+    alignment = 32
+    # Use the GPU kernel
+    permuted_indices_gpu, m_sizes = generate_permute_indices(
+        tokens_per_expert_group, experts_per_rank, num_ranks, max_len, alignment
+    )
+    # Use the CPU method
+    permuted_indices_cpu, _ = generate_permute_indices(
+        tokens_per_expert_group,
+        experts_per_rank,
+        num_ranks,
+        max_len,
+        alignment,
+        use_cpu=True,
+    )
+    # Check that the results are the same
+    assert torch.equal(permuted_indices_gpu.cpu(), permuted_indices_cpu)
+    assert torch.equal(
+        torch.remainder(m_sizes, alignment),
+        torch.zeros(experts_per_rank, device=device),
+    )
+    # Print the results
+    print(permuted_indices_gpu)
+    print(m_sizes)
+    print("Success")
+if __name__ == "__main__":
+    test()

torchtitan/experiments/deepseek_v3/symm_mem_recipes/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from .triton_on_device_all_to_all_v import OnDeviceAllToAllV
+__all__ = [
+    "OnDeviceAllToAllV",
+]

torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_on_device_all_to_all_v.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+import triton
+import triton.language as tl
+from .triton_barrier import blockwise_barrier
+from .triton_utils import sync_threads
+@triton.jit
+def _exchange_row_offsets(
+    split_sizes_ptrs,
+    rank: tl.constexpr,
+    world_size: tl.constexpr,
+    BLOCKS_PER_REMOTE_RANK: tl.constexpr,
+):
+    remote_rank = tl.program_id(0) // BLOCKS_PER_REMOTE_RANK
+    # split_sizes_ptr for all ranks
+    # All these vector stacks into split_sizes_matrix
+    split_sizes_ptrs = split_sizes_ptrs.to(tl.pointer_type(tl.uint64))
+    # split_sizes_matrix[remote_rank, :]
+    input_split_sizes_ptr = tl.load(split_sizes_ptrs + remote_rank).to(
+        tl.pointer_type(tl.int64)
+    )
+    offsets_ = tl.arange(0, world_size)
+    input_split_sizes = tl.load(
+        input_split_sizes_ptr + offsets_, mask=offsets_ <= rank, other=0
+    )
+    num_rows = tl.load(input_split_sizes_ptr + rank)
+    input_row_offset = tl.sum(input_split_sizes) - num_rows
+    # split_sizes_matrix[:, rank]
+    output_split_sizes_ptrs = (
+        tl.load(split_sizes_ptrs + offsets_).to(tl.pointer_type(tl.int64)) + rank
+    )
+    output_split_sizes = tl.load(
+        output_split_sizes_ptrs, mask=offsets_ <= remote_rank, other=0
+    )
+    output_row_offset = tl.sum(output_split_sizes) - num_rows
+    return input_row_offset, output_row_offset, num_rows
+@triton.jit
+def on_device_all_to_all_v_kernel(
+    output_ptr,
+    output_splits_ptr,
+    input_ptrs,
+    input_splits_ptr,
+    signal_pad_ptrs,
+    dim: tl.constexpr,  # Separate dim for easier vectorization
+    rank: tl.constexpr,
+    world_size: tl.constexpr,
+    BLOCKS_PER_REMOTE_RANK: tl.constexpr,
+    UNROLL_FACTOR: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    blockwise_barrier(signal_pad_ptrs, None, rank, world_size, sem="relaxed")
+    sync_threads()
+    remote_rank = tl.program_id(0) // BLOCKS_PER_REMOTE_RANK
+    block_offset = tl.program_id(0) % BLOCKS_PER_REMOTE_RANK
+    input_row_offset, output_row_offset, num_rows = _exchange_row_offsets(
+        input_splits_ptr, rank, world_size, BLOCKS_PER_REMOTE_RANK
+    )
+    output_splits_ptr = output_splits_ptr.to(tl.pointer_type(tl.uint64))
+    if block_offset == 0:
+        # Update output_splits
+        tl.store(output_splits_ptr + remote_rank, num_rows)
+    input_ptr = (
+        tl.load(input_ptrs.to(tl.pointer_type(tl.uint64)) + remote_rank).to(
+            tl.pointer_type(tl.bfloat16)
+        )
+        + input_row_offset * dim
+    )
+    output_ptr = output_ptr + output_row_offset * dim
+    outer_loop_step = BLOCK_SIZE * UNROLL_FACTOR
+    outer_loop_iters_per_rank = tl.cdiv(
+        tl.cdiv(num_rows * dim, outer_loop_step), BLOCKS_PER_REMOTE_RANK
+    )
+    numel_per_rank = outer_loop_step * outer_loop_iters_per_rank
+    offset = numel_per_rank * block_offset
+    end = tl.minimum(numel_per_rank * (block_offset + 1), num_rows * dim)
+    unroll_region_size = (end - offset) // outer_loop_step * outer_loop_step
+    for i in tl.range(offset, offset + unroll_region_size, outer_loop_step):
+        datas = []
+        for j in tl.range(
+            i,
+            i + outer_loop_step,
+            BLOCK_SIZE,
+            loop_unroll_factor=UNROLL_FACTOR,
+        ):
+            offsets = j + tl.arange(0, BLOCK_SIZE)
+            data = tl.load(input_ptr + offsets)
+            tl.store(output_ptr + offsets, data)
+    offset += unroll_region_size
+    while offset < end:
+        offsets = offset + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < num_rows * dim
+        data = tl.load(input_ptr + offsets, mask=mask)
+        tl.store(output_ptr + offsets, data, mask=mask)
+        offset += BLOCK_SIZE
+    sync_threads()
+    blockwise_barrier(signal_pad_ptrs, None, rank, world_size, sem="relaxed")
+    return
+def _on_device_all_to_all_v(
+    output: torch.Tensor,
+    output_splits: torch.Tensor,
+    input: torch.Tensor,
+    input_splits: torch.Tensor,
+    group: dist.ProcessGroup = dist.group.WORLD,
+    BLOCKS_PER_REMOTE_RANK=8,
+    UNROLL_FACTOR: int = 8,
+    BLOCK_SIZE: int = 16384,
+):
+    assert output.dim() == 2, f"{output.shape}"
+    assert input.dim() == 2, f"{input.shape}"
+    assert output.shape[1] == input.shape[1]
+    dim = output.shape[1]
+    input_hdl = symm_mem.rendezvous(input, group=group)
+    input_splits_hdl = symm_mem.rendezvous(input_splits, group=group)
+    num_blocks = input_hdl.world_size * BLOCKS_PER_REMOTE_RANK
+    kernel = on_device_all_to_all_v_kernel[(num_blocks, 1, 1)](
+        output,
+        output_splits,
+        input_hdl.buffer_ptrs_dev,
+        input_splits_hdl.buffer_ptrs_dev,
+        input_hdl.signal_pad_ptrs_dev,
+        dim=dim,
+        rank=input_hdl.rank,
+        world_size=input_hdl.world_size,
+        BLOCKS_PER_REMOTE_RANK=BLOCKS_PER_REMOTE_RANK,
+        UNROLL_FACTOR=UNROLL_FACTOR,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=16,
+    )
+    # log_triton_kernel(kernel)
+    return output
+class OnDeviceAllToAllV(torch.autograd.Function):
+    # A symmetric memory holding the grad_output during backward
+    grad_output_buf = None
+    # A symmetric memory for exchanges split sizes during both forward and backward
+    splits_buf = None
+    # Maximum output length (need to be set before use of OnDeviceAllToAllV)
+    max_output_len = None
+    @staticmethod
+    def forward(
+        ctx,
+        input: torch.Tensor,
+        input_splits: torch.Tensor,
+        group: dist.ProcessGroup = dist.group.WORLD,
+    ):
+        """
+        Args:
+            input: input tensor with data for all ranks concatenated.
+            input_splits: input splits of shape (group.world_size,)
+            group: process group to scope the collective.
+        """
+        # Initialize input splits buffer (one time only)
+        if OnDeviceAllToAllV.splits_buf is None:
+            OnDeviceAllToAllV.splits_buf = symm_mem.empty(
+                *input_splits.shape,
+                dtype=input_splits.dtype,
+                device=input_splits.device,
+            )
+        if OnDeviceAllToAllV.max_output_len is None:
+            raise RuntimeError(
+                "Please set max output length via `OnDeviceAllToAllV.max_output_len = ...`"
+            )
+        # Allocate output buffer
+        output = input.new_empty(OnDeviceAllToAllV.max_output_len, *input.shape[1:])
+        # Allocate output splits tensor
+        output_splits = torch.empty_like(input_splits)
+        # Copy input splits to the buffer
+        OnDeviceAllToAllV.splits_buf.copy_(input_splits)
+        # Shuffle input to output
+        _on_device_all_to_all_v(
+            output, output_splits, input, OnDeviceAllToAllV.splits_buf, group=group
+        )
+        # Output splits in forward is the input splits in backward
+        ctx.save_for_backward(output_splits)
+        ctx.group = group
+        ctx.input_shape = input.shape
+        return output, output_splits
+    @staticmethod
+    def backward(ctx, grad_output, grad_splits):
+        """
+        Backward is implemented as a shuffle of the output's gradients to the input.
+        Args:
+            `grad_output`: output's gradients passed from the downstream.
+            `grad_splits`: unused.
+        """
+        # Initialize grad_output buffer (one time only)
+        if OnDeviceAllToAllV.grad_output_buf is None:
+            assert (
+                OnDeviceAllToAllV.max_output_len is not None
+            ), "`max_output_len` not set"
+            OnDeviceAllToAllV.grad_output_buf = symm_mem.empty(
+                OnDeviceAllToAllV.max_output_len,
+                *grad_output.shape[1:],
+                dtype=grad_output.dtype,
+                device=grad_output.device,
+            )
+        # TODO: is there a way to tell autograd to feed grad_output directly to
+        # our symm_mem buffer?
+        OnDeviceAllToAllV.grad_output_buf.narrow(0, 0, grad_output.shape[0]).copy_(
+            grad_output
+        )
+        # Size info
+        (grad_output_splits,) = ctx.saved_tensors
+        OnDeviceAllToAllV.splits_buf.copy_(grad_output_splits)
+        grad_input_splits = torch.empty_like(grad_output_splits)  # unused
+        grad_input = grad_output.new_empty(*ctx.input_shape)
+        # Shuffle gradients back to the input
+        _on_device_all_to_all_v(
+            grad_input,
+            grad_input_splits,
+            OnDeviceAllToAllV.grad_output_buf,
+            OnDeviceAllToAllV.splits_buf,
+            group=ctx.group,
+        )
+        return grad_input, None, None
+# Alias
+on_device_all_to_all_v = OnDeviceAllToAllV.apply

torchtitan/experiments/flux/README.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# FLUX model in torchtitan
+## Overview
+## Usage
+First, download the autoencoder model from HuggingFace with your own access token:
+```bash
+python torchtitan/experiments/flux/scripts/download_autoencoder.py --repo_id black-forest-labs/FLUX.1-dev --ae_path ae.safetensors --hf_token <your_access_token>
+```
+This step will download the autoencoder model from HuggingFace and save it to the `torchtitan/experiments/flux/assets/autoencoder/ae.safetensors` file.
+Run the following command to train the model on a single GPU:
+```bash
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True torchrun --nproc_per_node=1 torchtitan/experiments/flux/train.py --job.config_file torchtitan/experiments/flux/train_configs/debug_model.toml
+```
+## TODO
+- [ ] Supporting for multiple GPUs is comming soon (FSDP, etc)
+- [ ] Implement test cases in CI for FLUX model. Adding more unit tests for FLUX model (eg, unit test for preprocessor, etc)
+- [ ] More parallesim support (Tensor Parallelism, Context Parallelism, etc)
+- [ ] Support for distributed checkpointing and loading
+- [ ] Implement init_weights() function to initialize the model weights
+- [ ] Implement the num_flops_per_token calculation in get_nparams_and_flops() function

torchtitan/experiments/flux/__init__.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved.
+from torchtitan.components.lr_scheduler import build_lr_schedulers
+from torchtitan.components.optimizer import build_optimizers
+from torchtitan.experiments.flux.dataset.flux_dataset import build_flux_dataloader
+from torchtitan.experiments.flux.loss import build_mse_loss
+from torchtitan.experiments.flux.model.autoencoder import AutoEncoderParams
+from torchtitan.experiments.flux.parallelize_flux import parallelize_flux
+from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
+from .model.model import FluxModel, FluxModelArgs
+__all__ = [
+    "FluxModelArgs",
+    "FluxModel",
+    "flux_configs",
+    "parallelize_flux",
+]
+flux_configs = {
+    "flux-dev": FluxModelArgs(
+        in_channels=64,
+        out_channels=64,
+        vec_in_dim=768,
+        context_in_dim=512,
+        hidden_size=3072,
+        mlp_ratio=4.0,
+        num_heads=24,
+        depth=19,
+        depth_single_blocks=38,
+        axes_dim=(16, 56, 56),
+        theta=10_000,
+        qkv_bias=True,
+        guidance_embed=True,
+        autoencoder_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=(1, 2, 4, 4),
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+    "flux-schnell": FluxModelArgs(
+        in_channels=64,
+        out_channels=64,
+        vec_in_dim=768,
+        context_in_dim=4096,
+        hidden_size=3072,
+        mlp_ratio=4.0,
+        num_heads=24,
+        depth=19,
+        depth_single_blocks=38,
+        axes_dim=(16, 56, 56),
+        theta=10_000,
+        qkv_bias=True,
+        guidance_embed=False,
+        autoencoder_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=(1, 2, 4, 4),
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+    "flux-debug": FluxModelArgs(
+        in_channels=64,
+        out_channels=64,
+        vec_in_dim=768,
+        context_in_dim=512,
+        hidden_size=512,
+        mlp_ratio=4.0,
+        num_heads=4,
+        depth=2,
+        depth_single_blocks=2,
+        axes_dim=(16, 56, 56),
+        theta=10_000,
+        qkv_bias=True,
+        guidance_embed=True,
+        autoencoder_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=(1, 2, 4, 4),
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+}
+register_train_spec(
+    TrainSpec(
+        name="flux",
+        cls=FluxModel,
+        config=flux_configs,
+        parallelize_fn=parallelize_flux,
+        pipelining_fn=None,
+        build_optimizers_fn=build_optimizers,
+        build_lr_schedulers_fn=build_lr_schedulers,
+        build_dataloader_fn=build_flux_dataloader,
+        build_tokenizer_fn=None,
+        build_loss_fn=build_mse_loss,
+    )
+)

torchtitan/experiments/flux/dataset/tokenizer.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+from typing import List
+from torchtitan.components.tokenizer import Tokenizer
+from transformers import CLIPTokenizer, T5Tokenizer
+class FluxTokenizer(Tokenizer):
+    """
+    Tokenizing and encoding/decoding text using the T5 or Clip tokenizer.
+    Args:
+        model_path (str): Path to the tokenzier from hugging face.
+    """
+    def __init__(self, model_path: str = "t5-small", max_length: int = 77):
+        super().__init__()
+        self._n_words = 8  # TODO(jianiw): check
+        self._max_length = max_length
+        self.is_clip = model_path.startswith("openai")
+        if self.is_clip:
+            self._tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(
+                model_path, max_length=max_length
+            )
+        else:
+            self._tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(
+                model_path, max_length=max_length
+            )
+    def encode(
+        self,
+        s: str,
+    ) -> List[int]:
+        """
+        Encode the prompt text into tokens.
+        """
+        tokens = self._tokenizer(
+            s,
+            truncation=True,
+            max_length=self._max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",  # return pytorch tensors, default return List[int]
+        )["input_ids"]
+        return tokens
+    def decode(self, t: List[int]) -> str:
+        """
+        Decode function. This function will not be called.
+        """
+        return self._tokenizer.decode(t)

torchtitan/experiments/flux/model/autoencoder.py ADDED Viewed

	@@ -0,0 +1,388 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+from dataclasses import dataclass
+import torch
+from einops import rearrange
+from safetensors.torch import load_file as load_sft
+from torch import nn, Tensor
+@dataclass
+class AutoEncoderParams:
+    resolution: int = 256
+    in_channels: int = 3
+    ch: int = 128
+    out_ch: int = 3
+    ch_mult: tuple[int] = (1, 2, 4, 4)
+    num_res_blocks: int = 2
+    z_channels: int = 16
+    scale_factor: float = 0.3611
+    shift_factor: float = 0.1159
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+        self.conv1 = nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        self.norm2 = nn.GroupNorm(
+            num_groups=32, num_channels=out_channels, eps=1e-6, affine=True
+        )
+        self.conv2 = nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(
+                in_channels, out_channels, kernel_size=1, stride=1, padding=0
+            )
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=2, padding=0
+        )
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1
+        )
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(
+            num_groups=32, num_channels=block_in, eps=1e-6, affine=True
+        )
+        self.conv_out = nn.Conv2d(
+            block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(
+            num_groups=32, num_channels=block_in, eps=1e-6, affine=True
+        )
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, z: Tensor) -> Tensor:
+        # get dtype for proper tracing
+        upscale_dtype = next(self.up.parameters()).dtype
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # cast to proper dtype
+        h = h.to(upscale_dtype)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.params = params
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.reg = DiagonalGaussian()
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))
+def load_ae(
+    ckpt_path: str,
+    autoencoder_params: AutoEncoderParams,
+    device: str | torch.device = "cuda",
+    dtype=torch.bfloat16,
+) -> AutoEncoder:
+    """
+    Load the autoencoder from the given model name.
+    Args:
+        name (str): The name of the autoencoder.
+        device (str or torch.device): The device to load the autoencoder to.
+    Returns:
+        AutoEncoder: The loaded autoencoder.
+    """
+    # Loading the autoencoder
+    print("Init AE")
+    with torch.device(device):
+        ae = AutoEncoder(autoencoder_params)
+    if not os.path.exists(ckpt_path):
+        raise ValueError(
+            f"Autoencoder path {ckpt_path} does not exist. Please download it first."
+        )
+    if ckpt_path is not None:
+        sd = load_sft(ckpt_path, device=str(device))
+        missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True)
+        if len(missing) > 0:
+            print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+        if len(unexpected) > 0:
+            print(
+                f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected)
+            )
+    return ae.to(dtype=dtype)

torchtitan/experiments/flux/model/hf_embedder.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from torch import nn, Tensor
+from transformers import CLIPTextModel, T5EncoderModel
+class FluxEmbedder(nn.Module):
+    def __init__(self, version: str, **hf_kwargs):
+        super().__init__()
+        self.is_clip = version.startswith("openai")
+        self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+        if self.is_clip:
+            self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(
+                version, **hf_kwargs
+            )
+        else:
+            self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(
+                version, **hf_kwargs
+            )
+        self.hf_module = self.hf_module.eval().requires_grad_(False)
+    def forward(self, batch_tokens: Tensor) -> Tensor:
+        """
+        batch_tokens: [bsz, embedding_length]
+        For T5 Encoder, embeding_length is 768
+        For CLIP, embedding_length is 256
+        """
+        outputs = self.hf_module(
+            input_ids=batch_tokens.to(self.hf_module.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )
+        return outputs[self.output_key]

torchtitan/experiments/flux/model/math.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from einops import rearrange
+from torch import Tensor
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+    x = rearrange(x, "B H L D -> B L (H D)")
+    return x
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=pos.dtype, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack(
+        [torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1
+    )
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.float()
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)

torchtitan/experiments/flux/model/model.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass, field
+import torch
+from torch import nn, Tensor
+from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.config_manager import JobConfig
+from torchtitan.experiments.flux.model.autoencoder import AutoEncoderParams
+from torchtitan.experiments.flux.model.layers import (
+    DoubleStreamBlock,
+    EmbedND,
+    LastLayer,
+    MLPEmbedder,
+    SingleStreamBlock,
+    timestep_embedding,
+)
+from torchtitan.protocols.train_spec import BaseModelArgs, ModelProtocol
+from torchtitan.tools.logging import logger
+@dataclass
+class FluxModelArgs(BaseModelArgs):
+    in_channels: int = 64
+    out_channels: int = 64
+    vec_in_dim: int = 768
+    context_in_dim: int = 512
+    hidden_size: int = 3072
+    mlp_ratio: float = 4.0
+    num_heads: int = 24
+    depth: int = 19
+    depth_single_blocks: int = 38
+    axes_dim: tuple = (16, 56, 56)
+    theta: int = 10_000
+    qkv_bias: bool = True
+    guidance_embed: bool = True
+    autoencoder_params: AutoEncoderParams = field(default_factory=AutoEncoderParams)
+    def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None:
+        # context_in_dim is the same as the T5 embedding dimension
+        self.context_in_dim = job_config.encoder.max_t5_encoding_len
+    def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
+        # TODO(jianiw): Add the number of flops for the autoencoder
+        nparams = sum(p.numel() for p in model.parameters())
+        logger.warning("FLUX model haven't implement get_nparams_and_flops() function")
+        return nparams, 1
+class FluxModel(nn.Module, ModelProtocol):
+    """
+    Transformer model for flow matching on sequences.
+    Agrs:
+        model_args: FluxModelArgs.
+    Attributes:
+        model_args (TransformerModelArgs): Model configuration arguments.
+    """
+    def __init__(self, model_args: FluxModelArgs):
+        super().__init__()
+        self.model_args = model_args
+        self.in_channels = model_args.in_channels
+        self.out_channels = model_args.out_channels
+        if model_args.hidden_size % model_args.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {model_args.hidden_size} must be divisible by num_heads {model_args.num_heads}"
+            )
+        pe_dim = model_args.hidden_size // model_args.num_heads
+        if sum(model_args.axes_dim) != pe_dim:
+            raise ValueError(
+                f"Got {model_args.axes_dim} but expected positional dim {pe_dim}"
+            )
+        self.hidden_size = model_args.hidden_size
+        self.num_heads = model_args.num_heads
+        self.pe_embedder = EmbedND(
+            dim=pe_dim, theta=model_args.theta, axes_dim=model_args.axes_dim
+        )
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(model_args.vec_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+            if model_args.guidance_embed
+            else nn.Identity()
+        )
+        self.txt_in = nn.Linear(model_args.context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=model_args.mlp_ratio,
+                    qkv_bias=model_args.qkv_bias,
+                )
+                for _ in range(model_args.depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    self.hidden_size, self.num_heads, mlp_ratio=model_args.mlp_ratio
+                )
+                for _ in range(model_args.depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+    def init_weights(self, buffer_device=None):
+        # TODO(jianiw): replace placeholder with real weight init
+        for param in self.parameters():
+            param.data.uniform_(0, 0.1)
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.model_args.guidance_embed:
+            if guidance is None:
+                raise ValueError(
+                    "Didn't get guidance strength for guidance distilled model."
+                )
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        for block in self.double_blocks:
+            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+        img = torch.cat((txt, img), 1)
+        for block in self.single_blocks:
+            img = block(img, vec=vec, pe=pe)
+        img = img[:, txt.shape[1] :, ...]
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img
+    @classmethod
+    def from_model_args(cls, model_args: FluxModelArgs) -> "FluxModel":
+        """
+        Initialize a Flux model from a FluxModelArgs object.
+        Args:
+            model_args (FluxModelArgs): Model configuration arguments.
+        Returns:
+            FluxModel: FluxModel model.
+        """
+        return cls(model_args)

torchtitan/experiments/flux/scripts/download_autoencoder.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Optional
+from requests.exceptions import HTTPError
+def hf_download(
+    repo_id: str, file_path: str, local_dir: str, hf_token: Optional[str] = None
+) -> None:
+    from huggingface_hub import hf_hub_download
+    try:
+        hf_hub_download(
+            repo_id=repo_id,
+            filename=file_path,
+            local_dir=local_dir,
+            local_dir_use_symlinks=False,
+            token=hf_token,
+        )
+    except HTTPError as e:
+        if e.response.status_code == 401:
+            print(
+                "You need to pass a valid `--hf_token=...` to download private checkpoints."
+            )
+        else:
+            raise e
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Download tokenizer from HuggingFace.")
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        default="black-forest-labs/FLUX.1-dev",
+        help="Repository ID to download from. default to Flux-dev model",
+    )
+    parser.add_argument(
+        "--ae_path",
+        type=str,
+        default="ae.safetensors",
+        help="the autoencoder path relative to repo_id",
+    )
+    parser.add_argument(
+        "--hf_token", type=str, default=None, help="HuggingFace API token"
+    )
+    parser.add_argument(
+        "--local_dir",
+        type=str,
+        default="torchtitan/experiments/flux/assets/autoencoder/",
+        help="local directory to save the autoencoder",
+    )
+    args = parser.parse_args()
+    hf_download(args.repo_id, args.ae_path, args.local_dir, args.hf_token)

torchtitan/experiments/flux/tests/test_generate_image.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import os
+import time
+from typing import Callable
+import torch
+from einops import rearrange
+from PIL import ExifTags, Image
+from torch import Tensor
+from torchtitan.experiments.flux.dataset.tokenizer import FluxTokenizer
+from torchtitan.experiments.flux.model.autoencoder import (
+    AutoEncoder,
+    AutoEncoderParams,
+    load_ae,
+)
+from torchtitan.experiments.flux.model.hf_embedder import FluxEmbedder
+from torchtitan.experiments.flux.model.model import FluxModel, FluxModelArgs
+from torchtitan.experiments.flux.utils import (
+    create_position_encoding_for_latents,
+    generate_noise_latent,
+    pack_latents,
+    preprocess_flux_data,
+    unpack_latents,
+)
+def time_shift(mu: float, sigma: float, t: Tensor):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+def get_lin_function(
+    x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
+) -> Callable[[float], float]:
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+    return lambda x: m * x + b
+def get_schedule(
+    num_steps: int,
+    image_seq_len: int,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+    shift: bool = True,
+) -> list[float]:
+    # extra step for zero
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+    # shifting the schedule to favor high timesteps for higher signal images
+    if shift:
+        # estimate mu based on linear estimation between two points
+        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+        timesteps = time_shift(mu, 1.0, timesteps)
+    return timesteps.tolist()
+class TestGenerateImage:
+    def test_generate_image(self):
+        """
+        Run a forward pass of flux model to generate an image.
+        """
+        name = "flux-dev"
+        img_width = 512
+        img_height = 512
+        seed = None
+        prompt = (
+            "a photo of a forest with mist swirling around the tree trunks. The word "
+            '"FLUX" is painted over it in big, red brush strokes with visible texture'
+        )
+        device = "cuda"
+        num_steps = None
+        loop = False
+        guidance = 3.5
+        output_dir = "output"
+        add_sampling_metadata = True
+        prompt = prompt.split("|")
+        if len(prompt) == 1:
+            prompt = prompt[0]
+            additional_prompts = None
+        else:
+            additional_prompts = prompt[1:]
+            prompt = prompt[0]
+        assert not (
+            (additional_prompts is not None) and loop
+        ), "Do not provide additional prompts and set loop to True"
+        torch_device = torch.device(device)
+        if num_steps is None:
+            num_steps = 30
+        # allow for packing and conversion to latent space
+        img_height = 16 * (img_height // 16)
+        img_width = 16 * (img_width // 16)
+        # init all components
+        model = FluxModel(FluxModelArgs()).to(device=torch_device, dtype=torch.bfloat16)
+        ae = load_ae(
+            ckpt_path="assets/autoencoder/ae.safetensors",
+            autoencoder_params=AutoEncoderParams(),
+            device=torch_device,
+            dtype=torch.bfloat16,
+        )
+        clip_tokenizer = FluxTokenizer(
+            model_path="openai/clip-vit-large-patch14", max_length=77
+        )
+        t5_tokenizer = FluxTokenizer(model_path="google/t5-v1_1-small", max_length=512)
+        clip_encoder = FluxEmbedder(version="openai/clip-vit-large-patch14").to(
+            torch_device, dtype=torch.bfloat16
+        )
+        t5_encoder = FluxEmbedder(version="google/t5-v1_1-small").to(
+            torch_device, dtype=torch.bfloat16
+        )
+        rng = torch.Generator(device="cpu")
+        if seed is None:
+            seed = rng.seed()
+        print(f"Generating with seed {seed}:\n{prompt}")
+        t0 = time.perf_counter()
+        output_name = os.path.join(output_dir, f"img_{seed}.jpg")
+        # Tokenize the prompt, on CPU
+        clip_tokens = clip_tokenizer.encode(prompt)
+        t5_tokens = t5_tokenizer.encode(prompt)
+        batch = preprocess_flux_data(
+            device=torch_device,
+            dtype=torch.bfloat16,
+            autoencoder=None,
+            clip_encoder=clip_encoder,
+            t5_encoder=t5_encoder,
+            batch={
+                "clip_tokens": clip_tokens,
+                "t5_tokens": t5_tokens,
+            },
+        )
+        img = self._generate_images(
+            device=torch_device,
+            dtype=torch.bfloat16,
+            model=model,
+            decoder=ae,
+            img_width=img_width,
+            img_height=img_height,
+            denoising_steps=num_steps,
+            seed=seed,
+            clip_encodings=batch["clip_encodings"],
+            t5_encodings=batch["t5_encodings"],
+            guidance=guidance,
+        )
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        t1 = time.perf_counter()
+        print(f"Done in {t1 - t0:.1f}s.")
+        self._save_image(name, output_name, img, add_sampling_metadata, prompt)
+    def _generate_images(
+        self,
+        device: torch.device,
+        dtype: torch.dtype,
+        model: FluxModel,
+        decoder: AutoEncoder,
+        # image params:
+        img_width: int,
+        img_height: int,
+        # sampling params:
+        denoising_steps: int,
+        seed: int,
+        clip_encodings: torch.Tensor,
+        t5_encodings: torch.Tensor,
+        guidance: float = 4.0,
+    ):
+        bsz = clip_encodings.shape[0]
+        latents = generate_noise_latent(bsz, img_height, img_width, device, dtype, seed)
+        _, latent_channels, latent_height, latent_width = latents.shape
+        # create denoising schedule
+        timesteps = get_schedule(denoising_steps, latent_channels, shift=True)
+        # create positional encodings
+        POSITION_DIM = 3  # constant for Flux flow model
+        latent_pos_enc = create_position_encoding_for_latents(
+            bsz, latent_height, latent_width, POSITION_DIM
+        ).to(latents)
+        text_pos_enc = torch.zeros(bsz, t5_encodings.shape[1], POSITION_DIM).to(latents)
+        # convert img-like latents into sequences of patches
+        latents = pack_latents(latents)
+        # this is ignored for schnell
+        guidance_vec = torch.full((bsz,), guidance, device=device, dtype=dtype)
+        for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]):
+            t_vec = torch.full((bsz,), t_curr, dtype=dtype, device=device)
+            pred = model(
+                img=latents,
+                img_ids=latent_pos_enc,
+                txt=t5_encodings,
+                txt_ids=text_pos_enc,
+                y=clip_encodings,
+                timesteps=t_vec,
+                guidance=guidance_vec,
+            )
+            latents = latents + (t_prev - t_curr) * pred
+        # convert sequences of patches into img-like latents
+        latents = unpack_latents(latents, latent_height, latent_width)
+        img = decoder.decode(latents)
+        return img
+    def _save_image(
+        self,
+        name: str,
+        output_name: str,
+        x: torch.Tensor,
+        add_sampling_metadata: bool,
+        prompt: str,
+    ):
+        print(f"Saving {output_name}")
+        # bring into PIL format and save
+        x = x.clamp(-1, 1)
+        x = rearrange(x[0], "c h w -> h w c")
+        img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy())
+        exif_data = Image.Exif()
+        exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux"
+        exif_data[ExifTags.Base.Make] = "Black Forest Labs"
+        exif_data[ExifTags.Base.Model] = name
+        if add_sampling_metadata:
+            exif_data[ExifTags.Base.ImageDescription] = prompt
+        img.save(output_name, exif=exif_data, quality=95, subsampling=0)

torchtitan/experiments/flux/train_configs/debug_model.toml ADDED Viewed

	@@ -0,0 +1,68 @@

+[job]
+dump_folder = "./outputs"
+description = "Flux debug model"
+print_args = false
+use_for_integration_test = true
+[profiling]
+enable_profiling = false
+save_traces_folder = "profile_trace"
+profile_freq = 10
+enable_memory_snapshot = false
+save_memory_snapshot_folder = "memory_snapshot"
+[metrics]
+log_freq = 1
+disable_color_printing = false
+enable_tensorboard = false
+save_tb_folder = "tb"
+enable_wandb = false
+[model]
+name = "flux"
+flavor = "flux-debug"
+norm_type = "rmsnorm"  # layernorm / np_layernorm / rmsnorm
+# test tokenizer.model, for debug purpose only
+# tokenizer_path = "./tests/assets/test_tiktoken.model"
+# converters = "float8"
+[optimizer]
+name = "AdamW"
+lr = 8e-4
+eps = 1e-8
+[lr_scheduler]
+warmup_steps = 2  # lr scheduler warm up, normally 20% of the train steps
+decay_ratio = 0.8  # lr scheduler decay ratio, 80% of the train steps
+decay_type = "linear"
+lr_min = 0.0
+[training]
+batch_size = 32
+seq_len = 512
+max_norm = 1.0  # grad norm clipping
+steps = 10
+compile = false
+dataset = "cc12m"
+guidance = 3.5
+seed = 0
+[encoder]
+t5_encoder="google/t5-v1_1-small"
+clip_encoder="openai/clip-vit-large-patch14"
+max_t5_encoding_len=512
+auto_encoder_path="torchtitan/experiments/flux/assets/autoencoder/ae.safetensors"  # Autoencoder to use for image
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = 1
+fsdp_reshard_after_forward = "default" # default / never / always
+tensor_parallel_degree = 1
+enable_async_tensor_parallel = false
+pipeline_parallel_degree = 1
+context_parallel_degree = 1
+[experimental]
+custom_args_module = "torchtitan.experiments.flux.flux_argparser"

torchtitan/experiments/kernels/triton_mg_group_gemm/simpleMoE.py ADDED Viewed

	@@ -0,0 +1,885 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import logging
+import math
+import time
+from typing import Dict, List, Tuple
+# import numpy as np
+import torch  #
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+# from torchao_pr.mg_grouped_gemm import mg_grouped_gemm
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+# Try to import the optimized MG GEMM implementation
+try:
+    from torchao_pr.mg_grouped_gemm import (  # grouped_gemm_backward,
+        grouped_gemm_forward,
+    )
+    has_mg_gemm = True
+except ImportError:
+    logging.warning("MG GEMM implementation not found. Will use manual looping only.")
+    has_mg_gemm = False
+class Router(nn.Module):
+    """
+    Router module that assigns tokens to experts.
+    """
+    def __init__(self, input_dim: int, num_experts: int, top_k: int = 2):
+        super().__init__()
+        self.input_dim = input_dim
+        self.num_experts = num_experts
+        self.top_k = top_k
+        # Routing layer
+        self.router = nn.Linear(input_dim, num_experts)
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
+        """
+        Route input tokens to experts.
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, seq_len, input_dim)
+        Returns:
+            Tuple containing:
+            - router_logits: Raw routing probabilities
+            - dispatch_tensor: One-hot tensor indicating expert assignment
+            - expert_indices: List of indices for each expert's tokens
+        """
+        batch_size, seq_len, _ = x.shape
+        # Flatten batch and sequence dimensions
+        x_flat = x.reshape(-1, self.input_dim)  # (batch_size * seq_len, input_dim)
+        # Compute routing probabilities
+        router_logits = self.router(x_flat)  # (batch_size * seq_len, num_experts)
+        # Apply softmax to get probabilities
+        router_probs = F.softmax(router_logits, dim=-1)
+        # Get top-k experts for each token
+        top_k_probs, top_k_indices = torch.topk(router_probs, self.top_k, dim=-1)
+        # Normalize top-k probabilities
+        top_k_probs = top_k_probs / top_k_probs.sum(dim=-1, keepdim=True)
+        # Create dispatch tensor (one-hot representation of assignments)
+        dispatch_tensor = torch.zeros_like(router_probs)
+        token_indices = (
+            torch.arange(router_probs.size(0), device=router_probs.device)
+            .unsqueeze(1)
+            .expand(-1, self.top_k)
+        )
+        dispatch_tensor.scatter_(1, top_k_indices, top_k_probs)  # .unsqueeze(-1))
+        # For each expert, get the indices of tokens routed to it
+        expert_indices = []
+        for expert_idx in range(self.num_experts):
+            # Get indices of tokens that have non-zero probability for this expert
+            indices = torch.nonzero(dispatch_tensor[:, expert_idx] > 0, as_tuple=True)[
+                0
+            ]
+            expert_indices.append(indices)
+        return router_logits, dispatch_tensor, expert_indices
+class Expert(nn.Module):
+    """
+    Individual expert module.
+    """
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
+        super().__init__()
+        self.fc1 = nn.Linear(input_dim, hidden_dim, bias=False)
+        self.activation = nn.GELU()
+        self.fc2 = nn.Linear(hidden_dim, output_dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = self.activation(x)
+        x = self.fc2(x)
+        return x
+class MixtureOfExperts(nn.Module):
+    """
+    Mixture of Experts layer with support for both manual looping and grouped GEMM.
+    """
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_experts: int,
+        top_k: int = 2,
+        use_mg_gemm: bool = False,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.output_dim = output_dim
+        self.num_experts = num_experts
+        self.top_k = top_k
+        self.use_mg_gemm = use_mg_gemm and has_mg_gemm
+        # Router
+        self.router = Router(input_dim, num_experts, top_k)
+        # Create expert modules
+        if self.use_mg_gemm:
+            # For MG GEMM, we need a single weight tensor for all experts
+            # First layer (input -> hidden)
+            self.expert_fc1_weight = nn.Parameter(
+                torch.randn(num_experts * hidden_dim, input_dim) / math.sqrt(input_dim)
+            )
+            # self.expert_fc1_bias = nn.Parameter(torch.zeros(num_experts * hidden_dim))
+            # Second layer (hidden -> output)
+            self.expert_fc2_weight = nn.Parameter(
+                torch.randn(num_experts * output_dim, hidden_dim)
+                / math.sqrt(hidden_dim)
+            )
+            # self.expert_fc2_bias = nn.Parameter(torch.zeros(num_experts * output_dim))
+        else:
+            # For manual looping, create separate experts
+            self.experts = nn.ModuleList(
+                [Expert(input_dim, hidden_dim, output_dim) for _ in range(num_experts)]
+            )
+    def forward_manual_loop(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass using manual looping over experts.
+        """
+        batch_size, seq_len, _ = x.shape
+        x_flat = x.reshape(-1, self.input_dim)  # (batch_size * seq_len, input_dim)
+        # Get routing information
+        router_logits, dispatch_tensor, expert_indices = self.router(x)
+        # Initialize output tensor
+        final_output = torch.zeros(
+            batch_size * seq_len, self.output_dim, device=x.device
+        )
+        # Process each expert
+        for expert_idx, indices in enumerate(expert_indices):
+            if indices.numel() > 0:
+                # Get tokens routed to this expert
+                expert_inputs = x_flat[indices]  # (num_tokens_for_expert, input_dim)
+                # Process tokens through expert
+                expert_outputs = self.experts[expert_idx](
+                    expert_inputs
+                )  # (num_tokens_for_expert, output_dim)
+                # Scale outputs by router probabilities
+                scaled_outputs = expert_outputs * dispatch_tensor[
+                    indices, expert_idx
+                ].unsqueeze(1)
+                # Add to final output
+                final_output.index_add_(0, indices, scaled_outputs)
+        # Reshape back to original dimensions
+        output = final_output.reshape(batch_size, seq_len, self.output_dim)
+        return output, router_logits
+    def forward_mg_gemm(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, _ = x.shape
+        x_flat = x.reshape(-1, self.input_dim)  # (batch_size * seq_len, input_dim)
+        total_tokens = batch_size * seq_len
+        # Get routing information
+        router_logits, dispatch_tensor, expert_indices = self.router(x)
+        # Get token counts for each expert
+        token_counts = [indices.numel() for indices in expert_indices]
+        m_sizes = torch.tensor(token_counts, dtype=torch.int32, device=x.device)
+        print(f"Token counts per expert: {token_counts}")
+        print(f"m_sizes: {m_sizes}")
+        # Create the combined input tensor
+        combined_input = torch.zeros(sum(token_counts), self.input_dim, device=x.device)
+        start_idx = 0
+        for expert_idx, indices in enumerate(expert_indices):
+            if indices.numel() > 0:
+                end_idx = start_idx + indices.numel()
+                combined_input[start_idx:end_idx] = x_flat[indices]
+                start_idx = end_idx
+        print(f"combined_input shape: {combined_input.shape}")
+        # First layer: input -> hidden
+        fc1_weight_reshaped = self.expert_fc1_weight.reshape(
+            self.num_experts, self.hidden_dim, self.input_dim
+        )
+        fc1_weight_combined = fc1_weight_reshaped.reshape(-1, self.input_dim)
+        print(f"fc1_weight_combined shape: {fc1_weight_combined.shape}")
+        # Run the grouped GEMM
+        hidden_outputs = grouped_gemm_forward(
+            combined_input, fc1_weight_combined, m_sizes
+        )
+        print(f"hidden_outputs shape after first GEMM: {hidden_outputs.shape}")
+        # Apply activation
+        hidden_outputs = F.gelu(hidden_outputs)
+        print(f"hidden_outputs shape after activation: {hidden_outputs.shape}")
+        # Second layer: hidden -> output
+        # Reshape hidden_outputs to match expected dimensions
+        reshaped_hidden_outputs = []
+        start_idx = 0
+        for expert_idx, count in enumerate(token_counts):
+            if count > 0:
+                end_idx = start_idx + count
+                # Take this expert's outputs and reshape to [count, hidden_dim]
+                expert_output = hidden_outputs[
+                    start_idx:end_idx,
+                    expert_idx * self.hidden_dim : (expert_idx + 1) * self.hidden_dim,
+                ]
+                reshaped_hidden_outputs.append(expert_output)
+                start_idx = end_idx
+        # Concatenate all reshaped outputs
+        hidden_outputs = torch.cat(reshaped_hidden_outputs, dim=0)
+        # Reshape expert weights for second layer
+        fc2_weight_reshaped = self.expert_fc2_weight.reshape(
+            self.num_experts, self.output_dim, self.hidden_dim
+        )
+        fc2_weight_combined = fc2_weight_reshaped.reshape(-1, self.hidden_dim)
+        print(f"fc2_weight_combined shape: {fc2_weight_combined.shape}")
+        # Run the second grouped GEMM
+        expert_outputs_combined = grouped_gemm_forward(
+            hidden_outputs, fc2_weight_combined, m_sizes
+        )
+        # Initialize final output tensor with correct shape
+        final_output = torch.zeros(total_tokens, self.output_dim, device=x.device)
+        # Distribute the outputs back to the original token positions
+        start_idx = 0
+        for expert_idx, indices in enumerate(expert_indices):
+            if indices.numel() > 0:
+                end_idx = start_idx + indices.numel()
+                # Get this expert's outputs
+                expert_outputs = expert_outputs_combined[start_idx:end_idx]
+                print(
+                    f"Expert {expert_idx} - indices shape: {indices.shape}, expert_outputs shape: {expert_outputs.shape}"
+                )
+                # Scale outputs by router probabilities
+                scaled_outputs = expert_outputs * dispatch_tensor[
+                    indices, expert_idx
+                ].unsqueeze(1)
+                # Ensure dimensions match before using index_add_
+                if scaled_outputs.shape[1] != final_output.shape[1]:
+                    # print(
+                    #    f"Reshaping: Dimension mismatch: scaled_outputs {scaled_outputs.shape}, final_output {final_output.shape}"
+                    # )
+                    # Reshape if needed - make sure output_dim is correct
+                    scaled_outputs = scaled_outputs[:, : self.output_dim]
+                # Add to final output
+                final_output.index_add_(0, indices, scaled_outputs)
+                start_idx = end_idx
+        # Reshape back to original dimensions
+        output = final_output.reshape(batch_size, seq_len, self.output_dim)
+        return output, router_logits
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.use_mg_gemm and has_mg_gemm:
+            return self.forward_mg_gemm(x)
+        else:
+            return self.forward_manual_loop(x)
+class MoEModel(nn.Module):
+    """
+    Simple model using MoE layers.
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        embed_dim: int,
+        hidden_dim: int,
+        num_experts: int,
+        top_k: int = 2,
+        use_mg_gemm: bool = False,
+    ):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embed_dim)
+        self.moe_layer = MixtureOfExperts(
+            input_dim=embed_dim,
+            hidden_dim=hidden_dim,
+            output_dim=embed_dim,
+            num_experts=num_experts,
+            top_k=top_k,
+            use_mg_gemm=use_mg_gemm,
+        )
+        self.output_layer = nn.Linear(embed_dim, vocab_size)
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # x shape: (batch_size, seq_len)
+        embedded = self.embedding(x)  # (batch_size, seq_len, embed_dim)
+        moe_output, router_logits = self.moe_layer(
+            embedded
+        )  # (batch_size, seq_len, embed_dim)
+        logits = self.output_layer(moe_output)  # (batch_size, seq_len, vocab_size)
+        return logits, router_logits
+def compute_load_balancing_loss(
+    router_logits: torch.Tensor, num_experts: int
+) -> torch.Tensor:
+    """
+    Compute the load balancing loss for MoE training.
+    Args:
+        router_logits (torch.Tensor): Router logits of shape (batch_size * seq_len, num_experts)
+        num_experts (int): Number of experts
+    Returns:
+        torch.Tensor: Load balancing loss
+    """
+    # Get router probabilities
+    router_probs = F.softmax(
+        router_logits, dim=-1
+    )  # (batch_size * seq_len, num_experts)
+    # Compute fraction of tokens routed to each expert
+    # Sum across the batch dimension and normalize
+    router_probs_sum = router_probs.sum(dim=0)  # (num_experts,)
+    router_probs_sum = router_probs_sum / router_probs_sum.sum()
+    # Compute the mean probability per expert
+    mean_prob = 1.0 / num_experts
+    # Compute the fraction of tokens routed to each expert
+    # The goal is to have uniform routing across experts
+    load_balancing_loss = num_experts * torch.sum(router_probs_sum * router_probs_sum)
+    return load_balancing_loss
+def generate_sample_data(
+    batch_size: int, seq_len: int, vocab_size: int, device: str = "cuda"
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Generate sample data for training.
+    Args:
+        batch_size (int): Batch size
+        seq_len (int): Sequence length
+        vocab_size (int): Vocabulary size
+        device (str): Device to use
+    Returns:
+        Tuple of input tokens and target tokens
+    """
+    # Generate random input tokens
+    inputs = torch.randint(0, vocab_size, (batch_size, seq_len), device=device)
+    # Generate random target tokens
+    targets = torch.randint(0, vocab_size, (batch_size, seq_len), device=device)
+    return inputs, targets
+def train_epoch(
+    model: nn.Module,
+    optimizer: torch.optim.Optimizer,
+    batch_size: int,
+    seq_len: int,
+    vocab_size: int,
+    num_batches: int,
+    device: str,
+    load_balance_coef: float = 0.01,
+) -> Dict[str, float]:
+    """
+    Train the model for one epoch.
+    Args:
+        model (nn.Module): Model to train
+        optimizer (torch.optim.Optimizer): Optimizer
+        batch_size (int): Batch size
+        seq_len (int): Sequence length
+        vocab_size (int): Vocabulary size
+        num_batches (int): Number of batches per epoch
+        device (str): Device to use
+        load_balance_coef (float): Coefficient for load balancing loss
+    Returns:
+        Dict containing training metrics
+    """
+    model.train()
+    total_loss = 0.0
+    total_acc = 0.0
+    start_time = time.time()
+    for i in range(num_batches):
+        # Generate sample data
+        inputs, targets = generate_sample_data(batch_size, seq_len, vocab_size, device)
+        # Forward pass
+        optimizer.zero_grad()
+        logits, router_logits = model(inputs)
+        # Compute loss
+        # Reshape for cross entropy loss
+        logits_flat = logits.reshape(-1, vocab_size)
+        targets_flat = targets.reshape(-1)
+        # Cross entropy loss
+        ce_loss = F.cross_entropy(logits_flat, targets_flat)
+        # Load balancing loss
+        lb_loss = compute_load_balancing_loss(
+            router_logits, model.moe_layer.num_experts
+        )
+        # Combined loss
+        loss = ce_loss + load_balance_coef * lb_loss
+        # Backward pass
+        loss.backward()
+        optimizer.step()
+        # Compute accuracy
+        preds = logits_flat.argmax(dim=-1)
+        correct = (preds == targets_flat).float().sum()
+        acc = correct / (batch_size * seq_len)
+        # Accumulate metrics
+        total_loss += loss.item()
+        total_acc += acc.item()
+        # Log progress
+        if (i + 1) % 10 == 0:
+            logging.info(
+                f"Batch {i + 1}/{num_batches} | "
+                f"Loss: {loss.item():.4f} | "
+                f"CE Loss: {ce_loss.item():.4f} | "
+                f"LB Loss: {lb_loss.item():.4f} | "
+                f"Acc: {acc.item():.4f}"
+            )
+    # Compute average metrics
+    avg_loss = total_loss / num_batches
+    avg_acc = total_acc / num_batches
+    epoch_time = time.time() - start_time
+    return {"loss": avg_loss, "acc": avg_acc, "time": epoch_time}
+def evaluate(
+    model: nn.Module,
+    batch_size: int,
+    seq_len: int,
+    vocab_size: int,
+    num_batches: int,
+    device: str,
+) -> Dict[str, float]:
+    """
+    Evaluate the model.
+    Args:
+        model (nn.Module): Model to evaluate
+        batch_size (int): Batch size
+        seq_len (int): Sequence length
+        vocab_size (int): Vocabulary size
+        num_batches (int): Number of batches for evaluation
+        device (str): Device to use
+    Returns:
+        Dict containing evaluation metrics
+    """
+    model.eval()
+    total_loss = 0.0
+    total_acc = 0.0
+    with torch.no_grad():
+        for i in range(num_batches):
+            # Generate sample data
+            inputs, targets = generate_sample_data(
+                batch_size, seq_len, vocab_size, device
+            )
+            # Forward pass
+            logits, router_logits = model(inputs)
+            # Compute loss
+            logits_flat = logits.reshape(-1, vocab_size)
+            targets_flat = targets.reshape(-1)
+            # Cross entropy loss
+            loss = F.cross_entropy(logits_flat, targets_flat)
+            # Compute accuracy
+            preds = logits_flat.argmax(dim=-1)
+            correct = (preds == targets_flat).float().sum()
+            acc = correct / (batch_size * seq_len)
+            # Accumulate metrics
+            total_loss += loss.item()
+            total_acc += acc.item()
+    # Compute average metrics
+    avg_loss = total_loss / num_batches
+    avg_acc = total_acc / num_batches
+    return {"loss": avg_loss, "acc": avg_acc}
+def measure_performance(
+    model: nn.Module,
+    batch_size: int,
+    seq_len: int,
+    vocab_size: int,
+    num_batches: int,
+    device: str,
+) -> Dict[str, float]:
+    """
+    Measure forward and backward pass performance.
+    Args:
+        model (nn.Module): Model to evaluate
+        batch_size (int): Batch size
+        seq_len (int): Sequence length
+        vocab_size (int): Vocabulary size
+        num_batches (int): Number of batches for measurement
+        device (str): Device to use
+    Returns:
+        Dict containing performance metrics
+    """
+    model.train()
+    # Create dummy optimizer
+    optimizer = optim.Adam(model.parameters(), lr=0.001)
+    # Warmup
+    for _ in range(5):
+        inputs, targets = generate_sample_data(batch_size, seq_len, vocab_size, device)
+        logits, router_logits = model(inputs)
+        loss = F.cross_entropy(logits.reshape(-1, vocab_size), targets.reshape(-1))
+        loss.backward()
+        optimizer.zero_grad()
+    # Measure forward pass time
+    torch.cuda.synchronize()
+    forward_start = time.time()
+    for _ in range(num_batches):
+        inputs, targets = generate_sample_data(batch_size, seq_len, vocab_size, device)
+        with torch.no_grad():
+            logits, router_logits = model(inputs)
+    torch.cuda.synchronize()
+    forward_end = time.time()
+    forward_time = (forward_end - forward_start) / num_batches
+    # Measure backward pass time
+    torch.cuda.synchronize()
+    backward_start = time.time()
+    for _ in range(num_batches):
+        inputs, targets = generate_sample_data(batch_size, seq_len, vocab_size, device)
+        logits, router_logits = model(inputs)
+        loss = F.cross_entropy(logits.reshape(-1, vocab_size), targets.reshape(-1))
+        loss.backward()
+        optimizer.zero_grad()
+    torch.cuda.synchronize()
+    backward_end = time.time()
+    backward_time = (backward_end - backward_start) / num_batches
+    return {
+        "forward_time": forward_time * 1000,  # Convert to ms
+        "backward_time": backward_time * 1000,  # Convert to ms
+        "total_time": (forward_time + backward_time) * 1000,  # Convert to ms
+    }
+def compare_methods(args):
+    """
+    Compare manual looping and MG GEMM implementations.
+    """
+    device = torch.device(args.device)
+    # Create models
+    manual_model = MoEModel(
+        vocab_size=args.vocab_size,
+        embed_dim=args.embed_dim,
+        hidden_dim=args.hidden_dim,
+        num_experts=args.num_experts,
+        top_k=args.top_k,
+        use_mg_gemm=False,
+    ).to(device)
+    if has_mg_gemm:
+        mg_model = MoEModel(
+            vocab_size=args.vocab_size,
+            embed_dim=args.embed_dim,
+            hidden_dim=args.hidden_dim,
+            num_experts=args.num_experts,
+            top_k=args.top_k,
+            use_mg_gemm=True,
+        ).to(device)
+    else:
+        mg_model = None
+    # Measure performance
+    logging.info("Measuring performance of manual looping method...")
+    manual_perf = measure_performance(
+        manual_model,
+        args.batch_size,
+        args.seq_len,
+        args.vocab_size,
+        args.perf_batches,
+        device,
+    )
+    if mg_model is not None:
+        logging.info("Measuring performance of MG GEMM method...")
+        mg_perf = measure_performance(
+            mg_model,
+            args.batch_size,
+            args.seq_len,
+            args.vocab_size,
+            args.perf_batches,
+            device,
+        )
+    else:
+        mg_perf = {"forward_time": 0, "backward_time": 0, "total_time": 0}
+    # Log results
+    logging.info("\n===== Performance Comparison =====")
+    logging.info("Model Configuration:")
+    logging.info(f"  - Batch Size: {args.batch_size}")
+    logging.info(f"  - Sequence Length: {args.seq_len}")
+    logging.info(f"  - Embed Dimension: {args.embed_dim}")
+    logging.info(f"  - Hidden Dimension: {args.hidden_dim}")
+    logging.info(f"  - Number of Experts: {args.num_experts}")
+    logging.info(f"  - Top-K: {args.top_k}")
+    logging.info("")
+    logging.info("Manual Looping Method:")
+    logging.info(f"  - Forward Time: {manual_perf['forward_time']:.2f} ms")
+    logging.info(f"  - Backward Time: {manual_perf['backward_time']:.2f} ms")
+    logging.info(f"  - Total Time: {manual_perf['total_time']:.2f} ms")
+    logging.info("")
+    if mg_model is not None:
+        logging.info("MG GEMM Method:")
+        logging.info(f"  - Forward Time: {mg_perf['forward_time']:.2f} ms")
+        logging.info(f"  - Backward Time: {mg_perf['backward_time']:.2f} ms")
+        logging.info(f"  - Total Time: {mg_perf['total_time']:.2f} ms")
+        logging.info("")
+        # Calculate speedup
+        forward_speedup = (
+            manual_perf["forward_time"] / mg_perf["forward_time"]
+            if mg_perf["forward_time"] > 0
+            else 0
+        )
+        backward_speedup = (
+            manual_perf["backward_time"] / mg_perf["backward_time"]
+            if mg_perf["backward_time"] > 0
+            else 0
+        )
+        total_speedup = (
+            manual_perf["total_time"] / mg_perf["total_time"]
+            if mg_perf["total_time"] > 0
+            else 0
+        )
+        logging.info("Speedup (MG GEMM vs Manual):")
+        logging.info(f"  - Forward Speedup: {forward_speedup:.2f}x")
+        logging.info(f"  - Backward Speedup: {backward_speedup:.2f}x")
+        logging.info(f"  - Total Speedup: {total_speedup:.2f}x")
+    else:
+        logging.info("MG GEMM method not available.")
+def train_model(args):
+    """
+    Train an MoE model.
+    """
+    device = torch.device(args.device)
+    # Create model
+    model = MoEModel(
+        vocab_size=args.vocab_size,
+        embed_dim=args.embed_dim,
+        hidden_dim=args.hidden_dim,
+        num_experts=args.num_experts,
+        top_k=args.top_k,
+        use_mg_gemm=args.use_mg_gemm and has_mg_gemm,
+    ).to(device)
+    # Create optimizer
+    optimizer = optim.Adam(model.parameters(), lr=args.lr)
+    # Log model information
+    logging.info("Model configuration:")
+    logging.info(f"  - Vocabulary Size: {args.vocab_size}")
+    logging.info(f"  - Embedding Dimension: {args.embed_dim}")
+    logging.info(f"  - Hidden Dimension: {args.hidden_dim}")
+    logging.info(f"  - Number of Experts: {args.num_experts}")
+    logging.info(f"  - Top-K: {args.top_k}")
+    logging.info(f"  - Using MG GEMM: {args.use_mg_gemm and has_mg_gemm}")
+    # Training loop
+    for epoch in range(args.epochs):
+        logging.info(f"\nEpoch {epoch + 1}/{args.epochs}")
+        # Train
+        train_metrics = train_epoch(
+            model=model,
+            optimizer=optimizer,
+            batch_size=args.batch_size,
+            seq_len=args.seq_len,
+            vocab_size=args.vocab_size,
+            num_batches=args.train_batches,
+            device=device,
+            load_balance_coef=args.load_balance_coef,
+        )
+        # Evaluate
+        eval_metrics = evaluate(
+            model=model,
+            batch_size=args.batch_size,
+            seq_len=args.seq_len,
+            vocab_size=args.vocab_size,
+            num_batches=args.eval_batches,
+            device=device,
+        )
+        # Log metrics
+        logging.info(
+            f"Train Loss: {train_metrics['loss']:.4f} | Train Acc: {train_metrics['acc']:.4f}"
+        )
+        logging.info(
+            f"Eval Loss: {eval_metrics['loss']:.4f} | Eval Acc: {eval_metrics['acc']:.4f}"
+        )
+        logging.info(f"Epoch Time: {train_metrics['time']:.2f} seconds")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Train MoE model")
+    # Model parameters
+    parser.add_argument("--vocab_size", type=int, default=10000, help="Vocabulary size")
+    parser.add_argument(
+        "--embed_dim", type=int, default=512, help="Embedding dimension"
+    )
+    parser.add_argument(
+        "--hidden_dim", type=int, default=1024, help="Hidden dimension in experts"
+    )
+    parser.add_argument("--num_experts", type=int, default=8, help="Number of experts")
+    parser.add_argument(
+        "--top_k", type=int, default=2, help="Top-k experts to route to"
+    )
+    # Training parameters
+    parser.add_argument("--batch_size", type=int, default=32, help="Batch size")
+    parser.add_argument("--seq_len", type=int, default=128, help="Sequence length")
+    parser.add_argument("--epochs", type=int, default=3, help="Number of epochs")
+    parser.add_argument("--lr", type=float, default=0.001, help="Learning rate")
+    parser.add_argument(
+        "--train_batches",
+        type=int,
+        default=100,
+        help="Number of training batches per epoch",
+    )
+    parser.add_argument(
+        "--eval_batches", type=int, default=20, help="Number of evaluation batches"
+    )
+    parser.add_argument(
+        "--perf_batches",
+        type=int,
+        default=50,
+        help="Number of batches for performance testing",
+    )
+    parser.add_argument(
+        "--load_balance_coef",
+        type=float,
+        default=0.01,
+        help="Load balancing loss coefficient",
+    )
+    # Runtime parameters
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        help="Device to use (cuda or cpu)",
+    )
+    parser.add_argument(
+        "--use_mg_gemm",
+        action="store_true",
+        help="Use MG GEMM implementation if available",
+    )
+    parser.add_argument(
+        "--compare",
+        action="store_true",
+        help="Compare manual and MG GEMM implementations",
+    )
+    parser.add_argument("--train", action="store_true", help="Train the model")
+    args = parser.parse_args()
+    # Check for CUDA
+    if args.device == "cuda" and not torch.cuda.is_available():
+        logging.warning("CUDA not available, using CPU instead.")
+        args.device = "cpu"
+    # Log basic information
+    logging.info(f"PyTorch version: {torch.__version__}")
+    logging.info(f"Device: {args.device}")
+    logging.info(f"MG GEMM available: {has_mg_gemm}")
+    # Run the requested action
+    if args.compare:
+        compare_methods(args)
+    elif args.train:
+        train_model(args)
+    else:
+        # Default to comparison if no action specified
+        compare_methods(args)

torchtitan/experiments/llama4/README.md ADDED Viewed

	@@ -0,0 +1,29 @@

+**The Llama 4 folder is still under development.**
+#### Available features
+- Llama 4 model definition (text-only), including the MoE architecture with token-choice routing
+- Basic FSDP, TP, PP, CP support
+- DCP checkpoint conversion scripts
+#### Download Llama 4 tokenizer
+```bash
+# Llama 4 tokenizer.model
+python scripts/download_tokenizer.py --repo_id meta-llama/Llama-4-Scout-17B-16E --tokenizer_path "" --hf_token=...
+```
+#### To be added
+- Modeling
+    - iRoPE implementation
+    - load balance loss for token-choice MoE
+    - alternative expert-choice MoE
+    - multimodal support
+- Kernel integration
+    - efficient bfloat16 GroupedGEMM kernels (from PyTorch core)
+    - efficient float8 GroupedGEMM kernels (from torchao)
+- Parallelism
+    - performant TP implementation and torch.compile support for MoE layers
+    - Context Parallel support for FlexAttention, iRoPE, and multimodal inputs
+    - Expert Parallel support
+- Testing
+    - perfomance and loss converging tests
+    - CI integration

torchtitan/experiments/llama4/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.66 kB). View file

torchtitan/experiments/llama4/model/__pycache__/moe.cpython-312.pyc ADDED Viewed

Binary file (10.5 kB). View file

torchtitan/experiments/llama4/model/args.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from typing import Optional
+from torch import nn
+from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.config_manager import JobConfig
+from torchtitan.protocols.train_spec import BaseModelArgs
+from torchtitan.tools.logging import logger
+@dataclass
+class TransformerModelArgs(BaseModelArgs):
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    rope_theta: float = 10000
+    max_seq_len: int = 2048
+    # If `True`, then each transformer block init uses its layer ID, and if
+    # `False`, each uses the total number of transformer blocks
+    depth_init: bool = True
+    norm_type: str = "rmsnorm"
+    use_flex_attn: bool = False
+    attn_mask_type: str = "causal"
+    eos_id: int = 0
+    # MoE args
+    moe_enabled: bool = True
+    num_experts: int = 8
+    use_shared_expert: bool = True
+    auto_scale_hidden_dim: bool = True
+    # frequency of using MoE layer instead of feedforward layer in a transformer block
+    interleave_moe_layer_step: int = 2
+    # token-choice
+    top_k: int = 1
+    def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None:
+        self.norm_type = job_config.model.norm_type
+        self.vocab_size = tokenizer.n_words
+        self.max_seq_len = job_config.training.seq_len
+        self.use_flex_attn = job_config.model.use_flex_attn
+    def get_nparams_and_flops(
+        self, model: nn.Module, seq_len: int
+    ) -> tuple[int, float]:
+        nparams_embedding = 0
+        nparams_moe_router = 0
+        nparams_shared_expert = 0
+        nparams_experts = 0
+        nparams_dense = 0
+        for name, p in model.named_parameters():
+            if "embedding" in name:
+                nparams_embedding += p.numel()
+                nparams_dense += p.numel()
+            elif "moe.shared_expert" in name:
+                nparams_shared_expert += p.numel()
+            elif "moe.router" in name:
+                nparams_moe_router += p.numel()
+            elif "moe.experts" in name:
+                nparams_experts += p.numel()
+            else:
+                nparams_dense += p.numel()
+        nparams_sparse = nparams_moe_router + nparams_shared_expert + nparams_experts
+        nparams = nparams_dense + nparams_sparse
+        nparams_sparse_active = (
+            nparams_moe_router
+            + nparams_shared_expert
+            + nparams_experts * self.top_k // self.num_experts
+        )
+        logger.info(
+            f"Total parameter count: dense {nparams_dense:,}, "
+            f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}"
+        )
+        l, h, q, t = (
+            self.n_layers,
+            self.n_heads,
+            self.dim // self.n_heads,
+            seq_len,
+        )
+        # Reasoning behind the factor of 12 for the self-attention part of the formula:
+        # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
+        # 2. the flash attention does 1 more matmul recomputation in the backward
+        #    but recomputation should not be counted in calculating MFU           (+0)
+        # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
+        # 4. we follow the convention and do not account for sparsity in causal attention
+        num_flops_per_token = (
+            6 * (nparams_dense - nparams_embedding + nparams_sparse_active)
+            + 12 * l * h * q * t
+        )
+        return nparams, num_flops_per_token

torchtitan/experiments/llama4/model/moe.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .args import TransformerModelArgs
+class GroupedExperts(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        num_experts: int,
+    ):
+        super().__init__()
+        self.num_experts = num_experts
+        self.w1 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim))
+        self.w2 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim))
+        self.w3 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim))
+    def forward(
+        self,
+        x: torch.Tensor,
+        num_local_tokens_per_expert: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if num_local_tokens_per_expert is not None:
+            # a tuple of tensors indexed by experts
+            # each with shape (tokens_per_expert(varying), dim)
+            x = torch.split(
+                x,
+                split_size_or_sections=num_local_tokens_per_expert.tolist(),
+                dim=0,
+            )
+            out_experts_splits = []
+            for expert_idx, x_expert in enumerate(x):
+                w1, w2, w3 = (
+                    self.w1[expert_idx],
+                    self.w2[expert_idx],
+                    self.w3[expert_idx],
+                )
+                h = F.silu(torch.matmul(x_expert, w1))
+                h = h * torch.matmul(x_expert, w3)
+                h = torch.matmul(h, w2)
+                # h shape (tokens_per_expert(varying), dim)
+                out_experts_splits.append(h)
+            out = torch.cat(out_experts_splits, dim=0)
+            # TODO:optimize with GroupedGEMM
+            # https://github.com/pytorch/pytorch/pull/150374
+            # _gouped_mm requires shapes to be multiple of 8
+            # offsets = torch.cumsum(num_local_tokens_per_expert, dim=0, dtype=torch.int32)
+            # h = F.silu(torch._grouped_mm(x, self.w1.transpose(-2, -1), offs=offsets, out_dtype=torch.bfloat16))
+            # h = h * torch._grouped_mm(x, self.w3.transpose(-2, -1), offs=offsets, out_dtype=torch.bfloat16)
+            # out = torch._grouped_mm(h, self.w2.transpose(-2, -1), offs=offsets, out_dtype=torch.bfloat16)
+        else:
+            # x shape (num_experts, tokens_per_expert, dim)
+            h = F.silu(torch.bmm(x, self.w1))
+            h = h * torch.bmm(x, self.w3)
+            # out shape (num_experts, tokens_per_expert, dim)
+            out = torch.bmm(h, self.w2)
+        return out
+    def init_weights(self, init_std: float):
+        nn.init.trunc_normal_(self.w1, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(self.w2, mean=0.0, std=init_std)
+        nn.init.trunc_normal_(self.w3, mean=0.0, std=init_std)
+class TokenChoiceTopKRouter(nn.Module):
+    """This class implements token-choice routing. In token-choice top-K routing, each token is
+        routed to top K experts based on the router scores.
+    Args:
+        gate (nn.Module): Gate module to calculate the scores, typically nn.Linear(dim, num_experts).
+        dim (int): Dimension of input tokens.
+        num_experts (int): Number of experts in each moe layer.
+        top_k (int): Number of experts each token will be routed to in token-choice routing.
+        use_sigmoid (bool): Whether to use sigmoid or softmax for router scores. Default is False.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_experts: int,
+        top_k: int,
+        use_sigmoid: bool = False,
+    ):
+        super().__init__()
+        self.gate = nn.Linear(dim, num_experts, bias=False)
+        self.num_experts = num_experts
+        self.top_k = top_k
+        self.use_sigmoid = use_sigmoid
+    def forward(
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            x (torch.Tensor): Input tensor with shape ``(bs*slen, dim)``.
+        Returns:
+            routed_input (torch.Tensor):
+                Tokens grouped together by experts indices with shape ``(bs*slen*top_k,)``.
+            token_indices (torch.Tensor):
+                Token indices for routed_input with shape ``(bs*slen*top_k,)``.
+            num_local_tokens_per_expert (torch.Tensor):
+                Number of tokens assigned to each expert with shape ``(num_experts,)``.
+        """
+        # scores shape (bs*slen, num_experts)
+        scores = self.gate(x)
+        # By default, sigmoid or softmax is performed in float32 to avoid loss explosion
+        if self.use_sigmoid:
+            scores = torch.sigmoid(scores.to(torch.float32)).to(x.dtype)
+        else:
+            scores = F.softmax(scores.to(torch.float32), dim=1).to(x.dtype)
+        # top scores shape (bs*slen, top_k)
+        top_scores, selected_experts_indices = torch.topk(scores, k=self.top_k, dim=1)
+        # top_scores /= top_scores.sum(dim=-1, keep_dim=True).to(x.dtype)
+        # group tokens together by expert indices from 0 to num_experts and pass that to experts forward
+        num_local_tokens_per_expert = torch.histc(
+            selected_experts_indices.view(-1),
+            bins=self.num_experts,
+            min=0,
+            max=self.num_experts,
+        )
+        # token_indices_experts_sorted shape (bs*slen*top_k,)
+        token_indices_experts_sorted = torch.argsort(
+            selected_experts_indices.view(-1), stable=True
+        )
+        top_scores = top_scores.view(-1)[token_indices_experts_sorted]
+        token_indices_experts_sorted = token_indices_experts_sorted // self.top_k
+        return top_scores, token_indices_experts_sorted, num_local_tokens_per_expert
+    def init_weights(self, init_std: float):
+        nn.init.trunc_normal_(self.gate.weight, mean=0.0, std=init_std)
+# TODO: implement load balancing auxiliary loss for token-choice routing
+class MoE(nn.Module):
+    def __init__(self, model_args: TransformerModelArgs):
+        super().__init__()
+        dim = model_args.dim
+        hidden_dim = 4 * model_args.dim
+        ffn_dim_multiplier = model_args.ffn_dim_multiplier
+        hidden_dim = int(2 * hidden_dim / 3)
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        num_experts = model_args.num_experts
+        hidden_dim_denom = 1
+        if model_args.auto_scale_hidden_dim:
+            hidden_dim_denom = model_args.top_k + int(model_args.use_shared_expert)
+        if model_args.auto_scale_hidden_dim:
+            hidden_dim = int(hidden_dim / hidden_dim_denom)
+        hidden_dim += -hidden_dim % model_args.multiple_of
+        self.experts = GroupedExperts(
+            dim=dim, hidden_dim=hidden_dim, num_experts=num_experts
+        )
+        self.router = TokenChoiceTopKRouter(
+            dim=dim, num_experts=num_experts, top_k=model_args.top_k
+        )
+        self.shared_expert = (
+            GroupedExperts(dim=dim, hidden_dim=hidden_dim, num_experts=1)
+            if model_args.use_shared_expert
+            else None
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): Input tensor with shape ``(bs, slen, dim)``.
+        Returns:
+            out (torch.Tensor): Output tensor with shape ``(bs, slen, dim)``.
+        """
+        bs, slen, dim = x.shape
+        # top_scores and selected_indices shape (bs*slen*top_k,)
+        # num_local_tokens_per_expert shape (num_experts,)
+        (
+            top_scores,
+            token_indices,
+            num_local_tokens_per_expert,
+        ) = self.router(x.reshape(bs * slen, dim))
+        # shape (bs*slen*top_k, dim)
+        token_indices = token_indices.reshape(-1, 1).expand(-1, dim)
+        # shape (bs*slen*top_k, dim)
+        routed_input = torch.gather(
+            x.view(-1, dim),
+            dim=0,
+            index=token_indices,
+        )
+        routed_input = routed_input * top_scores.reshape(-1, 1)
+        # shape (bs*slen*top_k, dim)
+        routed_output = self.experts(routed_input, num_local_tokens_per_expert)
+        # shared expert
+        if self.shared_expert is not None:
+            out = self.shared_expert(x.reshape(1, bs * slen, dim)).reshape(
+                bs * slen, dim
+            )
+        else:
+            out = torch.zeros_like(x.reshape(bs * slen, dim))
+        out = out.scatter_add(dim=0, index=token_indices, src=routed_output)
+        out = out.reshape(bs, slen, dim)
+        return out
+    def init_weights(self, init_std: float):
+        self.experts.init_weights(init_std)
+        self.router.init_weights(init_std)
+        if self.shared_expert is not None:
+            self.shared_expert.init_weights(init_std)

torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.py ADDED Viewed

	@@ -0,0 +1,536 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import os
+import time
+from dataclasses import dataclass
+from typing import Any, Optional
+import torch
+import torch.distributed as dist
+from torch.distributed.tensor import DeviceMesh, distribute_tensor, DTensor, Shard
+from torch.distributed.tensor._utils import compute_local_shape_and_global_offset
+from torchtitan.components.checkpoint import MODEL
+from torchtitan.config_manager import JobConfig
+from torchtitan.tools.logging import init_logger, logger
+from torchtitan.train import Trainer
+# Sharding dims for MP checkpoints
+column_parallel = [
+    "tok_embeddings",
+    "wq",
+    "wk",
+    "wv",
+    "wqkv",
+    "w_in_shared_FD",
+    "w_out_eF_D",
+    "w_swiglu_FD",
+    "output",
+    "_linear",
+    "c_fc",
+    "vision_projection",
+]
+row_parallel = [
+    "wo",
+    "w_out_shared_DF",
+    "w_in_eD_F",
+    "moe_w_swiglu_eD_F",
+    "c_proj",
+]
+def convert_to_titan_fqns(fqn: str) -> list[str]:
+    # From the stored checkpoint keys to TorchTitan keys.
+    if "wqkv" in fqn and "layer_norm_weight" not in fqn:
+        ret = []
+        for k in ("wq", "wk", "wv"):
+            ret.append(fqn.replace("wqkv", k))
+        return ret
+    return [fqn]
+def get_shard_dim(fqn: str) -> Optional[int]:
+    if "bias" in fqn:
+        # Some bias params are still sharded
+        if "resblocks" in fqn:
+            for k in ("wq", "wk", "wv", "c_fc"):
+                if k in fqn:
+                    return 0
+        return None
+    elif any([x in fqn for x in column_parallel]):
+        return 0
+    elif any([x in fqn for x in row_parallel]):
+        return 1
+    else:
+        return None
+def split_fused_qkv(shards: list[torch.Tensor]) -> tuple[torch.Tensor, ...]:
+    qkvs = [torch.split(shard, [640, 128, 128]) for shard in shards]
+    q = torch.cat([qkv[0] for qkv in qkvs], dim=0)
+    k = torch.cat([qkv[1] for qkv in qkvs], dim=0)
+    v = torch.cat([qkv[2] for qkv in qkvs], dim=0)
+    return q, k, v
+@dataclass
+class _Assignment:
+    loader_id: int
+    filename: str
+    fqns: tuple[str, ...]
+    shapes: tuple[torch.Size, ...]
+    dtypes: tuple[torch.dtype, ...]
+@dataclass
+class _AssignmentRound:
+    loader_assignments: dict[int, _Assignment]  # List of assignments for each loader
+class CheckpointConverter:
+    TOTAL_SHARDS = 8
+    def __init__(
+        self,
+        process_group: dist.ProcessGroup,
+        path: str,
+        loader_every_n_ranks: int = 8,
+    ) -> None:
+        self.path = path
+        self.pg = process_group
+        self.my_rank = dist.get_rank(self.pg)
+        self.loader_every_n_ranks = loader_every_n_ranks
+        self.loader_id = self.my_rank // loader_every_n_ranks
+        self.should_load = (
+            self.my_rank % loader_every_n_ranks == 0
+            and self.loader_id < CheckpointConverter.TOTAL_SHARDS
+        )
+        self.total_loader = CheckpointConverter.TOTAL_SHARDS
+        self.titan_fqn_to_stored_fqn: dict[str, str] = {}
+        self.stored_fqn_to_titan_fqn: dict[str, list[str]] = {}
+        self.total_send_bytes = 0
+        self.total_recv_bytes = 0
+    def convert(self, state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+        begin = time.time()
+        self._load_metadata()
+        self._create_fqn_mappings(state_dict)
+        rounds = self._get_load_assignments(state_dict)
+        for assignments in rounds:
+            loader_assignments = assignments.loader_assignments
+            loaded_state_dict = None
+            # Let each loader to load its own data and move to its GPU.
+            for i in range(self.total_loader):
+                # This loader doesn't have any loading assignment for this round.
+                if i not in loader_assignments:
+                    continue
+                # This rank is not the loader
+                if i != self.loader_id or not self.should_load:
+                    continue
+                loaded_state_dict = self._load_round(loader_assignments[i])
+            results = []
+            for i in range(self.total_loader):
+                if i not in loader_assignments:
+                    continue
+                if i == self.loader_id and self.should_load:
+                    # This rank is the loader. It needs to send the loaded data to
+                    # the other ranks.
+                    assert loaded_state_dict is not None
+                    results.append(
+                        self._reshard_send(loader_assignments[i], loaded_state_dict)
+                    )
+                else:
+                    results.append(
+                        self._reshard_receive(loader_assignments[i], state_dict)
+                    )
+            self._reshard(results, state_dict)
+        torch.cuda.synchronize()
+        logger.info(f"Checkpoint conversion took {time.time() - begin:.2f} seconds.")
+        logger.info(f"Total send bytes: {self.total_send_bytes / 1e9:.2f} GB")
+        logger.info(f"Total recv bytes: {self.total_recv_bytes / 1e9:.2f} GB")
+        return state_dict
+    def _get_file_path(self, loader_id: int) -> str:
+        return os.path.join(self.path, f"consolidated.0{loader_id}.pth")
+    def _load_metadata(self) -> None:
+        if not self.should_load:
+            self.read_dict = {}
+            return
+        self.read_dict = torch.load(
+            self._get_file_path(self.loader_id),
+            mmap=True,
+            weights_only=False,
+        )
+    def _create_fqn_mappings(self, state_dict: dict[str, torch.Tensor]) -> None:
+        if not self.read_dict:
+            return
+        # Create the mapping from the stored checkpoint keys to TorchTitan keys.
+        for fqn in list(self.read_dict.keys()):
+            titan_fqns = convert_to_titan_fqns(fqn)
+            # We don't know how to process _extra_state
+            if "_extra_state" in fqn:
+                self.read_dict.pop(fqn)
+                continue
+            if titan_fqns[0] not in state_dict:
+                for titan_fqn in titan_fqns:
+                    assert titan_fqns[0] not in state_dict
+                self.read_dict.pop(fqn)
+                continue
+            self.stored_fqn_to_titan_fqn[fqn] = titan_fqns
+            for titan_fqn in titan_fqns:
+                self.titan_fqn_to_stored_fqn[titan_fqn] = fqn
+        assert set(state_dict.keys()) == set(self.titan_fqn_to_stored_fqn.keys()), (
+            set(state_dict.keys()) - set(self.titan_fqn_to_stored_fqn.keys()),
+            set(self.titan_fqn_to_stored_fqn.keys()) - set(state_dict.keys()),
+        )
+    def _get_load_assignments(
+        self, state_dict: dict[str, torch.Tensor]
+    ) -> list[_AssignmentRound]:
+        if self.my_rank == 0:
+            rounds: list[_AssignmentRound] = []
+            size = 0
+            fqns = []
+            shapes = []
+            dtypes = []
+            # All loader must load all the FQNs because the checkpoint is purely TP sharded.
+            all_keys = list(self.read_dict.keys())
+            for fqn in all_keys:
+                fqns.append(fqn)
+                shapes.append(self.read_dict[fqn].shape)
+                dtypes.append(self.read_dict[fqn].dtype)
+                size += self.read_dict[fqn].numel() * self.read_dict[fqn].element_size()
+                if size < 1e9 and fqn != all_keys[-1]:
+                    continue
+                logger.info(f"Adding {fqns} to round {len(rounds)}")
+                round_assignment = _AssignmentRound(loader_assignments={})
+                for loader_id in range(self.total_loader):
+                    path = self._get_file_path(loader_id)
+                    round_assignment.loader_assignments[loader_id] = _Assignment(
+                        filename=path,
+                        fqns=tuple(fqns),
+                        shapes=tuple(shapes),
+                        dtypes=tuple(dtypes),
+                        loader_id=loader_id,
+                    )
+                rounds.append(round_assignment)
+                size = 0
+                fqns.clear()
+                shapes.clear()
+                dtypes.clear()
+            object_list: list[Any] = [
+                rounds,
+                self.titan_fqn_to_stored_fqn,
+                self.stored_fqn_to_titan_fqn,
+            ]
+        else:
+            object_list = [None, None, None]
+        dist.broadcast_object_list(object_list, src=0, group=self.pg)
+        rounds = object_list[0]
+        self.titan_fqn_to_stored_fqn = object_list[1]
+        self.stored_fqn_to_titan_fqn = object_list[2]
+        return rounds
+    def _load_round(self, assignment: _Assignment) -> dict[str, torch.Tensor]:
+        ret = {}
+        assert self.read_dict
+        for fqn in assignment.fqns:
+            ret[fqn] = self.read_dict[fqn].to(device="cuda")
+        return ret
+    def _reshard_send(
+        self,
+        assignment: _Assignment,
+        loaded_state_dict: dict[str, torch.Tensor],
+    ) -> dict[str, torch.Tensor]:
+        flatten_tensors = [t.flatten() for t in loaded_state_dict.values()]
+        flatten_tensor = torch.concat(flatten_tensors)
+        assert self.loader_id == assignment.loader_id
+        rank = self.loader_id * self.loader_every_n_ranks
+        assert rank == self.my_rank
+        logger.info(f"Sending {assignment.filename} from {rank} {self.loader_id}")
+        logger.info(f"Sending {assignment.fqns}")
+        dist.broadcast(flatten_tensor, src=rank, group=self.pg)
+        self.total_send_bytes += flatten_tensor.numel() * flatten_tensor.element_size()
+        return loaded_state_dict
+    def _reshard_receive(
+        self, assignment: _Assignment, state_dict: dict[str, torch.Tensor]
+    ) -> dict[str, torch.Tensor]:
+        flatten_tensor = torch.empty(
+            sum(math.prod(s) for s, d in zip(assignment.shapes, assignment.dtypes)),
+            dtype=assignment.dtypes[0],
+            device="cuda",
+        )
+        rank = assignment.loader_id * self.loader_every_n_ranks
+        dist.broadcast(flatten_tensor, src=rank, group=self.pg)
+        self.total_recv_bytes += flatten_tensor.numel() * flatten_tensor.element_size()
+        ret: dict[str, torch.Tensor] = {}
+        loc = 0
+        for fqn, shape, dtype in zip(
+            assignment.fqns, assignment.shapes, assignment.dtypes
+        ):
+            n_ele = math.prod(shape)
+            ret[fqn] = flatten_tensor[loc : loc + n_ele].view(shape)
+            loc += n_ele
+        return ret
+    def _reshard(
+        self,
+        results: list[dict[str, torch.Tensor]],
+        state_dict: dict[str, torch.Tensor],
+    ) -> None:
+        def _inplace_copy(fqn: str, full_tensors: tuple[torch.Tensor, ...]):
+            titan_fqns = self.stored_fqn_to_titan_fqn[fqn]
+            assert len(titan_fqns) == len(full_tensors)
+            for titan_fqn, full_tensor in zip(titan_fqns, full_tensors):
+                dtensor = state_dict[titan_fqn]
+                logger.info(f"{titan_fqn} {full_tensor.sum()}")
+                assert isinstance(dtensor, DTensor)
+                shape, offset = compute_local_shape_and_global_offset(
+                    full_tensor.shape, dtensor.device_mesh, dtensor.placements
+                )
+                slices = [
+                    slice(cur_offset, cur_offset + cur_shape)
+                    for cur_shape, cur_offset in zip(shape, offset)
+                ]
+                logger.info(
+                    f"Copying {titan_fqn} with {slices=} {dtensor._local_tensor.shape=} "
+                    f"{shape=} {offset=} {self.my_rank=} {dtensor.shape=} {full_tensor.shape=} "
+                    f"{dtensor.placements=} {dtensor.device_mesh=} "
+                )
+                dtensor.to_local().copy_(full_tensor[slices])
+        def _concat_shards(fqn, shards: list[torch.Tensor]) -> tuple[torch.Tensor, ...]:
+            if "wqkv" in fqn:
+                if "layer_norm" in fqn:
+                    return (shards[0],)
+                return split_fused_qkv(shards)
+            shard_dim = get_shard_dim(fqn)
+            if shard_dim is None:
+                return (shards[0],)
+            return (torch.cat(shards, dim=shard_dim),)
+        fqns = list(results[0].keys())
+        for result in results:
+            assert list(result.keys()) == fqns
+        for fqn in fqns:
+            full_tensors = _concat_shards(fqn, [result[fqn] for result in results])
+            _inplace_copy(fqn, full_tensors)
+def _create_verified_state_dict(
+    pg: dist.ProcessGroup, mesh: DeviceMesh
+) -> dict[str, torch.Tensor]:
+    placements = [Shard(0)]
+    state_dict = {
+        "tok_embeddings.weight": torch.rand(
+            25256 * 8, 5120, device="cuda", dtype=torch.bfloat16
+        ),
+        "layers.47.attention.wqkv.layer_norm_weight": torch.rand(
+            5120, device="cuda", dtype=torch.bfloat16
+        ),
+        "layers.47.attention.wq.weight": torch.rand(
+            640 * 8, 5120, device="cuda", dtype=torch.bfloat16
+        ),
+        "layers.47.attention.wk.weight": torch.rand(
+            128 * 8, 5120, device="cuda", dtype=torch.bfloat16
+        ),
+        "layers.47.attention.wv.weight": torch.rand(
+            128 * 8, 5120, device="cuda", dtype=torch.bfloat16
+        ),
+        "layers.47.attention.wo.weight": torch.rand(
+            5120, 640 * 8, device="cuda", dtype=torch.bfloat16
+        ),
+        # "layers.47.feed_forward.router_DE": torch.rand(5120, 128, device="cuda", dtype=torch.bfloat16),
+        # "layers.47.feed_forward.running_gate_stats_3E": torch.rand(3, 128, device="cuda", dtype=torch.bfloat16),
+        # "layers.47.feed_forward.global_gate_stats_3E": torch.rand(3, 128, device="cuda", dtype=torch.bfloat16),
+        "layers.47.feed_forward.w_in_shared_FD.weight": torch.rand(
+            1024 * 8, 5120, device="cuda", dtype=torch.bfloat16
+        ),
+        "layers.47.feed_forward.w_out_shared_DF.weight": torch.rand(
+            5120, 1024 * 8, device="cuda", dtype=torch.bfloat16
+        ),
+        "layers.47.feed_forward.w_swiglu_FD.weight": torch.rand(
+            1024 * 8, 5120, device="cuda", dtype=torch.bfloat16
+        ),
+        "layers.47.feed_forward.norm.weight": torch.rand(
+            5120, device="cuda", dtype=torch.bfloat16
+        ),
+        "layers.47.feed_forward.experts.moe_w_in_eD_F": torch.rand(
+            655360, 1024 * 8, device="cuda", dtype=torch.bfloat16
+        ),
+        "layers.47.feed_forward.experts.moe_w_out_eF_D": torch.rand(
+            131072 * 8, 5120, device="cuda", dtype=torch.bfloat16
+        ),
+        "layers.47.feed_forward.experts.moe_w_swiglu_eD_F": torch.rand(
+            655360, 1024 * 8, device="cuda", dtype=torch.bfloat16
+        ),
+    }
+    return {k: distribute_tensor(v, mesh, placements) for k, v in state_dict.items()}
+def _verify_state_dict(
+    state_dict: dict[str, torch.Tensor], path: str, rank: int
+) -> None:
+    stored_state_dicts = [
+        torch.load(
+            os.path.join(path, f"consolidated.0{i}.pth"),
+            map_location="cpu",
+            weights_only=False,
+            mmap=True,
+        )
+        for i in range(8)
+    ]
+    def read_and_verify_tensor(fqn: str, dtensor: DTensor) -> None:
+        logger.info(f"Verifying {fqn} {dtensor.shape=} {dtensor.placements=} ")
+        shards = [stored_state_dicts[i][fqn] for i in range(8)]
+        full_tensor = dtensor.full_tensor()
+        logger.info(f"Gather {fqn} {full_tensor.shape} completely.")
+        if rank > 0:
+            return
+        if len(shards[0].shape) == 1:
+            assert full_tensor.shape == shards[0].shape, fqn
+            assert torch.allclose(shards[0].to(device="cuda"), full_tensor), fqn
+            return
+        elif shards[0].shape[0] == full_tensor.shape[0]:
+            concat_shards = torch.cat(shards, dim=1)
+            logger.info(f"Load {fqn} completely.")
+        elif shards[0].shape[1] == full_tensor.shape[1]:
+            concat_shards = torch.cat(shards, dim=0)
+            logger.info(f"Load {fqn} completely.")
+        concat_shards = concat_shards.to(device="cuda")
+        logger.info(f"Move to GPU {fqn} completely.")
+        assert concat_shards.shape == full_tensor.shape, fqn
+        assert concat_shards.dtype == full_tensor.dtype, fqn
+        assert concat_shards.device == full_tensor.device, fqn
+        assert torch.allclose(concat_shards, full_tensor), fqn
+    for k, v in state_dict.items():
+        if "wq" in k and "wqkv" not in k:
+            pass
+        elif "wk" in k:
+            pass
+        elif "wv" in k:
+            pass
+        else:
+            assert v is not None, k
+            read_and_verify_tensor(k, v)
+if __name__ == "__main__":
+    init_logger()
+    config = JobConfig()
+    config.parser.add_argument(
+        "--checkpoint.convert_path",
+        type=str,
+        default="",
+        help="""Specify the path of the target checkpoint to convert.""",
+    )
+    config.parser.add_argument(
+        "--checkpoint.convert_load_every_n_ranks",
+        type=int,
+        default=8,
+        help="""
+            Specify the interval at which ranks are assigned to load checkpoints.
+            For example, if this number is 4, then ranks 0, 4, 8, ... will load the
+            checkpoint. Each loader is responsible for loading one file. If there
+            are more loaders than files, only the first few loaders will be assigned
+            to load the checkpoint. The default value is 8.
+        """,
+    )
+    config.parser.add_argument(
+        "--checkpoint.fake_model",
+        action="store_true",
+        help="""If true, the model will be fake.""",
+    )
+    config.parse_args()
+    assert config.checkpoint.convert_path != ""
+    trainer: Optional[Trainer] = None
+    try:
+        trainer = Trainer(config)
+        if os.path.exists(trainer.checkpointer.folder):
+            raise RuntimeError(
+                "The checkpoint folder already exists. Abort to avoid overwriting "
+                f"the checkpoint. {trainer.checkpointer.folder=}"
+            )
+        if config.checkpoint.fake_model:
+            state_dict = _create_verified_state_dict(
+                trainer.world_mesh.get_group(), trainer.world_mesh
+            )
+        else:
+            state_dict = trainer.checkpointer.states[MODEL].state_dict()
+        size = 0
+        for v in state_dict.values():
+            size += v.numel() * v.element_size()
+        logger.info(f"Total size of the model: {size / 1e9:.2f} GB")
+        # Do not support PP yet, we will need to iterate over the PP dimension and
+        # extract the corresponding state_dict and device_mesh.
+        if "freq_cis" in state_dict:
+            state_dict.pop("freqs_cis")
+        state_dict = CheckpointConverter(
+            process_group=trainer.world_mesh.get_group(),
+            path=config.checkpoint.convert_path,
+            loader_every_n_ranks=config.checkpoint.convert_load_every_n_ranks,
+        ).convert(state_dict)
+        class DummyModel:
+            def __init__(self, state_dict: dict[str, torch.Tensor]) -> None:
+                self._state_dict = state_dict
+            def state_dict(self) -> dict[str, torch.Tensor]:
+                return self._state_dict
+        if config.checkpoint.fake_model:
+            begin = time.time()
+            _verify_state_dict(
+                state_dict,
+                config.checkpoint.convert_path,
+                trainer.world_mesh.get_rank(),
+            )
+            dist.barrier()
+            logger.info(f"Verifies state_dict {time.time() - begin}.")
+        else:
+            # oh, this is pretty bad, when can we get rid of the freqs_cis issue?
+            state_dict["freqs_cis"] = None
+            trainer.checkpointer.states[MODEL] = DummyModel(state_dict)
+            trainer.checkpointer.model_weights_only = True
+            trainer.checkpointer.export_dtype = next(iter(state_dict.values())).dtype
+            trainer.checkpointer.save(curr_step=0, force=True)
+            time.sleep(2)
+    finally:
+        pass

torchtitan/experiments/multimodal/__init__.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from mm_dataset import build_mm_dataloader
+from torchtitan.components.loss import build_cross_entropy_loss
+from torchtitan.components.lr_scheduler import build_lr_schedulers
+from torchtitan.components.optimizer import build_optimizers
+from torchtitan.datasets.tokenizer.tiktoken import build_tiktoken_tokenizer
+from torchtitan.models.llama3 import parallelize_llama, pipeline_llama
+from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
+from .model import ModelArgs, MultimodalDecoder, VisionEncoder
+__all__ = ["VisionEncoder", "ModelArgs", "MultimodalDecoder"]
+llama4_mm_configs = {
+    # TODO: add configs for llama4 multimodal
+}
+register_train_spec(
+    TrainSpec(
+        name="llama4_multimodal",
+        cls=MultimodalDecoder,
+        config=llama4_mm_configs,
+        parallelize_fn=parallelize_llama,
+        pipelining_fn=pipeline_llama,
+        build_optimizers_fn=build_optimizers,
+        build_lr_schedulers_fn=build_lr_schedulers,
+        build_dataloader_fn=build_mm_dataloader,
+        build_tokenizer_fn=build_tiktoken_tokenizer,
+        build_loss_fn=build_cross_entropy_loss,
+    )
+)

torchtitan/experiments/multimodal/mm_collator.py ADDED Viewed

	@@ -0,0 +1,227 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+import torch
+import torch.nn.functional as F
+from tokenizer.tiktoken import IGNORE_INDEX
+from torch.nn.utils.rnn import pad_sequence
+def padded_collate(
+    batch: List[Dict[str, List[int]]],
+    padding_idx: int = 0,
+    ignore_idx: int = -100,
+) -> Dict[str, torch.Tensor]:
+    """Pad a batch of sequences to the longest sequence length in the batch, and
+    convert integer lists to tensors.
+    Args:
+        batch (List[Dict[str, List[int]]]): A list of dictionaries containing input, label pairs.
+        padding_idx (int): Padding index for input ids. Defaults to 0.
+        ignore_idx (int): Padding index for labels. Defaults to -100.
+    Returns:
+        Dict[str, torch.Tensor]: Collated input and label tensors.
+    Example:
+        >>> token_pairs = [
+        >>>    {"input_ids": [1, 2, 3], "labels": [4, 5, 6]},
+        >>>    {"input_ids": [7,], "labels": [10,]},
+        >>> ]
+        >>> collated = padded_collate(
+        >>>    batch=token_pairs,
+        >>>    padding_idx=padding_idx,
+        >>>    ignore_idx=ignore_idx,
+        >>> )
+        >>> collated["input_ids"]
+        >>> tensor([[1, 2, 3], [7, 0, 0]])
+        >>> collated["labels"]
+        >>> tensor([[4, 5, 6], [10, -100, -100]])
+    """
+    input_ids = pad_sequence(
+        [x["input_ids"] for x in batch],
+        batch_first=True,
+        padding_value=padding_idx,
+    )
+    labels = pad_sequence(
+        [x["labels"] for x in batch],
+        batch_first=True,
+        padding_value=ignore_idx,
+    )
+    input_ids_seq_len = input_ids.shape[-1]
+    labels_seq_len = labels.shape[-1]
+    # Hack to pad correctly and not use max_seq_len, which is costly
+    if input_ids_seq_len > labels_seq_len:
+        labels = F.pad(
+            labels, (0, input_ids_seq_len - labels_seq_len), value=ignore_idx
+        )
+    elif labels_seq_len > input_ids_seq_len:
+        input_ids = F.pad(
+            input_ids,
+            (0, labels_seq_len - input_ids_seq_len),
+            value=padding_idx,
+        )
+    return {"input_ids": input_ids, "labels": labels}
+# NOTE Inspired from torchtune.data._collate.py
+@dataclass
+class MultiModalCollator:
+    padding_idx: int = 128004
+    ignore_idx: int = IGNORE_INDEX
+    pad_max_tiles: Optional[int] = None
+    pad_max_images: Optional[int] = None
+    def __call__(self, batch: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
+        """Pad a batch of text sequences, tiled image tensors, aspect ratios,
+        and cross attention masks. This can be used for both training and inference.
+        ``batch`` is expected to be a list of sample dicts containing the following::
+            - "input_ids": List[int] of length text_seq_len, varies across samples
+            - "labels": List[int] of length text_seq_len, varies across samples
+            - "encoder_input": Dict[str, List[torch.Tensor]]
+                - "images": List[torch.Tensor], each with shape (n_tiles, c, h, w)
+                - "aspect_ratio": List[torch.Tensor], each with shape (2, ) to indicate h_ratio, w_ratio
+        Shape notation:
+            - c = channel dim
+            - h = height dim
+            - w = weight dim
+        Note:
+            For each element in the batch, ``len(images) == len(aspect_ratio)``.
+        This collater does the following:
+            (1) Pad text sequence and encoder mask to the longest sequence length in the batch
+            (2) Pad image tensors in the tile dimension with zeros to the largest number
+                of tiles in the batch
+            (3) Add empty images of zeros to samples up to max number of images in the batch
+            (4) Pad aspect ratios with (1,1) for all added padding images
+        Args:
+            batch (List[Dict[str, Any]]): A list of sample dicts containing input_ids,
+                labels, images, and aspect_ratio.
+            padding_idx (int): Padding index for input token ids. Defaults to 0.
+            ignore_idx (int): Padding index for labels. Defaults to -100.
+            pad_max_tiles (Optional[int]): Maximum number of tiles to pad to. If None, will pad to the largest number of tiles
+                in the batch. Defaults to None.
+            pad_max_images (Optional[int]): Maximum number of images to pad to. If None, will pad to the largest number of images
+                in the batch. Defaults to None.
+        Returns:
+            Dict[str, Tensor]: Collated tokens, labels, images, aspect_ratio tensors.
+                - tokens: Tensor of shape (bsz, max_seq_len)
+                - labels: Tensor of shape (bsz, max_seq_len)
+                - images: Tensor of shape (bsz, max_num_images, max_num_tiles, c, h, w)
+                - aspect_ratio: Tensor of shape (bsz, max_num_images, 2)
+        Example:
+            >>> image_id = 1
+            >>> tokens_per_tile = 5
+            >>> c, h, w = 1, 1, 1
+            >>> batch = [
+            ...     {
+            ...         "input_ids": [1, 2, 1, 3], "labels": [4, 5, 6, 7],
+            ...         "encoder_input": {
+            ...             # One image with two tiles, one image with three tiles
+            ...             "images": [torch.ones(2, c, h, w), torch.ones(3, c, h, w)],
+            ...             "aspect_ratio": [torch.tensor([1, 2]), torch.tensor([1, 3])],
+            ...         },
+            ...     },
+            ...     {
+            ...         "input_ids": [1, 4], "labels": [8, 9],
+            ...         "encoder_input": {
+            ...             # One image with four tiles
+            ...             "images": [torch.ones(4, c, h, w)],
+            ...             "aspect_ratio": [torch.tensor([2, 2])],
+            ...         },
+            ...     },
+            ... ]
+            ... collator = MultiModalCollator(pad_max_tiles=4)
+            >>> model_inputs = collator(batch=batch)
+            >>> print(model_inputs["input_ids"])
+            tensor([[1, 2, 1, 3],
+                    [1, 4, 0, 0]])
+            >>> print(model_inputs["labels"])
+            tensor([[4, 5, 6, 7],
+                    [8, 9, -100, -100]])
+            >>> print(model_inputs["encoder_input"]["images"].shape)  # (bsz, max_num_images, max_num_tiles, c, h, w)
+            torch.Size([2, 2, 4, 1, 1, 1])
+            >>> print(model_inputs["encoder_input"]["aspect_ratio"].shape)  # (bsz, max_num_images, 2)
+            torch.Size([2, 2, 2])
+            >>> print(model_inputs["encoder_input"]["images"][0, 0, ...])  # Image with two tiles got padded to four
+            tensor([[[[1.]]], [[[1.]]], [[[0.]]], [[[0.]]]])
+            >>> print(model_inputs["encoder_input"]["images"][0, 1, ...])  # Image with three tiles got padded to four
+            tensor([[[[1.]]], [[[1.]]], [[[1.]]], [[[0.]]]])
+            >>> print(model_inputs["encoder_input"]["images"][1, 0, ...])  # Image with four tiles did not get padded
+            tensor([[[[1.]]], [[[1.]]], [[[1.]]], [[[1.]]]])
+            >>> print(model_inputs["encoder_input"]["images"][1, 1, ...])  # Extra padding image was added to second sample
+            tensor([[[[0.]]], [[[0.]]], [[[0.]]], [[[0.]]]])
+        """
+        # Text tokens can be handled independently by existing collaters
+        text_only = [
+            {"input_ids": sample["input_ids"], "labels": sample["labels"]}
+            for sample in batch
+        ]
+        collated_text = padded_collate(text_only, self.padding_idx, self.ignore_idx)
+        if self.pad_max_tiles is None:
+            # Get max number of tiles in batch
+            max_num_tiles = max(sample["images_tiles"].shape[0] for sample in batch)
+        else:
+            max_num_tiles = self.pad_max_tiles
+        # Pad images and aspect ratios to max number of tiles
+        batch_images = []
+        batch_aspect_ratios = []
+        for sample in batch:
+            sample_images = []
+            for image in sample["encoder_input"]["images"]:
+                # Single image in each sample has shape (n_tiles, c, h, w)
+                n_tiles = image.shape[0]
+                # Single mask in each sample corresponds to a single image and has shape (text_seq_len, image_seq_len)
+                # where image_seq_len = n_tiles * tokens_per_tile
+                padding_tiles = max_num_tiles - n_tiles
+                # Image should now have shape (max_num_tiles, c, h, w)
+                padded_image = F.pad(
+                    image, (0, 0, 0, 0, 0, 0, 0, padding_tiles), value=0
+                )
+                sample_images.append(padded_image)
+            # Stack multiple images and masks per sample in num_images dimension
+            batch_images.append(torch.stack(sample_images))
+            batch_aspect_ratios.append(
+                torch.stack(sample["encoder_input"]["aspect_ratio"])
+            )
+        # Finally, pad images, masks, aspect ratios to max number of images in batch
+        # (bsz, max_num_images, max_num_tiles, c, h, w)
+        collated_images = pad_sequence(batch_images, batch_first=True, padding_value=0)
+        # (bsz, max_num_images, 2)
+        collated_aspect_ratios = pad_sequence(
+            batch_aspect_ratios, batch_first=True, padding_value=1
+        )
+        batch_dict = {
+            "input_ids": collated_text["input_ids"],
+            "labels": collated_text["labels"],
+            "encoder_input": {
+                "images": collated_images,
+                "aspect_ratio": collated_aspect_ratios,
+            },
+        }
+        return batch_dict

torchtitan/experiments/multimodal/requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ torchvision

torchtitan/experiments/simple_fsdp/__pycache__/simple_fsdp.cpython-312.pyc ADDED Viewed

Binary file (6.83 kB). View file

torchtitan/experiments/simple_fsdp/model.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from torchtitan.models.llama3 import Transformer, TransformerModelArgs
+from .simple_fsdp import disable_data_parallel
+class SimpleFSDPTransformer(Transformer):
+    def __init__(self, model_args: TransformerModelArgs):
+        super().__init__(model_args)
+        self.init_weights()
+    def init_weights(self, *args, **kwargs):
+        with disable_data_parallel():
+            super().init_weights(*args, **kwargs)

torchtitan/experiments/simple_fsdp/simple_fsdp.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+from torch.distributed._tensor import (
+    distribute_tensor,
+    DTensor,
+    Partial,
+    Replicate,
+    Shard,
+)
+from torch.utils.checkpoint import (
+    checkpoint,
+    CheckpointPolicy,
+    create_selective_checkpoint_contexts,
+)
+_active_parametrization = True
+@contextmanager
+def disable_data_parallel():
+    global _active_parametrization
+    try:
+        _active_parametrization = False
+        yield
+    finally:
+        _active_parametrization = True
+@dataclass(frozen=True)
+class MixedPrecisionPolicy:
+    param_dtype: Optional[torch.dtype] = None
+    reduce_dtype: Optional[torch.dtype] = None
+def fsdp_policy():
+    def _fsdp_recomp_policy():
+        def _custom_policy(ctx, func, *args, **kwargs):
+            to_recompute = func in {
+                torch.ops._c10d_functional.all_gather_into_tensor.default,
+                torch.ops._c10d_functional.wait_tensor.default,
+                torch.ops.aten._to_copy.default,  # for dtype cast in FSDP
+            }
+            return (
+                CheckpointPolicy.MUST_RECOMPUTE
+                if to_recompute
+                else CheckpointPolicy.MUST_SAVE
+            )
+        return _custom_policy
+    return create_selective_checkpoint_contexts(_fsdp_recomp_policy())
+class ReplicateComputation(torch.nn.Module):
+    def __init__(self, device_mesh, param_sharding, mode, regional_ac, mp_policy):
+        super().__init__()
+        self.device_mesh = device_mesh
+        self.param_sharding = param_sharding
+        self.mode = mode
+        self.compute_placements = [Replicate()] * self.device_mesh.ndim
+        self.grad_placements = [Partial(reduce_op="avg")] * self.device_mesh.ndim
+        self.regional_ac = regional_ac
+        mp_policy = mp_policy or MixedPrecisionPolicy()
+        self.param_dtype = mp_policy.param_dtype
+        self.reduce_dtype = mp_policy.reduce_dtype
+    def replicate_compute(self, x):
+        # data parallel runtime replicate parameters and do local compute
+        # the gradients are partial tensors that needs to perform reduction
+        # (i.e. DDP: allreduce, FSDP: reduce_scatter, HSDP: mix of both)
+        # NOTE: specifying mixed precision is only available in pytorch_intern24
+        #       https://github.com/tianyu-l/pytorch_intern24/pull/20
+        # support for FSDP + TP (assuming TP shards the inner-most dim)
+        if self.mode == "fully_shard" and x._spec.mesh.ndim == 2:
+            dp_placement, tp_placement = x._spec.placements
+            dp_mesh, tp_mesh = self.device_mesh, x._spec.mesh["tp"]
+            # re-wrap 2D DTensor to 1D DTensor on dp_mesh for efficient FSDP all-gather
+            # TODO: we should consider merging this logic into DTensor redistribute API
+            sharded_local_tensor = x.to_local()
+            sharded_dtensor = DTensor.from_local(
+                sharded_local_tensor, dp_mesh, self.param_sharding
+            )
+            # the actuall FSDP all-gather on dp_mesh
+            # TODO(ruisizhang123): enable mixed-precision training here
+            # add the forward_dtype and backward_dtype back after landing changes in PyTorch DTensor
+            replicated_dtensor = sharded_dtensor.redistribute(
+                placements=self.compute_placements,
+                # forward_dtype=self.param_dtype,
+                # backward_dtype=self.reduce_dtype,
+            )
+            # re-wrap 1D all-gathered DTensor on dp_mesh to 1D DTensor on tp_mesh
+            # TODO: DTensor should support this mesh collasping operation
+            replicated_local_tensor = replicated_dtensor.to_local(
+                grad_placements=self.grad_placements
+            )
+            output = DTensor.from_local(
+                replicated_local_tensor, tp_mesh, (tp_placement,)
+            )
+        else:
+            output = x.redistribute(
+                placements=self.compute_placements,
+                # forward_dtype=self.param_dtype,
+                # backward_dtype=self.reduce_dtype,
+            ).to_local(grad_placements=self.grad_placements)
+        return output
+    def forward(self, x):
+        global _active_parametrization
+        # This should never be set to true during forward, only outside for model
+        # inspection / debugging / initialization
+        # model initialization can be done now through
+        # with disable_data_parallel():
+        #     model.init_weights()
+        if not _active_parametrization:
+            return x
+        if self.regional_ac and self.mode in ("fully_shard", "hybrid_shard"):
+            # apply checkpointing to implement reshard_after_forward
+            output = checkpoint(
+                self.replicate_compute, x, use_reentrant=False, context_fn=fsdp_policy
+            )
+        else:
+            output = self.replicate_compute(x)
+        return output
+def data_parallel(
+    model,
+    device_mesh,
+    mode="replicate",
+    ac_mode: str = "none",
+    mp_policy: Optional[MixedPrecisionPolicy] = None,
+):
+    if mode == "replicate":
+        param_sharding = (Replicate(),)
+    elif mode == "fully_shard":
+        param_sharding = (Shard(0),)
+    elif mode == "hybrid_shard":
+        # replicate inter-host, fully shard intra-host
+        param_sharding = (Replicate(), Shard(0))
+        assert (
+            device_mesh.ndim == 2
+        ), "hybrid sharded data parallel requires 2D DeviceMesh"
+    else:
+        raise ValueError(f"Unsupported mode {mode}")
+    modules = list(model.modules())
+    # apply regional ac (with fsdp_policy) if no global ac is to be applied
+    regional_ac = ac_mode == "none"
+    for mod in modules:
+        params_dict = dict(mod.named_parameters(recurse=False))
+        for p_name, p in params_dict.items():
+            if p is not None and p.numel() > 0:
+                mod.register_parameter(
+                    p_name,
+                    # NOTE: for 2D we need to distribute_tensor a DTensor
+                    #       which requires latest change in pytorch_intern24
+                    #       https://github.com/tianyu-l/pytorch_intern24/pull/25
+                    nn.Parameter(distribute_tensor(p, device_mesh, param_sharding)),
+                )
+                nn.utils.parametrize.register_parametrization(
+                    mod,
+                    p_name,
+                    ReplicateComputation(
+                        device_mesh,
+                        param_sharding,
+                        mode,
+                        regional_ac,
+                        mp_policy=mp_policy,
+                    ),
+                    unsafe=True,
+                )
+    return model

torchtitan/experiments/simple_fsdp/tests/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.

torchtitan/models/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Import the built-in models here so that the corresponding register_model_spec()
+# will be called.
+import torchtitan.models.llama3  # noqa: F401

torchtitan/models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (195 Bytes). View file

torchtitan/models/__pycache__/norms.cpython-312.pyc ADDED Viewed

Binary file (1.39 kB). View file

torchtitan/models/llama3/__init__.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved.
+from torchtitan.components.loss import build_cross_entropy_loss
+from torchtitan.components.lr_scheduler import build_lr_schedulers
+from torchtitan.components.optimizer import build_optimizers
+from torchtitan.datasets.hf_datasets import build_hf_dataloader
+from torchtitan.datasets.tokenizer.tiktoken import build_tiktoken_tokenizer
+from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
+from .model import Transformer, TransformerModelArgs
+from .parallelize_llama import parallelize_llama
+from .pipeline_llama import pipeline_llama
+__all__ = [
+    "parallelize_llama",
+    "pipeline_llama",
+    "TransformerModelArgs",
+    "Transformer",
+    "llama3_configs",
+]
+llama3_configs = {
+    "debugmodel": TransformerModelArgs(
+        dim=256, n_layers=8, n_heads=16, rope_theta=500000
+    ),
+    "8B": TransformerModelArgs(
+        dim=4096,
+        n_layers=32,
+        n_heads=32,
+        n_kv_heads=8,
+        ffn_dim_multiplier=1.3,
+        multiple_of=1024,
+        rope_theta=500000,
+    ),
+    "70B": TransformerModelArgs(
+        dim=8192,
+        n_layers=80,
+        n_heads=64,
+        n_kv_heads=8,
+        ffn_dim_multiplier=1.3,
+        multiple_of=4096,
+        rope_theta=500000,
+    ),
+    "405B": TransformerModelArgs(
+        dim=16384,
+        n_layers=126,
+        n_heads=128,
+        n_kv_heads=8,
+        ffn_dim_multiplier=1.2,
+        multiple_of=4096,
+        rope_theta=500000,
+    ),
+}
+register_train_spec(
+    TrainSpec(
+        name="llama3",
+        cls=Transformer,
+        config=llama3_configs,
+        parallelize_fn=parallelize_llama,
+        pipelining_fn=pipeline_llama,
+        build_optimizers_fn=build_optimizers,
+        build_lr_schedulers_fn=build_lr_schedulers,
+        build_dataloader_fn=build_hf_dataloader,
+        build_tokenizer_fn=build_tiktoken_tokenizer,
+        build_loss_fn=build_cross_entropy_loss,
+    )
+)

torchtitan/models/llama3/parallelize_llama.py ADDED Viewed

	@@ -0,0 +1,398 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# This file applies the PT-D parallelisms (except pipeline parallelism) and various
+# training techniques (e.g. activation checkpointing and compile) to the Llama model.
+from collections import defaultdict
+import torch
+import torch.nn as nn
+from torch.distributed._composable.replicate import replicate
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    checkpoint_wrapper as ptd_checkpoint_wrapper,
+)
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy
+from torch.distributed.tensor import Replicate, Shard
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    parallelize_module,
+    PrepareModuleInput,
+    RowwiseParallel,
+    SequenceParallel,
+)
+from torchtitan.config_manager import JobConfig, TORCH_DTYPE_MAP
+from torchtitan.distributed import ParallelDims
+from torchtitan.tools.logging import logger
+def parallelize_llama(
+    model: nn.Module,
+    world_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+):
+    """
+    Apply tensor parallelism, activation checkpointing, torch.compile, and data
+    parallelism to the model.
+    NOTE: The passed-in model preferably should be on meta device. Otherwise,
+    the model must fit on GPU or CPU memory.
+    """
+    if parallel_dims.tp_enabled:
+        if (
+            job_config.parallelism.enable_async_tensor_parallel
+            and not job_config.training.compile
+        ):
+            raise RuntimeError("Async TP requires --training.compile")
+        enable_float8_linear = "float8" in job_config.model.converters
+        float8_is_rowwise = job_config.float8.recipe_name in (
+            "rowwise",
+            "rowwise_with_gw_hp",
+        )
+        # For now, float8 all-gather with TP is only supported for tensorwise
+        # float8 scaling recipes. For rowwise recipes, we use regular TP and
+        # all-gather happens in high precision.
+        enable_float8_tensorwise_tp = enable_float8_linear and not float8_is_rowwise
+        apply_tp(
+            model,
+            world_mesh["tp"],
+            loss_parallel=parallel_dims.loss_parallel_enabled,
+            enable_float8_tensorwise_tp=enable_float8_tensorwise_tp,
+            enable_async_tp=job_config.parallelism.enable_async_tensor_parallel,
+        )
+    if job_config.model.use_flex_attn:
+        if job_config.activation_checkpoint.mode == "selective":
+            raise ValueError(
+                "FlexAttention is not compatible with selective AC yet. "
+                "See https://github.com/pytorch/pytorch/issues/147879"
+            )
+        if parallel_dims.cp_enabled:
+            raise ValueError(
+                "FlexAttention is not compatible with CP yet. "
+                "We are still working on this."
+            )
+    if job_config.activation_checkpoint.mode != "none":
+        apply_ac(model, job_config.activation_checkpoint)
+    # turn on per-TransformerBlock compile after AC wrapping and before FSDP
+    if job_config.training.compile:
+        apply_compile(model)
+    if (
+        parallel_dims.dp_shard_enabled or parallel_dims.cp_enabled
+    ):  # apply FSDP or HSDP, potentially with Context Parallel
+        if parallel_dims.dp_replicate_enabled:
+            dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
+        else:
+            dp_mesh_dim_names = ("dp_shard_cp",)
+        apply_fsdp(
+            model,
+            world_mesh[tuple(dp_mesh_dim_names)],
+            param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+            reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
+            pp_enabled=parallel_dims.pp_enabled,
+            cpu_offload=job_config.training.enable_cpu_offload,
+            reshard_after_forward_policy=job_config.parallelism.fsdp_reshard_after_forward,
+        )
+        if parallel_dims.dp_replicate_enabled:
+            logger.info("Applied HSDP to the model")
+        else:
+            logger.info("Applied FSDP to the model")
+        if parallel_dims.cp_enabled:
+            logger.info("Applied Context Parallel to the model")
+        if job_config.training.enable_cpu_offload:
+            logger.info("Applied CPU Offloading to the model")
+    elif parallel_dims.dp_replicate_enabled:
+        if world_mesh.ndim > 1:
+            raise RuntimeError("DDP has not supported > 1D parallelism")
+        apply_ddp(
+            model,
+            world_mesh,
+            enable_compile=job_config.training.compile,
+            enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd,
+        )
+    return model
+def apply_tp(
+    model: nn.Module,
+    tp_mesh: DeviceMesh,
+    loss_parallel: bool,
+    enable_float8_tensorwise_tp: bool,
+    enable_async_tp: bool,
+):
+    """Apply tensor parallelism."""
+    # 1. Parallelize the embedding and shard its outputs (which are the first
+    # transformer block's inputs)
+    # 2. Parallelize the root norm layer over the sequence dim
+    # 3. Parallelize the final linear output layer
+    parallelize_module(
+        model,
+        tp_mesh,
+        {
+            "tok_embeddings": RowwiseParallel(
+                input_layouts=Replicate(),
+                output_layouts=Shard(1),
+            ),
+            "norm": SequenceParallel(),
+            "output": ColwiseParallel(
+                input_layouts=Shard(1),
+                output_layouts=Shard(-1) if loss_parallel else Replicate(),
+                use_local_output=not loss_parallel,
+            ),
+        },
+    )
+    # Parallel styles used for transformer block linear weights and their
+    # inputs may be different for float8 linears with tensorwise scaling.
+    if enable_float8_tensorwise_tp:
+        # TODO(vkuzo): add the items below to __init__.py of torchao.float8 and import from there
+        from torchao.float8.float8_tensor_parallel import (
+            Float8ColwiseParallel,
+            Float8RowwiseParallel,
+            PrepareFloat8ModuleInput,
+        )
+        rowwise_parallel, colwise_parallel, prepare_module_input = (
+            Float8RowwiseParallel,
+            Float8ColwiseParallel,
+            PrepareFloat8ModuleInput,
+        )
+    else:
+        rowwise_parallel, colwise_parallel, prepare_module_input = (
+            RowwiseParallel,
+            ColwiseParallel,
+            PrepareModuleInput,
+        )
+    # Apply tensor + sequence parallelism to every transformer block
+    # NOTE: At the cost of model code change, we can accelerate Sequence Parallel
+    #       by folding (and unfolding) the batch dimension and the sequence dimension.
+    #       Examples can be found at https://github.com/pytorch/torchtitan/pull/437
+    for layer_id, transformer_block in model.layers.items():
+        layer_plan = {
+            "attention_norm": SequenceParallel(),
+            "attention": prepare_module_input(
+                input_layouts=(Shard(1), None),
+                desired_input_layouts=(Replicate(), None),
+            ),
+            "attention.wq": colwise_parallel(),
+            "attention.wk": colwise_parallel(),
+            "attention.wv": colwise_parallel(),
+            "attention.wo": rowwise_parallel(output_layouts=Shard(1)),
+            "ffn_norm": SequenceParallel(),
+            "feed_forward": prepare_module_input(
+                input_layouts=(Shard(1),),
+                desired_input_layouts=(Replicate(),),
+            ),
+            "feed_forward.w1": colwise_parallel(),
+            "feed_forward.w2": rowwise_parallel(output_layouts=Shard(1)),
+            "feed_forward.w3": colwise_parallel(),
+        }
+        parallelize_module(
+            module=transformer_block,
+            device_mesh=tp_mesh,
+            parallelize_plan=layer_plan,
+        )
+    if enable_async_tp:
+        from torch.distributed._symmetric_memory import enable_symm_mem_for_group
+        torch._inductor.config._micro_pipeline_tp = True
+        enable_symm_mem_for_group(tp_mesh.get_group().group_name)
+    logger.info(
+        f"Applied {'Float8 tensorwise ' if enable_float8_tensorwise_tp else ''}{'Async ' if enable_async_tp else ''}"
+        "Tensor Parallelism to the model"
+    )
+# for selective op activation checkpointing
+_save_list = {
+    torch.ops.aten.mm.default,
+    torch.ops.aten._scaled_dot_product_efficient_attention.default,
+    torch.ops.aten._scaled_dot_product_flash_attention.default,
+    # for low precision training, it's useful to always save
+    # the result of max, since the absolute maximum is
+    # used to compute the scaling factor for quantization.
+    torch.ops.aten.max.default,
+}
+def _apply_ac_to_transformer_block(module: nn.Module, ac_config):
+    valid_ac_modes = ("full", "selective")
+    if ac_config.mode not in valid_ac_modes:
+        raise ValueError(
+            f"Invalid AC mode: {ac_config.mode}. Valid modes: {valid_ac_modes}"
+        )
+    if ac_config.mode == "full":
+        return ptd_checkpoint_wrapper(module, preserve_rng_state=False)
+    assert ac_config.mode == "selective", f"{ac_config.mode}"
+    use_op_sac = ac_config.selective_ac_option == "op"
+    use_layer_sac = ac_config.selective_ac_option.isdigit()
+    if not use_op_sac and not use_layer_sac:
+        raise ValueError(
+            f"Invalid selective AC option: {ac_config.selective_ac_option}. "
+            f"Valid options: 'op' or a positive int representing layer frequency"
+        )
+    if use_op_sac:
+        from torch.utils.checkpoint import (
+            CheckpointPolicy,
+            create_selective_checkpoint_contexts,
+        )
+        def _get_custom_policy(meta):
+            def _custom_policy(ctx, func, *args, **kwargs):
+                mode = "recompute" if ctx.is_recompute else "forward"
+                mm_count_key = f"{mode}_mm_count"
+                if func == torch.ops.aten.mm.default:
+                    meta[mm_count_key] += 1
+                # Saves output of all compute ops, except every second mm
+                to_save = func in _save_list and not (
+                    func == torch.ops.aten.mm.default and meta[mm_count_key] % 2 == 0
+                )
+                return (
+                    CheckpointPolicy.MUST_SAVE
+                    if to_save
+                    else CheckpointPolicy.PREFER_RECOMPUTE
+                )
+            return _custom_policy
+        def selective_checkpointing_context_fn():
+            meta = defaultdict(int)
+            return create_selective_checkpoint_contexts(_get_custom_policy(meta))
+        return ptd_checkpoint_wrapper(
+            module,
+            context_fn=selective_checkpointing_context_fn,
+            preserve_rng_state=False,
+        )
+    elif use_layer_sac:
+        # Checkpoint every `ac_freq` of the modules passed to this function
+        ac_freq = int(ac_config.selective_ac_option)
+        ptd_checkpoint_wrapper.__dict__.setdefault("_count", 0)
+        ptd_checkpoint_wrapper._count += 1
+        if not ac_freq or ptd_checkpoint_wrapper._count % ac_freq == 0:
+            return ptd_checkpoint_wrapper(module, preserve_rng_state=False)
+        else:
+            return module
+def apply_ac(model: nn.Module, ac_config):
+    """Apply activation checkpointing to the model."""
+    for layer_id, transformer_block in model.layers.named_children():
+        transformer_block = _apply_ac_to_transformer_block(transformer_block, ac_config)
+        model.layers.register_module(layer_id, transformer_block)
+    logger.info(f"Applied {ac_config.mode} activation checkpointing to the model")
+def apply_compile(model: nn.Module):
+    """
+    Apply torch.compile to each TransformerBlock, which makes compilation efficient due to
+    repeated structure. Alternatively one can compile the whole model (after applying DP).
+    """
+    for layer_id, transformer_block in model.layers.named_children():
+        transformer_block = torch.compile(transformer_block, fullgraph=True)
+        model.layers.register_module(layer_id, transformer_block)
+    logger.info("Compiling each TransformerBlock with torch.compile")
+def apply_fsdp(
+    model: nn.Module,
+    dp_mesh: DeviceMesh,
+    param_dtype: torch.dtype,
+    reduce_dtype: torch.dtype,
+    pp_enabled: bool,
+    cpu_offload: bool = False,
+    reshard_after_forward_policy: str = "default",
+):
+    """
+    Apply data parallelism (via FSDP2) to the model.
+    Args:
+        model (nn.Module): The model to apply data parallelism to.
+        dp_mesh (DeviceMesh): The device mesh to use for data parallelism.
+        param_dtype (torch.dtype): The data type to use for model parameters.
+        reduce_dtype (torch.dtype): The data type to use for reduction operations.
+        pp_enabled (bool): Whether pipeline parallelism is enabled.
+        cpu_offload (bool, optional): Whether to offload model parameters to CPU. Defaults to False.
+        reshard_after_forward_policy (str, optional): The policy to use for resharding after forward pass. Defaults to "default".
+            Other options: "never", "always".
+            - "default" applies default resharding behavior, implementing "smart defaults" for known optimal scenarios.
+            - "always" will enable `reshard_after_forward` for all forward passes.
+            - "never" will disable `reshard_after_forward` for all forward passes.
+    """
+    mp_policy = MixedPrecisionPolicy(param_dtype=param_dtype, reduce_dtype=reduce_dtype)
+    fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy}
+    if cpu_offload:
+        fsdp_config["offload_policy"] = CPUOffloadPolicy()
+    for layer_id, transformer_block in model.layers.items():
+        if reshard_after_forward_policy == "always":
+            reshard_after_forward = True
+        elif reshard_after_forward_policy == "never":
+            reshard_after_forward = False
+        elif reshard_after_forward_policy == "default":
+            if pp_enabled:
+                # For PP, do not reshard after forward to avoid per-microbatch
+                # all-gathers, which can be expensive and non-overlapped
+                reshard_after_forward = False
+            else:
+                # As an optimization, do not reshard after forward for the last
+                # transformer block since FSDP would prefetch it immediately
+                reshard_after_forward = int(layer_id) < len(model.layers) - 1
+        else:
+            raise ValueError(
+                f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}."
+            )
+        fully_shard(
+            transformer_block,
+            **fsdp_config,
+            reshard_after_forward=reshard_after_forward,
+        )
+    fully_shard(model, **fsdp_config, reshard_after_forward=not pp_enabled)
+def apply_ddp(
+    model: nn.Module,
+    dp_mesh: DeviceMesh,
+    enable_compile: bool,
+    enable_compiled_autograd: bool,
+):
+    if enable_compile:
+        if enable_compiled_autograd:
+            torch._dynamo.config.optimize_ddp = (
+                "python_reducer_without_compiled_forward"
+            )
+        else:
+            torch._dynamo.config.optimize_ddp = "ddp_optimizer"
+    replicate(model, device_mesh=dp_mesh, bucket_cap_mb=100)
+    logger.info("Applied DDP to the model")

torchtitan/models/llama3/pipeline_llama.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# This file applies the PT-D pipeline parallelism to the Llama model.
+import copy
+import torch.nn as nn
+from torch.distributed import DeviceMesh
+from torch.distributed.pipelining import PipelineStage
+from torch.distributed.pipelining.schedules import (
+    _PipelineSchedule,
+    get_schedule_class,
+    ScheduleZBVZeroBubble,
+)
+from torchtitan.components.loss import LossFunction
+from torchtitan.config_manager import JobConfig
+from torchtitan.distributed import ParallelDims
+from torchtitan.distributed.pipeline import (
+    build_pipeline_schedule,
+    generate_split_points,
+    stage_ids_this_rank,
+)
+from torchtitan.protocols.train_spec import DeviceType, ParallelizeFunction
+from torchtitan.tools.logging import logger
+from .model import TransformerModelArgs
+def pipeline_llama(
+    model: nn.Module,
+    world_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+    device: DeviceType,
+    model_config: TransformerModelArgs,
+    parallelize_fn: ParallelizeFunction,
+    loss_fn: LossFunction,
+) -> tuple[_PipelineSchedule, list[nn.Module], bool, bool]:
+    pp_mesh = world_mesh["pp"]
+    stages, model_parts = pipeline_llama_manual_split(
+        model, pp_mesh, parallel_dims, job_config, device, model_config
+    )
+    # For PP with looped schedules, each item in model_parts is one stage-model-chunk.
+    # We need to iterate through model_parts to apply SPMD parallelisms, compilation,
+    # optimizer, and checkpointing
+    for i, m in enumerate(model_parts):
+        # apply SPMD-style PT-D techniques
+        m = parallelize_fn(m, world_mesh, parallel_dims, job_config)
+        model_parts[i] = m
+        # NOTE: this is to update the model in the stage
+        #       in case the model is modified e.g. by torch.compile
+        stages[i].submod = m
+    pp_schedule = build_pipeline_schedule(job_config, stages, loss_fn)
+    # This is used in the train loop to determine whether to pass in the input_ids and labels
+    has_first_stage = False
+    has_last_stage = False
+    for stage in stages:
+        if stage.is_first:
+            has_first_stage = True
+        if stage.is_last:
+            has_last_stage = True
+    return pp_schedule, model_parts, has_first_stage, has_last_stage
+def pipeline_llama_manual_split(
+    whole_model: nn.Module,
+    pp_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+    device: DeviceType,
+    model_config: TransformerModelArgs,
+) -> tuple[list[PipelineStage], list[nn.Module]]:
+    """
+    This API extracts one torch.nn.Module objects for the part of the model configured to run inside this stage.
+    It wraps the model chunk in a ManualPipelineStage object and returns both the stage and model objects.
+    The stage object is used to create a pipeline schedule, and the model object can be used for applying SPMD
+    parallelism.
+    """
+    pp_rank = pp_mesh.get_local_rank()
+    pp_size = pp_mesh.size()
+    parallelism_config = job_config.parallelism
+    splits = parallelism_config.pipeline_parallel_split_points or generate_split_points(
+        parallelism_config.pipeline_parallel_schedule,
+        parallelism_config.pipeline_parallel_layers_per_stage,
+        parallel_dims.pp,
+        model_config.n_layers,
+    )
+    def _build_stage(
+        stage_idx: int,
+        start_layer: str | None,
+        stop_layer: str | None,
+        is_first: bool = False,
+        is_last: bool = False,
+    ) -> tuple[PipelineStage, nn.Module]:
+        model = copy.deepcopy(whole_model)
+        if not is_first:
+            model.tok_embeddings = None
+        drop_layers = start_layer is not None
+        for name in list(model.layers.keys()):
+            # we keep layers in a contiguous region between start (inclusive) and stop (exclusive)
+            if f"layers.{name}" == start_layer:
+                drop_layers = False
+            if f"layers.{name}" == stop_layer:
+                drop_layers = True
+            if drop_layers:
+                del model.layers[name]
+        if not is_last:
+            model.norm = None
+            model.output = None
+        stage = PipelineStage(
+            model,
+            stage_idx,
+            num_stages,
+            device,
+            group=pp_mesh.get_group("pp"),
+        )
+        return stage, model
+    num_stages = len(splits) + 1
+    stage_idx = pp_rank
+    stages = []
+    models = []
+    schedule_class = get_schedule_class(parallelism_config.pipeline_parallel_schedule)
+    style = "v" if schedule_class == ScheduleZBVZeroBubble else "loop"
+    for stage_idx in stage_ids_this_rank(pp_rank, pp_size, num_stages, style=style):
+        start_layer = splits[stage_idx - 1] if stage_idx > 0 else None
+        stop_layer = splits[stage_idx] if stage_idx < num_stages - 1 else None
+        stage, model_chunk = _build_stage(
+            stage_idx,
+            start_layer,
+            stop_layer,
+            is_first=stage_idx == 0,
+            is_last=stage_idx == num_stages - 1,
+        )
+        logger.info(
+            f"PP rank {pp_rank} is building stage_idx {stage_idx}"
+            f" with start_layer {start_layer}, stop_layer {stop_layer}"
+        )
+        stages.append(stage)
+        models.append(model_chunk)
+    return stages, models

torchtitan/models/llama3/train_configs/llama3_405b.toml ADDED Viewed

	@@ -0,0 +1,63 @@

+# torchtitan Config.toml
+# NOTE: this toml config is a preset for 128 H100 GPUs.
+[job]
+dump_folder = "./outputs"
+description = "Llama 3 405B training"
+[profiling]
+enable_profiling = true
+save_traces_folder = "profile_trace"
+profile_freq = 100
+[metrics]
+log_freq = 10
+enable_tensorboard = true
+save_tb_folder = "tb"
+[model]
+name = "llama3"
+flavor = "405B"
+norm_type = "rmsnorm"  # layernorm / np_layernorm / rmsnorm
+tokenizer_path = "./assets/tokenizer/original/tokenizer.model"
+converters = "float8"
+[optimizer]
+name = "AdamW"
+lr = 8e-5
+eps = 1e-8
+[lr_scheduler]
+warmup_steps = 600  # lr scheduler warm up, normally 20% of the train steps
+[training]
+batch_size = 2
+seq_len = 8192
+max_norm = 1.0  # grad norm clipping
+steps = 3000
+compile = true
+dataset = "c4"
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = -1
+tensor_parallel_degree = 8  # 8-way TP
+enable_async_tensor_parallel = true
+pipeline_parallel_degree = 1
+context_parallel_degree = 1
+[checkpoint]
+enable_checkpoint = false
+folder = "checkpoint"
+interval = 500
+model_weights_only = false
+export_dtype = "float32"
+async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"]
+[activation_checkpoint]
+mode = 'full' # ['none', 'selective', 'full']
+[float8]
+enable_fsdp_float8_all_gather = true
+precompute_float8_dynamic_scale_for_fsdp = true
+filter_fqns = "output"

torchtitan/tools/profiling.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import contextlib
+import os
+import pickle
+import time
+import torch
+from torchtitan.config_manager import JobConfig
+from torchtitan.tools.logging import logger
+# the number of warmup steps before the active step in each profiling cycle
+WARMUP = 3
+# how much memory allocation/free ops to record in memory snapshots
+MEMORY_SNAPSHOT_MAX_ENTRIES = 100000
+@contextlib.contextmanager
+def maybe_enable_profiling(config: JobConfig, *, global_step: int = 0):
+    # get user defined profiler settings
+    enable_profiling = config.profiling.enable_profiling
+    if enable_profiling:
+        dump_dir = config.job.dump_folder
+        save_trace_dir = config.profiling.save_traces_folder
+        trace_dir = os.path.join(dump_dir, save_trace_dir)
+        profile_freq = config.profiling.profile_freq
+        rank = torch.distributed.get_rank()
+        def trace_handler(prof):
+            curr_trace_dir_name = "iteration_" + str(prof.step_num)
+            curr_trace_dir = os.path.join(trace_dir, curr_trace_dir_name)
+            if not os.path.exists(curr_trace_dir):
+                os.makedirs(curr_trace_dir, exist_ok=True)
+            logger.info(f"Dumping profiler traces at step {prof.step_num}")
+            begin = time.monotonic()
+            prof.export_chrome_trace(f"{curr_trace_dir}/rank{rank}_trace.json")
+            logger.info(
+                f"Finished dumping profiler traces in {time.monotonic() - begin:.2f} seconds"
+            )
+        logger.info(f"Profiling active. Traces will be saved at {trace_dir}")
+        if not os.path.exists(trace_dir):
+            os.makedirs(trace_dir, exist_ok=True)
+        warmup, active = WARMUP, 1
+        wait = profile_freq - (active + warmup)
+        assert (
+            wait >= 0
+        ), "profile_freq must be greater than or equal to warmup + active"
+        gpu_device_profiled = None
+        if torch.cuda.is_available():
+            gpu_device_profiled = torch.profiler.ProfilerActivity.CUDA
+        elif torch.xpu.is_available():
+            gpu_device_profiled = torch.profiler.ProfilerActivity.XPU
+        with torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                gpu_device_profiled,
+            ],
+            schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active),
+            on_trace_ready=trace_handler,
+            record_shapes=True,
+        ) as torch_profiler:
+            torch_profiler.step_num = global_step
+            yield torch_profiler
+    else:
+        torch_profiler = contextlib.nullcontext()
+        yield None
+@contextlib.contextmanager
+def maybe_enable_memory_snapshot(config: JobConfig, *, global_step: int = 0):
+    enable_snapshot = config.profiling.enable_memory_snapshot
+    if enable_snapshot:
+        snapshot_folder = config.profiling.save_memory_snapshot_folder
+        snapshot_dir = os.path.join(config.job.dump_folder, snapshot_folder)
+        if not os.path.exists(snapshot_dir):
+            os.makedirs(snapshot_dir, exist_ok=True)
+        rank = torch.distributed.get_rank()
+        class MemoryProfiler:
+            def __init__(self, step_num: int, freq: int):
+                torch.cuda.memory._record_memory_history(
+                    max_entries=MEMORY_SNAPSHOT_MAX_ENTRIES
+                )
+                # when resume training, we start from the last step
+                self.step_num = step_num
+                self.freq = freq
+            def step(self, exit_ctx: bool = False):
+                self.step_num += 1
+                if not exit_ctx and self.step_num % self.freq != 0:
+                    return
+                if not exit_ctx:
+                    curr_step = self.step_num
+                    dir_name = f"iteration_{curr_step}"
+                else:
+                    # dump as iteration_0_exit if OOM at iter 1
+                    curr_step = self.step_num - 1
+                    dir_name = f"iteration_{curr_step}_exit"
+                curr_snapshot_dir = os.path.join(snapshot_dir, dir_name)
+                if not os.path.exists(curr_snapshot_dir):
+                    os.makedirs(curr_snapshot_dir, exist_ok=True)
+                logger.info(f"Dumping memory snapshot at step {curr_step}")
+                begin = time.monotonic()
+                with open(
+                    f"{curr_snapshot_dir}/rank{rank}_memory_snapshot.pickle", "wb"
+                ) as output:
+                    pickle.dump(torch.cuda.memory._snapshot(), output)
+                logger.info(
+                    f"Finished dumping memory snapshot in {time.monotonic() - begin:.2f} seconds"
+                )
+        logger.info(f"Memory profiler active. Snapshot will be saved at {snapshot_dir}")
+        profiler = MemoryProfiler(global_step, config.profiling.profile_freq)
+        try:
+            yield profiler
+        except torch.OutOfMemoryError as e:
+            profiler.step(exit_ctx=True)
+    else:
+        yield None