Repository: calculon-ai/calculon
Branch: main
Commit: caa4b11f8fe1
Files: 87
Total size: 249.7 KB

Directory structure:
gitextract_qq3udcpt/

├── .gitignore
├── LICENSE
├── Makefile
├── NOTICE
├── README.md
├── bin/
│   └── calculon
├── calculon/
│   ├── __init__.py
│   ├── command_line.py
│   ├── io.py
│   ├── llm/
│   │   ├── __init__.py
│   │   ├── all_executions.py
│   │   ├── layers.py
│   │   ├── llm.py
│   │   ├── optimal_execution.py
│   │   ├── parameter_calculator.py
│   │   ├── runner.py
│   │   └── validation.py
│   ├── memory.py
│   ├── network.py
│   ├── processor.py
│   ├── system.py
│   ├── util.py
│   └── version.py
├── examples/
│   └── 3072_t4_p64_d12_mbs4_full.json
├── models/
│   ├── anthropic-52B.json
│   ├── chinchilla.json
│   ├── gopher-280B.json
│   ├── gpt3-13B.json
│   ├── gpt3-175B.json
│   ├── lamda.json
│   ├── megatron-126M.json
│   ├── megatron-1T.json
│   ├── megatron-22B.json
│   ├── megatron-40B.json
│   ├── megatron-5B.json
│   ├── palm-540B.json
│   └── turing-530B.json
├── pylintrc
├── pyproject.toml
├── scripts/
│   ├── 3dplot.py
│   ├── find_huge.py
│   ├── heatmap.py
│   ├── install_hooks.sh
│   └── json_to_csv.py
├── setup.py
├── systems/
│   ├── a100_80e.json
│   ├── a100_80g.json
│   └── h100_80g_nvl8.json
├── test/
│   ├── __init__.py
│   ├── test.sh
│   └── test_json_write_read.py
└── validation/
    └── seqsel/
        ├── fig1/
        │   ├── gpt3-175B_none.json
        │   ├── gpt3-175B_seqsel.json
        │   ├── megatron-1T_none.json
        │   ├── megatron-1T_seqsel.json
        │   ├── megatron-22B_none.json
        │   ├── megatron-22B_seqsel.json
        │   ├── turing-530B_none.json
        │   └── turing-530B_seqsel.json
        ├── fig7/
        │   ├── gpt3-175B_full.json
        │   ├── gpt3-175B_none.json
        │   ├── gpt3-175B_sel.json
        │   ├── gpt3-175B_seq.json
        │   ├── gpt3-175B_seqsel.json
        │   ├── megatron-1T_full.json
        │   ├── megatron-1T_none.json
        │   ├── megatron-1T_sel.json
        │   ├── megatron-1T_seq.json
        │   ├── megatron-1T_seqsel.json
        │   ├── megatron-22B_full.json
        │   ├── megatron-22B_none.json
        │   ├── megatron-22B_sel.json
        │   ├── megatron-22B_seq.json
        │   ├── megatron-22B_seqsel.json
        │   ├── turing-530B_full.json
        │   ├── turing-530B_none.json
        │   ├── turing-530B_sel.json
        │   ├── turing-530B_seq.json
        │   └── turing-530B_seqsel.json
        └── tab5/
            ├── gpt3-175B_full.json
            ├── gpt3-175B_seqsel.json
            ├── megatron-1T_full.json
            ├── megatron-1T_seqsel.json
            ├── megatron-22B_full.json
            ├── megatron-22B_seqsel.json
            ├── turing-530B_full.json
            └── turing-530B_seqsel.json

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
.DS_Store
*.py[cod]
*.log

# C extensions
*.so

# Packages
*.egg
*.egg-info
dist
build
eggs
parts
var
sdist
develop-eggs
.installed.cfg
lib
lib64
__pycache__

# Installer logs
pip-log.txt
files.txt

# Unit test / coverage reports
.coverage
.tox
nosetests.xml

# Translations
*.mo

# Mr Developer
.mr.developer.cfg
.project
.pydevproject


================================================
FILE: LICENSE
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [2022] [Michael Isaev, Nic McDonald]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: Makefile
================================================
.SUFFIXES:
.PHONY: help install clean lint test count

help:
	@echo "options are: install clean lint test count"

install:
	python3 setup.py install --user --record files.txt

uninstall:
	cat files.txt | xargs rm -rf

clean:
	rm -rf build dist calculon.egg-info calculon/*.pyc calculon/__pycache__ calculon/*/__pycache__ test/*.pyc test/__pycache__

lint:
	pylint -r n calculon

test:
	python3 -m unittest -v -f --buffer
	@echo -e "Unit testing successful!\n\n"
	./test/test.sh

count:
	@wc calculon/*.py test/*.py | sort -n -k1
	@echo "files : "$(shell echo calculon/*.py test/*.py | wc -w)
	@echo "commits : "$(shell git rev-list HEAD --count) 


================================================
FILE: NOTICE
================================================
Calculon - Co-design for large scale parallel applications
Copyright 2022 Michael Isaev, Nic McDonald
All rights reserved.

================================================
FILE: README.md
================================================
[![DOI](https://zenodo.org/badge/660734586.svg)](https://zenodo.org/badge/latestdoi/660734586)
# Calculon - Co-design for large scale parallel applications

## Running

Run Calculon like this:
``` sh
$> PYTHONPATH=. ./bin/ <args>
```

Calculon is a hierarchical command line. To see the commands it accepts, use `--help` or `-h`:
``` sh
$> PYTHONPATH=. ./bin/ -h
```

You can also see how to use any command specifically by using `--help` or `-h` on the command:
``` sh
$> PYTHONPATH=. ./bin/ llm -h
```

## LLM Example

Run a single calculation for LLM (~1 sec):
``` sh
$> PYTHONPATH=. ./bin/ llm models/megatron-1T.json examples/3072_t4_p64_d12_mbs4_full.json systems/a100_80g.json -
```

Run a system execution optimizer for LLM (~1 min):
``` sh
$> PYTHONPATH=. ./bin/ llm-optimal-execution models/turing-530B.json 5128 2520 float16 systems/a100_80g.json output.json -m
```
`opt_exe.json` will contain the optimal way to run Turing-530B across 5128 A100 GPUs.

To store results from all successful runs from the same experiment, run a special system optimizer (~1 min):
``` sh
$> PYTHONPATH=. ./bin/ llm-all-executions models/turing-530B.json 5128 2520 float16 systems/a100_80g.json all_output.csv
```

## Testing and validation (optional)
To make sure that the current build is working, use

``` sh
$> make test
```
To validate Calculon performance modeling against Megatron run on NVIDIA's Selene A100-based supercomputer with results published in ["Sequence parallelism" paper](https://arxiv.org/abs/2205.05198), use

``` sh
$> PYTHONPATH=. ./bin/calculon llm-validation
```

## Publications

* Calculon: A Methodology and Tool for High-Level Co-Design of Systems and Large Language Models\
Mikhail Isaev, Nic McDonald, Larry Dennison, Richard Vuduc\
[Paper](https://dl.acm.org/doi/pdf/10.1145/3581784.3607102)

* Scaling Infrastructure to Support Multi-Trillion Parameter LLM Training\
Mikhail Isaev, Nic McDonald, Richard Vuduc\
[Paper](https://openreview.net/pdf?id=rqn2v1Ltgn0)


================================================
FILE: bin/calculon
================================================
#!/usr/bin/env python3

"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

import argparse
import calculon
import logging
import sys


if __name__ == '__main__':
  # CLI inspired from: https://github.com/ssnetsim/ssplot/

  # Creates an argparser and subparsers.
  desc = 'Calculon: Co-design for large scale parallel applications'
  ap = argparse.ArgumentParser(description=desc)
  ap.add_argument('-l', '--log', default='-',
                  help='Sets the log file, or - for stdout (default)')
  ap.add_argument('-v', '--verbosity', default='INFO',
                  help='Sets the logging level (see logging docs)')
  sp = ap.add_subparsers(title='commands', dest='command',
                         description='commands available in Calculon',
                         help='the command')
  sp.required = True

  # Registers each command line interface.
  for cls in calculon.CommandLine.command_lines():
    cls.create_parser(sp)

  # Parses the args and creates the logger
  args = ap.parse_args()
  logger = logging.getLogger()
  if args.log == '-':
    logger.addHandler(logging.StreamHandler(stream=sys.stdout))
  else:
    fd = open(args.log, 'w')
    logger.addHandler(logging.StreamHandler(stream=fd))
  logger.setLevel(args.verbosity)

  # Calls the corresponding command function
  sys.exit(args.func(logger, args))


================================================
FILE: calculon/__init__.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

__version__ = '0.1.0'

# Imports of this module
from .command_line import CommandLine
from .io import *
from .system import System
from .util import *
from .version import Version

# Imports submodules
from .llm import *


================================================
FILE: calculon/command_line.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

import copy

class CommandLine:
  """Defines the abstract interface definition for a command line interface.
  Inspired from: https://github.com/ssnetsim/ssplot/
  """

  @staticmethod
  def create_parser(subparser):
    """
    This function adds a parser to the subparser object according to the
    specific command line interface implementation.
    """
    raise NotImplementedError('subclasses must override this')

  @staticmethod
  def run_command(logger, args):
    """
    This function is used to run the command if it is chosen at the command
    line. This function should be registered to the parser in create_parser().
    """
    raise NotImplementedError('subclasses must override this')

  # this is a mapping of all names (class->names)
  _names = {}

  @staticmethod
  def register(cls):
    # gather names
    primary_name = cls.NAME
    aliases = cls.ALIASES

    # create a set to hold all
    all_names = [primary_name] + aliases

    # check current names against all new names
    for new_name in all_names:
      for pname in CommandLine._names:
        assert new_name is not pname, f'{new_name} already exists'
        for alias in CommandLine._names[pname]:
          assert new_name is not alias, f'{new_name} already exists'

    # add to map
    CommandLine._names[cls] = all_names

  @staticmethod
  def command_lines():
    return set(CommandLine._names.keys())

  @staticmethod
  def all_names():
    return copy.copy(CommandLine._names)


================================================
FILE: calculon/io.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""
import gzip
import json
import numpy as np


class NpEncoder(json.JSONEncoder):
  def default(self, obj):
    if isinstance(obj, np.integer):
      return int(obj)
    if isinstance(obj, np.floating):
      return float(obj)
    if isinstance(obj, np.ndarray):
      return obj.tolist()
    if isinstance(obj, np.bool_):
      return bool(obj)
    return super(NpEncoder, self).default(obj)

def is_json_extension(filename):
  return filename.endswith('.json') or filename.endswith('.json.gz')


def write_json_file(jdata, filename):
  assert is_json_extension(filename)
  opener = gzip.open if filename.endswith('.gz') else open
  indent = None if filename.endswith('.gz') else 2
  with opener(filename, 'wb') as fd:
    fd.write(bytes(json.dumps(jdata, indent=indent, cls=NpEncoder), 'utf-8'))


def read_json_file(filename):
  assert is_json_extension(filename)
  opener = gzip.open if filename.endswith('.gz') else open
  with opener(filename, 'rb') as fd:
    return json.loads(fd.read().decode('utf-8'))


================================================
FILE: calculon/llm/__init__.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

from .layers import *
from .llm import *

# Command lines
from .all_executions import AllExecutions
from .optimal_execution import OptimalExecution
from .parameter_calculator import ParameterCalculator
from .validation import Validation
from .runner import Runner


================================================
FILE: calculon/llm/all_executions.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

import datetime
import gzip
import itertools
import logging
import math
import multiprocessing as mp
import os
import pandas
import psutil
import random

import calculon
from calculon.util import pick, arg_true_false_all
from calculon.llm import *


class AllExecutions(calculon.CommandLine):
  NAME = 'llm-all-executions'
  ALIASES = ['lae']

  @staticmethod
  def create_parser(subparser):
    sp = subparser.add_parser(
      AllExecutions.NAME, aliases=AllExecutions.ALIASES,
      help='run a search to find the optimal llm execution')
    sp.set_defaults(func=AllExecutions.run_command)
    sp.add_argument('-d', '--debug', action='store_true',
                    help='Loop over executions, don\'t run them')
    sp.add_argument('application', type=str,
                    help='File path to application configuration')
    sp.add_argument('num_procs', type=int,
                    help='Number of processors in execution')
    sp.add_argument('max_batch_size', type=int,
                    help='Maximum batch size, will be largest multiple of DP')
    sp.add_argument('datatype', type=str, choices=System.supported_datatypes(),
                    help='The datatype to use')
    sp.add_argument('system', type=str,
                    help='File path to system configuration')
    sp.add_argument('output', type=str,
                    help='File path to the output file'
                    " ('*.csv', '*.csv.gz')")
    sp.add_argument('-c', '--cpus', type=int, default=psutil.cpu_count(logical=False),
                    help='CPUs to use for parallelization')
    sp.add_argument('-n', '--noneok', action='store_true',
                    help='Don\'t give failure status when no good execution exists')
    sp.add_argument('-f', '--fused_activation', type=arg_true_false_all,
                    default='true', help='Mode of fused activation')

  @staticmethod
  def execution_fields():
    return (
      'num_procs', 'tensor_par', 'pipeline_par', 'data_par', 'tensor_par_net',
      'pipeline_par_net', 'data_par_net', 'batch_size', 'microbatch_size',
      'datatype', 'fused_activation', 'attention_type', 'activation_recompute',
      'pipeline_interleaving', 'optimizer_sharding', 'tensor_par_comm_type',
      'tensor_par_overlap', 'seq_par_ag_redo', 'data_par_overlap',
      'weight_offload', 'activations_offload', 'optimizer_offload', 'training')

  @staticmethod
  def get_batch_size(data_par, max_batch_size):
    if data_par > max_batch_size:
      return None
    last = data_par
    while True:
      if last + data_par > max_batch_size:
        return last
      else:
        last += data_par

  @staticmethod
  def all_executions(app, syst, num_procs, max_batch_size, datatype, fused_activation):
    has_mem2 = syst.mem2.capacity > 0
    num_nets = syst.num_networks
    count = 0
    for tp in Llm.get_all_tensor_parallelisms(
        num_procs, app.hidden, app.attn_heads):
      for pp in Llm.get_all_pipeline_parallelisms(
          num_procs, tp, app.num_blocks):
        dp = Llm.get_data_parallelism(num_procs, tp, pp)
        for ppint in Llm.get_valid_pipeline_interleavings(app.num_blocks, pp):
          batch_size = AllExecutions.get_batch_size(dp, max_batch_size)
          if batch_size is None:
            continue
          for activation_recompute in ['full', 'attn_only', 'none']:
            for optimizer_sharding in pick(dp>1, [True, False], [False]):
              for tensor_par_comm_type in ['ar', 'p2p_rs_ag', 'rs_ag']:
                can_redo = Llm.can_redo_ag(tensor_par_comm_type,
                                           activation_recompute)
                for seq_par_ag_redo in pick(can_redo, [True, False], [False]):
                  for data_par_overlap in pick(dp>1, [True, False], [False]):
                    for tensor_par_overlap in pick(tp>1, ['none', 'ring', 'pipe'], ['none']):
                      for weight_offload in pick(has_mem2, [True, False], [False]):
                        if activation_recompute == 'full' or not has_mem2:
                          activations_offloads = [False]
                        else:
                          activations_offloads = [True, False]
                        for activations_offload in activations_offloads:
                          for optimizer_offload in pick(has_mem2, [True, False],
                                                        [False]):
                            for fused_act in fused_activation:
                              for microbatch_size in Llm.get_valid_microbatch_sizes(
                                  app.seq_size, tp, dp, batch_size, pp):
                                for tn in pick(tp>1, range(num_nets), [0]):
                                  for pn in pick(pp>1, range(num_nets), [0]):
                                    for dn in pick(dp>1, range(num_nets), [0]):
                                      yield (num_procs, tp, pp, dp, tn, pn, dn,
                                             batch_size, microbatch_size, datatype,
                                             fused_act, 'multihead', activation_recompute,
                                             ppint, optimizer_sharding, tensor_par_comm_type,
                                             tensor_par_overlap, seq_par_ag_redo,
                                             data_par_overlap, weight_offload,
                                             activations_offload, optimizer_offload,
                                             True)
                                      count += 1

  @staticmethod
  def run_command(logger, args):
    assert args.output.endswith('.csv') or args.output.endswith('.csv.gz')

    app = Llm.Application(calculon.io.read_json_file(args.application))
    syst = System(calculon.io.read_json_file(args.system))

    executions = list(AllExecutions.all_executions(
      app, syst, args.num_procs, args.max_batch_size, args.datatype,
      args.fused_activation))
    random.shuffle(executions)
    exe_count = len(executions)
    logger.info(f'Total executions: {exe_count}')

    step = math.ceil(len(executions) / args.cpus)
    worker_args = []
    for index in range(0, len(executions), step):
      worker_args.append((app, syst, executions[index : index + step]))
    del executions

    # Runs parallel searches
    start_time = datetime.datetime.now()
    with mp.Pool(args.cpus) as pool:
      goods = pool.starmap(AllExecutions.search, worker_args)
    end_time = datetime.datetime.now()
    good_count = sum(len(good) for good in goods)

    # Console statistics
    logger.info(f'Good executions: {good_count}')
    logger.info(f'Bad executions: {exe_count-good_count}')
    calc_rate = exe_count / (end_time - start_time).total_seconds()
    logger.info(f'Calculation rate: {calc_rate:.2f} calcs/sec')

    # Check if OK
    if good_count == 0:
      if not args.noneok:
        logger.fatal('No acceptable configurations found :(')
        return -1
      else:
        logger.info('No acceptable configurations found :(')

    if args.debug:
      return 0

    # Writes to CSV
    fields = Llm.Execution.fields() + Llm.get_stats_fields()
    assert len(fields) == len(goods[0][0])
    logger.info(f'Output: {args.output}')
    opener = gzip.open if args.output.endswith('.gz') else open
    with opener(args.output, 'wb') as fd:
      fd.write(bytes(','.join(fields) + '\n', 'utf-8'))
      for vals in itertools.chain(*goods):
        fd.write(bytes(','.join(str(v) for v in vals) + '\n', 'utf-8'))

    return 0

  @staticmethod
  def search(app, syst, executions):
    good = []
    for execution in executions:
      try:
        model = Llm(app, logging.Logger('sub'))
        model.compile(syst, Llm.Execution(*execution))
        model.run(syst)
        statistics = model.get_stats_values()
        good.append(execution + statistics)
      except Llm.Error as ex:
        logger = logging.getLogger()
        logger.debug(f'ERROR:{ex}\n')
    return good

  @staticmethod
  def update_list(current, candidate, quantity):
    if not isinstance(candidate, list):
      current.append(candidate)
    else:
      current.extend(candidate)
    if quantity <= 0:
      return current  # don't sort and chop
    else:
      current.sort(reverse=True, key=lambda x: x[0])
      return current[:quantity]


calculon.CommandLine.register(AllExecutions)


================================================
FILE: calculon/llm/layers.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

from calculon import *


class Layer:
  """
  A single layer of a neural network. Has weights, activation space,
  gradients, and optimizer state associated with it. May invoke compute,
  memory access, or network operation.
  """

  def __init__(self, name, sys, fw_flops=0, agrad_flops=0, wgrad_flops=0,
               inputs_size=0, output_size=0, activation_space=0,
               activation_grads=0, weight_space=0, weight_grads=0,
               optim_space=0, needs_recompute=False, needs_recomm=False,
               activation_reused=False, activation_stored=True,
               output_stored=True):
    self.name = name
    self.sys = sys
    self.fw_flops = fw_flops
    self.agrad_flops = agrad_flops
    self.wgrad_flops = wgrad_flops
    self.inputs_size = inputs_size
    self.output_size = output_size
    # activations equal input size, we store them to compute Wgrad during BW
    self.activation_space = activation_space
    # activation grads equal output size and correspond grads w.r.t. the output
    self.activation_grads = activation_grads
    self.weight_space = weight_space
    self.weight_grads = weight_grads
    self.optim_space = optim_space
    self.optim_sharding_num_proc = 1

    # Add optimizations and parallelization split
    self.needs_recompute = needs_recompute
    self.needs_recomm = needs_recomm
    self.activation_reused=activation_reused
    self.activation_stored = activation_stored
    self.output_stored = output_stored
    # Before bytes_per_element set by SW config, we operate with just
    # parameter count, setting bytes_per_element to 1
    self.bytes_per_element = 1
    self.processing_time = None
    self.net_exposed_time = None

  def get_stats_json(self):
    return {
      'name': self.name,
      'inputs_size': self.inputs_size,
      'outputs_size': self.output_size,
      'fw_flops': self.get_fw_flops(),
      'fw_mem_accessed': self.get_fw_mem_accessed(),
      'fw_arithmetic_intensity': self.get_fw_arithmetic_intensity(),
      'fw_processing_time': self.compute_processing_time('fw'),
      'baseblock_fw_tp_comm_tile': self.get_comm_tile('fw', baseblock=True),
      'edgeblock_fw_tp_comm_tile': self.get_comm_tile('fw', baseblock=False),
      'baseblock_fw_tp_comm_size': self.get_comm_bytes('fw', baseblock=True),
      'edgeblock_fw_tp_comm_size': self.get_comm_bytes('fw', baseblock=False),
      'baseblock_fw_tp_comm_time': self.compute_net_time('fw', baseblock=True),
      'edgeblock_fw_tp_comm_time': self.compute_net_time('fw',baseblock=False),
      'baseblock_fw_tp_comm_time_exposed': self.get_exposed_net_time(
        'fw', baseblock=True),
      'edgeblock_fw_tp_comm_time_exposed': self.get_exposed_net_time(
        'fw', baseblock=False),
      'agrad_flops': self.get_agrad_flops(),
      'agrad_mem_accessed': self.get_agrad_mem_accessed(),
      'agrad_arithmetic_intensity': self.get_agrad_arithmetic_intensity(),
      'agrad_processing_time': self.compute_processing_time('agrad'),
      'baseblock_bw_tp_comm_tile': self.get_comm_tile('agrad', baseblock=True),
      'edgeblock_bw_tp_comm_tile': self.get_comm_tile('agrad', baseblock=False),
      'baseblock_bw_tp_comm_size': self.get_comm_bytes('agrad', baseblock=True),
      'edgeblock_bw_tp_comm_size': self.get_comm_bytes('agrad', baseblock=False),
      'baseblock_bw_tp_comm_time': self.compute_net_time('agrad', baseblock=True),
      'edgeblock_bw_tp_comm_time': self.compute_net_time('agrad', baseblock=False),
      'baseblock_bw_tp_comm_time_exposed': self.get_exposed_net_time(
        'agrad', baseblock=True),
      'edgeblock_bw_tp_comm_time_exposed': self.get_exposed_net_time(
        'agrad', baseblock=False),
      'wgrad_flops': self.get_wgrad_flops(),
      'wgrad_mem_accessed': self.get_wgrad_mem_accessed(),
      'wgrad_arithmetic_intensity': self.get_wgrad_arithmetic_intensity(),
      'wgrad_processing_time': self.compute_processing_time('wgrad'),
      'baseblock_recomm_tile': self.get_comm_tile('wgrad', baseblock=True),
      'edgeblock_recomm_tile': self.get_comm_tile('wgrad', baseblock=False),
      'baseblock_recomm_size': self.get_comm_bytes('wgrad', baseblock=True),
      'edgeblock_recomm_size': self.get_comm_bytes('wgrad', baseblock=False),
      'baseblock_recomm_time': self.compute_net_time('wgrad', baseblock=True),
      'edgeblock_recomm_time': self.compute_net_time('wgrad', baseblock=False),
      'baseblock_recomm_time_exposed': self.get_exposed_net_time(
        'wgrad', baseblock=True),
      'edgeblock_recomm_time_exposed': self.get_exposed_net_time(
        'wgrad', baseblock=False),
      'optim_flops': self.get_optim_step_flops(),
      'optim_mem_accessed': self.get_optim_step_mem_accessed(),
      'optim_arithmetic_intensity': self.get_optim_step_arithmetic_intensity(),
      'optim_processing_time': self.compute_processing_time('optim'),
      'weight': self.get_weight(),
      'activation': self.get_activation(),
      'weight_grad': self.get_weight_grad(),
      'activation_grad': self.get_activation_grad(),
      'optimizer': self.get_optimizer()
    }

  def get_stats_str(self):
    stats = "Operation {0}:\n{1} FW flops, {2} FW bytes accessed,".format(
      self.name,
      human_format(self.get_fw_flops(), 'flops'),
      human_format(self.get_fw_mem_accessed(), 'bytes'))
    stats += " FW AI: {0:.3f}\n".format(self.get_fw_arithmetic_intensity())
    stats += "{0} BW Adrad flops, {1} BW Agrad bytes accessed,".format(
      human_format(self.get_agrard_flops(), 'flops'),
      human_format(self.get_agrad_mem_accessed(), 'bytes'))
    stats += " BW Agrad AI: {0:.3f}\n".format(
      self.get_agrad_arithmetic_intensity())
    stats += "{0} BW Wdrad flops, {1} BW Wgrad bytes accessed,".format(
      human_format(self.get_wgrard_flops(), 'flops'),
      human_format(self.get_wgrad_mem_accessed(), 'bytes'))
    stats += " BW Wgrad AI: {0:.3f}\n".format(
      self.get_wgrad_arithmetic_intensity())
    stats += "{0} Optim flops, {1} Optim bytes accessed,".format(
      human_format(self.get_optim_step_flops(), 'flops'),
      human_format(self.get_optim_step_mem_accessed(), 'bytes'))
    stats += " Optim AI: {0:.3f}\n".format(
      self.get_optim_step_arithmetic_intensity())
    stats += "W: {0}, Act: {1}, WGrad: {2}, AGrad: {3}, Optim: {4}".format(
      human_format(self.get_weight(), 'bytes'),
      human_format(self.get_activation(), 'bytes'),
      human_format(self.get_weight_grad(), 'bytes'),
      human_format(self.get_activation_grad(), 'bytes'),
      human_format(self.get_optimizer(), 'bytes'))
    return stats

  def set_bytes_per_element(self, bytes_per_element):
    self.bytes_per_element = bytes_per_element

  # Shard (distribute) optimizer and weight grads between data parallel nodes
  def shard_optimizer(self, num_procs):
    self.optim_sharding_num_proc = num_procs

  # getters that will be called from Llm model class, can be rewritten
  def get_fw_flops(self):
    return self.fw_flops

  def get_fw_mem_accessed(self):
    mem_accessed = self.inputs_size + self.output_size + self.weight_space
    mem_accessed *= self.bytes_per_element
    return mem_accessed

  def get_fw_arithmetic_intensity(self):
    if self.fw_flops == 0:
      return 0
    if self.get_fw_mem_accessed() == 0:
      return float('inf')
    return self.fw_flops / self.get_fw_mem_accessed()

  def get_recompute_flag(self):
    return self.needs_recompute

  def get_recomm_flag(self):
    return self.needs_recomm

  def reuses_activation(self):
    return self.activation_reused

  def stores_activation(self):
    return self.activation_stored

  def stores_output(self):
    return self.output_stored

  def get_agrad_flops(self):
    return self.agrad_flops

  def get_agrad_mem_accessed(self):
    # activation grads equal output size and correspond grads w.r.t.
    # layer output; activations are equal to input size
    grad_mem = self.weight_space + (
      self.activation_space + self.activation_grads)
    grad_mem *= self.bytes_per_element
    return grad_mem

  def get_agrad_arithmetic_intensity(self):
    if self.agrad_flops == 0:
      return 0
    if self.get_agrad_mem_accessed() == 0:
      return float('inf')
    return self.agrad_flops / self.get_agrad_mem_accessed()

  def get_wgrad_flops(self):
    return self.wgrad_flops

  def get_wgrad_mem_accessed(self):
    if self.weight_space == 0:
      assert self.wgrad_flops == 0, \
        f"Haven't expected to see wgrad flops in layer {self.name}"
      return 0
    # activation grads equal output size and correspond grads w.r.t.
    # layer output; activations are equal to input size
    grad_mem = self.weight_grads + (
      self.activation_space + self.activation_grads)
    grad_mem *= self.bytes_per_element
    return grad_mem

  def get_wgrad_arithmetic_intensity(self):
    if self.wgrad_flops == 0:
      return 0
    if self.get_wgrad_mem_accessed() == 0:
      return float('inf')
    return self.wgrad_flops / self.get_wgrad_mem_accessed()

  # We use Adam optimizer. The amount of flops is based on the number of
  # weight grads to accommodate for possible weight_grad sharding
  # among data parallel nodes
  def get_optim_step_flops(self):
    optim_flops = self.weight_grads / self.optim_sharding_num_proc * 11
    return optim_flops

  def get_optim_step_mem_accessed(self):
    return self.get_optimizer()

  def get_optim_step_arithmetic_intensity(self):
    if self.get_optim_step_flops() == 0:
      return 0
    if self.get_optim_step_mem_accessed() == 0:
      return float('inf')
    return self.get_optim_step_flops() / self.get_optim_step_mem_accessed()

  def get_weight(self):
    return self.weight_space * self.bytes_per_element

  def get_activation(self):
    return self.activation_space * self.bytes_per_element

  def get_output(self):
    return self.output_size * self.bytes_per_element

  def get_weight_grad(self, sharded=True):
    # Keep lower precision copy of grads for mem and net transfers
    grads = self.weight_grads
    if sharded:
      # We keep grads in lower precision for communication
      grads *= self.bytes_per_element
      grads /= self.optim_sharding_num_proc
    else:
      # otherwise keep grads in 32 bit for accumulation
      grads *= 4
    return grads

  def get_activation_grad(self):
    return self.activation_grads * self.bytes_per_element

  def get_optimizer(self):
    # Keep 32-bits master copy of weights, plus both moments (m,v)
    # master copy for grads is accounted for in get_weight_grad()
    moments_size = self.optim_space * 4
    if self.bytes_per_element < 4:
      master_copy_size = self.weight_space * 4
    else:
      master_copy_size = 0
    return (master_copy_size + moments_size) / self.optim_sharding_num_proc

  def set_processing_time(self, processing_time):
    self.processing_time = processing_time

  def get_processing_time(self):
    return self.processing_time

  def use_matrix_engine(self):
    return False

  def get_comm_bytes(self, stage, baseblock=True):
    return 0

  def get_comm_tile(self, stage, baseblock=True):
    return self.get_comm_bytes(stage, baseblock)

  def compute_flops_time(self, stage):
    if stage == "fw":
      flops = self.get_fw_flops()
    elif stage == "agrad":
      flops = self.get_agrad_flops()
    elif stage == "wgrad":
      flops = self.get_wgrad_flops()
    elif stage == "optim":
      flops = self.get_optim_step_flops()
    else:
      raise Exception(f'Bad compute stage : {stage}')
    if self.use_matrix_engine() and stage != "optim":
      throughput = self.sys.get_matrix_throughput(flops)
    else:
      throughput = self.sys.get_vector_throughput(flops)
    return flops / throughput

  def compute_mem_time(self, stage):
    if stage == "fw":
      mem = self.get_fw_mem_accessed()
    elif stage == "agrad":
      mem = self.get_agrad_mem_accessed()
    elif stage == "wgrad":
      mem = self.get_wgrad_mem_accessed()
    elif stage == "optim":
      mem = self.get_optim_step_mem_accessed()
    else:
      raise Exception(f'Bad compute stage : {stage}')
    return mem / self.sys.get_mem1_throughput(mem)

  def compute_net_time(self, stage, baseblock=True):
    return 0

  def get_exposed_net_time(self, stage, baseblock=True):
    return 0

  def get_required_bandwidth(self, stage, baseblock=True):
    return 0

  def compute_processing_time(self, stage):
    self.processing_time =  self.sys.get_processing_time(
      self.compute_flops_time(stage),
      self.compute_mem_time(stage)
    )
    return self.processing_time

# We can factor all layers peculiarities and layer-wise optimizations by
# rewriting parent class member functions when needed
class Linear(Layer):
  def __init__(self, name, sys, batch_seq, c_in, c_out,
               needs_recompute=False, activation_reused=False,
               activation_stored=True, output_stored=True):
    m, n, k = batch_seq, c_in, c_out
    super().__init__(name,
                     sys,
                     fw_flops=2*m*n*k,
                     agrad_flops=2*m*n*k,
                     wgrad_flops=2*m*n*k,
                     inputs_size=m*n,
                     output_size=m*k,
                     weight_space=n*k,
                     weight_grads=n*k,
                     activation_space=m*n,
                     activation_grads=m*k,
                     optim_space=2*n*k,
                     needs_recompute=needs_recompute,
                     activation_reused=activation_reused,
                     activation_stored=activation_stored,
                     output_stored=output_stored)

  def use_matrix_engine(self):
    return True

class LinearOverlapped(Layer):
  def __init__(self, name, sys, batch_seq, c_in, c_out, tensor_par_comm_type,
               num_tiles, net_id, num_peers, conjugate=False,
               in_network_reduction=False, tp_overlap='pipe',
               needs_recompute=False, needs_recomm=False,
               activation_reused=False, activation_stored=True,
               output_stored=True):
    m, n, k = batch_seq, c_in, c_out
    self.tensor_par_comm_type = tensor_par_comm_type
    self.num_tiles = num_tiles
    self.net = sys.get_network(net_id)
    self.num_peers = num_peers
    self.conjugate = conjugate
    self.in_network_reduction = in_network_reduction
    self.tp_overlap = tp_overlap
    self._processed_flag = False
    if self.tensor_par_comm_type == 'rs_ag':
      if not conjugate:
        #AllGather case
        assert k % self.num_peers == 0
        # assert m % self.num_peers == 0         # this should be true for seq_par
        k = k // self.num_peers
        act_space = m * n // num_tiles
        act_grad_space = m * k
        act_net_buffer = m * n // num_tiles
        act_grad_net_buffer = 0
      else:
        # ReduceScatter case
        assert n % self.num_peers == 0
        # assert m % self.num_peers == 0         # this should be true for seq_par
        n = n // self.num_peers
        act_space = m * n
        act_grad_space = m * k // num_tiles
        act_net_buffer = 0
        act_grad_net_buffer = m * k // num_tiles
        #act_net_buffer = m * k // num_tiles
    else:
      if not conjugate:
        # AllReduce case
        assert k % self.num_peers == 0
        k = k // self.num_peers
        act_space = m * n
        act_grad_space = 0
        act_net_buffer = m * n // num_tiles
        act_grad_net_buffer = 0
      else:
        # Identityy case
        assert n % self.num_peers == 0
        n = n // self.num_peers
        act_space = 0
        act_grad_space = m * k
        act_net_buffer = 0
        act_grad_net_buffer = m * k

    super().__init__(name,
                     sys,
                     fw_flops=2*m*n*k,
                     agrad_flops=2*m*n*k,
                     wgrad_flops=2*m*n*k,
                     inputs_size=m*n,
                     output_size=m*k,
                     weight_space=n*k,
                     weight_grads=n*k,
                     activation_space=act_space, # + act_net_buffer,
                     activation_grads=act_grad_space + act_grad_net_buffer,
                     optim_space=2*n*k,
                     needs_recompute=needs_recompute,
                     needs_recomm=needs_recomm,
                     activation_reused=activation_reused,
                     activation_stored=activation_stored,
                     output_stored=output_stored)

  def use_matrix_engine(self):
    return True

  def get_comm_bytes(self, stage, baseblock=True):
    if self.num_peers == 1:
      return 0
    split_comm = (self.tensor_par_comm_type == 'rs_ag') or (
      (self.tensor_par_comm_type == 'p2p_rs_ag') and not baseblock)
    ag_comm_size = self.inputs_size * self.bytes_per_element
    ar_rs_comm_size = self.output_size * self.bytes_per_element
    if stage == 'fw':
      if self.conjugate:
        # ReduceScatter or AllReduce on FW
        return ar_rs_comm_size
      else:
        if split_comm:
          # AllGather on FW
          return ag_comm_size
        else:
          # Identity on FW
          return 0
    if stage == 'agrad':
      # Comm sizes during FW and BW pass are the same
      if not self.conjugate:
        # ReduceScatter or AllReduce on BW
        return ag_comm_size
      else:
        if split_comm:
          # AllGather on BW
          return ar_rs_comm_size
        else:
          # Identity on BW
          return 0
    if stage == 'wgrad':
      if self.needs_recomm:
        return self.get_comm_bytes('fw', baseblock)
      else:
        return 0
    if stage == 'optim':
      return 0

  def get_comm_flops(self, stage, baseblock=True):
    return self.get_comm_bytes(stage, baseblock) / self.bytes_per_element

  def get_num_tiles(self):
    return self.num_tiles

  def get_comm_tile(self, stage, baseblock=True):
    return self.get_comm_bytes(stage, baseblock) / self.get_num_tiles()

  def compute_net_time(self, stage, baseblock=True):
    if self.num_peers == 1:
      return 0
    split_comm = (self.tensor_par_comm_type == 'rs_ag') or (
      (self.tensor_par_comm_type == 'p2p_rs_ag') and not baseblock)
    if self.conjugate:
      if split_comm:
        # ReduceScatter case
        fw_comm_type = 'reduce_scatter'
        bw_comm_type = 'all_gather'
      else:
        #AllReduce case
        fw_comm_type = 'all_reduce'
        bw_comm_type = None
      if not self.in_network_reduction:
        fw_flops = self.get_comm_flops(stage, baseblock) * (
          self.num_peers - 1) / self.num_peers
        fw_flop_time = fw_flops / self.sys.get_vector_throughput(fw_flops)
      else:
        fw_flop_time = 0
      bw_flop_time = 0
    else:
      if split_comm:
        #AllGather case
        fw_comm_type = 'all_gather'
        bw_comm_type = 'reduce_scatter'
      else:
        # Identity case
        fw_comm_type = None
        bw_comm_type = 'all_reduce'
      fw_flop_time = 0
      if not self.in_network_reduction:
        bw_flops = self.get_comm_flops(stage, baseblock) * (
          self.num_peers - 1) / self.num_peers
        bw_flop_time = bw_flops / self.sys.get_vector_throughput(bw_flops)
      else:
        bw_flop_time = 0
    if stage == 'fw':
      if fw_comm_type == None:
        return 0
      else:
        fw_net_time = self.net.time(
          fw_comm_type, self.get_comm_bytes(stage, baseblock), self.num_peers)
        return fw_net_time + fw_flop_time
    if stage == 'agrad':
      if bw_comm_type == None:
        return 0
      else:
        bw_net_time = self.net.time(
          bw_comm_type, self.get_comm_bytes(stage, baseblock), self.num_peers)
        return bw_net_time + bw_flop_time
    if stage == 'wgrad':
      if self.needs_recomm and fw_comm_type:
        # AllGather Redo (RS_AG only) or full recompute
        return self.net.time(
          fw_comm_type, self.get_comm_bytes(stage, baseblock), self.num_peers)
      else:
        return 0
    if stage == 'optim':
      return 0

  def compute_processing_time(self, stage):
    flop_time = self.compute_flops_time(stage)
    flop_time_slowed = flop_time / (1 - self.net.processor_usage)
    mem_time = self.compute_mem_time(stage)
    net_time = self.compute_net_time(stage)
    compute_time = self.sys.get_processing_time(flop_time, mem_time)
    if net_time == 0:
      time = compute_time
      net_exposed_time = 0
    else:
      compute_time_slowed = self.sys.get_processing_time(
        flop_time_slowed, mem_time)
      # Tiled time computed as fraction of full time, to model high effective
      # throughput when processing many consequitive tiles
      flop_tile = flop_time / self.num_tiles
      flop_tile_slowed = flop_time_slowed / self.num_tiles
      net_tile = net_time / self.num_tiles
      compute_tile = compute_time / self.num_tiles
      compute_tile_slowed = compute_time_slowed / self.num_tiles
      overlap_inflection = net_tile - flop_tile_slowed
      # we have one exposed comm tile if tp_comm is not ring,
      # one exposed compute tile, and
      # (Proc - 1) overlapped tiles, where either compute or comm is exposed
      if overlap_inflection > 0:
        # Tcomm is larger than compute, excess is exposed
        # compute time itself is the compute + mem
        time = compute_tile + (self.num_tiles - 1) * compute_tile_slowed
        net_exposed_time = (self.num_tiles - 1) * overlap_inflection
      else:
        # Tcomm is smaller than compute and hidden, but it contributes to
        # compute slowdown due part of compute resources orchestrating comm
        time = compute_tile + (self.num_tiles - 1) * compute_tile + (
          self.num_tiles - 1) * net_tile * self.net.processor_usage
        net_exposed_time = 0
      if self.tp_overlap == 'pipe':
        # If overlap type is pipe, we need to add an exposed comm tile
        # with ring-based overlap, we have a special schedule for comm and avoid
        # sending an extra tile we have in the beginning
        net_exposed_time += net_tile
        time += net_tile
    self.processing_time = time
    self.net_exposed_time = net_exposed_time
    self._processed_flag = True
    return self.processing_time

  def get_exposed_net_time(self, stage, baseblock=True):
    # only use after calling compute_processing_time(), otherwise it's set with None
    assert self._processed_flag
    return self.net_exposed_time

  def get_required_bandwidth(self, stage, baseblock=True):
    assert self._processed_flag
    net_tile_size = self.get_comm_tile(stage, baseblock)
    flop_time = self.compute_flops_time(stage)
    flop_time_slowed = flop_time / (1 - self.net.processor_usage)
    flop_tile_slowed = flop_time_slowed / self.num_tiles
    return net_tile_size / flop_tile_slowed

class BatchMatMul(Layer):
  def __init__(self, name, sys, batch, size_a, contraction_size, size_b,
               needs_recompute=False, activation_reused=False,
               activation_stored=True, output_stored=True):
    m, n, k = size_a, contraction_size, size_b
    super().__init__(name,
                     sys,
                     fw_flops=batch*2*m*n*k,
                     agrad_flops=batch*2*2*m*n*k,
                     inputs_size=batch*(m*n+n*k),
                     output_size=batch*m*k,
                     activation_space=batch*(m*n+n*k),
                     activation_grads=batch*m*k,
                     needs_recompute=needs_recompute,
                     activation_reused=activation_reused,
                     activation_stored=activation_stored,
                     output_stored=output_stored)

  def use_matrix_engine(self):
    return True

# https://kratzert.github.io/2016/02/12/understanding-the-gradient-flow-through-the-batch-normalization-layer.html
# https://cthorey.github.io./blog/2016/backpropagation/
class LayerNorm(Layer):
  def __init__(self, name, sys, act_size, hidden,
               needs_recompute=False, activation_reused=False,
               activation_stored=True, output_stored=True):
    super().__init__(name,
                     sys,
                     fw_flops=9*act_size,
                     agrad_flops=14*act_size,
                     wgrad_flops=7*act_size,
                     inputs_size=act_size,
                     output_size=act_size,
                     activation_space=act_size,
                     activation_grads=act_size,
                     weight_space=2*hidden,
                     weight_grads=2*hidden,
                     optim_space=2*2*hidden,
                     needs_recompute=needs_recompute,
                     activation_reused=activation_reused,
                     activation_stored=activation_stored,
                     output_stored=output_stored)


class DropOut(Layer):
  def __init__(self, name, sys, act_size,
               needs_recompute=False, activation_reused=False,
               activation_stored=True, output_stored=True):
    super().__init__(name,
                     sys,
                     fw_flops=act_size,
                     agrad_flops=act_size,
                     inputs_size=act_size,
                     output_size=act_size,
                     activation_space=act_size,
                     activation_grads=act_size,
                     needs_recompute=needs_recompute,
                     activation_reused=activation_reused,
                     activation_stored=activation_stored,
                     output_stored=output_stored)


  # need to account for DropOut mask of bool type that takes 1 B per element
  # mask is the only DropOut activation
  def get_activation(self):
    return self.activation_space

  def get_activation_grad(self):
    return self.activation_grads

  def get_fw_mem_accessed(self):
    mask_size = self.activation_space
    mem_accessed = self.inputs_size + self.output_size
    mem_accessed *= self.bytes_per_element
    mem_accessed += mask_size
    return mem_accessed

  def get_agrad_mem_accessed(self):
    return self.get_fw_mem_accessed()


# https://mlfromscratch.com/activation-functions-explained/#/
class GeLU(Layer):
  def __init__(self, name, sys, act_size,
               needs_recompute=False, activation_reused=False,
               activation_stored=True, output_stored=True,
               fused=False):
    # Fused GeLU runs right after previous Linear layer and does not store
    # activations or gradients
    self._fused = fused
    if fused:
      eff_act_space = 0
      eff_act_grads = 0
    else:
      eff_act_space = act_size
      eff_act_grads = act_size
    super().__init__(name, sys, fw_flops=8*act_size, agrad_flops=13*act_size,
                     inputs_size=act_size, output_size=act_size,
                     activation_space=eff_act_space,
                     activation_grads=eff_act_grads,
                     needs_recompute=needs_recompute,
                     activation_reused=activation_reused,
                     activation_stored=activation_stored,
                     output_stored=output_stored)

  def get_agrad_mem_accessed(self):
    return self.get_fw_mem_accessed()


# https://automata88.medium.com/how-to-implement-the-softmax-derivative-independently-from-any-loss-function-ae6d44363a9d
class SoftMax(Layer):
  def __init__(self, name, sys, act_size,
               needs_recompute=False, activation_reused=False,
               activation_stored=True, output_stored=True):
    super().__init__(name,
                     sys,
                     fw_flops=5*act_size,
                     agrad_flops=8*act_size,
                     inputs_size=act_size,
                     output_size=act_size,
                     activation_space=act_size,
                     activation_grads=act_size,
                     needs_recompute=needs_recompute,
                     activation_reused=activation_reused,
                     activation_stored=activation_stored,
                     output_stored=output_stored)

  def get_agrad_mem_accessed(self):
    return self.get_fw_mem_accessed()


# https://explained.ai/matrix-calculus/#sec:1.4.2
class ElementWise(Layer):
  def __init__(self, name, sys, operand1, operand2,
               needs_recompute=False, activation_reused=False,
               activation_stored=True, output_stored=True):
    act_size = max(operand1, operand2)
    super().__init__(name,
                     sys,
                     fw_flops=act_size,
                     agrad_flops=(operand1+operand2),
                     inputs_size=(operand1+operand2),
                     output_size=act_size,
                     activation_space=(operand1+operand2),
                     activation_grads=act_size,
                     needs_recompute=needs_recompute,
                     activation_reused=activation_reused,
                     activation_stored=activation_stored,
                     output_stored=output_stored)


# Splits activation on the forward pass, sums gradients on the backward
class Fork(Layer):
  def __init__(self, name, sys, act_size, num_users,
               needs_recompute=False, activation_reused=False,
               activation_stored=True, output_stored=True):
    self.num_users = num_users
    super().__init__(name,
                     sys,
                     inputs_size=act_size,
                     agrad_flops=num_users*act_size,
                     activation_space=act_size,
                     # Gradients from num_users accumulated in a single storage
                     # that's accounted in the other layers
                     # use 0 here to avoid double accounting
                     activation_grads=0,
                     needs_recompute=needs_recompute,
                     activation_reused=activation_reused,
                     activation_stored=activation_stored,
                     output_stored=output_stored)

  def get_fw_mem_accessed(self):
    return 0

  def get_agrad_mem_accessed(self):
    return self.activation_space * self.bytes_per_element * (
      self.num_users + 1)


class TPComm(Layer):

  def __init__(self, name, sys, act_size, net_id, num_peers, tensor_par_comm_type,
               conjugate=False, in_network_reduction=False,
               needs_recomm=False, activation_reused=False,
               activation_stored=True, output_stored=True):
    self.net = sys.get_network(net_id)
    self.num_peers = num_peers
    self.tensor_par_comm_type = tensor_par_comm_type
    self.comm_size = act_size
    self.conjugate = conjugate
    if self.num_peers == 1:
      fw_flops = 0
      bw_flops = 0
      in_size = 0
      out_size = 0
    else:
      if not self.conjugate:
        # FW pass Identity/AllGather, BW pass AllReduce/ReduceScatter
        fw_flops = 0
        if not in_network_reduction:
          bw_flops = act_size * (self.num_peers - 1) / self.num_peers
        else:
          bw_flops = 0
        in_size = act_size
        out_size = act_size
      else:
        # Conjugate function is opposite
        if not in_network_reduction:
          fw_flops = act_size * (self.num_peers - 1) / self.num_peers
        else:
          fw_flops = 0
        bw_flops = 0
        in_size = act_size
        out_size = act_size
    super().__init__(name,
                     sys,
                     fw_flops=fw_flops,
                     agrad_flops=bw_flops,
                     inputs_size=in_size,
                     output_size=out_size,
                     activation_space=in_size,
                     activation_grads=out_size,
                     needs_recomm=needs_recomm,
                     activation_reused=activation_reused,
                     activation_stored=activation_stored,
                     output_stored=output_stored)

  def get_activation(self):
    if self.tensor_par_comm_type == 'rs_ag':
      return self.activation_space * self.bytes_per_element / self.num_peers
    else:
      if self.conjugate:
        return self.activation_space * self.bytes_per_element
      else:
        # Identity
        return 0

  def get_fw_mem_accessed(self):
    if not self.tensor_par_comm_type == 'rs_ag' and not self.conjugate:
      # Identity
      return 0
    else:
      return super().get_fw_mem_accessed()

  def get_activation_grad(self):
    if self.tensor_par_comm_type == 'rs_ag':
      return self.activation_space * self.bytes_per_element / self.num_peers
    else:
      if not self.conjugate:
        return self.activation_grads * self.bytes_per_element
      else:
        # Identity
        return 0

  def get_agrad_mem_accessed(self):
    if not self.tensor_par_comm_type == 'rs_ag' and self.conjugate:
      # Identity
      return 0
    else:
      return super().get_agrad_mem_accessed()

  def get_comm_bytes(self, stage, baseblock=True):
    if self.num_peers == 1:
      return 0
    split_comm = (self.tensor_par_comm_type == 'rs_ag') or (
      (self.tensor_par_comm_type == 'p2p_rs_ag') and not baseblock)
    if (not split_comm and (self.conjugate and stage == 'agrad' or
        not self.conjugate and stage == 'fw')):
      # Identity FW or AllReduce BW
      return 0
    else:
      if stage == 'fw' or stage == 'agrad':
        return self.comm_size * self.bytes_per_element
      if stage == 'wgrad' and self.needs_recomm and (
          split_comm or self.conjugate):
        # with AG Redo, we need recomm both on FW pass (not self.conjugate)
        # and BW pass (self.conjugate)
        return self.comm_size * self.bytes_per_element
      else:
        # optim and wgrad stage has no comm if no ag_redo flag for RS_AG
        return 0

  def compute_net_time(self, stage, baseblock=True):
    if self.num_peers == 1:
      return 0
    split_comm = (self.tensor_par_comm_type == 'rs_ag') or (
      (self.tensor_par_comm_type == 'p2p_rs_ag') and not baseblock)
    net_compute_time = super().compute_processing_time(stage)
    if split_comm:
      if self.conjugate:
        # ReduceScatter case
        fw_net_time = self.net.time('reduce_scatter',
          self.get_comm_bytes(stage, baseblock), self.num_peers)
        bw_net_time = self.net.time('all_gather',
          self.get_comm_bytes(stage, baseblock), self.num_peers)
      else:
        #AllGather case
        fw_net_time = self.net.time('all_gather',
          self.get_comm_bytes(stage, baseblock), self.num_peers)
        bw_net_time = self.net.time('reduce_scatter',
          self.get_comm_bytes(stage, baseblock), self.num_peers)
    else:
      if self.conjugate:
        fw_net_time = self.net.time('all_reduce',
          self.get_comm_bytes(stage, baseblock), self.num_peers)
        bw_net_time = 0
      else:
        fw_net_time = 0
        bw_net_time = self.net.time('all_reduce',
          self.get_comm_bytes(stage, baseblock), self.num_peers)
    if stage == 'fw':
      return fw_net_time + net_compute_time
    elif stage == 'agrad':
      return bw_net_time + net_compute_time
    elif stage == 'wgrad':
      # with AG Redo, we need recomm both on FW pass (not self.conjugate)
      # and BW pass (self.conjugate)
      if self.needs_recomm:
        return fw_net_time + net_compute_time
      else:
        return 0
    elif stage == 'optim':
      return 0
    else:
      raise Exception(f'Bad compute stage : {stage}')
    return 0

  def get_exposed_net_time(self, stage, baseblock=True):
    # only use after calling compute_processing_time(), otherwise it's set witth None
    return self.compute_net_time(stage, baseblock)

  def compute_processing_time(self, stage):
    return 0


================================================
FILE: calculon/llm/llm.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

from calculon import *
from .layers import *


class Llm:
  """
  This implements the transformer with tensor, pipeline, and data parallelism.
  Using it follows this pattern:
  1. Initialize the model with certain model parameters
  2. Compile it with certain optimizations and parallelization strategies
  3. Run on particular hardware system
  """

  class Application:
    """Specifies the application configuration."""
    def __init__(self, cfg):
      self.cfg = cfg
      self.hidden = cfg['hidden']
      self.feedforward = cfg['feedforward']
      self.seq_size = cfg['seq_size']
      self.attn_heads = cfg['attn_heads']
      self.attn_size = cfg['attn_size']
      self.num_blocks = cfg['num_blocks']

    def num_parameters(self):
      # https://cs.stanford.edu/~matei/papers/2021/sc_megatron_lm.pdf
      # Equation 2
      p = 2 * self.hidden * self.feedforward                   # MLP weights
      p += 4 * self.hidden * self.attn_heads * self.attn_size  # Attn weights
      p += self.hidden + self.feedforward                      # biases MLP
      p += 3 * self.attn_heads * self.attn_size + self.hidden  # biases Attn
      p += 2 * 2 * self.hidden                                 # layer norm
      p *= self.num_blocks                                     # per each block
      p += (51200 + self.seq_size) * self.hidden               # embeddings
      return p

  class Execution:
    """Specifies the execution configuration."""

    @staticmethod
    def fields():
      return (
        'num_procs', 'tensor_par', 'pipeline_par', 'data_par', 'tensor_par_net',
        'pipeline_par_net', 'data_par_net', 'batch_size', 'microbatch_size',
        'datatype', 'fused_activation', 'attention_type', 'activation_recompute',
        'pipeline_interleaving', 'optimizer_sharding', 'tensor_par_comm_type',
        'tensor_par_overlap', 'seq_par_ag_redo', 'data_par_overlap',
        'weight_offload', 'activations_offload', 'optimizer_offload', 'training')

    @staticmethod
    def from_json(cfg):
      assert set(cfg.keys()) == set(Llm.Execution.fields())
      values = [cfg[field] for field in Llm.Execution.fields()]
      return Llm.Execution(*values)

    def __init__(self, num_procs, tensor_par, pipeline_par, data_par,
                 tensor_par_net, pipeline_par_net, data_par_net,
                 batch_size, microbatch_size, datatype,
                 fused_activation, attention_type, activation_recompute,
                 pipeline_interleaving, optimizer_sharding,
                 tensor_par_comm_type, tensor_par_overlap,
                 seq_par_ag_redo, data_par_overlap, weight_offload,
                 activations_offload, optimizer_offload, training):
      self.training = training
      self.num_procs = num_procs
      assert self.num_procs > 0
      self.tensor_par = tensor_par
      assert self.tensor_par > 0
      self.pipeline_par = pipeline_par
      assert self.pipeline_par > 0
      self.data_par = data_par
      assert self.data_par > 0
      assert self.num_procs == self.tensor_par * self.pipeline_par * \
        self.data_par, 'tensor * pipeline * data parallelism != num_procs'
      self.tensor_par_net = tensor_par_net
      self.pipeline_par_net = pipeline_par_net
      self.data_par_net = data_par_net
      self.global_batch_size = batch_size
      assert self.global_batch_size > 0
      self.microbatch_size = microbatch_size
      assert self.microbatch_size > 0
      assert self.global_batch_size % self.data_par == 0
      self._local_batch_size = self.global_batch_size // self.data_par
      assert self._local_batch_size % self.microbatch_size == 0
      self._num_microbatches = self._local_batch_size // self.microbatch_size
      self.datatype = datatype
      self.fused_activation = fused_activation
      self.attention_type = attention_type
      assert self.attention_type in ['multihead', 'multiquery']
      self.activation_recompute = activation_recompute
      assert self.activation_recompute in ['full', 'attn_only', 'none']
      if self.activation_recompute in ['full', 'attn_only']:
        assert self.training, "We only perform recompute during training"
      self.pipeline_interleaving = pipeline_interleaving
      assert self.pipeline_interleaving > 0, \
        f'Bad pipeline interleaving of {self.pipeline_interleaving}'
      if self.pipeline_par == 1:
        assert self.pipeline_interleaving == 1, \
        f'Bad pipeline interleaving of {self.pipeline_interleaving} with PP=1'
      self.optimizer_sharding = optimizer_sharding
      if self.optimizer_sharding:
        assert self.data_par > 1, "We perform optimizer sharding with DP > 1"
      self.tensor_par_comm_type = tensor_par_comm_type
      self.in_network_reduction = False
      assert self.tensor_par_comm_type in ['ar', 'p2p_rs_ag', 'rs_ag']
      self.tensor_par_overlap = tensor_par_overlap
      assert self.tensor_par_overlap in ['none', 'ring', 'pipe']
      if self.tensor_par_overlap != 'none':
        assert self.tensor_par > 1, "We perform TP comm overlap with TP > 1"
      self._sequence_par = self.tensor_par_comm_type == 'rs_ag'
      self.seq_par_ag_redo = seq_par_ag_redo
      if self.seq_par_ag_redo:
        assert self.tensor_par_comm_type == 'rs_ag', "We only redo AG comm"
        assert self._sequence_par, "We only redo AG with sequence parallelism"
        assert self.activation_recompute != 'full', \
          "We assume no extra AG with full recompute"
      self._pipeline_par_rs_ag = \
        self.tensor_par_comm_type in ['p2p_rs_ag', 'rs_ag']
      self.data_par_overlap = data_par_overlap
      if self.data_par_overlap:
        assert self.training, "We only perform DP comm overlap during training"
        assert self.data_par > 1, "We perform DP comm overlap with DP > 1"
      self.weight_offload = weight_offload
      self.activations_offload = activations_offload
      self.optimizer_offload = optimizer_offload
      if self.optimizer_offload:
        assert self.training, \
          "We only perform optimizer offloading during training"

    def get_json(self):
      keys = Llm.Execution.fields()
      values = [
        self.num_procs, self.tensor_par, self.pipeline_par, self.data_par, self.tensor_par_net,
        self.pipeline_par_net, self.data_par_net, self.global_batch_size, self.microbatch_size,
        self.datatype, self.fused_activation, self.attention_type, self.activation_recompute,
        self.pipeline_interleaving, self.optimizer_sharding, self.tensor_par_comm_type,
        self.tensor_par_overlap, self.seq_par_ag_redo, self.data_par_overlap,
        self.weight_offload, self.activations_offload, self.optimizer_offload, self.training
      ]
      assert len(keys) == len(values)
      return dict(zip(keys, values))

    def get_peers_json(self):
      peers = {}
      for di in range(self.data_par):
        for pi in range(self.pipeline_par):
          for ti in range(self.tensor_par):
            nid = (di * self.tensor_par * self.pipeline_par +
                   pi * self.tensor_par +
                   ti)
            peers[nid] = {}

            # tensor parallelism peers
            if self.tensor_par > 1:
              peers[nid]['tensor'] = []
              for ti2 in range(self.tensor_par):
                pid = (di * self.tensor_par * self.pipeline_par +
                       pi * self.tensor_par +
                       ti2)
                peers[nid]['tensor'].append(pid)

            # pipeline parallelism peer
            if self.pipeline_par > 1:
              peers[nid]['pipeline'] = None
              pi2 = (pi + 1) % self.pipeline_par
              pid = (di * self.tensor_par * self.pipeline_par +
                     pi2 * self.tensor_par +
                     ti)
              peers[nid]['pipeline'] = pid

            # data parallelism peers
            if self.data_par > 1:
              peers[nid]['data'] = []
              for di2 in range(self.data_par):
                pid = (di2 * self.tensor_par * self.pipeline_par +
                       pi * self.tensor_par +
                       ti)
                peers[nid]['data'].append(pid)
      return peers


  # This is used for errors where the user may not be fully aware of
  # limitations. Use it like this:
  #   raise self.Error(f'Foo bar {num1} is not {num2}')
  class Error(Exception):
    pass

  @staticmethod
  def _factors(x):
    for cand in range(1, x + 1):
      if x % cand == 0:
        yield cand

  @staticmethod
  def get_all_tensor_parallelisms(num_procs, hidden, attn_heads):
    for cand in Llm._factors(num_procs):
      if hidden % cand == 0 and attn_heads % cand == 0:
        yield cand

  @staticmethod
  def get_all_pipeline_parallelisms(num_procs, tensor_par, num_blocks):
    assert num_procs % tensor_par == 0
    max_pp = min(num_procs // tensor_par, num_blocks)
    for cand in Llm._factors(max_pp):
      if (num_procs % (tensor_par * cand) == 0 and
          num_blocks % cand == 0):
        yield cand

  @staticmethod
  def get_data_parallelism(num_procs, tensor_par, pipeline_par):
    assert num_procs % (tensor_par * pipeline_par) == 0, \
      f'np={num_procs} tp={tensor_par} pp={pipeline_par}'
    return num_procs // (tensor_par * pipeline_par)

  @staticmethod
  def get_valid_pipeline_interleavings(num_blocks, pipeline_par):
    assert num_blocks % pipeline_par == 0
    if pipeline_par == 1:
      yield 1
    else:
      max_ppint = num_blocks // pipeline_par
      yield from Llm._factors(max_ppint)

  @staticmethod
  def get_valid_microbatch_sizes(
      seq_size, tensor_par, data_par, global_batch_size, pipeline_par):
    assert global_batch_size % data_par == 0
    local_batch_size = global_batch_size // data_par
    for cand in Llm._factors(local_batch_size):
      batch_seq = cand * seq_size
      if batch_seq % tensor_par == 0:
        yield cand

  @staticmethod
  def can_redo_ag(tensor_par_comm_type, activation_recompute):
    return tensor_par_comm_type == 'rs_ag' and activation_recompute != 'full'

  def __init__(self, app, log):
    assert isinstance(app, self.Application)
    self.app = app
    self.log = log

    # Set during compile
    self.exe = None

    # Set during run
    self.sys = None

    # State of calling compile() and run()
    self._compiled = False
    self._executed = False

    # Holds the layers in a single block
    self._llm_block = []

    # A chunk is a set of blocks for microbatch before passing to the next
    # processor in the pipeline. Each chunk is modeled as a base
    # block that is repeated N-1 times and followed by 1 edge block.
    # Recommunication time is the same in both base and edge blocks.
    self._blocks_per_proc = None
    self._bubble_reduction_blocks = None
    self._blocks_per_chunk = None
    self._chunks_per_proc = None
    self._baseblocks_per_chunk = None
    self._edgeblocks_per_chunk = None

    # Misc compilation values
    self._bytes_per_element = None
    self._batch_seq = None
    self._batch_seq_par = None
    self._activation_size = None
    self._seq_par_activation_size = None

    # Assignments to specific networks
    self._tp_net = None
    self._pp_net = None
    self._dp_net = None

    # metrics collected after run for each microbatch
    self._block_fw_flops = None
    self._block_fw_flops_time = None
    self._block_fw_mem_accessed = None
    self._block_fw_mem_time = None
    self._block_fw_time = None
    self._block_re_flops = None
    self._block_re_flops_time = None
    self._block_re_mem_accessed = None
    self._block_re_mem_time = None
    self._block_re_time = None
    self._block_agrad_flops = None
    self._block_agrad_flops_time = None
    self._block_agrad_mem_accessed = None
    self._block_agrad_mem_time = None
    self._block_agrad_time = None
    self._block_wgrad_flops = None
    self._block_wgrad_flops_time = None
    self._block_wgrad_mem_accessed = None
    self._block_wgrad_mem_time = None
    self._block_wgrad_time = None
    self._block_optim_flops = None
    self._block_optim_flops_time = None
    self._block_optim_mem_accessed = None
    self._block_optim_mem_time = None
    self._block_optim_time = None

    self._baseblock_fw_tp_size = None
    self._edgeblock_fw_tp_size = None
    self._baseblock_agrad_tp_size = None
    self._edgeblock_agrad_tp_size = None
    self._baseblock_recomm_size = None
    self._edgeblock_recomm_size = None
    self._block_fw_pp_size = None
    self._block_bw_pp_size = None
    self._block_dp_size = None
    self._baseblock_fw_time_no_offload = None
    self._edgeblock_fw_time_no_offload = None
    self._baseblock_bw_time_no_offload = None
    self._edgeblock_bw_time_no_offload = None
    self._baseblock_fw_offload_overhead = None
    self._edgeblock_fw_offload_overhead = None
    self._baseblock_bw_offload_overhead = None
    self._edgeblock_bw_offload_overhead = None
    self._baseblock_fw_time = None
    self._edgeblock_fw_time = None
    self._baseblock_bw_time = None
    self._edgeblock_bw_time = None
    self._block_dp_time = None
    self._tp_bw_overlap_req = None
    self._dp_bw_overlap_req_chunk = None
    self._dp_bw_overlap_req_tail = None

    self._block_weight_space = None
    self._block_act_working_space = None
    self._block_act_storage_space = None
    self._block_act_checkpoint_size = None
    self._block_weight_grad_space = None
    self._block_weight_grad_space_no_sharding = None
    self._block_act_grad_space = None
    self._block_optimizer_space = None

    # Top level memory usage stats
    self._weight_space = None
    self._act_space = None
    self._act_checkpoint_size = None
    self._weight_grad_space = None
    self._act_grad_space = None
    self._optimizer_space = None

    # Top level throughput stats
    self._fw_flops = None
    self._fw_flops_time = None
    self._fw_mem_accessed = None
    self._fw_mem_time = None
    self._fw_time = None
    self._baseblock_fw_tp_time = None
    self._edgeblock_fw_tp_time = None
    self._baseblock_fw_tp_time_exposed = None
    self._edgeblock_fw_tp_time_exposed = None
    self._re_flops = None
    self._re_flops_time = None
    self._re_mem_accessed = None
    self._re_mem_time = None
    self._re_time = None
    self._baseblock_recomm_time = None
    self._edgeblock_recomm_time = None
    self._baseblock_recomm_time_exposed = None
    self._edgeblock_recomm_time_exposed = None
    self._agrad_flops = None
    self._agrad_flops_time = None
    self._agrad_mem_accessed = None
    self._agrad_mem_time = None
    self._baseblock_agrad_tp_time = None
    self._edgeblock_agrad_tp_time = None
    self._baseblock_agrad_tp_time_exposed = None
    self._edgeblock_agrad_tp_time_exposed = None
    self._agrad_time = None
    self._wgrad_flops = None
    self._wgrad_flops_time = None
    self._wgrad_mem_accessed = None
    self._wgrad_mem_time = None
    self._wgrad_time = None
    self._optim_flops = None
    self._optim_flops_time = None
    self._optim_mem_accessed = None
    self._optim_mem_time = None
    self._optim_time = None

    # Top level network stats
    self._tp_comm_time_exposed = None
    self._tp_comm_time_link = None
    self._recomm_time_exposed = None
    self._recomm_time_link = None
    self._pp_comm_time_exposed = None
    self._pp_comm_time_link = None
    self._dp_comm_time_exposed = None
    self._dp_comm_time_link = None
    self._bubble_time = None

  @staticmethod
  def get_stats_fields():
    return (
      'block_fw_flops',
      'block_fw_flops_time',
      'block_fw_mem_accessed',
      'block_fw_mem_time',
      'block_fw_time',
      'baseblock_fw_tp_time',
      'edgeblock_fw_tp_time',
      'baseblock_fw_tp_time_exposed',
      'edgeblock_fw_tp_time_exposed',
      'block_re_flops',
      'block_re_flops_time',
      'block_re_mem_accessed',
      'block_re_mem_time',
      'block_re_time',
      'baseblock_recomm_time',
      'edgeblock_recomm_time',
      'baseblock_recomm_time_exposed',
      'edgeblock_recomm_time_exposed',
      'block_agrad_flops',
      'block_agrad_flops_time',
      'block_agrad_mem_accessed',
      'block_agrad_mem_time',
      'block_agrad_time',
      'baseblock_agrad_tp_time',
      'edgeblock_agrad_tp_time',
      'baseblock_agrad_tp_time_exposed',
      'edgeblock_agrad_tp_time_exposed',
      'block_wgrad_flops',
      'block_wgrad_flops_time',
      'block_wgrad_mem_accessed',
      'block_wgrad_mem_time',
      'block_wgrad_time',
      'block_optim_flops',
      'block_optim_flops_time',
      'block_optim_mem_accessed',
      'block_optim_mem_time',
      'block_optim_time',

      'baseblock_fw_tp_size',
      'edgeblock_fw_tp_size',
      'baseblock_bw_tp_size',
      'edgeblock_bw_tp_size',
      'baseblock_recomm_size',
      'edgeblock_recomm_size',
      'block_fw_pp_size',
      'block_bw_pp_size',
      'block_dp_size',
      'tp_bw_overlap_req',
      'dp_bw_overlap_req_chunk',
      'dp_bw_overlap_req_tail',

      'block_weight_space',
      'block_act_working_space',
      'block_act_storage_space',
      'block_act_checkpoint_size',
      'block_weight_grad_space',
      'block_weight_grad_space_no_sharding',
      'block_act_grad_space',
      'block_optimizer_space',

      'weight_space_with_offload',
      'act_space_with_offload',
      'act_checkpoint_size_with_offload',
      'act_grad_space_with_offload',
      'weight_grad_space_with_offload',
      'optimizer_space_with_offload',

      'weight_space',
      'act_space',
      'act_checkpoint_size',
      'act_grad_space',
      'weight_grad_space',
      'optimizer_space',

      'fw_time',
      'bw_time',
      'optim_step_time',
      'recompute_time',
      'recomm_link_time',
      'recomm_exposed_time',
      'bubble_time',
      'tp_comm_link_time',
      'pp_comm_link_time',
      'dp_comm_link_time',
      'tp_comm_exposed_time',
      'pp_comm_exposed_time',
      'dp_comm_exposed_time',
      'fw_offload_exposed_time',
      'bw_offload_exposed_time',
      'total_time',
      'act_offload_bw_req',
      'weight_offload_bw_req',
      'optim_offload_bw_req',
      'offload_mem_bw_req',
      'proc_mem_tier1_cap_req',
      'proc_mem_tier2_cap_req',
      'useful_flops',
      'compute_efficiency',
      'system_efficiency',
      'total_efficiency',
      'sample_rate')

  def get_stats_values(self):
    assert self._executed
    return (
      self._block_fw_flops,
      self._block_fw_flops_time,
      self._block_fw_mem_accessed,
      self._block_fw_mem_time,
      self._block_fw_time,
      self._baseblock_fw_tp_time,
      self._edgeblock_fw_tp_time,
      self._baseblock_fw_tp_time_exposed,
      self._edgeblock_fw_tp_time_exposed,
      self._block_re_flops,
      self._block_re_flops_time,
      self._block_re_mem_accessed,
      self._block_re_mem_time,
      self._block_re_time,
      self._baseblock_recomm_time,
      self._edgeblock_recomm_time,
      self._baseblock_recomm_time_exposed,
      self._edgeblock_recomm_time_exposed,
      self._block_agrad_flops,
      self._block_agrad_flops_time,
      self._block_agrad_mem_accessed,
      self._block_agrad_mem_time,
      self._block_agrad_time,
      self._baseblock_agrad_tp_time,
      self._edgeblock_agrad_tp_time,
      self._baseblock_agrad_tp_time_exposed,
      self._edgeblock_agrad_tp_time_exposed,
      self._block_wgrad_flops,
      self._block_wgrad_flops_time,
      self._block_wgrad_mem_accessed,
      self._block_wgrad_mem_time,
      self._block_wgrad_time,
      self._block_optim_flops,
      self._block_optim_flops_time,
      self._block_optim_mem_accessed,
      self._block_optim_mem_time,
      self._block_optim_time,

      self._baseblock_fw_tp_size,
      self._edgeblock_fw_tp_size,
      self._baseblock_agrad_tp_size,
      self._edgeblock_agrad_tp_size,
      self._baseblock_recomm_size,
      self._edgeblock_recomm_size,
      self._block_fw_pp_size,
      self._block_bw_pp_size,
      self._block_dp_size,
      self._tp_bw_overlap_req,
      self._dp_bw_overlap_req_chunk,
      self._dp_bw_overlap_req_tail,

      self._block_weight_space,
      self._block_act_working_space,
      self._block_act_storage_space,
      self._block_act_checkpoint_size,
      self._block_weight_grad_space,
      self._block_weight_grad_space_no_sharding,
      self._block_act_grad_space,
      self._block_optimizer_space,

      self.get_weight_space_min(),
      self.get_act_space_min(),
      self.get_act_checkpoint_size_min(),
      self.get_act_grad_space_min(),
      self.get_weight_grad_space_min(),
      self.get_optimizer_space_min(),

      self.get_weight_space(),
      self.get_act_space(),
      self.get_act_checkpoint_size(),
      self.get_act_grad_space(),
      self.get_weight_grad_space(),
      self.get_optimizer_space(),

      self.get_fw_time(),
      self.get_bw_time(),
      self.get_optim_step_time(),
      self.get_recompute_time(),
      self.get_recomm_link_time(),
      self.get_recomm_exposed_time(),
      self.get_bubble_time(),
      self.get_tp_comm_link_time(),
      self.get_pp_comm_link_time(),
      self.get_dp_comm_link_time(),
      self.get_tp_comm_exposed_time(),
      self.get_pp_comm_exposed_time(),
      self.get_dp_comm_exposed_time(),
      self.get_fw_offload_overhead(),
      self.get_bw_offload_overhead(),
      self.get_total_time(),
      self.get_act_offload_bw_req(),
      self.get_weight_offload_bw_req(),
      self.get_optim_offload_bw_req(),
      self.get_offload_mem_bw_req(),
      self.get_mem_tier1_cap_req(),
      self.get_mem_tier2_cap_req(),
      self.get_useful_flops(),
      self.get_compute_efficiency(),
      self.get_system_efficiency(),
      self.get_total_efficiency(),
      self.get_sample_rate())

  def get_stats_json(self, include_layers):
    assert self._executed
    keys = Llm.get_stats_fields()
    values = self.get_stats_values()
    assert len(keys) == len(values), f'{len(keys)} {len(values)}'
    j = dict(zip(keys, values))
    if include_layers:
      j['layers'] = []
      for layer in self._llm_block:
        j['layers'].append(layer.get_stats_json())
    return j

  def _build_attn_block(self):
    recompute_flag = self.exe.activation_recompute == "full"
    recompute_attn_flag = self.exe.activation_recompute in \
      ["full", "attn_only"]
    recompute_ag_flag = recompute_attn_flag or self.exe.seq_par_ag_redo

    assert self.app.hidden % self.exe.tensor_par == 0, (
      f"We should split hidden={self.app.hidden} between"
      f" {self.exe.tensor_par} TP partitions evenly")
    assert self.app.feedforward % self.exe.tensor_par == 0, (
      f"We should split feedforward={self.app.feedforward} between"
      f" {self.exe.tensor_par} TP partitions evenly")
    assert self.app.attn_heads % self.exe.tensor_par == 0, (
      f"We should split {self.app.attn_heads} attn_heads between"
      f" {self.exe.tensor_par} TP partitions evenly")

    self._llm_block.append(Fork(
      "AttnBlock_Fork",
      self.sys,
      pick(self.exe._sequence_par, self._seq_par_activation_size,
           self._activation_size),
      2,
      needs_recompute=recompute_flag,
      # We account this activation when consider Residual and LayerNorm
      activation_stored=True))
    self._llm_block.append(LayerNorm(
      "AttnBlock_LayerNorm",
      self.sys,
      pick(self.exe._sequence_par, self._seq_par_activation_size,
           self._activation_size),
      self.app.hidden,
      needs_recompute=recompute_flag,
      # Activation is stored in Fork instead
      activation_stored=False,
      activation_reused=True))
    if self.exe.tensor_par_overlap == 'none':
      self._llm_block.append(TPComm(
        "AttnBlock_F",
        self.sys,
        self._activation_size,
        self.exe.tensor_par_net,
        self.exe.tensor_par,
        # We only compute flops/mem analyzing this layers, comm analyzed later
        # This is conservative estimate that does not consider p2p_rs_ag
        # because we don't differentiate between edge and middle blocks here
        tensor_par_comm_type=self.exe.tensor_par_comm_type,
        conjugate=False,
        in_network_reduction=self.exe.in_network_reduction,
        needs_recomm=recompute_ag_flag))
      self._llm_block.append(Fork(
        "AttnBlock_Multihead_Fork",
        self.sys,
        self._activation_size,
        3,
        needs_recompute=recompute_ag_flag,
        # With seq_par, we use activations from Comm layers to reflect that
        # they're split, otherwise we keep full size activations
        activation_stored=(not recompute_ag_flag)))
      self._llm_block.append(Linear(
        "AttnBlock_Query",
        self.sys,
        self._batch_seq,
        self.app.hidden,
        self.app.attn_heads * self.app.attn_size // self.exe.tensor_par,
        needs_recompute=recompute_flag,
        # Activation is stored in Fork instead,
        activation_stored=False,
        activation_reused=True))
      if self.exe.attention_type == 'multihead':
        self._llm_block.append(Linear(
          "AttnBlock_Key",
          self.sys,
          self._batch_seq,
          self.app.hidden,
          self.app.attn_heads * self.app.attn_size // self.exe.tensor_par,
          needs_recompute=recompute_flag,
          # Activation is stored in Fork instead,
          activation_stored=False,
          activation_reused=True))
        self._llm_block.append(Linear(
          "AttnBlock_Value",
          self.sys,
          self._batch_seq,
          self.app.hidden,
          self.app.attn_heads * self.app.attn_size // self.exe.tensor_par,
          needs_recompute=recompute_flag,
          # Activation is stored in Fork instead,
          activation_stored=False,
          activation_reused=True))
      elif self.exe.attention_type == 'multiquery':
        # Multiqueri attention uses the same K, V for all "heads" resulting in
        # smaller Wk and Wv, less matmul, faster inference
        self._llm_block.append(Linear(
          "AttnBlock_Key",
          self.sys,
          self._batch_seq,
          self.app.hidden,
          self.app.attn_size,
          needs_recompute=recompute_flag,
          # Activation is stored in Fork instead,
          activation_stored=False,
          activation_reused=True))
        self._llm_block.append(Linear(
          "AttnBlock_Value",
          self.sys,
          self._batch_seq,
          self.app.hidden,
          self.app.attn_size,
          needs_recompute=recompute_flag,
          # Activation is stored in Fork instead,
          activation_stored=False,
          activation_reused=True))
      else:
        raise self.Error('Wrong attention type', self.exe.attention_type)
    else:
      if self.exe.attention_type == 'multihead':
        self._llm_block.append(LinearOverlapped(
          "AttnBlock_QKV_AG",
          self.sys,
          self._batch_seq,
          self.app.hidden,
          self.app.attn_heads * self.app.attn_size *3,          # Q, K, V
          self.exe.tensor_par_comm_type,
          self.exe.tensor_par,
          self.exe.tensor_par_net,
          self.exe.tensor_par,
          conjugate=False,
          tp_overlap=self.exe.tensor_par_overlap,
          needs_recompute=recompute_flag,
          needs_recomm=recompute_ag_flag))
      elif self.exe.attention_type == 'multiquery':
        self._llm_block.append(LinearOverlapped(
          "AttnBlock_Query_AG",
          self.sys,
          self._batch_seq,
          self.app.hidden,
          self.app.attn_heads * self.app.attn_size,
          self.exe.tensor_par_comm_type,
          self.exe.tensor_par,
          self.exe.tensor_par_net,
          self.exe.tensor_par,
          conjugate=False,
          tp_overlap=self.exe.tensor_par_overlap,
          needs_recompute=recompute_flag,
          needs_recomm=recompute_ag_flag))
        self._llm_block.append(Fork(
          "AttnBlock_KV_Fork",
          self.sys,
          self._activation_size,
          2,
          needs_recompute=recompute_ag_flag,
          # With seq_par, we use activations from Comm layers to reflect that
          # they're split, otherwise we keep full size activations
          activation_stored=(not recompute_ag_flag)))
        self._llm_block.append(Linear(
          "AttnBlock_Key",
          self.sys,
          self._batch_seq,
          self.app.hidden,
          self.app.attn_size,
          needs_recompute=recompute_flag,
          # Activation is stored in Fork instead,
          activation_stored=False,
          activation_reused=True))
        self._llm_block.append(Linear(
          "AttnBlock_Value",
          self.sys,
          self._batch_seq,
          self.app.hidden,
          self.app.attn_size,
          needs_recompute=recompute_flag,
          # Activation is stored in Fork instead,
          activation_stored=False,
          activation_reused=True))
      else:
        raise self.Error('Wrong attention type', self.exe.attention_type)
    self._llm_block.append(BatchMatMul(
      "AttnBlock_Multihead_Key_Query",
      self.sys,
      self.exe.microbatch_size * self.app.attn_heads // self.exe.tensor_par,
      self.app.seq_size,
      self.app.attn_size,
      self.app.seq_size,
      needs_recompute=recompute_attn_flag,
      output_stored=(not recompute_attn_flag)))
    self._llm_block.append(SoftMax(
      "AttnBlock_Multihead_SoftMax",
      self.sys,
      self.app.attn_heads // self.exe.tensor_par * \
        self.app.seq_size**2 * self.exe.microbatch_size,
      needs_recompute=recompute_attn_flag,
      output_stored=(not recompute_attn_flag)))
    self._llm_block.append(DropOut(
      "AttnBlock_Multihead_DropOut",
      self.sys,
      self.app.attn_heads // self.exe.tensor_par * \
        self.app.seq_size**2 * self.exe.microbatch_size,
      needs_recompute=recompute_attn_flag,
      activation_stored=(not recompute_attn_flag)))
    self._llm_block.append(BatchMatMul(
      "AttnBlock_Multihead_Attn",
      self.sys,
      self.exe.microbatch_size * self.app.attn_heads // self.exe.tensor_par,
      self.app.seq_size,
      self.app.seq_size,
      self.app.attn_heads * self.app.attn_size // self.app.attn_heads,
      needs_recompute=recompute_flag))
    if self.exe.tensor_par_overlap == 'none':
      self._llm_block.append(Linear(
        "AttnBlock_MLP",
        self.sys,
        self._batch_seq,
        self.app.attn_heads * self.app.attn_size // self.exe.tensor_par,
        self.app.hidden,
        needs_recompute=recompute_flag))
      self._llm_block.append(TPComm(
        "AttnBlock_G",
        self.sys,
        self._activation_size,
        self.exe.tensor_par_net,
        self.exe.tensor_par,
        # We only compute flops/mem analyzing this layers, comm analyzed later
        # This is conservative estimate that does not consider p2p_rs_ag
        # because we don't differentiate between edge and middle blocks here
        tensor_par_comm_type=self.exe.tensor_par_comm_type,
        conjugate=True,
        in_network_reduction=self.exe.in_network_reduction,
        needs_recomm=recompute_flag,
        # We don't store input to RS/AR
        activation_stored=False))
    else:
      self._llm_block.append(LinearOverlapped(
        "AttnBlock_MLP_RS",
        self.sys,
        self._batch_seq,
        self.app.attn_heads * self.app.attn_size,
        self.app.hidden,
        self.exe.tensor_par_comm_type,
        self.exe.tensor_par,
        self.exe.tensor_par_net,
        self.exe.tensor_par,
        conjugate=True,
        tp_overlap=self.exe.tensor_par_overlap,
        needs_recompute=recompute_flag,
        needs_recomm=recompute_flag))
    self._llm_block.append(DropOut(
      "AttnBlock_DropOut",
      self.sys,
      pick(self.exe._sequence_par, self._seq_par_activation_size,
           self._activation_size),
      needs_recompute=recompute_flag))
    self._llm_block.append(ElementWise(
      "AttnBlock_Residual",
      self.sys,
      pick(self.exe._sequence_par, self._seq_par_activation_size,
           self._activation_size),
      pick(self.exe._sequence_par, self._seq_par_activation_size,
           self._activation_size),
      needs_recompute=recompute_flag,
      # Activation is stored in Fork instead
      activation_stored=False,
      activation_reused=True))

  def _build_mlp_block(self):
    recompute_flag = self.exe.activation_recompute == "full"
    recompute_ag_flag = recompute_flag or self.exe.seq_par_ag_redo

    self._llm_block.append(Fork(
      "MlpBlock_Fork",
      self.sys,
      pick(self.exe._sequence_par, self._seq_par_activation_size,
           self._activation_size),
      2,
      needs_recompute=recompute_flag,
      # We account this activation when consider Residual and LayerNorm
      activation_stored=True))
    self._llm_block.append(LayerNorm(
      "MlpBlock_LayerNorm",
      self.sys,
      pick(self.exe._sequence_par, self._seq_par_activation_size,
           self._activation_size),
      self.app.hidden,
      needs_recompute=recompute_flag,
      # Activation is stored in Fork instead
      activation_stored=False,
      activation_reused=True))
    if self.exe.tensor_par_overlap == 'none':
      self._llm_block.append(TPComm(
        "MlpBlock_F",
        self.sys,
        # We only do compute/mem analyzing this layers, comm analyzed later
        # We keep extra mem buffer for comm, consider full tensor mem access
        # to be consistent with how much data comm moves/touches
        # This is conservative estimate that does not consider p2p_rs_ag
        # because we don't differentiate between edge and middle blocks here
        self._activation_size,
        self.exe.tensor_par_net,
        self.exe.tensor_par,
        tensor_par_comm_type=self.exe.tensor_par_comm_type,
        conjugate=False,
        in_network_reduction=self.exe.in_network_reduction,
        needs_recomm=recompute_ag_flag))
      self._llm_block.append(Linear(
        "MlpBlock_Mlp1",
        self.sys,
        self._batch_seq,
        self.app.hidden,
        self.app.feedforward // self.exe.tensor_par,
        needs_recompute=recompute_flag,
        # With seq_par, we use activations from Comm layers to reflect that
        # they're split, otherwise we keep full size activations
        activation_stored=(not recompute_ag_flag)))
    else:
      self._llm_block.append(LinearOverlapped(
        "MlpBlock_Mlp1_AG",
        self.sys,
        self._batch_seq,
        self.app.hidden,
        self.app.feedforward,
        self.exe.tensor_par_comm_type,
        self.exe.tensor_par,
        self.exe.tensor_par_net,
        self.exe.tensor_par,
        conjugate=False,
        tp_overlap=self.exe.tensor_par_overlap,
        needs_recompute=recompute_flag,
        needs_recomm=recompute_ag_flag))
    self._llm_block.append(GeLU(
      "MlpBlock_GeLU",
      self.sys,
      self.app.feedforward * self._batch_seq // self.exe.tensor_par,
      needs_recompute=recompute_flag,
      fused=self.exe.fused_activation))
    if self.exe.tensor_par_overlap == 'none':
      self._llm_block.append(Linear(
        "MlpBlock_Mlp2",
        self.sys,
        self._batch_seq,
        self.app.feedforward // self.exe.tensor_par,
        self.app.hidden,
        needs_recompute=recompute_flag))
      self._llm_block.append(TPComm(
        "MlpBlock_G",
        self.sys,
        self._activation_size,
        self.exe.tensor_par_net,
        self.exe.tensor_par,
        # We only compute flops/mem analyzing this layers, comm analyzed later
        # This is conservative estimate that does not consider p2p_rs_ag
        # because we don't differentiate between edge and middle blocks here
        tensor_par_comm_type=self.exe.tensor_par_comm_type,
        conjugate=True,
        in_network_reduction=self.exe.in_network_reduction,
        needs_recomm=recompute_flag,
        # We don't store input to RS/AR
        activation_stored=False))
    else:
      self._llm_block.append(LinearOverlapped(
        "MlpBlock_Mlp2_RS",
        self.sys,
        self._batch_seq,
        self.app.feedforward,
        self.app.hidden,
        self.exe.tensor_par_comm_type,
        self.exe.tensor_par,
        self.exe.tensor_par_net,
        self.exe.tensor_par,
        conjugate=True,
        tp_overlap=self.exe.tensor_par_overlap,
        needs_recompute=recompute_flag,
        needs_recomm=recompute_flag))
    self._llm_block.append(DropOut(
      "MlpBlock_DropOut",
      self.sys,
      pick(self.exe._sequence_par, self._seq_par_activation_size,
           self._activation_size),
      needs_recompute=recompute_flag))
    self._llm_block.append(ElementWise(
      "MlpBlock_Residual",
      self.sys,
      pick(self.exe._sequence_par, self._seq_par_activation_size,
           self._activation_size),
      pick(self.exe._sequence_par, self._seq_par_activation_size,
           self._activation_size),
      needs_recompute=recompute_flag,
      # Activation is stored in Fork instead
      activation_stored=False,
      activation_reused=True))

  def compile(self, sys, exe):
    assert not self._compiled
    assert isinstance(exe, self.Execution)
    self.exe = exe
    assert isinstance(sys, System)
    self.sys = sys
    self._check_network_assignments()

    self.sys.set_datatype(self.exe.datatype)

    # If we have number of blocks not divisible by PP, we can allocate the
    # reminder of the blocks on the first num_block % PP Procs and block
    # "bubbles" on the last PP - (num_block % PP) Procs. To reflect that,
    # we round up blocks_per_proc. We report time for Proc0. In that case
    # its bubble time is `PP - (num_block % PP)` blocks shorter
    self._blocks_per_proc = self.app.num_blocks // self.exe.pipeline_par
    if self.app.num_blocks % self.exe.pipeline_par != 0:
      self._blocks_per_proc += 1
      self._bubble_reduction_blocks = self.exe.pipeline_par - (
        self.app.num_blocks % self.exe.pipeline_par)
    else:
      self._bubble_reduction_blocks = 0
    if self.exe.pipeline_interleaving > self._blocks_per_proc:
      raise self.Error('Pipeline interleaving must be less than or equal to '
                       'the number of blocks per processor')
    if self._blocks_per_proc % self.exe.pipeline_interleaving != 0:
      raise self.Error('Pipeline interleaving must be a factor value of the '
                       'number of blocks per processor')
    self._bytes_per_element = System.TypeSizes[self.exe.datatype]

    # Checks that enough blocks per processor exist if offloading is being
    # performed
    if (self.exe.weight_offload or self.exe.activations_offload or
        self.exe.optimizer_offload) and (self._blocks_per_proc <= 2):
      raise self.Error('Offloading requires each processor to handle at least'
                       ' 3 blocks')

    # A chunk is a set of blocks for microbatch before passing to the next
    # processor in the pipeline. Each chunk is modeled as a base
    # block that is repeated N-1 times and followed by 1 edge block.
    # Recommunication time is the same in both base and edge blocks.
    self._blocks_per_chunk = \
      self._blocks_per_proc // self.exe.pipeline_interleaving
    assert self._blocks_per_proc % self._blocks_per_chunk == 0, \
      "PP interleaving should evenly devide {self._blocks_per_proc} blocks"
    self._chunks_per_proc = self._blocks_per_proc // self._blocks_per_chunk
    assert self._chunks_per_proc == self.exe.pipeline_interleaving, \
      "Number of chunks should be equal to pipeline_interleaving"
    self._baseblocks_per_chunk = self._blocks_per_chunk - 1
    self._edgeblocks_per_chunk = 1

    # Build model during the compilation step
    self._batch_seq = self.exe.microbatch_size * self.app.seq_size
    self._activation_size = self._batch_seq * self.app.hidden
    self._batch_seq_par = self._batch_seq // self.exe.tensor_par
    if self.exe._sequence_par or self.exe._pipeline_par_rs_ag:
      assert self._batch_seq % self.exe.tensor_par == 0, (
        f"We should split batch_seq={self._batch_seq} between"
        f" {self.exe.tensor_par} TP partitions evenly")
    self._seq_par_activation_size = self._batch_seq_par * self.app.hidden
    self._build_attn_block()
    self._build_mlp_block()
    for layer in self._llm_block:
      layer.set_bytes_per_element(self._bytes_per_element)
      if self.exe.optimizer_sharding:
        layer.shard_optimizer(self.exe.data_par)
    self._compiled = True

  def _check_network_assignments(self):
    used = [False] * self.sys.num_networks
    size = [1] * self.sys.num_networks

    assert self.exe.tensor_par_net < self.sys.num_networks
    assert self.exe.pipeline_par_net < self.sys.num_networks
    assert self.exe.data_par_net < self.sys.num_networks

    if self.exe.tensor_par > 1:
      used[self.exe.tensor_par_net] = True
      size[self.exe.tensor_par_net] *= self.exe.tensor_par
    self._tp_net = self.sys.get_network(self.exe.tensor_par_net)

    if self.exe.pipeline_par > 1:
      used[self.exe.pipeline_par_net] = True
      size[self.exe.pipeline_par_net] *= self.exe.pipeline_par
    self._pp_net = self.sys.get_network(self.exe.pipeline_par_net)

    if self.exe.data_par > 1:
      used[self.exe.data_par_net] = True
      size[self.exe.data_par_net] *= self.exe.data_par
    self._dp_net = self.sys.get_network(self.exe.data_par_net)

    for tier_used, tier_size, tier in zip(
        used, size, range(self.sys.num_networks)):
      if tier_used:
        if tier_size > self.sys.get_network(tier).size:
          raise self.Error(f'Network tier{tier} isn\'t big enough')
        if (self.sys.get_network(tier).must_be_filled and
            self.sys.get_network(tier).size % tier_size != 0):
          raise self.Error(f'Network tier{tier} isn\'t fully used')

  def _compute_block_stats(self):
    """
    This function computes the statistics for one microbatch on a single block.
    This only computes flops, flop time, and communication sizes. Since
    tensor and pipeline parallelism cause different communication operations to
    occur at the full batch level, the communication times are computed later.
    """
    if self.exe.training and self.exe.activation_recompute == "full":
      self._block_act_checkpoint_size = \
        self._activation_size * self._bytes_per_element
    else:
      self._block_act_checkpoint_size = 0

    # Initializes values to zero for accumulation in layer loop
    self._block_fw_flops = 0
    self._block_fw_flops_time = 0
    self._block_fw_mem_accessed = 0
    self._block_fw_mem_time = 0
    self._block_fw_time = 0
    self._baseblock_fw_tp_size = 0
    self._edgeblock_fw_tp_size = 0
    self._baseblock_fw_tp_time = 0
    self._edgeblock_fw_tp_time = 0
    self._baseblock_fw_tp_time_exposed = 0
    self._edgeblock_fw_tp_time_exposed = 0
    self._block_weight_space = 0
    self._block_act_working_space = 0
    self._block_act_storage_space = 0
    # We use this block for self.exe.training, but initialize anyway
    self._block_re_flops = 0
    self._block_re_flops_time = 0
    self._block_re_mem_accessed = 0
    self._block_re_mem_time = 0
    self._block_re_time = 0
    self._baseblock_recomm_size = 0
    self._edgeblock_recomm_size = 0
    self._baseblock_recomm_time = 0
    self._edgeblock_recomm_time = 0
    self._baseblock_recomm_time_exposed = 0
    self._edgeblock_recomm_time_exposed = 0
    self._block_agrad_flops = 0
    self._block_agrad_flops_time = 0
    self._block_agrad_mem_accessed = 0
    self._block_agrad_mem_time = 0
    self._block_agrad_time = 0
    self._baseblock_agrad_tp_size = 0
    self._edgeblock_agrad_tp_size = 0
    self._baseblock_agrad_tp_time = 0
    self._edgeblock_agrad_tp_time = 0
    self._baseblock_agrad_tp_time_exposed = 0
    self._edgeblock_agrad_tp_time_exposed = 0
    self._block_wgrad_flops = 0
    self._block_wgrad_flops_time = 0
    self._block_wgrad_mem_accessed = 0
    self._block_wgrad_mem_time = 0
    self._block_wgrad_time = 0
    self._block_optim_flops = 0
    self._block_optim_flops_time = 0
    self._block_optim_mem_accessed = 0
    self._block_optim_mem_time = 0
    self._block_optim_time = 0
    self._block_weight_grad_space = 0
    self._block_weight_grad_space_no_sharding = 0
    self._block_act_grad_space = 0
    self._block_optimizer_space = 0
    self._tp_bw_overlap_req = 0

    prev_layer_recompute = False
    for layer in self._llm_block:
      # Add flops/bytes/times per layer
      self._block_fw_flops += layer.get_fw_flops()
      self._block_fw_flops_time += layer.compute_flops_time("fw")
      self._block_fw_mem_accessed += layer.get_fw_mem_accessed()
      self._block_fw_mem_time += layer.compute_mem_time("fw")
      self._block_fw_time += layer.compute_processing_time("fw")
      self._baseblock_fw_tp_size += layer.get_comm_bytes("fw",
        baseblock=True)
      self._edgeblock_fw_tp_size += layer.get_comm_bytes("fw",
        baseblock=False)
      self._baseblock_fw_tp_time += layer.compute_net_time("fw",
        baseblock=True)
      self._edgeblock_fw_tp_time += layer.compute_net_time("fw",
        baseblock=False)
      self._baseblock_fw_tp_time_exposed += layer.get_exposed_net_time("fw",
        baseblock=True)
      self._edgeblock_fw_tp_time_exposed += layer.get_exposed_net_time("fw",
        baseblock=False)
      self._tp_bw_overlap_req = max(self._tp_bw_overlap_req,
        layer.get_required_bandwidth("fw", baseblock=True))
      self._tp_bw_overlap_req = max(self._tp_bw_overlap_req,
        layer.get_required_bandwidth("fw", baseblock=False))
      if self.exe.training:
        if layer.get_recompute_flag():
          self._block_re_flops += self._block_fw_flops
          self._block_re_flops_time += self._block_fw_flops_time
          self._block_re_mem_accessed += self._block_fw_mem_accessed
          self._block_re_mem_time += self._block_fw_mem_time
          self._block_re_time += layer.compute_processing_time("fw")
        if layer.get_recomm_flag():
          self._baseblock_recomm_size += layer.get_comm_bytes("wgrad",
            baseblock=True)
          self._edgeblock_recomm_size += layer.get_comm_bytes("wgrad",
            baseblock=False)
          self._baseblock_recomm_time += layer.compute_net_time("wgrad",
            baseblock=True)
          self._edgeblock_recomm_time += layer.compute_net_time("wgrad",
            baseblock=False)
          self._baseblock_recomm_time_exposed += layer.get_exposed_net_time(
            "wgrad", baseblock=True)
          self._edgeblock_recomm_time_exposed += layer.get_exposed_net_time(
            "wgrad", baseblock=False)
        self._block_agrad_flops += layer.get_agrad_flops()
        self._block_agrad_flops_time += layer.compute_flops_time("agrad")
        self._block_agrad_mem_accessed += layer.get_agrad_mem_accessed()
        self._block_agrad_mem_time += layer.compute_mem_time("agrad")
        self._block_agrad_time += layer.compute_processing_time("agrad")
        self._baseblock_agrad_tp_size += layer.get_comm_bytes("agrad",
          baseblock=True)
        self._edgeblock_agrad_tp_size += layer.get_comm_bytes("agrad",
          baseblock=False)
        self._baseblock_agrad_tp_time += layer.compute_net_time("agrad",
          baseblock=True)
        self._edgeblock_agrad_tp_time += layer.compute_net_time("agrad",
          baseblock=False)
        self._baseblock_agrad_tp_time_exposed += layer.get_exposed_net_time(
          "agrad", baseblock=True)
        self._edgeblock_agrad_tp_time_exposed += layer.get_exposed_net_time(
          "agrad", baseblock=False)
        self._tp_bw_overlap_req = max(self._tp_bw_overlap_req,
          layer.get_required_bandwidth("agrad", baseblock=True))
        self._tp_bw_overlap_req = max(self._tp_bw_overlap_req,
          layer.get_required_bandwidth("agrad", baseblock=False))
        self._block_wgrad_flops += layer.get_wgrad_flops()
        self._block_wgrad_flops_time += layer.compute_flops_time("wgrad")
        self._block_wgrad_mem_accessed += layer.get_wgrad_mem_accessed()
        self._block_wgrad_mem_time += layer.compute_mem_time("wgrad")
        self._block_wgrad_time += layer.compute_processing_time("wgrad")
        self._block_optim_flops += layer.get_optim_step_flops()
        self._block_optim_flops_time += layer.compute_flops_time("optim")
        self._block_optim_mem_accessed += layer.get_optim_step_mem_accessed()
        self._block_optim_mem_time += layer.compute_mem_time("optim")
        self._block_optim_time += layer.compute_processing_time("optim")

      # Accumulate space requirements per block
      self._block_weight_space += layer.get_weight()
      if not layer.reuses_activation():
        self._block_act_working_space += layer.get_activation()
      self._block_act_storage_space += layer.get_activation()
      if self.exe.training:
        if not layer.stores_output():
          self._block_act_storage_space -= layer.get_output()
        if not layer.stores_activation():
          self._block_act_storage_space -= layer.get_activation()
        self._block_weight_grad_space += layer.get_weight_grad()
        self._block_weight_grad_space_no_sharding += layer.get_weight_grad(
          sharded=False)
        self._block_act_grad_space += layer.get_activation_grad()
        self._block_optimizer_space += layer.get_optimizer()

      self.log.debug("%s %s %s", layer.name, 'Recompute flag:',
                     str(layer.get_recompute_flag()))
      self.log.debug("%s %s %s", layer.name, 'Recomm flag:',
                     str(layer.get_recomm_flag()))
      self.log.debug("%s %s %s", layer.name, 'Stores activation:',
                     str(layer.stores_activation()))
      self.log.debug("%s %s %s", layer.name, 'Reuses activation:',
                     str(layer.reuses_activation()))
      self.log.debug("%s %s %s", layer.name, 'Stores output:',
                     str(layer.stores_output()))
      self.log.debug("%s %s %s", layer.name, 'FW flops:',
                     human_format(layer.get_fw_flops(), 'flops'))
      self.log.debug("%s %s %s", layer.name, 'FW num inputs:',
                     human_format(layer.inputs_size, 'base2'))
      self.log.debug("%s %s %s", layer.name, 'FW num output:',
                     human_format(layer.output_size, 'base2'))
      self.log.debug("%s %s %s", layer.name, 'FW num weights:',
                     human_format(layer.weight_space, 'base2'))
      self.log.debug("%s %s %s", layer.name, 'FW mem:',
                     human_format(layer.get_fw_mem_accessed(), 'bytes'))
      self.log.debug("%s %s %s", layer.name, 'FW baseblock comm tile size:',
                     human_format(layer.get_comm_tile("fw", baseblock=True),
                     'bytes'))
      self.log.debug("%s %s %s", layer.name, 'FW edgeblock comm tile size:',
                     human_format(layer.get_comm_tile("fw", baseblock=False),
                     'bytes'))
      self.log.debug("%s %s %s", layer.name, 'FW baseblock comm size:',
                     human_format(layer.get_comm_bytes("fw", baseblock=True),
                     'bytes'))
      self.log.debug("%s %s %s", layer.name, 'FW edgeblock comm size:',
                     human_format(layer.get_comm_bytes("fw", baseblock=False),
                     'bytes'))
      self.log.debug("%s %s %.3e", layer.name, 'FW net link time:',
                     layer.compute_net_time("fw"))
      self.log.debug("%s %s %.3e", layer.name, 'FW net exposed time:',
                     layer.get_exposed_net_time("fw"))
      self.log.debug("%s %s %.3e", layer.name, 'FW time:',
                     layer.compute_processing_time("fw"))
      self.log.debug("%s %s %s", layer.name, 'BW flops:',
                     human_format(
                      layer.get_agrad_flops() + layer.get_wgrad_flops(),
                      'flops'))
      self.log.debug("%s %s %s", layer.name, 'BW num Wgrads:',
                     human_format(layer.weight_grads, 'base2'))
      self.log.debug("%s %s %s", layer.name, 'BW num Agrads:',
                     human_format(layer.activation_grads, 'base2'))
      self.log.debug("%s %s %s", layer.name, 'BW num Igrads:',
                     human_format(layer.inputs_size, 'base2'))
      self.log.debug("%s %s %s", layer.name, 'BW mem:',
                     human_format(
                      layer.get_agrad_mem_accessed() +
                      layer.get_wgrad_mem_accessed(), 'bytes'))
      self.log.debug("%s %s %s", layer.name, 'BW baseblock comm tile size:',
                     human_format(layer.get_comm_tile("agrad", baseblock=True),
                     'bytes'))
      self.log.debug("%s %s %s", layer.name, 'BW edgeblock comm tile size:',
                     human_format(layer.get_comm_tile("agrad", baseblock=False),
                     'bytes'))
      self.log.debug("%s %s %s", layer.name, 'BW baseblock comm size:',
                     human_format(layer.get_comm_bytes("agrad", baseblock=True),
                     'bytes'))
      self.log.debug("%s %s %s", layer.name, 'BW edgeblock comm size:',
                     human_format(layer.get_comm_bytes("agrad", baseblock=False),
                     'bytes'))
      self.log.debug("%s %s %.3e", layer.name, 'BW net link time:',
                     layer.compute_net_time("agrad"))
      self.log.debug("%s %s %.3e", layer.name, 'BW net exposed time:',
                     layer.get_exposed_net_time("agrad"))
      self.log.debug("%s %s %.3e", layer.name, 'BW time:',
                     layer.compute_processing_time("agrad") +
                     layer.compute_processing_time("wgrad"))
      self.log.debug("%s %s %s", layer.name, 'Recomm baseblock comm tile size:',
                     human_format(layer.get_comm_tile("wgrad", baseblock=True),
                     'bytes'))
      self.log.debug("%s %s %s", layer.name, 'Recomm edgeblock comm tile size:',
                     human_format(layer.get_comm_tile("wgrad", baseblock=False),
                     'bytes'))
      self.log.debug("%s %s %s", layer.name, 'Recomm baseblock comm size:',
                     human_format(layer.get_comm_bytes("wgrad", baseblock=True),
                     'bytes'))
      self.log.debug("%s %s %s", layer.name, 'Recomm edgeblock comm size:',
                     human_format(layer.get_comm_bytes("wgrad", baseblock=False),
                     'bytes'))
      self.log.debug("%s %s %.3e", layer.name, 'Recomm net link time:',
                     layer.compute_net_time("wgrad"))
      self.log.debug("%s %s %.3e", layer.name, 'Recomm net exposed time:',
                     layer.get_exposed_net_time("wgrad"))
      self.log.debug("%s %s %s", layer.name, 'Optim flops:',
                     human_format(layer.get_optim_step_flops(), 'flops'))
      self.log.debug("%s %s %s", layer.name, 'BW Optimizer size:',
                     human_format(layer.get_optimizer(), 'bytes'))
      self.log.debug("%s %s %s", layer.name, 'Optim mem:',
                     human_format(layer.get_optim_step_mem_accessed(), 'bytes'))
      self.log.debug("%s %s %.3e", layer.name, 'Optim time:',
                     layer.compute_processing_time("optim"))
      self.log.debug("%s %s %.3e", layer.name, 'Recompute:',
                     layer.get_recompute_flag())
      self.log.debug("%s %s %s", layer.name, 'Recompute mem saving:',
                     human_format(layer.stores_output() * \
                       layer.get_output(), 'bytes'))
      self.log.debug("%s %s %s", layer.name, 'Weight:',
                     human_format(layer.get_weight(), 'bytes'))
      self.log.debug("%s %s %s", layer.name, 'Act:',
                     human_format(layer.get_activation(), 'bytes'))
      self.log.debug("%s %s %s", layer.name, 'Weight grad:',
                     human_format(layer.get_weight_grad(), 'bytes'))
      self.log.debug("%s %s %s", layer.name, 'Act grad:',
                     human_format(layer.get_activation_grad(), 'bytes'))
      self.log.debug("%s %s %s", layer.name, 'Optim:',
                     human_format(layer.get_optimizer(), 'bytes'))
      self.log.debug("%s %s %s", layer.name, 'Incremental Weight:',
                     human_format(self._block_weight_space, 'bytes'))
      self.log.debug("%s %s %s", layer.name, 'Incremental Act Working space:',
                     human_format(self._block_act_working_space, 'bytes'))
      self.log.debug("%s %s %s", layer.name, 'Incremental Act Storage space:',
                     human_format(self._block_act_storage_space, 'bytes'))
      self.log.debug("%s %s %s", layer.name, 'Incremental Weight grad:',
                     human_format(self._block_weight_grad_space, 'bytes'))
      self.log.debug("%s %s %s", layer.name, 'Incremental Act grad:',
                     human_format(self._block_act_grad_space, 'bytes'))
      self.log.debug("%s %s %s", layer.name, 'Incremental Optim:',
                     human_format(self._block_optimizer_space, 'bytes'))
      prev_layer_recompute = layer.get_recompute_flag()
    if self.exe.activation_recompute == 'full':
      self._block_act_storage_space = 0

    # Sets the PP communication operation size
    if self.exe.pipeline_par > 1:
      if self.exe._pipeline_par_rs_ag:
        self._block_fw_pp_size = self._seq_par_activation_size * \
          self._bytes_per_element
      else:
        self._block_fw_pp_size = self._activation_size * \
          self._bytes_per_element
    else:
      self._block_fw_pp_size = 0

    # When training, BW sizes for TP and PP are same as FW
    if self.exe.training:
      self._block_bw_pp_size = self._block_fw_pp_size
    else:
      self._block_bw_pp_size = 0

    self.log.debug("%s %s", 'TP comm FW baseblock size:',
                   human_format(self._baseblock_fw_tp_size, 'bytes'))
    self.log.debug("%s %s", 'TP comm FW edgeblock size:',
                   human_format(self._edgeblock_fw_tp_size, 'bytes'))
    self.log.debug("%s %s", 'PP comm FW size:',
                   human_format(self._block_fw_pp_size, 'bytes'))
    self.log.debug("%s %s", 'TP comm BW baseblock size:',
                   human_format(self._baseblock_agrad_tp_size, 'bytes'))
    self.log.debug("%s %s", 'TP comm BW edgeblock size:',
                   human_format(self._edgeblock_agrad_tp_size, 'bytes'))
    self.log.debug("%s %s", 'PP comm BW size:',
                   human_format(self._block_bw_pp_size, 'bytes'))
    self.log.debug("%s %s", 'TP recomm baseblock size:',
                   human_format(self._baseblock_recomm_size, 'bytes'))
    self.log.debug("%s %s", 'TP recomm edgeblock size:',
                   human_format(self._edgeblock_recomm_size, 'bytes'))
    self.log.debug("%s %s", 'TP comm required bandwidth for tiled overlap:',
                   human_format(self._tp_bw_overlap_req, 'bandwidth'))

  def _compute_batch_stats(self):
    """
    This function computes the statistics for a full batch. This uses the per
    microbatch per block statistics from the prior function (see above).
    """
    # Total stats for compute and memory
    mult = self._blocks_per_proc * self.exe._num_microbatches
    self._fw_flops = mult * self._block_fw_flops
    self._fw_flops_time = mult * self._block_fw_flops_time
    self._fw_mem_accessed = mult * self._block_fw_mem_accessed
    self._fw_mem_time = mult * self._block_fw_mem_time
    self._fw_time = mult * self._block_fw_time
    self._re_flops = mult * self._block_re_flops
    self._re_flops_time = mult * self._block_re_flops_time
    self._re_mem_accessed = mult * self._block_re_mem_accessed
    self._re_mem_time = mult * self._block_re_mem_time
    self._re_time = mult * self._block_re_time
    self._agrad_flops = mult * self._block_agrad_flops
    self._agrad_flops_time = mult * self._block_agrad_flops_time
    self._agrad_mem_accessed = mult * self._block_agrad_mem_accessed
    self._agrad_mem_time = mult * self._block_agrad_mem_time
    self._agrad_time = mult * self._block_agrad_time
    self._wgrad_flops = mult * self._block_wgrad_flops
    self._wgrad_flops_time = mult * self._block_wgrad_flops_time
    self._wgrad_mem_accessed = mult * self._block_wgrad_mem_accessed
    self._wgrad_mem_time = mult * self._block_wgrad_mem_time
    self._wgrad_time = mult * self._block_wgrad_time
    self._optim_flops = self._blocks_per_proc * self._block_optim_flops
    self._optim_flops_time = self._blocks_per_proc * self._block_optim_flops_time
    self._optim_mem_accessed = self._blocks_per_proc * self._block_optim_mem_accessed
    self._optim_mem_time = self._blocks_per_proc * self._block_optim_mem_time
    self._optim_time = self._blocks_per_proc * self._block_optim_time

    # These TP numbers are for total times for all blocks in all chunks
    tp_fw_comm_time = self.exe._num_microbatches * self._chunks_per_proc * (
      (self._baseblocks_per_chunk * self._baseblock_fw_tp_time) +
      (self._edgeblocks_per_chunk * self._edgeblock_fw_tp_time))
    tp_fw_comm_time_exposed = \
      self.exe._num_microbatches * self._chunks_per_proc * (
        (self._baseblocks_per_chunk * self._baseblock_fw_tp_time_exposed) +
        (self._edgeblocks_per_chunk * self._edgeblock_fw_tp_time_exposed))
    tp_bw_comm_time = self.exe._num_microbatches * self._chunks_per_proc * (
      self._baseblocks_per_chunk * self._baseblock_agrad_tp_time +
      self._edgeblocks_per_chunk * self._edgeblock_agrad_tp_time)
    tp_bw_comm_time_exposed = \
      self.exe._num_microbatches * self._chunks_per_proc * (
        self._baseblocks_per_chunk * self._baseblock_agrad_tp_time_exposed +
        self._edgeblocks_per_chunk * self._edgeblock_agrad_tp_time_exposed)
    tp_recomm_time = self.exe._num_microbatches * self._chunks_per_proc * (
      (self._baseblocks_per_chunk * self._baseblock_recomm_time) +
      (self._edgeblocks_per_chunk * self._edgeblock_recomm_time))
    tp_recomm_time_exposed = \
      self.exe._num_microbatches * self._chunks_per_proc * (
        (self._baseblocks_per_chunk * self._baseblock_recomm_time_exposed) +
        (self._edgeblocks_per_chunk * self._edgeblock_recomm_time_exposed))

    # Per chunk PP comm time
    chunk_fw_pp_time = self._pp_net.time('p2p', self._block_fw_pp_size, 2)
    chunk_bw_pp_time = self._pp_net.time('p2p', self._block_bw_pp_size, 2)

    # Determines number of times PP causes pipeline p2p communications per
    # chunk during the forward and backward pass (equal to chunks per proc)
    if self.exe.pipeline_par > 1:
      num_fw_pp_p2ps = self._chunks_per_proc
      if self.exe.training:
        num_bw_pp_p2ps = self._chunks_per_proc
      else:
        num_bw_pp_p2ps = 0
    else:
      num_fw_pp_p2ps = 0
      num_bw_pp_p2ps = 0

    # These PP numbers are for total times for all blocks and all microbatches
    pp_fw_comm_time = self.exe._num_microbatches * num_fw_pp_p2ps * \
      chunk_fw_pp_time
    pp_bw_comm_time = self.exe._num_microbatches * num_bw_pp_p2ps * \
      chunk_bw_pp_time

    # Aggregrates metrics
    self._tp_comm_time_link = tp_fw_comm_time + tp_bw_comm_time
    self._tp_comm_time_exposed = (tp_fw_comm_time_exposed +
      tp_bw_comm_time_exposed)
    self._recomm_time_link = tp_recomm_time
    self._recomm_time_exposed = tp_recomm_time_exposed
    self._pp_comm_time_link = pp_fw_comm_time + pp_bw_comm_time
    self._pp_comm_time_exposed = self._pp_comm_time_link

    self.log.debug("%s %s", 'TP comm baseblock FW time:',
      self._baseblock_fw_tp_time)
    self.log.debug("%s %s", 'TP comm edgeblock FW time:',
      self._edgeblock_fw_tp_time)
    self.log.debug("%s %s", 'TP comm FW time:', tp_fw_comm_time)
    self.log.debug("%s %s", 'TP comm baseblock FW exposed time:',
      self._baseblock_fw_tp_time_exposed)
    self.log.debug("%s %s", 'TP comm edgeblock FW exposed time:',
      self._edgeblock_fw_tp_time_exposed)
    self.log.debug("%s %s", 'TP comm FW exposed time:', tp_fw_comm_time_exposed)
    self.log.debug("%s %s", 'TP comm baseblock BW time:',
      self._baseblock_agrad_tp_time)
    self.log.debug("%s %s", 'TP comm edgeblock BW time:',
      self._edgeblock_agrad_tp_time)
    self.log.debug("%s %s", 'TP comm BW time:', tp_bw_comm_time)
    self.log.debug("%s %s", 'TP comm baseblock BW exposed time:',
      self._baseblock_agrad_tp_time_exposed)
    self.log.debug("%s %s", 'TP comm edgeblock BW exposed time:',
      self._edgeblock_agrad_tp_time_exposed)
    self.log.debug("%s %s", 'TP comm BW exposed time:',
      tp_bw_comm_time_exposed)
    self.log.debug("%s %s", 'PP comm chunk FW time:', chunk_fw_pp_time)
    self.log.debug("%s %s", 'PP comm chunk BW time:', chunk_bw_pp_time)
    self.log.debug("%s %s", 'PP comm FW time:', pp_fw_comm_time)
    self.log.debug("%s %s", 'PP comm BW time:', pp_bw_comm_time)

    # Bubble forms between i-th microbatch FW and BW passes on the 1st GPU.
    # With no interleaving between blocks, it includes
    # L/gpu x microbatch_time x (p-1) x Tcycle, where cycle includes both
    # FW and BW passes, TP and PP communication for FW and BW passes
    # With full interleaving, we only need microbatch_time x (p-1) x Tcycle time
    self._baseblock_fw_time_no_offload = (
      self._block_fw_time + self._baseblock_fw_tp_time_exposed)
    self._edgeblock_fw_time_no_offload = (
      self._block_fw_time + self._edgeblock_fw_tp_time_exposed +
      chunk_fw_pp_time)
    self._baseblock_fw_offload_overhead = max(
      0, self.get_fw_offload_time() + self._block_fw_mem_time -
      self._baseblock_fw_time_no_offload)
    self._edgeblock_fw_offload_overhead = max(
      0, self.get_fw_offload_time() + self._block_fw_mem_time -
      self._edgeblock_fw_time_no_offload)
    self._baseblock_fw_time = (
      self._baseblock_fw_time_no_offload + self._baseblock_fw_offload_overhead)
    self._edgeblock_fw_time = (
      self._edgeblock_fw_time_no_offload + self._edgeblock_fw_offload_overhead)
    # When we consider block BW time, we do not add optimizer step to it
    # because we have optimizer only for last microbatches, while offloading
    # works during the whole backward pass.
    # Optimizer step is overall memory bound streaming task, itt is reasonable
    # to not overlap offloading with optimizer step
    self._baseblock_bw_time_no_offload = (
      self._block_re_time + self._baseblock_recomm_time_exposed +
      self._block_agrad_time + self._block_wgrad_time +
      self._baseblock_agrad_tp_time_exposed)
    self._edgeblock_bw_time_no_offload = (
      self._block_re_time + self._edgeblock_recomm_time_exposed +
      self._block_agrad_time + self._block_wgrad_time +
      self._edgeblock_agrad_tp_time_exposed + chunk_bw_pp_time)
    self._baseblock_bw_offload_overhead = max(
      0, self.get_bw_offload_time() + self._block_agrad_mem_time +
      self._block_wgrad_mem_time -
      self._baseblock_bw_time_no_offload)
    self._edgeblock_bw_offload_overhead = max(
      0, self.get_bw_offload_time() + self._block_agrad_mem_time +
      self._block_wgrad_mem_time -
      self._edgeblock_bw_time_no_offload)
    self._baseblock_bw_time = (
      self._baseblock_bw_time_no_offload + self._baseblock_bw_offload_overhead)
    self._edgeblock_bw_time = (
      self._edgeblock_bw_time_no_offload + self._edgeblock_bw_offload_overhead)
    chunk_fw_time = (
      (self._baseblocks_per_chunk * self._baseblock_fw_time) +
      (self._edgeblocks_per_chunk * self._edgeblock_fw_time))
    chunk_bw_time = (
      (self._baseblocks_per_chunk * self._baseblock_bw_time) +
      (self._edgeblocks_per_chunk * self._edgeblock_bw_time))
    # Can't overlap DP comm with mem accesses, but can overlap with offload
    baseblock_dp_overlap_time = self._baseblock_bw_time - (
      self._block_agrad_mem_time + self._block_wgrad_mem_time +
      self._block_re_mem_time)
    edgeblock_dp_overlap_time = self._edgeblock_bw_time - (
      self._block_agrad_mem_time + self._block_wgrad_mem_time +
      self._block_re_mem_time)
    block_dp_compute_time = (
      self._block_agrad_flops_time + self._block_wgrad_flops_time +
      self._block_re_flops_time)
    if not self.exe.optimizer_sharding:
      # If optimizer is not sharded, we can overlap optimizer step with
      # communication, except for memory access time
      baseblock_dp_overlap_time += (
        self._block_optim_time - self._block_optim_mem_time)
      edgeblock_dp_overlap_time += (
        self._block_optim_time - self._block_optim_mem_time)
      block_dp_compute_time += self._block_optim_flops_time
    if self._dp_net == self._tp_net:
      # Can't overlap DP with TP if in the same network
      baseblock_dp_overlap_time -= (
        self._baseblock_recomm_time + self._baseblock_agrad_tp_time)
      edgeblock_dp_overlap_time -= (
        self._edgeblock_recomm_time + self._edgeblock_agrad_tp_time)
    chunk_dp_overlap_time = (
      self._baseblocks_per_chunk * baseblock_dp_overlap_time +
      self._edgeblocks_per_chunk * edgeblock_dp_overlap_time)
    chunk_dp_compute_time = self._blocks_per_chunk * block_dp_compute_time
    chunk_time = chunk_fw_time + chunk_bw_time
    # Block bubbles appear due to uneven division of blocks by pipeline stages
    # and result in the schedule bubble shorten by the missing edge blocks on
    # the later pipeline stages (missing block case)
    if self._baseblocks_per_chunk > 0:
      # We cut last block of chunk, which is half-edge (has PP comm in the end)
      bubble_reduction_time = self._bubble_reduction_blocks * (
        self._baseblock_fw_time + self._edgeblock_fw_time +
        self._baseblock_bw_time + self._edgeblock_bw_time) / 2
    else:
      # If chunk doesn't have base blocks, we cut edge block
      bubble_reduction_time = self._bubble_reduction_blocks * (
        self._edgeblock_fw_time + self._edgeblock_bw_time)
    # With PP interleaving we assume that we move through every chunk at least
    # PP mini batches. If num_microbatches < PP, then we have extra bubbles
    # (missing microbatches case). We have the bubbles in the last microbatches
    # of every overlappable chunk (all but last chunks). Size of bubbles is
    # equal to microbatch_shortage, same number of microbatches will be missing
    # in the last chunk
    chunks_in_bubble = self.exe.pipeline_par - 1
    num_overlappable_chunks = self.exe.pipeline_interleaving - 1
    microbatch_shortage = self.exe.pipeline_par - (
      self.exe._num_microbatches % self.exe.pipeline_par)
    if self.exe._num_microbatches % self.exe.pipeline_par != 0:
      extra_interleaving_bubbles = num_overlappable_chunks * \
        microbatch_shortage
    else:
      extra_interleaving_bubbles = 0
    self._bubble_time = chunks_in_bubble * chunk_time + (
      extra_interleaving_bubbles * chunk_time - bubble_reduction_time)

    self.log.debug("%s %s", 'Block FW time:', self._block_fw_time)
    self.log.debug("%s %s", 'Baseblock FW time:', self._baseblock_fw_time)
    self.log.debug("%s %s", 'With FW offload overhead time:',
      self._baseblock_fw_offload_overhead)
    self.log.debug("%s %s", 'Edgeblock FW time:', self._edgeblock_fw_time)
    self.log.debug("%s %s", 'With FW offload overhead time:',
      self._edgeblock_fw_offload_overhead)
    self.log.debug("%s %s", 'Baseblock REcomm exposed time:',
      self._baseblock_recomm_time_exposed)
    self.log.debug("%s %s", 'Edgeblock REcomm exposed time:',
      self._edgeblock_recomm_time_exposed)
    self.log.debug("%s %s", 'Block RE time:', self._block_re_time)
    self.log.debug("%s %s", 'Block BW Agrad time:', self._block_agrad_time)
    self.log.debug("%s %s", 'Block BW Wgrad time:', self._block_wgrad_time)
    self.log.debug("%s %s", 'Block optim time:', self._block_optim_time)
    self.log.debug("%s %s", 'Baseblock BW time:', self._baseblock_bw_time)
    self.log.debug("%s %s", 'With BW offload overhead time:',
      self._baseblock_bw_offload_overhead)
    self.log.debug("%s %s", 'Edgeblock BW time:', self._edgeblock_bw_time)
    self.log.debug("%s %s", 'With BW offload overhead time:',
      self._edgeblock_bw_offload_overhead)

    # Determines how long it takes to perform the DP per block
    # This assumes no DP communication overlap (will be adjusted later).
    if self.exe.data_par > 1 and self.exe.training:
      self._block_dp_size = self._block_weight_space
      if self.exe.optimizer_sharding:
        # When performing optimizer sharding, the communication time is a
        # reduce-scatter plus an all-gather.
        self._block_dp_time = (
          self._dp_net.time(
            'reduce_scatter', self._block_dp_size, self.exe.data_par) +
          self._dp_net.time(
            'all_gather', self._block_dp_size, self.exe.data_par))
      else:
        # When not performing optimizer sharding, the communication time is a
        # single all-reduce.
        self._block_dp_time = self._dp_net.time(
          'all_reduce', self._block_dp_size, self.exe.data_par)
    else:
      self._block_dp_size = 0
      self._block_dp_time = 0
    self.log.debug('DP block comm size: %s',
                   human_format(self._block_dp_size, 'bytes'))
    self.log.debug('DP block comm time (no overlap): %.3e',
                   self._block_dp_time)

    # DP overlap happens if DP time for a previous block(s) is lower than
    # microbatch BW pass time for next pack of consecutive blocks
    # If no interleaving, we move a single microbatch through each block
    # and need to overlap DP during a single block single microbatch time
    # In case of full interleaving, we propagate p microbatches through each
    # block and need to overlap DP comm with p-1 microbatches over a block
    # In a mixed case, we can overlap DP communication of several chunks, e.g.
    # non-interleaved blocks (L/gpu / interleaving_factor) over BW pass of
    # p-1 microbatches through the same amount of blocks if memory capacity is
    # enough, or perform offload/prefetch after each block-microbatch
    # For simplicity we count only bandwidth-optimal case
    # Note that uneven extra PP bubbles won't affect overlapping
    if self.exe.data_par > 1 and self.exe.training:
      if self.exe.data_par_overlap:
        # we can evenly overlap all the chunks except for the last one
        # in the last chunk we can overlap only all blocks except for the last
        num_overlappable_chunks = self.exe.pipeline_interleaving - 1
        last_chunk_overlap_size = self._blocks_per_chunk - 1
        # We can overlap DP with BW pass, overlap[ing AR for previous layer
        # with BW for current, except when optimizer sharded. We can't overlap
        # during optimizer step as we RS grads before step and AG weights after
        # Overlappable chunks have overlap size equal to
        # blocks_per_chunk * num_microbatches
        # In case of 1F1B schedule, num_microbatches == pipeline_par
        overlap_window = self.exe.pipeline_par * chunk_dp_overlap_time
        overlap_compute = self.exe.pipeline_par * chunk_dp_compute_time
        chunk_dp_time = self._blocks_per_chunk * self._block_dp_time
        # We may have PP and DP comm colliding if DP comm takes longer than
        # a single chunk BW time. We can't collide more PP than microbatches
        if self._dp_net == self._pp_net:
          if self.exe._num_microbatches % self.exe.pipeline_par != 0:
            num_overlapped_pp = min(
              chunk_dp_time // chunk_bw_time,
              self.exe._num_microbatches % self.exe.pipeline_par)
          else:
            num_overlapped_pp = min(
              chunk_dp_time // chunk_bw_time,
              self.exe.pipeline_par)
        else:
          # if PP and DP on different networks, overlapping is fine
          num_overlapped_pp = 0
        # we add DP/PP collision time and compute slowdown due to overlap
        overlap_inflection = chunk_dp_time - (overlap_window -
          num_overlapped_pp * chunk_bw_pp_time) + overlap_compute * \
          self._dp_net.processor_usage
        if overlap_inflection > 0:
          # Tcomm is larger than compute, excess is exposed
          overlappable_chunks_exposed_time = num_overlappable_chunks * \
            overlap_inflection
        else:
          # Tcomm is smaller than compute and hidden, but it contributes to
          # compute slowdown due part of compute resources orchestrating comm
          overlappable_chunks_exposed_time = num_overlappable_chunks * \
            chunk_dp_time * self._dp_net.processor_usage
        # Compute minimal bandwidth required for DP comm overlap of all chunks
        # but the last one.
        chunk_overlap_time = overlap_window + overlap_compute * \
          self._dp_net.processor_usage
        if self._dp_net == self._pp_net:
          chunk_overlap_time -= chunk_bw_pp_time
        chunk_overlap_time *= num_overlappable_chunks
        if chunk_overlap_time > 0:
          self._dp_bw_overlap_req_chunk = self._blocks_per_chunk * \
            self._block_dp_size / chunk_overlap_time
          if self.exe.optimizer_sharding:
            self._dp_bw_overlap_req_chunk *= (
              self._dp_net._ops["reduce_scatter"].scalar +
              self._dp_net._ops["all_gather"].scalar)
          else:
            self._dp_bw_overlap_req_chunk *= self._dp_net._ops["all_reduce"].scalar
        else:
          self._dp_bw_overlap_req_chunk = 0
        # in the last chunk, we overlap DP comm over last edge block and all
        # middle blocks, so we substract the time of the first edge block
        if self._baseblocks_per_chunk > 0:
          last_chunk_window = chunk_dp_overlap_time - chunk_bw_pp_time - (
            self._baseblock_bw_time + self._edgeblock_bw_time) / 2
          if not self.exe.optimizer_sharding:
            # If optimizer is not sharded, we can overlap optimizer step with
            # communication, except for memory access time
            last_chunk_window += (
              self._block_optim_time - self._block_optim_mem_time)
        else:
          # if there is no base blocks, we only have a single edge block
          # and last chunk is completely not overlappable
          last_chunk_window = 0
        last_chunk_inflection = (
          last_chunk_overlap_size * self._block_dp_time) + (
            block_dp_compute_time * self._dp_net.processor_usage -
            last_chunk_window)
        if last_chunk_inflection > 0:
          # Tcomm is larger than compute, excess is exposed
          last_chunk_exposed_time = last_chunk_inflection
        else:
          # Tcomm is smaller than compute and hidden, but it contributes to
          # compute slowdown due part of compute resources orchestrating comm
          last_chunk_exposed_time = last_chunk_overlap_size * \
            self._block_dp_time * self._dp_net.processor_usage
        exposed_time = \
          overlappable_chunks_exposed_time + last_chunk_exposed_time
        # Compute minimal bandwidth required for DP comm overlap of last chunk
        tail_overlap_time = last_chunk_window + last_chunk_overlap_size * \
          self._block_dp_time * self._dp_net.processor_usage
        if tail_overlap_time > 0:
          self._dp_bw_overlap_req_tail = self._blocks_per_chunk * \
          self._block_dp_size / tail_overlap_time
          if self.exe.optimizer_sharding:
            self._dp_bw_overlap_req_tail *= (
              self._dp_net._ops["reduce_scatter"].scalar +
              self._dp_net._ops["all_gather"].scalar)
          else:
            self._dp_bw_overlap_req_tail *= self._dp_net._ops["all_reduce"].scalar
        else:
          self._dp_bw_overlap_req_tail = 0
        self._dp_comm_time_exposed = self._block_dp_time + exposed_time
        self._dp_comm_time_link = self._blocks_per_proc * self._block_dp_time
        self.log.debug('Blocks per chunk: %d', self._blocks_per_chunk)
        self.log.debug('Num overlappable chunks: %d', num_overlappable_chunks)
        self.log.debug('Last chunk size: %d', last_chunk_overlap_size)
        self.log.debug('Chunk exposed time: %.3e', max(0, \
          chunk_dp_time + num_overlapped_pp * chunk_bw_pp_time - \
          overlap_window))
        self.log.debug('Last chunk exposed time: %.3e', last_chunk_exposed_time)
      else:
        self._dp_comm_time_exposed = self._blocks_per_proc * self._block_dp_time
        self._dp_comm_time_link = self._dp_comm_time_exposed
        self._dp_bw_overlap_req_chunk = 0
        self._dp_bw_overlap_req_tail = 0
    else:
      self._dp_comm_time_exposed = 0
      self._dp_comm_time_link = 0
      self._dp_bw_overlap_req_chunk = 0
      self._dp_bw_overlap_req_tail = 0
    self.log.debug('Chunk FW time: %.3e', chunk_fw_time)
    self.log.debug('Chunk BW time: %.3e', chunk_bw_time)
    self.log.debug('Chunk BW time for DP overlap: %.3e', chunk_dp_overlap_time)
    self.log.debug('DP comm time exposed: %.3e', self._dp_comm_time_exposed)
    self.log.debug('DP comm time on the link: %.3e',
                   self._dp_comm_time_link)
    self.log.debug('DP comm required bandwidth for overlapped chunks: %s',
                   human_format(self._dp_bw_overlap_req_chunk, "bandwidth"))
    self.log.debug('DP comm required bandwidth for the last chunk: %s',
                   human_format(self._dp_bw_overlap_req_tail, "bandwidth"))

    # memory capacity stats
    self._weight_space = self._block_weight_space * self._blocks_per_proc
    # account for activation recomputation
    # for full recompute we keep single block's activations
    # (no scaling by L/gpu)
    if self.exe.training:
      # With 1F1B schedule we only keep `pipeline_par` microbatches
      # If num_microbatches < PP, we keep num_microbatches for all PP stages
      if self.exe._num_microbatches < self.exe.pipeline_par:
        mem_microbatches = self.exe._num_microbatches
      else:
        mem_microbatches = self.exe.pipeline_par
      if self.exe.activation_recompute == "full":
        assert self._block_act_storage_space == 0, \
          "We expect with full act recomputation we recompute ALL activations"
        self._act_space = self._block_act_working_space
        # We would need to store checkpoints for all microbatches before we
        # compute BW pass with regular schedule, but we ONLY use 1F1B schedule
        self._act_checkpoint_size = self._blocks_per_proc * \
          self._block_act_checkpoint_size
        # Keep activation checkpoints for all pipeline stages for PP
        if self.exe.pipeline_interleaving > 1:
          self._act_checkpoint_size *= mem_microbatches * (
            1 + (self.exe.pipeline_par - 1) / (self.exe.pipeline_interleaving *
                                               self.exe.pipeline_par))
        else:
          assert self.exe.pipeline_interleaving == 1
          self._act_checkpoint_size *= mem_microbatches
      else:
        # Without full recompute, we don't need checkpoints
        self._act_checkpoint_size = 0
        # Without full recompute, we keep activations for all blocks on the GPU,
        # one activation for working block, and activation for other blocks for
        # all pipeline stages w.r.t. interleaved 1F1B schedule
        if self.exe.pipeline_interleaving > 1:
          pp_microbatch_factor = mem_microbatches * (
            1 + (self.exe.pipeline_par - 1) / (self.exe.pipeline_interleaving *
                                               self.exe.pipeline_par))
        else:
          assert self.exe.pipeline_interleaving == 1
          pp_microbatch_factor = mem_microbatches
        self._act_space = self._block_act_working_space + \
          self._block_act_storage_space * (
            self._blocks_per_proc * pp_microbatch_factor - 1)
      # Only need activation grads for a single block
      self._act_grad_space = self._block_act_grad_space
    else:
      self._act_space = self._block_act_working_space
      self._act_checkpoint_size = 0
      self._act_grad_space = 0

    # Optimizer split  already accounted for during block compilation
    # We should keep non-sharded weight grad for a current block for AllReduce
    # and one that we currently compute, so 2x total
    # We only need a single no sharded weight grad copy for before reduction
    if self.exe.training:
      if self._blocks_per_proc == 1:
        self._weight_grad_space = self._block_weight_grad_space_no_sharding
      else:
        self._weight_grad_space = \
          self._block_weight_grad_space_no_sharding + \
          self._block_weight_grad_space * (self._blocks_per_proc - 1)
      self._optimizer_space = \
        self._block_optimizer_space * self._blocks_per_proc
    else:
      self._weight_grad_space = 0
      self._optimizer_space = 0

  def _check_mem_caps(self):
    if self.get_mem_tier1_cap_req() > self.sys.mem1.capacity:
      raise self.Error(f'Mem tier1 needs '
                       f'{human_format(self.get_mem_tier1_cap_req(), "bytes")} '
                       f'but only has '
                       f'{human_format(self.sys.mem1.capacity, "bytes")}')
    if self.get_mem_tier2_cap_req() > self.sys.mem2.capacity:
      raise self.Error(f'Mem tier2 needs '
                       f'{human_format(self.get_mem_tier2_cap_req(), "bytes")} '
                       f'but only has '
                       f'{human_format(self.sys.mem2.capacity, "bytes")}')

  def _misc_sanity_checks(self):
    if self.exe.tensor_par == 1:
      assert self.get_tp_comm_exposed_time() == 0
      assert self.get_tp_comm_link_time() == 0
    if self.exe.pipeline_par == 1:
      assert self.get_pp_comm_exposed_time() == 0
      assert self.get_pp_comm_link_time() == 0
    if self.exe.data_par == 1:
      assert self.get_dp_comm_exposed_time() == 0
      assert self.get_dp_comm_link_time() == 0

    assert self._fw_flops >= self._block_fw_flops
    assert self._fw_flops_time >= self._block_fw_flops_time
    assert self._fw_mem_accessed >= self._block_fw_mem_accessed
    assert self._fw_mem_time >= self._block_fw_mem_time
    assert self._fw_time >= self._block_fw_time
    assert self._re_flops >= self._block_re_flops
    assert self._re_flops_time >= self._block_re_flops_time
    assert self._re_mem_accessed >= self._block_re_mem_accessed
    assert self._re_mem_time >= self._block_re_mem_time
    assert self._re_time >= self._block_re_time
    assert self._agrad_flops >= self._block_agrad_flops
    assert self._agrad_flops_time >= self._block_agrad_flops_time
    assert self._agrad_mem_accessed >= self._block_agrad_mem_accessed
    assert self._agrad_mem_time >= self._block_agrad_mem_time
    assert self._agrad_time >= self._block_agrad_time
    assert self._wgrad_flops >= self._block_wgrad_flops
    assert self._wgrad_flops_time >= self._block_wgrad_flops_time
    assert self._wgrad_mem_accessed >= self._block_wgrad_mem_accessed
    assert self._wgrad_mem_time >= self._block_wgrad_mem_time
    assert self._wgrad_time >= self._block_wgrad_time
    assert self._optim_flops >= self._block_optim_flops
    assert self._optim_flops_time >= self._block_optim_flops_time
    assert self._optim_mem_accessed >= self._block_optim_mem_accessed
    assert self._optim_mem_time >= self._block_optim_mem_time
    assert self._optim_time >= self._block_optim_time
    assert self._weight_space >= self._block_weight_space
    assert self._act_space >= self._block_act_working_space
    assert self._act_checkpoint_size >= self._block_act_checkpoint_size
    assert self._weight_grad_space >= self._block_weight_grad_space_no_sharding
    assert self._act_grad_space == self._block_act_grad_space
    assert self._optimizer_space >= self._block_optimizer_space

    if not self.exe.training:
      # when not training (inference), backward is not performed and DP has no
      # communication overhead
      assert self.get_bw_time() == 0
      assert self.get_optim_step_time() == 0
      assert self.get_bw_offload_time() == 0
      assert self.get_recompute_time() == 0
      assert self.get_act_checkpoint_size() == 0
      assert self.get_dp_comm_exposed_time() == 0
      assert self.get_dp_comm_link_time() == 0
    else:
      # when training, backward is performed
      assert self.get_bw_time() > 0
      assert self.get_optim_step_time() > 0
      if self.exe.activation_recompute == 'full':
        assert self.get_recompute_time() > 0
        assert self.get_act_checkpoint_size() > 0
      elif self.exe.activation_recompute == 'attn_only':
        assert self.get_recompute_time() > 0
        assert self.get_act_checkpoint_size() == 0
      else:
        if not self.exe.seq_par_ag_redo:
          assert self.get_recompute_time() == 0
        assert self.get_act_checkpoint_size() == 0


  def run(self, sys):
    assert self._compiled, "You must first call self.compile()"
    assert not self._executed
    assert isinstance(sys, System)
    self._compute_block_stats()
    self._compute_batch_stats()
    self._check_mem_caps()
    self._misc_sanity_checks()
    self._executed = True

  def _get_fw_offload_size(self):
    if self.exe.weight_offload:
      weight_offload_size = self._block_weight_space
    else:
      weight_offload_size = 0
    if self.exe.activations_offload:
      if self.exe.activation_recompute != 'full':
        act_offload_size = self._block_act_storage_space
      else:
        act_offload_size = self._block_act_checkpoint_size
    else:
      act_offload_size = 0
    return max(weight_offload_size, act_offload_size)

  def _get_bw_offload_size(self):
    bw_offload_size = 0
    if self.exe.training:
      if self.exe.weight_offload:
        bw_offload_size += self._block_weight_space
      if self.exe.activations_offload:
        if self.exe.activation_recompute != 'full':
          bw_offload_size += self._block_act_storage_space
        else:
          bw_offload_size += self._block_act_checkpoint_size
      if self.exe.optimizer_offload:
        bw_offload_size += self._block_optimizer_space
    return bw_offload_size

  def get_fw_time(self):
    return self._fw_time

  def get_fw_offload_time(self):
    return self.sys.compute_offload_time(self._get_fw_offload_size())

  def get_fw_offload_overhead(self):
    full_overhead = self.exe._num_microbatches * self._chunks_per_proc * (
      (self._baseblocks_per_chunk * self._baseblock_fw_offload_overhead) +
      (self._edgeblocks_per_chunk * self._edgeblock_fw_offload_overhead))
    return full_overhead

  def get_bw_time(self):
    return self._agrad_time + self._wgrad_time

  def get_optim_step_time(self):
    return self._optim_time

  def get_bw_offload_time(self):
    if self.exe.training:
      return self.sys.compute_offload_time(self._get_bw_offload_size())
    else:
      return 0

  def get_bw_offload_overhead(self):
    if self.exe.training:
      full_overhead = self.exe._num_microbatches * self._chunks_per_proc * (
        (self._baseblocks_per_chunk * self._baseblock_bw_offload_overhead) +
        (self._edgeblocks_per_chunk * self._edgeblock_bw_offload_overhead))
      return full_overhead
    else:
      return 0

  def get_recompute_time(self):
    return self._re_time

  def get_recomm_exposed_time(self):
    if self.exe.training:
      return self._recomm_time_exposed
    else:
      return 0

  def get_recomm_link_time(self):
    if self.exe.training:
      return self._recomm_time_link
    else:
      return 0

  def get_bubble_time(self):
    return self._bubble_time

  def get_tp_comm_exposed_time(self):
    return self._tp_comm_time_exposed

  def get_pp_comm_exposed_time(self):
    return self._pp_comm_time_exposed

  def get_dp_comm_exposed_time(self):
    if self.exe.training:
      return self._dp_comm_time_exposed
    else:
      return 0

  def get_tp_comm_link_time(self):
    return self._tp_comm_time_link

  def get_pp_comm_link_time(self):
    return self._pp_comm_time_link

  def get_dp_comm_link_time(self):
    if self.exe.training:
      return self._dp_comm_time_link
    else:
      return 0

  def get_dp_comm_net_time(self):
    if self.exe.training:
      return self._blocks_per_proc * self._block_dp_time
    else:
      return 0

  def get_total_time(self):
    time = self.get_fw_time()
    time += self.get_bw_time()
    time += self.get_optim_step_time()
    time += self.get_fw_offload_overhead()
    time += self.get_bw_offload_overhead()
    time += self.get_recompute_time()
    time += self.get_recomm_exposed_time()
    time += self.get_bubble_time()
    time += self.get_tp_comm_exposed_time()
    time += self.get_pp_comm_exposed_time()
    time += self.get_dp_comm_exposed_time()
    return time

  def get_useful_flops(self):
    total_flops = sum(
      [block.get_fw_flops() for block in self._llm_block])
    if self.exe.training:
      total_flops += sum(
        [block.get_agrad_flops() + block.get_wgrad_flops() + \
          block.get_optim_step_flops() for block in self._llm_block])
    return total_flops

  def get_compute_efficiency(self):
    total_flops = self.get_useful_flops()
    compute_time = self.get_fw_time() + self.get_bw_time() + \
      self.get_optim_step_time()
    perfect_time = self._blocks_per_proc * self.exe._num_microbatches * \
      total_flops / self.sys.matrix.flops(self.exe.datatype)
    return perfect_time / compute_time

  def get_system_efficiency(self):
    compute_time = self.get_fw_time() + self.get_bw_time() + \
      self.get_optim_step_time()
    return compute_time / self.get_total_time()

  def get_total_efficiency(self):
    total_flops = self.get_useful_flops()
    perfect_time = self._blocks_per_proc * self.exe._num_microbatches * \
      total_flops / self.sys.matrix.flops(self.exe.datatype)
    return perfect_time / self.get_total_time()

  def get_weight_space_min(self):
    return self._block_weight_space * 2

  def get_weight_space(self):
    return self._weight_space

  def get_act_space_min(self):
    if self.exe.activation_recompute != 'full':
      return self._block_act_working_space + self._block_act_storage_space
    else:
      return self._block_act_working_space

  def get_act_space(self):
    return self._act_space

  def get_act_checkpoint_size_min(self):
    if self.exe.training:
      if self.exe.activation_recompute != 'full':
        return 0
      else:
        return self._block_act_checkpoint_size * 2

  def get_act_checkpoint_size(self):
    if self.exe.training:
      if self.exe.activation_recompute != 'full':
        return 0
      else:
        return self._act_checkpoint_size
    else:
      return 0

  def get_weight_grad_space_min(self):
    if self.exe.training:
      # We keep one set of non-sharded weight grads after compute before
      # reduction, and one sharded set for offloading
      return self._block_weight_grad_space_no_sharding + \
        self._block_weight_grad_space
    else:
      return 0

  def get_weight_grad_space(self):
    if self.exe.training:
      return self._weight_grad_space
    else:
      return 0

  def get_act_grad_space_min(self):
    return self.get_act_grad_space()

  def get_act_grad_space(self):
    if self.exe.training:
      return self._act_grad_space
    else:
      return 0

    return self._block_optimizer_space * 2

  def get_optimizer_space_min(self):
    if self.exe.training:
      return self._block_optimizer_space * 2
    else:
      return 0

  def get_optimizer_space(self):
    if self.exe.training:
      return self._optimizer_space
    else:
      return 0

  def _get_mem_cap_reqs(self):
    tier1 = 0
    tier2 = 0
    if self.exe.weight_offload:
      tier1 += self.get_weight_space_min()
      tier2 += self.get_weight_space()
    else:
      tier1 += self.get_weight_space()
    if self.exe.activations_offload:
      if self.exe.activation_recompute != 'full':
        tier1 += self.get_act_space_min()
        tier2 += self.get_act_space()
      else:
        tier1 += self.get_act_space_min()
        tier1 += self.get_act_checkpoint_size_min()
        tier2 += self.get_act_checkpoint_size()
    else:
      tier1 += self.get_act_space()
      tier1 += self.get_act_checkpoint_size()
    if self.exe.optimizer_offload:
      # We keep one set of non-sharded weight grads after compute before
      # reduction, and one sharded set for offloading
      tier1 += self.get_weight_grad_space_min()
      tier1 += self.get_optimizer_space_min()
      tier2 += self._block_weight_grad_space * self._blocks_per_proc
      tier2 += self.get_optimizer_space()
    else:
      tier1 += self.get_weight_grad_space() + \
        self.get_optimizer_space()
    tier1 += self.get_act_grad_space()
    return tier1, tier2

  def get_mem_tier1_cap_req(self):
    return self._get_mem_cap_reqs()[0]

  def get_mem_tier2_cap_req(self):
    return self._get_mem_cap_reqs()[1]

  def get_act_offload_bw_req(self):
    # We should be able to offload (write) activation during FW pass and
    # prefetch it (read) during BW pass for block (i-1)
    # After BW pass activations are discarded
    if self.exe.activation_recompute != 'full':
      act_offload_size = self._block_act_storage_space
    else:
      act_offload_size = self._block_act_checkpoint_size
    offload_time = min(
      self._baseblock_fw_time_no_offload - self._block_fw_mem_time,
      self._edgeblock_fw_time_no_offload - self._block_fw_mem_time)
    return act_offload_size / offload_time

  def get_weight_offload_bw_req(self):
    # We should be able to offload (write) and prefetch (read) weights both
    # during FW and BW passes for blocks (i-1) / (i+1).
    # We always keep weights, they cannot be discarded
    offload_time = min(
      self._baseblock_fw_time_no_offload - self._block_fw_mem_time,
      self._edgeblock_fw_time_no_offload - self._block_fw_mem_time)
    return self._block_weight_space / offload_time

  def get_optim_offload_bw_req(self):
    # We should be able to offload (write) weight grads and optimizer state
    # and prefetch (read) optimizer state during BW passes for blocks
    # (i-1) / (i+1).
    if self.exe.training:
      offload_time = min(
        self._baseblock_bw_time_no_offload - (self._block_agrad_mem_time +
          self._block_wgrad_mem_time),
        self._edgeblock_bw_time_no_offload - (self._block_agrad_mem_time +
          self._block_wgrad_mem_time))
      return (self._block_weight_grad_space + self._block_optimizer_space) / \
        offload_time
    else:
      return 0

  def get_offload_mem_bw_req(self):
    fw_offload_time = min(
      self._baseblock_fw_time_no_offload - self._block_fw_mem_time,
      self._edgeblock_fw_time_no_offload - self._block_fw_mem_time)
    if self.exe.training:
      bw_offload_time = min(
        self._baseblock_bw_time_no_offload - (self._block_agrad_mem_time +
          self._block_wgrad_mem_time),
        self._edgeblock_bw_time_no_offload - (self._block_agrad_mem_time +
          self._block_wgrad_mem_time))
      req_bw = max(self._get_fw_offload_size() / fw_offload_time,
                   self._get_bw_offload_size() / bw_offload_time)
      return req_bw
    else:
      return self._get_fw_offload_size() / fw_offload_time

  def get_sample_rate(self):
    return self.exe.global_batch_size / self.get_total_time()

  def display_stats(self):
    stats = "=" * 80 + "\n"
    stats += "" \
      f"blocks={self.app.num_blocks}, " \
      f"hidden={self.app.hidden}, feedforward={self.app.feedforward}\n" \
      f"num attn heads: {self.app.attn_heads}, " \
      f"attn_size={self.app.attn_size}\n" \
      f"Run on {self.exe.num_procs} processors with:\n" \
      f"TP={self.exe.tensor_par}\n" \
      f"PP={self.exe.pipeline_par}\n" \
      f"DP={self.exe.data_par}\n" \
      f"Blocks per processor: {self._blocks_per_proc}\n" \
      f"Execution: {self.exe.get_json()};\n" \
      f"System: {self.sys.cfg};\n" \
      f"Weights: {human_format(self.get_weight_space(), 'bytes')};\n" \
      f"Act: {human_format(self.get_act_space(), 'bytes')};\n" \
      f"Act CP: {human_format(self.get_act_checkpoint_size(), 'bytes')};\n" \
      f"Act grad: {human_format(self.get_act_grad_space(), 'bytes')};\n" \
      f"Weight grad: {human_format(self.get_weight_grad_space(), 'bytes')};\n" \
      f"Optim space: {human_format(self.get_optimizer_space(), 'bytes')};\n" \
      f"Batch FW time: {self.get_fw_time():.4f};\n" \
      f"Batch BW time: {self.get_bw_time():.4f};\n" \
      f"Batch optim time: {self.get_optim_step_time():.4f};\n" \
      f"Batch FW offload overhead: {self.get_fw_offload_overhead():.4f};\n" \
      f"Batch BW offload overhead: {self.get_bw_offload_overhead():.4f};\n" \
      f"Batch recompute overhead: {self.get_recompute_time():.4f};\n" \
      f"Batch recomm overhead: {self.get_recomm_exposed_time():.4f};\n" \
      f"Batch bubble overhead: {self.get_bubble_time():.4f};\n" \
      f"Batch TP comm overhead: {self.get_tp_comm_exposed_time():.4f};\n" \
      f"Batch PP comm overhead: {self.get_pp_comm_exposed_time():.4f};\n" \
      f"Batch DP comm overhead: {self.get_dp_comm_exposed_time():.4f};\n" \
      f"Batch TP comm time on link: {self.get_tp_comm_link_time():.4f};\n" \
      f"Batch PP comm time on link: {self.get_pp_comm_link_time():.4f};\n" \
      f"Batch DP comm time on link: {self.get_dp_comm_link_time():.4f};\n" \
      f"Batch total time: {self.get_total_time():.4f};\n" \
      f"Activation offload required BW: " \
      f"{human_format(self.get_act_offload_bw_req(), 'bandwidth')};\n" \
      f"Weight offload required BW: " \
      f"{human_format(self.get_weight_offload_bw_req(), 'bandwidth')};\n" \
      f"Optimizer offload required BW: " \
      f"{human_format(self.get_optim_offload_bw_req(), 'bandwidth')};\n" \
      f"Total offload required BW: " \
      f"{human_format(self.get_offload_mem_bw_req(), 'bandwidth')};\n" \
      f"Mem tier1 capacity requirement: " \
      f"{human_format(self.get_mem_tier1_cap_req(), 'bytes')};\n" \
      f"Mem tier2 capacity requirement: " \
      f"{human_format(self.get_mem_tier2_cap_req(), 'bytes')};\n" \
      f"Mem tier2 BW for offload: " \
      f"{human_format(self.get_offload_mem_bw_req(), 'bandwidth')};\n" \
      f"Compute efficiency: {self.get_compute_efficiency()*100:.2f}%;\n" \
      f"System efficiency: {self.get_system_efficiency()*100:.2f}%;\n" \
      f"Total efficiency: {self.get_total_efficiency()*100:.2f}%;\n" \
      f"Sample rate: {self.get_sample_rate():.2f};\n"
    self.log.info(stats)


================================================
FILE: calculon/llm/optimal_execution.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

import datetime
import gzip
import logging
import multiprocessing as mp
import psutil
import os

import calculon
from calculon.util import pick, arg_true_false_all
from calculon.llm import *


class OptimalExecution(calculon.CommandLine):
  NAME = 'llm-optimal-execution'
  ALIASES = ['loe']

  @staticmethod
  def create_parser(subparser):
    sp = subparser.add_parser(
      OptimalExecution.NAME, aliases=OptimalExecution.ALIASES,
      help='run a search to find the optimal llm execution')
    sp.set_defaults(func=OptimalExecution.run_command)
    sp.add_argument('-d', '--debug', action='store_true',
                    help='Loop over executions, don\'t run them')
    sp.add_argument('application', type=str,
                    help='File path to application configuration')
    sp.add_argument('num_procs', type=int,
                    help='Number of processors in execution')
    sp.add_argument('max_batch_size', type=int,
                    help='Maximum batch size, will be largest multiple of DP')
    sp.add_argument('datatype', type=str, choices=System.supported_datatypes(),
                    help='The datatype to use')
    sp.add_argument('system', type=str,
                    help='File path to system configuration')
    sp.add_argument('output', type=str,
                    help='File path to the output file'
                    " ('*.csv', '*.csv.gz', '*.json', '*.json.gz')")
    sp.add_argument('-c', '--cpus', type=int, default=psutil.cpu_count(logical=False),
                    help='CPUs to use for parallelization')
    sp.add_argument('-n', '--noneok', action='store_true',
                    help='Don\'t give failure status when no good execution exists')
    sp.add_argument('-m', '--mbs-break', action='store_true',
                    help='Search across MBS and break earlier when possible')
    sp.add_argument('-t', '--top-n', type=int, default=1,
                    help='Number of best outputs')
    sp.add_argument('-l', '--layers', action='store_true',
                    help='Include layers information in output stats file')
    sp.add_argument('-f', '--fused_activation', type=arg_true_false_all,
                    default='true', help='Mode of fused activation')
    sp.add_argument('--no-tp-overlap', action='store_true',
                    help='Don\'t allow TP overlap')
    sp.add_argument('--no-dp-overlap', action='store_true',
                    help='Don\'t allow DP overlap')

  @staticmethod
  def run_command(logger, args):
    assert args.top_n > 0, 'top-n must be > 0'

    app = Llm.Application(calculon.io.read_json_file(args.application))
    syst = System(calculon.io.read_json_file(args.system))

    params = []
    for tp in Llm.get_all_tensor_parallelisms(
        args.num_procs, app.hidden, app.attn_heads):
      for pp in Llm.get_all_pipeline_parallelisms(
          args.num_procs, tp, app.num_blocks):
        dp = Llm.get_data_parallelism(args.num_procs, tp, pp)
        for ppint in Llm.get_valid_pipeline_interleavings(app.num_blocks, pp):
          batch_size = OptimalExecution.get_batch_size(dp, args.max_batch_size)
          if batch_size is None:
            continue
          for activation_recompute in ['full', 'attn_only', 'none']:
            for optimizer_sharding in pick(dp>1, [True, False], [False]):
              for tensor_par_comm_type in ['ar', 'p2p_rs_ag', 'rs_ag']:
                params.append(
                  (args.debug, args.top_n, args.layers, args.num_procs,
                   args.max_batch_size, args.datatype, app, syst, tp, pp, dp,
                   ppint, batch_size, activation_recompute, optimizer_sharding,
                   tensor_par_comm_type, args.fused_activation, args.mbs_break,
                   not args.no_tp_overlap, not args.no_dp_overlap))

    # Runs parallel searches
    start_time = datetime.datetime.now()
    with mp.Pool(args.cpus) as pool:
      searches = pool.starmap(OptimalExecution.search, params)
    end_time = datetime.datetime.now()

    # Combines parallel search result into one data structure
    best = []
    exe_count = 0
    good_exe_count = 0
    bad_exe_count = 0
    for cbest, ec, gec, bec, tp, pp in searches:
      best = OptimalExecution.update_list(best, cbest, args.top_n)
      exe_count += ec
      good_exe_count += gec
      bad_exe_count += bec

    logger.info(f'Total executions: {exe_count}')
    logger.info(f'Good executions: {good_exe_count}')
    logger.info(f'Bad executions: {bad_exe_count}')
    calc_rate = exe_count / (end_time - start_time).total_seconds()
    logger.info(f'Calculation rate: {calc_rate:.2f} calcs/sec')
    if args.debug:
      return 0

    if len(best) == 0:
      if not args.noneok:
        logger.fatal('No acceptable configurations found :(')
        return -1
      else:
        logger.info('No acceptable configurations found :(')
    else:
      logger.info(f'Best sample rate: {best[0][0]}')

    output = {}
    for index, run in enumerate(best):
      _, execution, stats = run
      output[index] = {
        'execution': execution,
        'stats': stats
      }

    if calculon.io.is_json_extension(args.output):
      logger.info(f'Output: {args.output}')
      calculon.io.write_json_file(output, args.output)
    elif args.output.endswith('.csv') or args.output.endswith('.csv.gz'):
      logger.info(f'Output: {args.output}')
      exe_keys = list(output[0]['execution'].keys())
      stats_keys = list(output[0]['stats'].keys())
      opener = gzip.open if args.output.endswith('.gz') else open
      with opener(args.output, 'wb') as fd:
        fd.write(bytes(f',{",".join(exe_keys)},{",".join(stats_keys)}\n',
                       'utf-8'))
        for index in sorted(output.keys()):
          fd.write(bytes(f'{index}', 'utf-8'))
          for exe_key in exe_keys:
            fd.write(bytes(f',{output[index]["execution"][exe_key]}', 'utf-8'))
          for stats_key in stats_keys:
            fd.write(bytes(f',{output[index]["stats"][stats_key]}', 'utf-8'))
          fd.write(bytes('\n', 'utf-8'))
    else:
      assert False, f'Unknown file type: {args.output}'

    return 0

  @staticmethod
  def get_batch_size(data_par, max_batch_size):
    if data_par > max_batch_size:
      return None
    last = data_par
    while True:
      if last + data_par > max_batch_size:
        return last
      else:
        last += data_par

  @staticmethod
  def search(debug, top_n, layers, num_procs, max_batch_size, datatype,
             app, syst, tp, pp, dp, ppint, batch_size, activation_recompute,
             optimizer_sharding, tensor_par_comm_type, fused_acts, mbs_break,
             allow_tp_overlap, allow_dp_overlap):
    num_nets = syst.num_networks

    best = []
    exe_count = 0
    good_exe_count = 0
    bad_exe_count = 0

    has_mem2 = syst.mem2.capacity > 0

    can_redo = Llm.can_redo_ag(tensor_par_comm_type,
                               activation_recompute)
    for seq_par_ag_redo in pick(can_redo, [True, False], [False]):
      for data_par_overlap in pick(dp>1 and allow_dp_overlap, [True, False],
                                   [False]):
        for tensor_par_overlap in pick(tp>1 and allow_tp_overlap,
                                       ['none', 'ring', 'pipe'], ['none']):
          for weight_offload in pick(has_mem2, [True, False], [False]):
            if activation_recompute == 'full' or not has_mem2:
              activations_offloads = [False]
            else:
              activations_offloads = [True, False]
            for activations_offload in activations_offloads:
              for optimizer_offload in pick(has_mem2, [True, False],
                                            [False]):
                for fused_act in fused_acts:
                  for microbatch_size in Llm.get_valid_microbatch_sizes(
                      app.seq_size, tp, dp, batch_size, pp):
                    mbs_break_good = good_exe_count
                    for tn in pick(tp>1, range(num_nets), [0]):
                      for pn in pick(pp>1, range(num_nets), [0]):
                        for dn in pick(dp>1, range(num_nets), [0]):
                          exe_count += 1
                          exe_json = {
                            'num_procs': num_procs,
                            'tensor_par': tp,
                            'pipeline_par': pp,
                            'data_par': dp,
                            'tensor_par_net': tn,
                            'pipeline_par_net': pn,
                            'data_par_net': dn,
                            'batch_size': batch_size,
                            'microbatch_size': microbatch_size,
                            'datatype': datatype,
                            'fused_activation': fused_act,
                            'attention_type': 'multihead',
                            'activation_recompute': activation_recompute,
                            'pipeline_interleaving': ppint,
                            'optimizer_sharding': optimizer_sharding,
                            'tensor_par_comm_type': tensor_par_comm_type,
                            'tensor_par_overlap': tensor_par_overlap,
                            'seq_par_ag_redo': seq_par_ag_redo,
                            'data_par_overlap': data_par_overlap,
                            'weight_offload': weight_offload,
                            'activations_offload': activations_offload,
                            'optimizer_offload': optimizer_offload,
                            'training': True
                          }

                          if not debug:
                            try:
                              logger = logging.Logger('sub')
                              model = Llm(app, logger)
                              model.compile(
                                syst,
                                Llm.Execution.from_json(exe_json))
                              model.run(syst)
                              stats = model.get_stats_json(layers)
                              good_exe_count += 1
                              curr = (stats['sample_rate'], exe_json, stats)
                              best = OptimalExecution.update_list(best, curr,
                                                                  top_n)
                            except Llm.Error as ex:
                              logger = logging.getLogger()
                              logger.debug(f'JSON:{exe_json}\nERROR:{ex}\n')
                              bad_exe_count += 1
                    if mbs_break and good_exe_count == mbs_break_good:
                      break
    return (best, exe_count, good_exe_count, bad_exe_count, tp, pp)

  @staticmethod
  def update_list(current, candidate, quantity):
    if not isinstance(candidate, list):
      current.append(candidate)
    else:
      current.extend(candidate)
    current.sort(reverse=True, key=lambda x: x[0])
    return current[:quantity]


calculon.CommandLine.register(OptimalExecution)


================================================
FILE: calculon/llm/parameter_calculator.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

import os

import calculon
from calculon.llm import *

class ParameterCalculator(calculon.CommandLine):
  NAME = 'llm-parameter-calculator'
  ALIASES = ['lpc']

  @staticmethod
  def create_parser(subparser):
    sp = subparser.add_parser(ParameterCalculator.NAME,
                              aliases=ParameterCalculator.ALIASES,
                              help='run a single llm calculation')
    sp.set_defaults(func=ParameterCalculator.run_command)
    sp.add_argument('application', type=str,
                    help='File path to application configuration')
    sp.add_argument('-a', '--alignment', type=int, default=13,
                    help='Alignment spaces')

  @staticmethod
  def run_command(logger, args):
    app_json = calculon.io.read_json_file(args.application)

    try:
      app = Llm.Application(app_json)
    except Llm.Error as error:
      print(f'ERROR: {error}')
      return -1

    app_name, _ = os.path.splitext(os.path.basename(args.application))

    logger.info(f'{app_name}'
                f'{" " * (args.alignment - len(app_name))}'
                ' -> '
                f'{human_format(app.num_parameters())}')


calculon.CommandLine.register(ParameterCalculator)


================================================
FILE: calculon/llm/runner.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

import calculon
from calculon.llm import *

class Runner(calculon.CommandLine):
  NAME = 'llm'
  ALIASES = []

  @staticmethod
  def create_parser(subparser):
    sp = subparser.add_parser(Runner.NAME, aliases=Runner.ALIASES,
                              help='run a single llm calculation')
    sp.set_defaults(func=Runner.run_command)
    sp.add_argument('application', type=str,
                    help='File path to application configuration')
    sp.add_argument('execution', type=str,
                    help='File path to execution configuration')
    sp.add_argument('system', type=str,
                    help='File path to system configuration')
    sp.add_argument('stats', type=str,
                    help='File path to stats output ("-" for stdout")')
    sp.add_argument('-p', '--peers', type=str, default=None,
                    help='File path to write out peers file')
    sp.add_argument('-l', '--layers', action='store_true',
                    help='Include layers information in output stats file')

  @staticmethod
  def run_command(logger, args):
    app_json = calculon.io.read_json_file(args.application)
    exe_json = calculon.io.read_json_file(args.execution)
    sys_json = calculon.io.read_json_file(args.system)

    app = Llm.Application(app_json)
    exe = Llm.Execution.from_json(exe_json)
    syst = System(sys_json)

    try:
      model = Llm(app, logger)
      model.compile(syst, exe)
      model.run(syst)
    except Llm.Error as error:
      print(f'ERROR: {error}')
      return -1

    if args.stats == '-':
      model.display_stats()
    elif calculon.is_json_extension(args.stats):
      calculon.write_json_file(model.get_stats_json(args.layers), args.stats)
    else:
      assert False, f'unknown stats extension: {args.stats}'

    if args.peers:
      calculon.write_json_file(exe.get_peers_json(), args.peers)

    return 0


calculon.CommandLine.register(Runner)


================================================
FILE: calculon/llm/validation.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

import logging
import math
import os

import calculon
from calculon.util import pick
from calculon.llm import *


class Validation(calculon.CommandLine):
  NAME = 'llm-validation'
  ALIASES = ['lv']

  @staticmethod
  def create_parser(subparser):
    sp = subparser.add_parser(
      Validation.NAME, aliases=Validation.ALIASES,
      help='run a validation of llm execution')
    sp.set_defaults(func=Validation.run_command)
    sp.add_argument('-b', '--base_dir', default='.',
                    help='Base directory')
    sp.add_argument('-v', '--verbose', action='store_true',
                    help='Show verbose output while running')

  @staticmethod
  def run_command(logger, args):
    funcs = [
      Validation.seqsel_fig1,
      Validation.seqsel_fig7,
      Validation.seqsel_tab5
    ]
    for func in funcs:
      if args.verbose:
        print(f'\n\nNow running test: {func.__name__}')
      if func(logger, args) is not None:
        return -1

  @staticmethod
  def seqsel_fig1(logger, args):
    kModels = ['megatron-22B', 'gpt3-175B', 'turing-530B', 'megatron-1T']
    kModes = ['none', 'seqsel']
    # These profiled values are reported here:
    # https://arxiv.org/pdf/2205.05198.pdf
    # Figure 1
    kProfile = {
      'megatron-22B': {
        'none': {
          'par_opt': 45.5625,
          'act': 59.25
        },
        'seqsel': {
          'par_opt': 45.5625,
          'act': 9.5625
        }
      },
      'gpt3-175B': {
        'none': {
          'par_opt': 45.5625,
          'act': 66.84375
        },
        'seqsel': {
          'par_opt': 45.5625,
          'act': 12.3515625
        }
      },
      'turing-530B': {
        'none': {
          'par_opt': 31.640625,
          'act': 114.0234375
        },
        'seqsel': {
          'par_opt': 31.640625,
          'act': 23.076171875
        }
      },
      'megatron-1T': {
        'none': {
          'par_opt': 32.958984375,
          'act': 131.25
        },
        'seqsel': {
          'par_opt': 32.958984375,
          'act': 26.5625
        }
      }
    }

    def get_files(model, mode):
      assert model in kModels
      assert mode in kModes
      app = os.path.join(args.base_dir, 'models', f'{model}.json')
      exe = os.path.join(args.base_dir, 'validation', 'seqsel', 'fig1',
                         f'{model}_{mode}.json')
      return app, exe

    def get_profile(model, mode):
      assert model in kModels
      assert mode in kModes
      return kProfile[model][mode]

    syst_file = os.path.join(args.base_dir, 'systems', 'a100_80e.json')
    syst = System(calculon.io.read_json_file(syst_file))
    data = {}
    for model in kModels:
      data[model] = {}
      for mode in kModes:
        if args.verbose:
          print(f'Analyzing {model} {mode}')
        data[model][mode] = {}
        app_file, exe_file = get_files(model, mode)
        app = Llm.Application(calculon.read_json_file(app_file))
        exe = Llm.Execution.from_json(calculon.read_json_file(exe_file))
        mt = Llm(app, logger)
        mt.compile(syst, exe)
        mt.run(syst)
        stats = mt.get_stats_json(False)
        data[model][mode]['profile_gib'] = get_profile(model, mode)
        act_par_opt = (stats['weight_space'] + stats['weight_grad_space'] +
                       stats['optimizer_space']) / (1024**3)
        act_act = stats['act_space'] / (1024**3)
        data[model][mode]['actual_gib'] = {
          'par_opt': act_par_opt,
          'act': act_act
        }

    print('*Params & Opt,|,none,,,|,seqsel,,,')
    print('Model,|,Profile,Calc,Delta,|,Profile,Calc,Delta,')
    max_error = 0
    abs_error = 0
    for model in kModels:
      print(f'{model},', end='')
      for mode in kModes:
        p = data[model][mode]['profile_gib']['par_opt']
        a = data[model][mode]['actual_gib']['par_opt']
        d = 100*(1-a/p)
        if math.fabs(d) > max_error:
          max_error = math.fabs(d)
        abs_error += math.fabs(d)
        print(f'|,{p},{a:.2f},{d:.2f}%,', end='')
      print()
    ave_error = abs_error / (len(kModels) * len(kModes))
    print(f'Ave,,{ave_error:.2f}%')
    print(f'Max,,{max_error:.2f}%')
    print(',')

    print('*Activations,|,none,,,|,seqsel,,,')
    print('Model,|,Profile,Calc,Delta,|,Profile,Calc,Delta,')
    max_error = 0
    abs_error = 0
    for model in kModels:
      print(f'{model},', end='')
      for mode in kModes:
        p = data[model][mode]['profile_gib']['act']
        a = data[model][mode]['actual_gib']['act']
        d = 100*(1-a/p)
        if math.fabs(d) > max_error:
          max_error = math.fabs(d)
        abs_error += math.fabs(d)
        print(f'|,{p},{a:.2f},{d:.2f}%,', end='')
      print()
    ave_error = abs_error / (len(kModels) * len(kModes))
    print(f'Ave,,{ave_error:.2f}%')
    print(f'Max,,{max_error:.2f}%')
    print(',')

  @staticmethod
  def seqsel_fig7(logger, args):
    kModels = ['megatron-22B', 'gpt3-175B', 'turing-530B', 'megatron-1T']
    kModes = ['none', 'seq', 'sel', 'seqsel', 'full']
    # These profiled values are reported here:
    # https://arxiv.org/pdf/2205.05198.pdf
    # Figure 7
    kProfile = {
      'megatron-22B': {
        'none': 100.00,
        'seq': 66.84,
        'sel': 49.42,
        'seqsel': 16.18,
        'full': 7.64
      },
      'gpt3-175B': {
        'none': 100.00,
        'seq': 62.04,
        'sel': 56.53,
        'seqsel': 18.49,
        'full': 8.71
      },
      'turing-530B': {
        'none': 100.00,
        'seq': 58.31,
        'sel': 62.04,
        'seqsel': 20.27,
        'full': 9.42
      },
      'megatron-1T': {
        'none': 100.00,
        'seq': 58.31,
        'sel': 62.04,
        'seqsel': 20.27,
        'full': 9.42
      }
    }

    def get_files(model, mode):
      assert model in kModels
      assert mode in kModes
      app = os.path.join(args.base_dir, 'models', f'{model}.json')
      exe = os.path.join(args.base_dir, 'validation', 'seqsel', 'fig7',
                         f'{model}_{mode}.json')
      return app, exe

    def get_profile(model, mode):
      assert model in kModels
      assert mode in kModes
      return kProfile[model][mode]

    syst_file = os.path.join(args.base_dir, 'systems', 'a100_80e.json')
    syst = System(calculon.io.read_json_file(syst_file))
    raw = {}
    for model in kModels:
      raw[model] = {}
      for mode in kModes:
        if args.verbose:
          print(f'Analyzing {model} {mode}')
        raw[model][mode] = {}
        app_file, exe_file = get_files(model, mode)
        app = Llm.Application(calculon.read_json_file(app_file))
        exe = Llm.Execution.from_json(calculon.read_json_file(exe_file))
        mt = Llm(app, logger)
        mt.compile(syst, exe)
        mt.run(syst)
        stats = mt.get_stats_json(False)
        raw[model][mode] = stats['act_space'] + stats['act_checkpoint_size']

    rel = {}
    for model in kModels:
      rel[model] = {}
      for mode in kModes:
        rel[model][mode] = {}
        rel[model][mode] = raw[model][mode] / raw[model]['none'] * 100

    print('Activations,|,none,,,|,seq,,,|,sel,,,|,seqsel,,,|,full,,,')
    print('Model,|,Profile,Calc,Delta,|,Profile,Calc,Delta,|'
          ',Profile,Calc,Delta,|,Profile,Calc,Delta,|,Profile,Calc,Delta,')
    max_error = 0
    abs_error = 0
    for model in kModels:
      print(f'{model},', end='')
      for mode in kModes:
        p = get_profile(model, mode)
        a = rel[model][mode]
        d = 100*(1-a/p)
        if math.fabs(d) > max_error:
          max_error = math.fabs(d)
        abs_error += math.fabs(d)
        print(f'|,{p}%,{a:.2f}%,{d:.2f}%,', end='')
      print()
    ave_error = abs_error / (len(kModels) * len(kModes))
    print(f'Ave,,{ave_error:.2f}%')
    print(f'Max,,{max_error:.2f}%')
    print(',')

  @staticmethod
  def seqsel_tab5(logger, args):
    kModels = ['megatron-22B', 'gpt3-175B', 'turing-530B', 'megatron-1T']
    kModes = ['full', 'seqsel']
    # These profiled values are reported here:
    # https://arxiv.org/pdf/2205.05198.pdf
    # Table 5
    kProfile = {
      'megatron-22B': {
        'full': 1.42,
        'seqsel': 1.10
      },
      'gpt3-175B': {
        'full': 18.13,
        'seqsel': 13.75
      },
      'turing-530B': {
        'full': 49.05,
        'seqsel': 37.83
      },
      'megatron-1T': {
        'full': 94.42,
        'seqsel': 71.49
      }
    }

    def get_files(model, mode):
      assert model in kModels
      assert mode in kModes
      app = os.path.join(args.base_dir, 'models', f'{model}.json')
      exe = os.path.join(args.base_dir, 'validation', 'seqsel', 'tab5',
                         f'{model}_{mode}.json')
      return app, exe

    def get_profile(model, mode):
      assert model in kModels
      assert mode in kModes
      return kProfile[model][mode]

    syst_file = os.path.join(args.base_dir, 'systems', 'a100_80g.json')
    syst = System(calculon.io.read_json_file(syst_file))
    data = {}
    for model in kModels:
      data[model] = {}
      for mode in kModes:
        if args.verbose:
          print(f'Analyzing {model} {mode}')
        data[model][mode] = {}
        app_file, exe_file = get_files(model, mode)
        app = Llm.Application(calculon.read_json_file(app_file))
        exe = Llm.Execution.from_json(calculon.read_json_file(exe_file))
        mt = Llm(app, logger)
        mt.compile(syst, exe)
        mt.run(syst)
        stats = mt.get_stats_json(False)
        data[model][mode]['profile_time'] = get_profile(model, mode)
        data[model][mode]['actual_time'] = stats["total_time"]
        data[model][mode]['memory_req'] = stats["proc_mem_tier1_cap_req"]

    print('End-to-end,|,full,,,,|,seqsel,,,,')
    print('Model,|,Profile,Calc,Delta,GiB,|,Profile,Calc,Delta,GiB,')
    max_error = 0
    abs_error = 0
    for model in kModels:
      print(f'{model},', end='')
      for mode in kModes:
        p = data[model][mode]['profile_time']
        a = data[model][mode]['actual_time']
        d = 100*(1-a/p)
        if math.fabs(d) > max_error:
          max_error = math.fabs(d)
        abs_error += math.fabs(d)
        m = data[model][mode]['memory_req'] / (1024**3)
        print(f'|,{p},{a:.2f},{d:.2f}%,{m:.2f},', end='')
      print()
    ave_error = abs_error / (len(kModels) * len(kModes))
    print(f'Ave,,{ave_error:.2f}%')
    print(f'Max,,{max_error:.2f}%')
    print(',')

calculon.CommandLine.register(Validation)


================================================
FILE: calculon/memory.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

class Memory:
  """Configuration for a memory."""

  def __init__(self, cfg):
    self._capacity = cfg['GiB'] * 1024**3
    self._bandwidth = cfg['GBps'] * 1e9
    self._efficiency = []
    for mbytes, eff in cfg['MB_efficiency']:
      bytes = mbytes * 1e6
      assert 0 < eff <= 1.0
      self._efficiency.append((bytes, eff))

  @property
  def capacity(self):
    return self._capacity

  @property
  def bandwidth(self):
    return self._bandwidth

  def efficiency(self, op_bytes):
    for bytes, eff in self._efficiency:
      if op_bytes >= bytes:
        return eff
    assert False, f'OP bytes {op_bytes} wasn\'t covered'

  def throughput(self, op_bytes):
    return self._bandwidth * self.efficiency(op_bytes)


================================================
FILE: calculon/network.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""


class Network:
  """Configuration for a network."""

  kKeys = set(['bandwidth', 'efficiency', 'size', 'latency', 'ops',
               'must_be_filled', 'processor_usage'])
  kNetOps = set(['p2p', 'reduce_scatter', 'all_gather', 'all_reduce'])
  kCollectives = set(['reduce_scatter', 'all_gather', 'all_reduce'])

  class Op:
    def __init__(self, scalar, offset):
      self.scalar = scalar
      self.offset = offset

  @staticmethod
  def _parse_op(op, scalar, offset):
    assert op in Network.kNetOps, f'Invalid network op: {op}'
    assert scalar > 0.0, f'Invalid network scalar for {op}: {scalar}'
    if op in Network.kCollectives:
      assert offset is not None, f'Must give offset for {op}'
      return Network.Op(scalar, offset)
    else:
      assert offset is None, f'Can\'t give offset for {op}'
      return Network.Op(scalar, 0)

  def __init__(self, cfg):
    assert Network.kKeys == set(cfg.keys())
    self._bw = cfg['bandwidth'] * 1e9  # Specified in GB/s
    assert self._bw > 0
    self._eff = cfg['efficiency']
    assert 0 < self._eff <= 1.0
    self._size = cfg['size']
    assert self._size >= 0
    self._latency = cfg['latency']
    self._ops = {}
    for op in cfg['ops']:
      self._ops[op] = Network._parse_op(
        op, cfg['ops'][op][0], cfg['ops'][op][1])
    assert set(self._ops.keys()) == Network.kNetOps
    self._must_be_filled = cfg['must_be_filled']
    self._proc_usage = cfg['processor_usage']
    assert self._proc_usage >= 0.0 and self._proc_usage < 1.0

  @property
  def size(self):
    return self._size

  @property
  def must_be_filled(self):
    return self._must_be_filled

  @property
  def processor_usage(self):
    return self._proc_usage

  def time(self, op, op_size, comm_size):
    """ Computes the time taken for a network operation.

    Args:
      op (str)        : operation name
      op_size (int)   : operation size in bytes
      comm_size (int) : number of participants in operation

    Returns:
      time (float)    : time needed for operation
    """
    if op not in Network.kCollectives:
      assert comm_size == 2
    else:
      assert comm_size >= 2
    assert op in Network.kNetOps
    assert op_size >= 0

    # Scales the op_size by the scalar
    op_size *= self._ops[op].scalar

    # Scales the op_size by the op offset
    chunk_size = 1 / comm_size * op_size
    op_size += chunk_size * self._ops[op].offset

    # Calculates time based on raw bandwidth,  bandwidth efficiency, and latency
    return self._latency + op_size / (self._bw * self._eff)


================================================
FILE: calculon/processor.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

class Processor:
  """Configuration for a processing engine."""

  def __init__(self, cfg):
    self._datatypes = {}
    for datatype in cfg.keys():
      self._datatypes[datatype] = {
        'flops': cfg[datatype]['tflops'] * 1e12,
        'efficiency': []
      }
      last = None
      for gflops, eff in cfg[datatype]['gflops_efficiency']:
        flops = gflops * 1e9
        assert 0 < eff <= 1.0
        if last:
          assert flops < last
        last = flops
        self._datatypes[datatype]['efficiency'].append((flops, eff))

  def flops(self, datatype):
    return self._datatypes[datatype]['flops']

  def efficiency(self, datatype, op_flops):
    for flops, eff in self._datatypes[datatype]['efficiency']:
      if op_flops >= flops:
        return eff
    assert False, f'{op_flops} wasn\'t covered in {datatype} efficiency curve'

  def throughput(self, datatype, op_flops):
    assert datatype in self._datatypes, f'Unsupported type: {datatype}'
    return self.flops(datatype) * self.efficiency(datatype, op_flops)


================================================
FILE: calculon/system.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

from .memory import *
from .network import *
from .processor import *

class System:
  """Configuration for a system."""

  TypeSizes = {
    'float8'   : 1,
    'float16'  : 2,
    'float32'  : 4,
    'bfloat16' : 2
  }

  @staticmethod
  def supported_datatypes():
    return list(System.TypeSizes.keys())

  def __init__(self, cfg):
    self.cfg = cfg
    self.matrix = Processor(cfg['matrix'])
    self.vector = Processor(cfg['vector'])
    self.datatype = None

    self.mem1 = Memory(cfg['mem1'])
    self.mem2 = Memory(cfg['mem2'])

    self.proc_mode = cfg['processing_mode']
    assert self.proc_mode in ['roofline', 'no_overlap']

    self.networks = [Network(n) for n in cfg['networks']]

  @property
  def num_networks(self):
    return len(self.networks)

  def get_network(self, tier):
    assert tier < len(self.networks), f'Bad network tier ID: {tier}'
    return self.networks[tier]

  def set_datatype(self, datatype):
    assert datatype in System.TypeSizes, f'Unsupported data type: {datatype}'
    self.datatype = datatype

  def get_matrix_throughput(self, flops):
    return self.matrix.throughput(self.datatype, flops)

  def get_vector_throughput(self, flops):
    return self.vector.throughput(self.datatype, flops)

  def get_mem1_throughput(self, size):
    return self.mem1.throughput(size)

  def get_mem2_throughput(self, size):
    return self.mem2.throughput(size)

  def compute_offload_time(self, size):
    return size / self.mem2.throughput(size)

  def get_processing_time(self, flops_time, mem_time):
    if self.proc_mode == 'roofline':
      return max(flops_time, mem_time)
    elif self.proc_mode == 'no_overlap':
      return flops_time + mem_time


================================================
FILE: calculon/util.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

import argparse


def human_format(value, v_type='base10', precision=3):
  step = 1
  suffix = ''
  if v_type == 'base10':
    step = 1000
    suffix = ''
  elif v_type == 'base2':
    step = 1024
    suffix = ''
  elif v_type == 'bytes':
    step = 1024
    suffix = 'iB'
  elif v_type == 'bandwidth':
    step = 1000
    suffix = 'B/s'
  elif v_type == 'flops':
    step = 1000
    suffix = 'Ops'
  elif v_type == 'throughput':
    step = 1000
    suffix = 'Op/s'
  else:
    raise ValueError(
      f"Type value should be 'base10', 'base2', 'bytes', 'flops', "
      f"'bandwidth', or 'throughput'. You gave {v_type}")
  labels = ['', 'k', 'M', 'G', 'T', 'P', 'E']
  index = 0
  if value != None:
    abs_value = abs(value)
    if value >= 0:
      sign = 1
    else:
      sign = -1
    for l in labels:
      if abs_value >= step:
        abs_value /= step
        index += 1
      else:
        break
    value = sign * abs_value
    return "{0:.{1}f} {2}{3}".format(value, precision, labels[index], suffix)
  else:
    return "n/a {1}{2}".format(value, labels[0], suffix)


def pick(en, a, b):
  if en:
    return a
  return b


def arg_true_false_all(arg):
  trues = ['t', 'true', 'T', 'True', '1']
  falses = ['f', 'false', 'F', 'False', '0']
  alls = ['both', 'all', '*']
  if arg in trues:
    return [True]
  elif arg in falses:
    return [False]
  elif arg in alls:
    return [False, True]
  else:
    raise argparse.ArgumentTypeError(f'Invalid true/false/all: {arg}')


================================================
FILE: calculon/version.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

import calculon

class Version(calculon.CommandLine):
  NAME = 'version'
  ALIASES = ['v']

  @staticmethod
  def create_parser(subparser):
    sp = subparser.add_parser(Version.NAME, aliases=Version.ALIASES,
                              help='show the version and exit')
    sp.set_defaults(func=Version.run_command)

  @staticmethod
  def run_command(logger, args):
    # version is specified in __init__.py
    logger.info(calculon.__version__)


calculon.CommandLine.register(Version)


================================================
FILE: examples/3072_t4_p64_d12_mbs4_full.json
================================================
{
  "num_procs": 3072,
  "tensor_par": 4,
  "pipeline_par": 64,
  "data_par": 12,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 3072,
  "microbatch_size": 4,
  "datatype": "float16",
  "fused_activation": true,
  "attention_type": "multihead",
  "activation_recompute": "full",
  "pipeline_interleaving": 1,
  "optimizer_sharding": true,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: models/anthropic-52B.json
================================================
{
  "hidden": 8192,
  "feedforward": 32768,
  "seq_size": 8192,
  "attn_heads": 64,
  "attn_size": 128,
  "num_blocks": 64
}


================================================
FILE: models/chinchilla.json
================================================
{
  "hidden": 8192,
  "feedforward": 32768,
  "seq_size": 2048,
  "attn_heads": 64,
  "attn_size": 128,
  "num_blocks": 80
}


================================================
FILE: models/gopher-280B.json
================================================
{
  "hidden": 16384,
  "feedforward": 65536,
  "seq_size": 2048,
  "attn_heads": 128,
  "attn_size": 128,
  "num_blocks": 80
}


================================================
FILE: models/gpt3-13B.json
================================================
{
  "hidden": 5140,
  "feedforward": 20560,
  "seq_size": 2048,
  "attn_heads": 40,
  "attn_size": 128,
  "num_blocks": 40
}


================================================
FILE: models/gpt3-175B.json
================================================
{
  "hidden": 12288,
  "feedforward": 49152,
  "seq_size": 2048,
  "attn_heads": 96,
  "attn_size": 128,
  "num_blocks": 96
}


================================================
FILE: models/lamda.json
================================================
{
  "hidden": 8192,
  "feedforward": 65536,
  "seq_size": 2048,
  "attn_heads": 128,
  "attn_size": 128,
  "num_blocks": 64
}


================================================
FILE: models/megatron-126M.json
================================================
{
  "hidden": 768,
  "feedforward": 3072,
  "seq_size": 2048,
  "attn_heads": 16,
  "attn_size": 48,
  "num_blocks": 12
}


================================================
FILE: models/megatron-1T.json
================================================
{
  "hidden": 25600,
  "feedforward": 102400,
  "seq_size": 2048,
  "attn_heads": 160,
  "attn_size": 160,
  "num_blocks": 128
}


================================================
FILE: models/megatron-22B.json
================================================
{
  "hidden": 6144,
  "feedforward": 24576,
  "seq_size": 2048,
  "attn_heads": 64,
  "attn_size": 96,
  "num_blocks": 48
}


================================================
FILE: models/megatron-40B.json
================================================
{
  "hidden": 8192,
  "feedforward": 32768,
  "seq_size": 2048,
  "attn_heads": 64,
  "attn_size": 128,
  "num_blocks": 48
}


================================================
FILE: models/megatron-5B.json
================================================
{
  "hidden": 4096,
  "feedforward": 16384,
  "seq_size": 2048,
  "attn_heads": 32,
  "attn_size": 128,
  "num_blocks": 24
}


================================================
FILE: models/palm-540B.json
================================================
{
  "hidden": 18432,
  "feedforward": 73728,
  "seq_size": 2048,
  "attn_heads": 48,
  "attn_size": 256,
  "num_blocks": 118
}


================================================
FILE: models/turing-530B.json
================================================
{
  "hidden": 20480,
  "feedforward": 81920,
  "seq_size": 2048,
  "attn_heads": 128,
  "attn_size": 160,
  "num_blocks": 105
}


================================================
FILE: pylintrc
================================================
[MESSAGES CONTROL]
disable=locally-disabled,
	too-many-branches,
	too-many-instance-attributes,
	too-many-return-statements,
	duplicate-code,
	too-many-arguments,
	no-method-argument

[FORMAT]
indent-string='  '
indent-after-paren=2

[DESIGN]
min-public-methods=0
max-public-methods=9999

================================================
FILE: pyproject.toml
================================================
[build-system]
requires = [
    "setuptools>=42",
    "wheel"
]
build-backend = "setuptools.build_meta"


================================================
FILE: scripts/3dplot.py
================================================
#!/usr/bin/env python3

import argparse
import calculon
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
import numpy as np


def main(args):
  data = calculon.io.read_json_file(args.stats)

  # Turns the keys back into integers
  ndata = {}
  for tp in data.keys():
    tpi = int(tp)
    ndata[tpi] = {}
    for pp in data[tp].keys():
      ppi = int(pp)
      ndata[tpi][ppi] = data[tp][pp]
  data = ndata
  tps = sorted(list(data.keys()))
  pps = set()
  for tp in data.keys():
    for pp in data[tp].keys():
      pps.add(pp)
  pps = sorted(list(pps))
  assert len(tps) > 1, f'len(tps)={len(tps)} can\'t plot'
  assert len(pps) > 1, f'len(pps)={len(pps)} can\'t plot'

  # Gathers data
  fdata = np.full((len(pps), len(tps)), float('NaN'))
  for tp in data.keys():
    for pp in data[tp].keys():
      if 'stats' in data[tp][pp]:
        v = data[tp][pp]['stats']['sample_rate']
        fdata[pps.index(pp)][tps.index(tp)] = v
        print(f'{tp},{pp} is {v}')
      else:
        print(f'{tp},{pp} has none')

  fig = plt.figure()
  ax = fig.add_subplot(111, projection='3d')
  X, Y = np.meshgrid(list(range(len(tps))), list(range(len(pps))))
  ax.plot_surface(X, Y, fdata, rstride=1, cstride=1,
                  cmap='rainbow',
                  edgecolor='none')
  ax.set_xlabel('Tensor Parallelism')
  ax.set_ylabel('Pipeline Parallelism')
  ax.set_zlabel('Sample Rate (s/sec)')
  if args.title:
    ax.set_title(args.title)
  ax.view_init(20, 180+25)
  @tkr.FuncFormatter
  def formatter(x, pos):
    d = 2**x
    if d < 1:
      return 'duh'
    else:
      return str(int(d))
  ax.xaxis.set_major_formatter(formatter)
  ax.yaxis.set_major_formatter(formatter)
  fig.tight_layout()
  plt.show()


if __name__ == '__main__':
  ap = argparse.ArgumentParser()
  ap.add_argument('stats', type=str,
                  help='File path to stats input')
  ap.add_argument('-t', '--title', type=str, default=None,
                  help='Title of plot')
  args = ap.parse_args()
  main(args)


================================================
FILE: scripts/find_huge.py
================================================
#!/usr/bin/env python3

import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import tol_colors as tc

########## Utils ##########
def transformer_attn_size(hidden, layers, attn_size_step=32):
  return step_rounder(hidden / layers, attn_size_step)

def transformer_num_parameters(hidden, layers, attn_size_step=32):
  attn_heads = layers
  attn_size = transformer_attn_size(hidden, layers, attn_size_step)
  mlp_params = 8 * layers * hidden **2
  attn_params = 4 * layers * hidden * attn_heads * attn_size
  return mlp_params + attn_params
  #return 12 * layers * hidden **2

def transformer_t_params(hidden, layers):
  return transformer_num_parameters(hidden, layers) / 10**12

def step_rounder(layer, step=1):
  return np.round(layer/step) * step

def model_ratio(hidden, layers):
  return hidden / layers

def human_format(value, v_type='base10', precision=3):
  step = 1
  suffix = ''
  if v_type == 'base10':
    step = 1000
    suffix = ''
  elif v_type == 'base2':
    step = 1024
    suffix = ''
  elif v_type == 'bytes':
    step = 1024
    suffix = 'iB'
  elif v_type == 'bandwidth':
    step = 1000
    suffix = 'B/s'
  elif v_type == 'flops':
    step = 1000
    suffix = 'Ops'
  elif v_type == 'throughput':
    step = 1000
    suffix = 'Op/s'
  else:
    raise ValueError(
      f"Type value should be 'base10', 'base2', 'bytes', 'flops', "
      f"'bandwidth', or 'throughput'. You gave {v_type}")
  labels = ['', 'k', 'M', 'G', 'T', 'P', 'E']
  index = 0
  if value != None:
    abs_value = abs(value)
    if value >= 0:
      sign = 1
    else:
      sign = -1
    for l in labels:
      if abs_value >= step:
        abs_value /= step
        index += 1
      else:
        break
    value = sign * abs_value
    return "{0:.{1}f}{2}{3}".format(value, precision, labels[index], suffix)
  else:
    return "n/a {1}{2}".format(value, labels[0], suffix)

########## Scale rules with ratio ##########
def ratio_layer_scale(hidden, ratio=128, step=4):
  return step_rounder(hidden/ratio, step=step)
def ratio_hidden_scale(layers, ratio=128, step=4096):
  return step_rounder(layers * ratio, step=step)
def ratio_param_layer_scale(layers, ratio=128, step=4096):
  return transformer_num_parameters(
    ratio_hidden_scale(layers, ratio=ratio, step=step), layers)
def ratio_param_hidden_scale(hidden, ratio=128, step=4):
  return transformer_num_parameters(
    hidden, ratio_layer_scale(hidden, ratio=ratio, step=step))


hidden_step = 1024
layer_step = 32
hiddens = [x for x in range(24*1024, 8192*24 + 1, hidden_step)]
layers = [x for x in range(128, 576 + 1, layer_step)]
slope = (320-192) / ((512-128)/layer_step)
y_intercept = 192
targets = [slope * x + y_intercept for x in range(len(layers))]
#targets = [200 for x in range(len(layers))]
hiddens = np.asarray(hiddens)
layers = np.asarray(layers)
params_grid = np.zeros((hiddens.shape[0], layers.shape[0]), dtype="float")
ratio_grid = np.zeros((hiddens.shape[0], layers.shape[0]), dtype="float")
target_ratio_grid = np.zeros((hiddens.shape[0], layers.shape[0]), dtype="float")
for row, h in enumerate(hiddens):
  for col, l in enumerate(layers):
    params_grid[row][col] = transformer_num_parameters(h, l)
    ratio = model_ratio(h, l)
    ratio_grid[row][col] = ratio
    target_ratio_grid[row][col] = ratio / targets[col]


fig = plt.figure(figsize=(16, 16), dpi=200)
ax = fig.add_subplot(1, 1, 1)

im = ax.imshow(target_ratio_grid, cmap=tc.tol_cmap('BuRd'),
               vmin=.5, vmax=1.5, origin='lower')#, aspect=0.8)
ax.set_xlabel('# of blocks')
ax.set_ylabel('Hidden size')

# Show all ticks and label them with the respective list entries
ax.set_yticks(np.arange(hiddens.shape[0]))
ax.set_xticks(np.arange(layers.shape[0]))
ax.set_yticklabels(hiddens)
ax.set_xticklabels(layers)

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
print('name,hidden,feedforward,seq_size,attn_heads,attn_size,num_blocks,gbs,ratio')
for col, l in enumerate(layers):
  best_val = 9999
  best_row = None
  for row, h in enumerate(hiddens):
    val = abs(target_ratio_grid[row][col] - 1)
    if val < best_val:
      best_val = val
      best_row = row
  for row, h in enumerate(hiddens):
    result = human_format(params_grid[row][col], precision=0)
    result += "\n"
    result += human_format(ratio_grid[row][col], precision=0)
    weight = 'bold' if row == best_row else None
    text = ax.text(col, row, result, ha="center", va="center", color="k", size=8, weight=weight)
    if row == best_row:
      attn_size = int(step_rounder(hiddens[row] / layers[col]))
      params = human_format(transformer_num_parameters(hiddens[row], layers[col]), precision=0)
      ratio = hiddens[row] / layers[col]
      print(f'{params},{hiddens[row]},{hiddens[row]*4},8192,{layers[col]},{attn_size},{layers[col]},3072,{ratio}')

exit(0)
ax.spines[:].set_visible(False)
ax.set_xticks(np.arange(params_grid.shape[1]+1)-.5, minor=True)
ax.set_yticks(np.arange(params_grid.shape[0]+1)-.5, minor=True)
ax.grid(which="minor", color="w", linestyle='-', linewidth=2)
ax.tick_params(which="minor", bottom=False, left=False)
ax.set_title("Number of parameters in trillions, and model ratio, colored by ratio")

fig.tight_layout()
fig.savefig('huge.png')
plt.close(fig)


================================================
FILE: scripts/heatmap.py
================================================
#!/usr/bin/env python3

import argparse
import calculon
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
import numpy as np
import tol_colors as tc


def main(args):
  data = calculon.io.read_json_file(args.stats)

  # Turns the keys back into integers
  ndata = {}
  for tp in data.keys():
    tpi = int(tp)
    ndata[tpi] = {}
    for pp in data[tp].keys():
      ppi = int(pp)
      ndata[tpi][ppi] = data[tp][pp]
  data = ndata
  tps = sorted(list(data.keys()))
  pps = set()
  for tp in data.keys():
    for pp in data[tp].keys():
      pps.add(pp)
  pps = sorted(list(pps))
  assert len(tps) > 1, f'len(tps)={len(tps)} can\'t plot'
  assert len(pps) > 1, f'len(pps)={len(pps)} can\'t plot'

  # Gathers data
  fdata = np.full((len(tps), len(pps)), float('NaN'))
  for tp in data.keys():
    for pp in data[tp].keys():
      if 'stats' in data[tp][pp]:
        v = data[tp][pp]['stats']['sample_rate']
        fdata[tps.index(tp)][pps.index(pp)] = v
        print(f'{tp},{pp} is {v}')
      else:
        print(f'{tp},{pp} has none')

  # Determines range
  minf = min(map(min, fdata))
  maxf = max(map(max, fdata))
  black_threshold = minf + (maxf - minf) * 0.30
  print(f'min={minf} max={maxf} thres={black_threshold}')

  # Creates the plot
  fig = plt.figure()
  ax = fig.add_subplot(1, 1, 1)
  ax.imshow(fdata, origin='lower', cmap='hot')#, linewidth=0.5)
  ax.set_xticks(np.arange(len(pps)), labels=pps)
  ax.set_xlabel('Pipeline Parallelism')
  ax.set_yticks(np.arange(len(tps)), labels=tps)
  ax.set_ylabel('Tensor Parallelism')
  for tp in tps:
    for pp in pps:
      perf = fdata[tps.index(tp), pps.index(pp)]
      color = 'black' if perf > black_threshold else 'white'
      perf = f'{perf:.1f}'
      text = f'{perf}'
      ax.text(pps.index(pp), tps.index(tp), text, ha='center', va='center',
              color=color)
  if args.title:
    ax.set_title(args.title)
  print(f'writing {args.output}')
  fig.tight_layout()
  fig.savefig(args.output)
  plt.close(fig)


if __name__ == '__main__':
  ap = argparse.ArgumentParser()
  ap.add_argument('stats', type=str,
                  help='File path to stats input')
  ap.add_argument('output', type=str,
                  help='Output plot file')
  ap.add_argument('-t', '--title', type=str, default=None,
                  help='Title of plot')
  args = ap.parse_args()
  main(args)


================================================
FILE: scripts/install_hooks.sh
================================================
#!/bin/bash

set -e

# Pre-commit hook
cat > .git/hooks/pre-commit <<-EOF
#!/bin/bash
echo -n "Testing..."
if ! make test &> /dev/null; then
    echo " failed :("
    exit -1
fi
EOF

chmod a+x .git/hooks/pre-commit


================================================
FILE: scripts/json_to_csv.py
================================================
#!/usr/bin/env python3

import argparse
import calculon
import gzip
import json
import sys


def main(args):
  j = calculon.read_json_file(args.json_file)

  header_entries = []
  for category in j['0']:
    for key in j['0'][category]:
      header_entries.append((category, key))

  opener = gzip.open if args.csv_file.endswith('.gz') else open
  with opener(args.csv_file, 'wb') as fd:
    # Header
    fd.write(bytes(',', 'utf-8'))
    for _, key in header_entries:
      fd.write(bytes(f'{key},', 'utf-8'))
    fd.write(bytes(',\n', 'utf-8'))

    # Rows
    for entry in j.keys():
      fd.write(bytes(f'{entry},', 'utf-8'))
      for category, key in header_entries:
        v = j[entry][category][key]
        fd.write(bytes(f'{v},', 'utf-8'))
      fd.write(bytes(',\n', 'utf-8'))

if __name__ == '__main__':
  ap = argparse.ArgumentParser()
  ap.add_argument('json_file', help='input JSON file')
  ap.add_argument('csv_file', help='output CSV file')
  sys.exit(main(ap.parse_args()))


================================================
FILE: setup.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""

import codecs
import re
import os
import sys

try:
  from setuptools import setup
except:
  print('please install setuptools via pip:')
  print('  pip3 install setuptools')
  sys.exit(-1)

def find_version(*file_paths):
  version_file = codecs.open(os.path.join(os.path.abspath(
    os.path.dirname(__file__)), *file_paths), 'r').read()
  version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
                            version_file, re.M)
  if version_match:
    return version_match.group(1)
  raise RuntimeError("Unable to find version string.")


setup(
  name='calculon',
  version=find_version('calculon', '__init__.py'),
  description='Co-design for large scale parallel applications',
  author='Michael Isaev',
  author_email='michael.v.isaev@gmail.com',
  license='Apache 2',
  url='http://github.com/calculon-ai/calculon',
  packages=['calculon', 'calculon.llm'],
  scripts=['bin/calculon'],
  install_requires=[],
)


================================================
FILE: systems/a100_80e.json
================================================
{
  "matrix" : {
    "float16": {
      "tflops": 312,
      "gflops_efficiency": [
        [128, 0.99],
        [16, 0.9],
        [1, 0.6],
        [0, 0.1]
      ]
    }
  },
  "vector": {
    "float16": {
      "tflops": 78,
      "gflops_efficiency": [
        [16, 0.95],
        [1, 0.5],
        [0, 0.1]
      ]
    }
  },
  "mem1": {
    "GiB": 80000000000,
    "GBps": 2048,
    "MB_efficiency": [
      [100, 0.95],
      [10, 0.90],
      [1, 0.7],
      [0, 0.3]
    ]
  },
  "mem2": {
    "GiB": 512,
    "GBps": 32,
    "MB_efficiency": [
      [100, 0.95],
      [10, 0.9],
      [1, 0.7],
      [0, 0.3]
    ]
  },
  "processing_mode": "no_overlap",
  "networks": [
    {
      "bandwidth": 300,
      "efficiency": 0.65,
      "size": 8,
      "latency": 0.00001,
      "ops": {
        "p2p": [1.0, null],
        "reduce_scatter": [1.5, -1],
        "all_gather": [1.5, -1],
        "all_reduce": [2.0, -1]
      },
      "must_be_filled": true,
      "processor_usage": 0.15
    },{
      "bandwidth": 25,
      "efficiency": 0.9,
      "size": 65536,
      "latency": 0.00002,
      "ops": {
        "p2p": [1.0, null],
        "reduce_scatter": [1.0, 0],
        "all_gather": [1.0, 0],
        "all_reduce": [1.0, 0]
      },
      "must_be_filled": false,
      "processor_usage": 0.02
    }
  ]
}


================================================
FILE: systems/a100_80g.json
================================================
{
  "matrix" : {
    "float16": {
      "tflops": 312,
      "gflops_efficiency": [
        [128, 0.95],
        [16, 0.9],
        [1, 0.6],
        [0, 0.1]
      ]
    }
  },
  "vector": {
    "float16": {
      "tflops": 78,
      "gflops_efficiency": [
        [16, 0.95],
        [1, 0.5],
        [0, 0.1]
      ]
    }
  },
  "mem1": {
    "GiB": 80,
    "GBps": 2048,
    "MB_efficiency": [
      [100, 0.90],
      [10, 0.75],
      [1, 0.6],
      [0, 0.3]
    ]
  },
  "mem2": {
    "GiB": 512,
    "GBps": 32,
    "MB_efficiency": [
      [100, 0.95],
      [10, 0.9],
      [1, 0.7],
      [0, 0.3]
    ]
  },
  "processing_mode": "no_overlap",
  "networks": [
    {
      "bandwidth": 300,
      "efficiency": 0.65,
      "size": 8,
      "latency": 0.00001,
      "ops": {
        "p2p": [1.0, null],
        "reduce_scatter": [1.5, -1],
        "all_gather": [1.5, -1],
        "all_reduce": [2.0, -1]
      },
      "must_be_filled": true,
      "processor_usage": 0.15
    },{
      "bandwidth": 25,
      "efficiency": 0.9,
      "size": 65536,
      "latency": 0.00002,
      "ops": {
        "p2p": [1.0, null],
        "reduce_scatter": [1.0, 0],
        "all_gather": [1.0, 0],
        "all_reduce": [1.0, 0]
      },
      "must_be_filled": false,
      "processor_usage": 0.02
    }
  ]
}


================================================
FILE: systems/h100_80g_nvl8.json
================================================
{
  "matrix": {
    "float8": {
      "tflops": 2000,
      "gflops_efficiency": [
        [128, 0.95],
        [16, 0.9],
        [1, 0.6],
        [0, 0.1]
      ]
    },
    "float16": {
      "tflops": 1000,
      "gflops_efficiency": [
        [128, 0.95],
        [16, 0.9],
        [1, 0.6],
        [0, 0.1]
      ]
    }
  },
  "vector": {
    "float8": {
      "tflops": 120,
      "gflops_efficiency": [
        [16, 0.95],
        [1, 0.5],
        [0, 0.1]
      ]
    },
    "float16": {
      "tflops": 120,
      "gflops_efficiency": [
        [16, 0.95],
        [1, 0.5],
        [0, 0.1]
      ]
    }
  },
  "mem1": {
    "GiB": 80,
    "GBps": 3072,
    "MB_efficiency": [
      [100, 0.90],
      [10, 0.75],
      [1, 0.6],
      [0, 0.3]
    ]
  },
  "mem2": {
    "GiB": 512,
    "GBps": 450,
    "MB_efficiency": [
      [100, 0.95],
      [10, 0.9],
      [1, 0.7],
      [0, 0.3]
    ]
  },
  "processing_mode": "no_overlap",
  "networks": [
    {
      "bandwidth": 450,
      "efficiency": 0.65,
      "size": 8,
      "latency": 0.00001,
      "ops": {
        "p2p": [1.0, null],
        "reduce_scatter": [1.0, 0],
        "all_gather": [1.0, 0],
        "all_reduce": [1.0, 1]
      },
      "must_be_filled": true,
      "processor_usage": 0.15
    },{
      "bandwidth": 50,
      "efficiency": 0.9,
      "size": 65536,
      "latency": 0.00002,
      "ops": {
        "p2p": [1.0, null],
        "reduce_scatter": [1.0, 0],
        "all_gather": [1.0, 0],
        "all_reduce": [1.0, 0]
      },
      "must_be_filled": false,
      "processor_usage": 0.02
    }
  ]
}


================================================
FILE: test/__init__.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""


================================================
FILE: test/test.sh
================================================
#!/bin/bash

set -e

export PYTHONPATH=.

# CLI interface infrastructure
echo -e "### Testing top level --help"
./bin/calculon --help > /dev/null
commands=$(./bin/calculon --help | head -n 2 | tail -n 1 | tr '{' ' ' | tr '}' ' ' | tr ',' ' ')
for command in $commands; do
    if [ $command == 'v' ] || [ $command == 'version' ]; then
	echo -e "### Testing \"$command\""
	./bin/calculon $command
    else
	echo -e "### Testing \"$command\" --help"
	./bin/calculon $command --help > /dev/null
    fi
done
echo -e "\n\n"

# Model size calculations
echo -e "### Testing llm-parameter-calculator"
for model in models/*json; do
    ./bin/calculon llm-parameter-calculator -a 15 $model
done
echo -e "\n\n"

# Model tests
echo -e "### Testing llm"
for model in models/*json; do
    echo $model
    ./bin/calculon llm $model examples/3072_t4_p64_d12_mbs4_full.json systems/a100_80e.json - > /dev/null
    ./bin/calculon llm $model examples/3072_t4_p64_d12_mbs4_full.json systems/a100_80e.json /tmp/calculon_stats.json -p /tmp/calculon_peers.json
done
echo -e "\n\n"

# Llm validation
echo -e "### Testing llm-validation"
./bin/calculon lv -v
echo -e "\n\n"

# Llm optimal execution
echo -e "### Testing llm-optimal-execution (float16) (using -f)"
./bin/calculon loe models/turing-530B.json 5128 2520 float16 systems/h100_80g_nvl8.json /tmp/calculon_530B_fp16.json -t 3 -f False --no-tp-overlap
echo -e "\n"

echo -e "### Testing llm-optimal-execution (float8) (using -m)"
./bin/calculon loe models/turing-530B.json 5128 2520 float8 systems/h100_80g_nvl8.json /tmp/calculon_530B_fp8.csv.gz -t 10 -m
echo -e "\n\n"

# Llm all executions
echo -e "### Testing llm-all-executions (float8)"
./bin/calculon lae models/turing-530B.json 5128 2520 float8 systems/h100_80g_nvl8.json /tmp/calculon_530B_fp8_all.csv.gz
echo -e "\n\n"


================================================
FILE: test/test_json_write_read.py
================================================
"""
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  https://www.apache.org/licenses/LICENSE-2.0
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
"""
import calculon
import os
import tempfile
import unittest


class JsonWriteReadTestCase(unittest.TestCase):
  def test_json_read_write(self):
    jd = {
      'a': 1239,
      'hi': {
        '34': 'world',
        'ugh': 77,
        '1': 'hello world world world world world world world world world world'
      }
    }

    _, reg_file = tempfile.mkstemp(suffix='.json')
    _, gz_file = tempfile.mkstemp(suffix='.json.gz')
    _, foo_file = tempfile.mkstemp(suffix='.json.foo')
    _, bar_file = tempfile.mkstemp(suffix='.bar.gz')
    os.remove(reg_file)
    os.remove(gz_file)
    os.remove(foo_file)
    os.remove(bar_file)

    self.assertTrue(calculon.is_json_extension(reg_file))
    self.assertTrue(calculon.is_json_extension(gz_file))
    self.assertFalse(calculon.is_json_extension(foo_file))
    self.assertFalse(calculon.is_json_extension(bar_file))

    self.assertFalse(os.path.exists(reg_file))
    self.assertFalse(os.path.exists(gz_file))

    calculon.io.write_json_file(jd, reg_file)
    calculon.io.write_json_file(jd, gz_file)

    self.assertTrue(os.path.exists(reg_file))
    self.assertTrue(os.path.exists(gz_file))

    reg_size = os.path.getsize(reg_file)
    gz_size = os.path.getsize(gz_file)
    self.assertTrue(reg_size > 0)
    self.assertTrue(reg_size > gz_size)
    self.assertTrue(gz_size > 0)

    reg_jd = calculon.io.read_json_file(reg_file)
    gz_jd = calculon.io.read_json_file(gz_file)

    self.assertEqual(reg_jd, jd)
    self.assertEqual(gz_jd, jd)

    os.remove(reg_file)
    os.remove(gz_file)


================================================
FILE: validation/seqsel/fig1/gpt3-175B_none.json
================================================
{
  "num_procs": 64,
  "tensor_par": 8,
  "pipeline_par": 8,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 64,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "none",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig1/gpt3-175B_seqsel.json
================================================
{
  "num_procs": 64,
  "tensor_par": 8,
  "pipeline_par": 8,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 64,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "attn_only",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": true,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig1/megatron-1T_none.json
================================================
{
  "num_procs": 512,
  "tensor_par": 8,
  "pipeline_par": 64,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 512,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "none",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig1/megatron-1T_seqsel.json
================================================
{
  "num_procs": 512,
  "tensor_par": 8,
  "pipeline_par": 64,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 512,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation":  false,
  "attention_type": "multihead",
  "activation_recompute": "attn_only",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": true,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig1/megatron-22B_none.json
================================================
{
  "num_procs": 8,
  "tensor_par": 8,
  "pipeline_par": 1,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 4,
  "microbatch_size": 4,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "none",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig1/megatron-22B_seqsel.json
================================================
{
  "num_procs": 8,
  "tensor_par": 8,
  "pipeline_par": 1,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 4,
  "microbatch_size": 4,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "attn_only",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": true,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig1/turing-530B_none.json
================================================
{
  "num_procs": 280,
  "tensor_par": 8,
  "pipeline_par": 35,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 280,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "none",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig1/turing-530B_seqsel.json
================================================
{
  "num_procs": 280,
  "tensor_par": 8,
  "pipeline_par": 35,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 280,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "attn_only",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": true,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/gpt3-175B_full.json
================================================
{
  "num_procs": 64,
  "tensor_par": 8,
  "pipeline_par": 8,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 64,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "full",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/gpt3-175B_none.json
================================================
{
  "num_procs": 64,
  "tensor_par": 8,
  "pipeline_par": 8,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 64,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "none",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/gpt3-175B_sel.json
================================================
{
  "num_procs": 64,
  "tensor_par": 8,
  "pipeline_par": 8,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 64,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "attn_only",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/gpt3-175B_seq.json
================================================
{
  "num_procs": 64,
  "tensor_par": 8,
  "pipeline_par": 8,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 64,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "none",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": true,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/gpt3-175B_seqsel.json
================================================
{
  "num_procs": 64,
  "tensor_par": 8,
  "pipeline_par": 8,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 64,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "attn_only",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": true,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/megatron-1T_full.json
================================================
{
  "num_procs": 512,
  "tensor_par": 8,
  "pipeline_par": 64,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 512,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "full",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/megatron-1T_none.json
================================================
{
  "num_procs": 512,
  "tensor_par": 8,
  "pipeline_par": 64,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 512,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "none",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/megatron-1T_sel.json
================================================
{
  "num_procs": 512,
  "tensor_par": 8,
  "pipeline_par": 64,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 512,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "attn_only",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/megatron-1T_seq.json
================================================
{
  "num_procs": 512,
  "tensor_par": 8,
  "pipeline_par": 64,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 512,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "none",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": true,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/megatron-1T_seqsel.json
================================================
{
  "num_procs": 512,
  "tensor_par": 8,
  "pipeline_par": 64,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 512,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "attn_only",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": true,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/megatron-22B_full.json
================================================
{
  "num_procs": 8,
  "tensor_par": 8,
  "pipeline_par": 1,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 4,
  "microbatch_size": 4,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "full",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/megatron-22B_none.json
================================================
{
  "num_procs": 8,
  "tensor_par": 8,
  "pipeline_par": 1,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 4,
  "microbatch_size": 4,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "none",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/megatron-22B_sel.json
================================================
{
  "num_procs": 8,
  "tensor_par": 8,
  "pipeline_par": 1,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 4,
  "microbatch_size": 4,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "attn_only",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/megatron-22B_seq.json
================================================
{
  "num_procs": 8,
  "tensor_par": 8,
  "pipeline_par": 1,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 4,
  "microbatch_size": 4,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "none",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": true,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/megatron-22B_seqsel.json
================================================
{
  "num_procs": 8,
  "tensor_par": 8,
  "pipeline_par": 1,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 4,
  "microbatch_size": 4,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "attn_only",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": true,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/turing-530B_full.json
================================================
{
  "num_procs": 280,
  "tensor_par": 8,
  "pipeline_par": 35,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 280,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "full",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/turing-530B_none.json
================================================
{
  "num_procs": 280,
  "tensor_par": 8,
  "pipeline_par": 35,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 280,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "none",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/turing-530B_sel.json
================================================
{
  "num_procs": 280,
  "tensor_par": 8,
  "pipeline_par": 35,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 280,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "attn_only",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/turing-530B_seq.json
================================================
{
  "num_procs": 280,
  "tensor_par": 8,
  "pipeline_par": 35,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 280,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "none",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": true,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/fig7/turing-530B_seqsel.json
================================================
{
  "num_procs": 280,
  "tensor_par": 8,
  "pipeline_par": 35,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 280,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "attn_only",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": true,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/tab5/gpt3-175B_full.json
================================================
{
  "num_procs": 64,
  "tensor_par": 8,
  "pipeline_par": 8,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 64,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "full",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/tab5/gpt3-175B_seqsel.json
================================================
{
  "num_procs": 64,
  "tensor_par": 8,
  "pipeline_par": 8,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 64,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "attn_only",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": true,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/tab5/megatron-1T_full.json
================================================
{
  "num_procs": 512,
  "tensor_par": 8,
  "pipeline_par": 64,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 512,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "full",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/tab5/megatron-1T_seqsel.json
================================================
{
  "num_procs": 512,
  "tensor_par": 8,
  "pipeline_par": 64,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 512,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "attn_only",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": true,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/tab5/megatron-22B_full.json
================================================
{
  "num_procs": 8,
  "tensor_par": 8,
  "pipeline_par": 1,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 4,
  "microbatch_size": 4,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "full",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/tab5/megatron-22B_seqsel.json
================================================
{
  "num_procs": 8,
  "tensor_par": 8,
  "pipeline_par": 1,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 4,
  "microbatch_size": 4,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "attn_only",
  "pipeline_interleaving": 1,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": true,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/tab5/turing-530B_full.json
================================================
{
  "num_procs": 280,
  "tensor_par": 8,
  "pipeline_par": 35,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 280,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "full",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "ar",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": false,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}


================================================
FILE: validation/seqsel/tab5/turing-530B_seqsel.json
================================================
{
  "num_procs": 280,
  "tensor_par": 8,
  "pipeline_par": 35,
  "data_par": 1,
  "tensor_par_net": 0,
  "pipeline_par_net": 1,
  "data_par_net": 1,
  "batch_size": 280,
  "microbatch_size": 1,
  "datatype": "float16",
  "fused_activation": false,
  "attention_type": "multihead",
  "activation_recompute": "attn_only",
  "pipeline_interleaving": 3,
  "optimizer_sharding": false,
  "tensor_par_comm_type": "rs_ag",
  "tensor_par_overlap": "none",
  "seq_par_ag_redo": true,
  "data_par_overlap": false,
  "weight_offload": false,
  "activations_offload": false,
  "optimizer_offload": false,
  "training": true
}