Showing preview only (2,072K chars total). Download the full file or copy to clipboard to get everything.
Repository: PrincetonUniversity/LLMCompass
Branch: main
Commit: 2e015fd2ee75
Files: 111
Total size: 2.0 MB
Directory structure:
gitextract_pu2g2804/
├── .gitignore
├── .gitmodules
├── Dockerfile
├── LICENSE
├── README.md
├── __init__.py
├── ae/
│ ├── .gitignore
│ ├── __init__.py
│ ├── figure10/
│ │ ├── __init__.py
│ │ ├── plot_latency.py
│ │ ├── run_figure10.sh
│ │ └── test_latency.py
│ ├── figure11/
│ │ ├── __init__.py
│ │ ├── plot_decoding.py
│ │ ├── run_figure11.sh
│ │ └── test_decoding.py
│ ├── figure12/
│ │ ├── __init__.py
│ │ ├── plot_throughput.py
│ │ ├── run_figure12.sh
│ │ └── test_throughput.py
│ ├── figure5/
│ │ ├── __init__.py
│ │ ├── ab/
│ │ │ ├── __init__.py
│ │ │ ├── plot_matmul.py
│ │ │ ├── real_hardware/
│ │ │ │ ├── matmul_A100.csv
│ │ │ │ └── matmul_MI210.csv
│ │ │ ├── run.sh
│ │ │ └── test_matmul.py
│ │ ├── cf/
│ │ │ ├── __init__.py
│ │ │ ├── plot_softmax.py
│ │ │ ├── real_hardware/
│ │ │ │ ├── softmax_A100.csv
│ │ │ │ └── softmax_MI210.csv
│ │ │ ├── run.sh
│ │ │ └── test_softmax.py
│ │ ├── de/
│ │ │ ├── __init__.py
│ │ │ ├── plot_layernorm.py
│ │ │ ├── real_hardware/
│ │ │ │ ├── layernorm_A100.csv
│ │ │ │ └── layernorm_MI210.csv
│ │ │ ├── run.sh
│ │ │ └── test_layernorm.py
│ │ ├── g/
│ │ │ ├── __init__.py
│ │ │ ├── plot_gelu.py
│ │ │ ├── real_hardware/
│ │ │ │ ├── gelu_A100.csv
│ │ │ │ └── gelu_MI210.csv
│ │ │ ├── run.sh
│ │ │ └── test_gelu.py
│ │ ├── h/
│ │ │ ├── __init__.py
│ │ │ ├── run.sh
│ │ │ └── test_allreduce.py
│ │ ├── ijkl/
│ │ │ ├── __init__.py
│ │ │ ├── plot_transformer.py
│ │ │ ├── real_hardware/
│ │ │ │ ├── transformerAR_A100.csv
│ │ │ │ └── transformer_A100.csv
│ │ │ ├── run.sh
│ │ │ └── test_transformer.py
│ │ └── run_figure5.sh
│ ├── figure6/
│ │ ├── real_hardware/
│ │ │ └── die_area.csv
│ │ ├── run_figure6.sh
│ │ └── test_cost_model.py
│ ├── figure7/
│ │ ├── __init__.py
│ │ ├── change_core_size.py
│ │ ├── plot_core_size.py
│ │ └── run_figure7.sh
│ ├── figure8/
│ │ ├── __init__.py
│ │ ├── change_memory_bw.py
│ │ ├── plot_memory_bw.py
│ │ └── run_figure8.sh
│ └── figure9/
│ ├── __init__.py
│ ├── change_l1_cache.py
│ ├── plot_l1_cache.py
│ └── run_figure9.sh
├── configs/
│ ├── GA100.json
│ ├── ga102_template.json
│ ├── generation_system.json
│ ├── latency_design.json
│ ├── mi210.json
│ ├── mi210_template.json
│ ├── prefilling_system.json
│ └── template.json
├── cost_model/
│ ├── __init__.py
│ ├── cost_examples.py
│ ├── cost_model.py
│ └── regfile_area.py
├── design_space_exploration/
│ ├── __init__.py
│ └── dse.py
├── docs/
│ └── run.md
├── environment.yml
├── hardware_model/
│ ├── __init__.py
│ ├── arch_template.py
│ ├── compute_module.py
│ ├── device.py
│ ├── interconnect.py
│ ├── io_module.py
│ ├── memory_module.py
│ └── system.py
├── software_model/
│ ├── __init__.py
│ ├── communication_primitives.py
│ ├── gelu.py
│ ├── layernorm.py
│ ├── matmul.py
│ ├── operators.py
│ ├── softmax.py
│ ├── transformer.py
│ └── utils.py
├── systolic_array_model/
│ ├── look_up_table.csv
│ ├── look_up_table_128_128.csv
│ ├── look_up_table_16_16.csv
│ ├── look_up_table_32_32.csv
│ ├── look_up_table_64_64.csv
│ ├── look_up_table_8_8.csv
│ └── look_up_table_old.csv
└── utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
__pycache__/
SlurmOutput/
temp/
.vscode/
*.ncu-rep
================================================
FILE: .gitmodules
================================================
[submodule "cost_model/supply_chain"]
path = cost_model/supply_chain
url = https://github.com/PrincetonUniversity/ttm-cas.git
================================================
FILE: Dockerfile
================================================
# Start with a base image that includes Miniconda to manage our environment
FROM continuumio/miniconda3
# Set the working directory in the container to /app
WORKDIR /app
# Create the conda environment
COPY environment.yml /app/environment.yml
RUN conda env create -f /app/environment.yml
# Initialize conda in bash shell
RUN echo "source activate llmcompass_ae" > ~/.bashrc
ENV PATH /opt/conda/envs/llmcompass_ae/bin:$PATH
# Clone your GitHub repository
RUN git clone https://github.com/HenryChang213/LLMCompass_ISCA_AE.git /app/LLMCompass_ISCA_AE
RUN cd /app/LLMCompass_ISCA_AE && git submodule init && git submodule update --recursive
# Expose the port your app runs on
EXPOSE 8000
================================================
FILE: LICENSE
================================================
BSD 3-Clause License
Copyright (c) 2024, Princeton University
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
================================================
FILE: README.md
================================================
[](https://zenodo.org/doi/10.5281/zenodo.10892431)
# LLMCompass
This repository provides the implementation of **LLMCompass** from the following papers:
[**LLMCompass: Enabling Efficient Hardware Design for Large Language Model Inference**](https://parallel.princeton.edu/papers/isca24_llmcompass.pdf)
*Hengrui Zhang, August Ning, Rohan Baskar Prabhakar, David Wentzlaff*
In the Proceedings of the 51st Annual International Symposium on Computer Architecture:
```
@inproceedings{LLMCompass,
author = {Zhang, Hengrui and Ning, August and Prabhakar, Rohan Baskar and Wentzlaff, David},
title = {LLMCompass: Enabling Efficient Hardware Design for Large Language Model Inference},
year = {2024},
booktitle = {Proceedings of the 51st Annual International Symposium on Computer Architecture},
}
```
## Set up the environment
```
$ conda create -n llmcompass_ae python=3.9
$ conda activate llmcompass_ae
$ pip3 install scalesim
$ conda install pytorch==2.0.0 -c pytorch
$ pip3 install matplotlib
$ pip3 install seaborn
$ pip3 install scipy
```
## Installation
### If using Github
```
$ git clone -b ISCA_AE https://github.com/PrincetonUniversity/LLMCompass
$ cd LLMCompass
$ git submodule init
$ git submodule update --recursive
```
### If using Zenodo
Unzip the file and download from https://github.com/PrincetonUniversity/ttm-cas.git as `cost_model\supply_chain`
### If using Docker
A Dockerfile has been provided (`./Dockerfile`), including all the software dependencies and the LLMCompass source code.
A docker image has been provided [here](https://github.com/HenryChang213/LLMCompass_ISCA_AE_docker).
## AE Experiment workflow
```
# Figure 5 (around 100 min)
$ cd ae/figure5
$ bash run_figure5.sh
# Figure 6 (around 1 min)
$ cd ae/figure6
$ bash run_figure6.sh
# Figure 7 (around 20 min)
$ cd ae/figure7
$ bash run_figure7.sh
# Figure 8 (around 40 min)
$ cd ae/figure8
$ bash run_figure8.sh
# Figure 9 (around 30 min)
$ cd ae/figure9
$ bash run_figure9.sh
# Figure 10 (around 45 min)
$ cd ae/figure10
$ bash run_figure10.sh
# Figure 11 (around 5 min)
$ cd ae/figure11
$ bash run_figure11.sh
# Figure 12 (around 4 hours)
$ cd ae/figure12
$ bash run_figure12.sh
```
## AE Expected result
After running each script above, the corresponding figures
will be generated under the corresponding directory as suggested by its name.
For comparison, a copy of the expected results can be found in `ae\expected_results`
## User Guide
A guide on "How to Run a LLMCompass Simulation" is shown [here](./docs/run.md).
================================================
FILE: __init__.py
================================================
================================================
FILE: ae/.gitignore
================================================
*.pdf
*.csv
!**/real_hardware/**/*.csv
!expected_results/*
================================================
FILE: ae/__init__.py
================================================
================================================
FILE: ae/figure10/__init__.py
================================================
================================================
FILE: ae/figure10/plot_latency.py
================================================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
our_decoding = pd.read_csv(
"our_decoding.csv", header=None, names=["bs", "s", "latency"]
).sort_values(by="s")
our_prefill = pd.read_csv("our_prefill.csv", header=None, names=["bs", "s", "latency"])
A100_decoding = pd.read_csv(
"A100_decoding.csv", header=None, names=["bs", "s", "latency"]
).sort_values(by="s")
A100_prefill = pd.read_csv(
"A100_prefill.csv", header=None, names=["bs", "s", "latency"]
)
def get_total_decoding_latency(df: pd.DataFrame, start, end):
df_filtered = df[(df["s"] >= start) & (df["s"] <= end)]
total_latency = 0
# Calculate the mean of the values for each length interval and add to the sum
for i in range(len(df_filtered) - 1):
# Calculate the mean of current and next value
mean = (df_filtered.iloc[i]["latency"] + df_filtered.iloc[i + 1]["latency"]) / 2
# Calculate the difference in length
length_interval = df_filtered.iloc[i + 1]["s"] - df_filtered.iloc[i]["s"]
# Multiply the mean value by the length interval and add to the sum
total_latency += mean * length_interval
# print(total_latency)
return total_latency
norm_perf = []
for input_length in [256, 512, 1024, 2048]:
temp_list = []
our_prefill_latency = our_prefill[our_prefill["s"] == input_length][
"latency"
].values[0]
A100_prefill_latency = A100_prefill[A100_prefill["s"] == input_length][
"latency"
].values[0]
for output_length in [256, 512, 768, 1024, 1280, 1536, 1792, 2048]:
our_total_latency = our_prefill_latency + get_total_decoding_latency(
our_decoding, input_length, input_length + output_length
)
A100_total_latency = A100_prefill_latency + get_total_decoding_latency(
A100_decoding, input_length, input_length + output_length
)
temp_list.append(A100_total_latency / our_total_latency)
norm_perf.append(temp_list)
cmap = sns.color_palette("viridis", as_cmap=True)
data = np.array(norm_perf)
import statistics
print(statistics.geometric_mean(data.flatten()))
fig, ax = plt.subplots()
cax = ax.imshow(data, interpolation="nearest", cmap=cmap, vmin=0.8, vmax=1)
# cax = sns.heatmap(data, cmap="viridis")
# Add a colorbar
fig.colorbar(cax, shrink=0.5)
# Function to convert RGB to grayscale intensity
def get_intensity(color):
return color[0] * 0.299 + color[1] * 0.587 + color[2] * 0.114
# Set a threshold for deciding text color
intensity_threshold = 0.5
for i in range(data.shape[0]):
for j in range(data.shape[1]):
# Get the color from the colormap
cell_color = cax.cmap(cax.norm(data[i, j]))
# Calculate intensity of the cell color
intensity = get_intensity(cell_color)
# Choose text color based on intensity
text_color = "white" if intensity < intensity_threshold else "black"
text = ax.text(
j, i, round(data[i, j], 2), ha="center", va="center", color=text_color
)
# Set the x-axis and y-axis values
x_axis_labels = [256, 512, 768, 1024, 1280, 1536, 1792, 2048]
y_axis_labels = [256, 512, 1024, 2048]
# Set ticks positions
ax.set_xticks(np.arange(len(x_axis_labels)))
ax.set_yticks(np.arange(len(y_axis_labels)))
# Set ticks labels
ax.set_xticklabels(x_axis_labels)
ax.set_yticklabels(y_axis_labels)
# Set labels for axes
ax.set_xlabel("Output Length")
ax.set_ylabel("Input Length")
ax.invert_yaxis()
# # Rotate the tick labels for the x-axis if needed
# plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
# Show the plot
plt.tight_layout()
plt.savefig("figure10.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01)
# norm_perf = []
# norm_perf_ttft = []
# for input_length in [256, 512, 1024, 2048]:
# temp_list = []
# our_prefill_latency = our_prefill[our_prefill["s"] == input_length][
# "latency"
# ].values[0]
# A100_prefill_latency = A100_prefill[A100_prefill["s"] == input_length][
# "latency"
# ].values[0]
# for output_length in [256, 512, 768, 1024, 1280, 1536, 1792, 2048]:
# our_tbt_latency = get_total_decoding_latency(
# our_decoding, input_length, input_length + output_length
# )
# A100_tbt_latency = get_total_decoding_latency(
# A100_decoding, input_length, input_length + output_length
# )
# temp_list.append(our_tbt_latency / A100_tbt_latency)
# norm_perf.append(temp_list)
# norm_perf_ttft.append(our_prefill_latency / A100_prefill_latency)
# cmap = sns.color_palette("viridis", as_cmap=True)
# data = np.array(norm_perf)
# data_ttft = np.array(norm_perf_ttft)
# print(data)
# print(data_ttft)
# import statistics
# from matplotlib import gridspec
# print(statistics.geometric_mean(data.flatten()))
# print(statistics.geometric_mean(data_ttft))
# # fig, axs = plt.subplots(1, 2, figsize=(8, 4),
# # gridspec_kw={'width_ratios': [3, 1]}, sharey=True)
# fig = plt.figure(figsize=(8, 3)) # Define the figure size
# gs = gridspec.GridSpec(
# 1, 2, width_ratios=[4, 1]
# ) # 2 rows, 1 column, with the first row 3 times the height of the second
# ax = fig.add_subplot(gs[0])
# # ax=axs[0]
# cax = ax.imshow(data, interpolation="nearest", cmap=cmap, vmin=1.015, vmax=1.045)
# # cax = sns.heatmap(data, cmap="viridis")
# # Add a colorbar
# fig.colorbar(cax, shrink=1)
# # Function to convert RGB to grayscale intensity
# def get_intensity(color):
# return color[0] * 0.299 + color[1] * 0.587 + color[2] * 0.114
# # Set a threshold for deciding text color
# intensity_threshold = 0.5
# for i in range(data.shape[0]):
# for j in range(data.shape[1]):
# # Get the color from the colormap
# cell_color = cax.cmap(cax.norm(data[i, j]))
# # Calculate intensity of the cell color
# intensity = get_intensity(cell_color)
# # Choose text color based on intensity
# text_color = "white" if intensity < intensity_threshold else "black"
# text = ax.text(
# j, i, round(data[i, j], 3), ha="center", va="center", color=text_color
# )
# # Set the x-axis and y-axis values
# x_axis_labels = [256, 512, 768, 1024, 1280, 1536, 1792, 2048]
# y_axis_labels = [256, 512, 1024, 2048]
# # Set ticks positions
# ax.set_xticks(np.arange(len(x_axis_labels)))
# ax.set_yticks(np.arange(len(y_axis_labels)))
# # Set ticks labels
# ax.set_xticklabels(x_axis_labels)
# ax.set_yticklabels(y_axis_labels)
# # Set labels for axes
# ax.set_xlabel("Output Length\n" + r"$\mathbf{Normalized\ TBT}$")
# ax.set_ylabel("Input Length")
# ax.invert_yaxis()
# # # Rotate the tick labels for the x-axis if needed
# # plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
# # fig = plt.figure(figsize=(10, 5)) # Define the figure size
# axs1 = fig.add_subplot(gs[0, 1])
# axs1.barh(np.arange(len(data_ttft)) / 2 + 0.2, data_ttft, color="steelblue", height=0.3)
# axs1.set_yticks(np.arange(len(y_axis_labels)) / 2 + 0.2)
# axs1.set_yticklabels(y_axis_labels)
# axs1.set_xlabel(r"$\mathbf{Normalized\ TTFT}$")
# axs1.set_xlim(1, 2)
# # Show the plot
# plt.tight_layout()
# plt.savefig("figure11.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01)
================================================
FILE: ae/figure10/run_figure10.sh
================================================
rm *.csv
rm *.pdf
cd ../..
python -m ae.figure10.test_latency
cd ae/figure10
python plot_latency.py
================================================
FILE: ae/figure10/test_latency.py
================================================
from software_model.transformer import (
TransformerBlockInitComputationTP,
TransformerBlockAutoRegressionTP,
)
from software_model.utils import data_type_dict, Tensor
from design_space_exploration.dse import template_to_system, read_architecture_template
from multiprocessing import Process, Lock
from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2
import time
A100_specs = read_architecture_template("configs/GA100.json")
A100_system = template_to_system(A100_specs)
our_specs = read_architecture_template("configs/latency_design.json")
our_system = template_to_system(our_specs)
A100_compute_area_mm2 = calc_compute_chiplet_area_mm2(A100_specs)
A100_io_area_mm2 = calc_io_die_area_mm2(A100_specs)
our_compute_area_mm2 = calc_compute_chiplet_area_mm2(our_specs)
our_io_area_mm2 = calc_io_die_area_mm2(our_specs)
print(f"A100 compute area: {A100_compute_area_mm2} mm2")
print(f"A100 IO area: {A100_io_area_mm2} mm2")
print(f"A100 total area: {A100_compute_area_mm2+A100_io_area_mm2} mm2")
print(f"Our compute area: {our_compute_area_mm2} mm2")
print(f"Our IO area: {our_io_area_mm2} mm2")
print(f"Our total area: {our_compute_area_mm2+our_io_area_mm2} mm2")
with open("ae/figure10/area.csv", "w") as f:
f.write(f"A100 compute area: {A100_compute_area_mm2} mm2\n")
f.write(f"A100 IO area: {A100_io_area_mm2} mm2\n")
f.write(f"A100 total area: {A100_compute_area_mm2+A100_io_area_mm2} mm2\n")
f.write(f"Our compute area: {our_compute_area_mm2} mm2\n")
f.write(f"Our IO area: {our_io_area_mm2} mm2\n")
f.write(f"Our total area: {our_compute_area_mm2+our_io_area_mm2} mm2\n")
def simulate_decoding_latency(system, bs, seq_len, name, lock):
model_auto_regression = TransformerBlockAutoRegressionTP(
d_model=12288,
n_heads=96,
device_count=4,
data_type=data_type_dict["fp16"],
)
_ = model_auto_regression(
Tensor([bs, 1, 12288], data_type_dict["fp16"]),
seq_len,
)
auto_regression_latency_simulated = model_auto_regression.compile_and_simulate(
system, "heuristic-GPU"
)
with lock:
with open(f"ae/figure10/{name}_decoding.csv", "a") as f:
f.write(f"{bs}, {seq_len}, {auto_regression_latency_simulated}\n")
def simulate_prefill_latency(system, bs, seq_len, name, lock):
model = TransformerBlockInitComputationTP(
d_model=12288,
n_heads=96,
device_count=4,
data_type=data_type_dict["fp16"],
)
_ = model(
Tensor([bs, seq_len, 12288], data_type_dict["fp16"]),
)
latency_simulated = model.compile_and_simulate(system, "heuristic-GPU")
with lock:
with open(f"ae/figure10/{name}_prefill.csv", "a") as f:
f.write(f"{bs}, {seq_len}, {latency_simulated}\n")
lock_our_prefill = Lock()
lock_our_decoding = Lock()
lock_A100_prefill = Lock()
lock_A100_decoding = Lock()
processes = []
for bs in [16]: # [1, 4, 8, 16, 32, 64]:
for seq_len in [256, 512, 1024, 2048]:
for system in [our_system, A100_system]:
if system == A100_system:
name = "A100"
lock = lock_A100_prefill
else:
name = "our"
lock = lock_our_prefill
p = Process(
target=simulate_prefill_latency, args=(system, bs, seq_len, name, lock)
)
processes.append(p)
for seq_len in range(256, 4096 + 64, 64):
for system in [our_system, A100_system]:
if system == A100_system:
name = "A100"
lock = lock_A100_decoding
else:
name = "our"
lock = lock_our_decoding
p = Process(
target=simulate_decoding_latency, args=(system, bs, seq_len, name, lock)
)
processes.append(p)
try:
for p in processes:
p.start()
print("Processes started.")
print("number of process:", len(processes))
while any(p.is_alive() for p in processes):
time.sleep(1)
except KeyboardInterrupt:
print("Terminating processes...")
for p in processes:
p.terminate()
p.join()
print("All processes have finished.")
================================================
FILE: ae/figure11/__init__.py
================================================
================================================
FILE: ae/figure11/plot_decoding.py
================================================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import gmean
categories = ["bs", "seq_len", "latency"] + [
"Q_K_V",
"Q_mul_K",
"A_mul_V",
"Wo_proj",
"W1_proj",
"W2_proj",
"Softmax",
"LayerNorm_MHA",
"LayerNorm_FFN",
"GeLU",
"AllReduce_MHA",
"AllReduce_FFN",
]
A100 = pd.read_csv("A100.csv", header=None, names=categories)
A100["latency"] = A100["latency"] * 1000
our = pd.read_csv("our.csv", header=None, names=categories)
our["latency"] = our["latency"] * 1000
bs_list = [1, 2, 4, 8, 16, 32]
colors_our = sns.color_palette("Blues", 3)[1:]
colors_a100 = sns.color_palette("summer_r", 2)
our_512 = our[our.seq_len == 512][our["bs"].isin(bs_list)]["latency"].tolist()
our_2048 = our[our.seq_len == 2048][our["bs"].isin(bs_list)]["latency"].tolist()
a100_512 = A100[A100.seq_len == 512][A100["bs"].isin(bs_list)]["latency"].tolist()
a100_2048 = A100[A100.seq_len == 2048][A100["bs"].isin(bs_list)]["latency"].tolist()
avg_speedup = gmean(
np.concatenate(
(
np.array(a100_512) / np.array(our_512),
np.array(a100_2048) / np.array(our_2048),
)
)
)
print(avg_speedup)
plt.figure(figsize=(8, 3.5))
x_pos = 0.25
for bs in bs_list:
if bs == 1:
seq_len = 512
plt.bar(
x_pos,
A100[(A100.bs == bs) & (A100.seq_len == seq_len)].latency,
width=0.5,
label=f"GA100 (seq_len={seq_len})",
color=colors_a100[0],
)
bars = plt.bar(
x_pos + 0.5,
our[(our.bs == bs) & (our.seq_len == seq_len)].latency,
width=0.5,
label=f"Latency design (seq_len={seq_len})",
color=colors_our[0],
)
for bar in bars:
bar.set_hatch("//") # Add diagonal stripes
seq_len = 2048
plt.bar(
x_pos + 1,
A100[(A100.bs == bs) & (A100.seq_len == seq_len)].latency,
width=0.5,
label=f"GA100 (seq_len={seq_len})",
color=colors_a100[1],
)
bars = plt.bar(
x_pos + 1.5,
our[(our.bs == bs) & (our.seq_len == seq_len)].latency,
width=0.5,
label=f"Latency design (seq_len={seq_len})",
color=colors_our[1],
)
for bar in bars:
bar.set_hatch("//") # Add diagonal stripes
else:
seq_len = 512
plt.bar(
x_pos,
A100[(A100.bs == bs) & (A100.seq_len == seq_len)].latency,
width=0.5,
color=colors_a100[0],
)
bars = plt.bar(
x_pos + 0.5,
our[(our.bs == bs) & (our.seq_len == seq_len)].latency,
width=0.5,
color=colors_our[0],
)
for bar in bars:
bar.set_hatch("//") # Add diagonal stripes
seq_len = 2048
if bs < 164:
plt.bar(
x_pos + 1,
A100[(A100.bs == bs) & (A100.seq_len == seq_len)].latency,
width=0.5,
color=colors_a100[1],
)
bars = plt.bar(
x_pos + 1.5,
our[(our.bs == bs) & (our.seq_len == seq_len)].latency,
width=0.5,
color=colors_our[1],
)
for bar in bars:
bar.set_hatch("//") # Add diagonal stripes
x_pos += 3
plt.xticks([1, 4, 7, 10, 13, 16], bs_list)
plt.xlabel("Batch Size")
plt.ylabel("Latency (ms)")
plt.legend(loc="upper left")
plt.tight_layout()
plt.grid(True, axis="y", ls="--", c="0.8")
plt.savefig("figure11.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300)
plt.show()
================================================
FILE: ae/figure11/run_figure11.sh
================================================
rm *.csv
rm *.pdf
cd ../..
python -m ae.figure11.test_decoding
cd ae/figure11
python plot_decoding.py
================================================
FILE: ae/figure11/test_decoding.py
================================================
from software_model.transformer import (
TransformerBlockInitComputationTP,
TransformerBlockAutoRegressionTP,
)
from software_model.utils import data_type_dict, Tensor
from design_space_exploration.dse import template_to_system, read_architecture_template
from multiprocessing import Process, Lock
from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2
import time
A100_specs = read_architecture_template("configs/GA100.json")
A100_system = template_to_system(A100_specs)
our_specs = read_architecture_template("configs/latency_design.json")
our_system = template_to_system(our_specs)
A100_compute_area_mm2 = calc_compute_chiplet_area_mm2(A100_specs)
A100_io_area_mm2 = calc_io_die_area_mm2(A100_specs)
our_compute_area_mm2 = calc_compute_chiplet_area_mm2(our_specs)
our_io_area_mm2 = calc_io_die_area_mm2(our_specs)
print(f"A100 compute area: {A100_compute_area_mm2} mm2")
print(f"A100 IO area: {A100_io_area_mm2} mm2")
print(f"A100 total area: {A100_compute_area_mm2+A100_io_area_mm2} mm2")
print(f"Our compute area: {our_compute_area_mm2} mm2")
print(f"Our IO area: {our_io_area_mm2} mm2")
print(f"Our total area: {our_compute_area_mm2+our_io_area_mm2} mm2")
def simulate_latency(system, bs, seq_len, name, lock):
model_auto_regression = TransformerBlockAutoRegressionTP(
d_model=12288,
n_heads=96,
device_count=4,
data_type=data_type_dict["fp16"],
)
_ = model_auto_regression(
Tensor([bs, 1, 12288], data_type_dict["fp16"]),
seq_len,
)
auto_regression_latency_simulated = model_auto_regression.compile_and_simulate(
system, "heuristic-GPU"
)
with lock:
with open(f"ae/figure11/{name}.csv", "a") as f:
f.write(
f"{bs}, {seq_len}, {auto_regression_latency_simulated}, {model_auto_regression.simluate_log}\n"
)
lock = Lock()
processes = []
for bs in [1, 2, 4, 8, 16, 32, 64]:
for seq_len in [512, 2048]:
for system in [our_system, A100_system]:
if system == A100_system:
name = "A100"
else:
name = "our"
p = Process(target=simulate_latency, args=(system, bs, seq_len, name, lock))
processes.append(p)
try:
for p in processes:
p.start()
while any(p.is_alive() for p in processes):
time.sleep(1)
except KeyboardInterrupt:
print("Terminating processes...")
for p in processes:
p.terminate()
p.join()
print("All processes have finished.")
================================================
FILE: ae/figure12/__init__.py
================================================
================================================
FILE: ae/figure12/plot_throughput.py
================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
our_directory = "our/our"
A100_directory = "A100/A100"
categories = ["bs", "s", "latency"] + [
"Q_K_V",
"Q_mul_K",
"A_mul_V",
"Wo_proj",
"W1_proj",
"W2_proj",
"Softmax",
"LayerNorm_MHA",
"LayerNorm_FFN",
"GeLU",
"AllReduce_MHA",
"AllReduce_FFN",
]
throughput_our = []
bs_our = []
latency_our = []
throughput_A100 = []
bs_A100 = []
latency_A100 = []
def get_total_decoding_latency(df: pd.DataFrame, start, end):
df_filtered = df[(df["s"] >= start) & (df["s"] <= end)]
total_latency = 0
# Calculate the mean of the values for each length interval and add to the sum
for i in range(len(df_filtered) - 1):
# Calculate the mean of current and next value
mean = (df_filtered.iloc[i]["latency"] + df_filtered.iloc[i + 1]["latency"]) / 2
# Calculate the difference in length
length_interval = df_filtered.iloc[i + 1]["s"] - df_filtered.iloc[i]["s"]
# Multiply the mean value by the length interval and add to the sum
total_latency += mean * length_interval
# print(total_latency)
return total_latency
for input_length in [256, 512, 1024, 2048]:
temp_our = []
temp_A100 = []
temp_our_bs = []
temp_A100_bs = []
temp_our_latency = []
temp_A100_latency = []
for output_length in [256, 512, 768, 1024, 1280, 1536, 1792, 2048]:
our_prefill_df = pd.read_csv(
f"{our_directory}_{input_length}_{output_length}_prefill.csv",
header=None,
names=categories,
)
# print(our_prefill_df)
our_prefill_latency = our_prefill_df.iloc[0]["latency"]
our_bs = our_prefill_df.iloc[0]["bs"]
temp_our_bs.append(our_bs)
our_decoding_df = pd.read_csv(
f"{our_directory}_{input_length}_{output_length}_decoding.csv",
header=None,
names=categories,
).sort_values(by="s")
our_decoding_latency = get_total_decoding_latency(
our_decoding_df, input_length, input_length + output_length
)
# print(our_decoding_latency)
our_throughput = (
our_bs * output_length / (our_prefill_latency + our_decoding_latency) / 12
)
temp_our.append(our_throughput)
temp_our_latency.append(our_prefill_latency + our_decoding_latency)
A100_prefill_df = pd.read_csv(
f"{A100_directory}_{input_length}_{output_length}_prefill.csv",
header=None,
names=categories,
)
A100_prefill_latency = A100_prefill_df.iloc[0]["latency"]
A100_bs = A100_prefill_df.iloc[0]["bs"]
temp_A100_bs.append(A100_bs)
A100_decoding_df = pd.read_csv(
f"{A100_directory}_{input_length}_{output_length}_decoding.csv",
header=None,
names=categories,
).sort_values(by="s")
A100_decoding_latency = get_total_decoding_latency(
A100_decoding_df, input_length, input_length + output_length
)
A100_throughput = (
A100_bs
* output_length
/ (A100_prefill_latency + A100_decoding_latency)
/ 12
)
temp_A100.append(A100_throughput)
temp_A100_latency.append(A100_prefill_latency + A100_decoding_latency)
throughput_our.append(temp_our)
throughput_A100.append(temp_A100)
bs_our.append(temp_our_bs)
bs_A100.append(temp_A100_bs)
latency_our.append(temp_our_latency)
latency_A100.append(temp_A100_latency)
# print(throughput_our)
# print(throughput_A100)
print(latency_our)
print(latency_A100)
print(
statistics.geometric_mean(
(np.array(latency_our) / np.array(latency_A100)).flatten()
)
)
# Function to convert RGB to grayscale intensity
def get_intensity(color):
return color[0] * 0.299 + color[1] * 0.587 + color[2] * 0.114
cmap = sns.color_palette("viridis", as_cmap=True)
data = np.array(throughput_our) # / np.array(throughput_A100)
print(data.mean())
fig, ax = plt.subplots()
cax = ax.imshow(data, interpolation="nearest", cmap=cmap)
# cax = sns.heatmap(data, cmap="Blues")
# Add a colorbar
fig.colorbar(cax, shrink=0.5)
# Set a threshold for deciding text color
intensity_threshold = 0.5
for i in range(data.shape[0]):
for j in range(data.shape[1]):
# Get the color from the colormap
cell_color = cax.cmap(cax.norm(data[i, j]))
# Calculate intensity of the cell color
intensity = get_intensity(cell_color)
# Choose text color based on intensity
text_color = "white" if intensity < intensity_threshold else "black"
text = ax.text(
j, i, int(data[i, j]), ha="center", va="center", color=text_color
)
# Set the x-axis and y-axis values
x_axis_labels = [256, 512, 768, 1024, 1280, 1536, 1792, 2048]
y_axis_labels = [256, 512, 1024, 2048]
# Set ticks positions
ax.set_xticks(np.arange(len(x_axis_labels)))
ax.set_yticks(np.arange(len(y_axis_labels)))
# Set ticks labels
ax.set_xticklabels(x_axis_labels)
ax.set_yticklabels(y_axis_labels)
# Set labels for axes
ax.set_xlabel("Output Length")
ax.set_ylabel("Input Length")
ax.invert_yaxis()
# # Rotate the tick labels for the x-axis if needed
# plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
# Show the plot
plt.tight_layout()
plt.savefig("figure12a.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01)
import statistics
cmap = sns.color_palette("viridis", as_cmap=True)
data = np.array(throughput_our) / np.array(throughput_A100)
print(statistics.geometric_mean(data.flatten()))
fig, ax = plt.subplots()
cax = ax.imshow(
data,
interpolation="nearest",
cmap=cmap,
)
# cax = sns.heatmap(data, cmap="viridis")
# Add a colorbar
fig.colorbar(cax, shrink=0.5)
# Set a threshold for deciding text color
intensity_threshold = 0.5
for i in range(data.shape[0]):
for j in range(data.shape[1]):
# Get the color from the colormap
cell_color = cax.cmap(cax.norm(data[i, j]))
# Calculate intensity of the cell color
intensity = get_intensity(cell_color)
# Choose text color based on intensity
text_color = "white" if intensity < intensity_threshold else "black"
text = ax.text(
j, i, round(data[i, j], 2), ha="center", va="center", color=text_color
)
# Set the x-axis and y-axis values
x_axis_labels = [256, 512, 768, 1024, 1280, 1536, 1792, 2048]
y_axis_labels = [256, 512, 1024, 2048]
# Set ticks positions
ax.set_xticks(np.arange(len(x_axis_labels)))
ax.set_yticks(np.arange(len(y_axis_labels)))
# Set ticks labels
ax.set_xticklabels(x_axis_labels)
ax.set_yticklabels(y_axis_labels)
# Set labels for axes
ax.set_xlabel("Output Length")
ax.set_ylabel("Input Length")
ax.invert_yaxis()
# # Rotate the tick labels for the x-axis if needed
# plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
# Show the plot
plt.tight_layout()
plt.savefig("figure12b.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01)
================================================
FILE: ae/figure12/run_figure12.sh
================================================
rm A100/*.csv
rm our/*.csv
rm *.pdf
mkdir A100
mkdir our
cd ../..
python -m ae.figure12.test_throughput
cd ae/figure12
python plot_throughput.py
================================================
FILE: ae/figure12/test_throughput.py
================================================
from software_model.transformer import (
TransformerBlockInitComputationTP,
TransformerBlockAutoRegressionTP,
)
from software_model.utils import data_type_dict, Tensor
from design_space_exploration.dse import template_to_system, read_architecture_template
from multiprocessing import Process, Lock
from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2
import time
A100_specs = read_architecture_template("configs/GA100.json")
A100_system = template_to_system(A100_specs)
our_specs = read_architecture_template("configs/prefilling_system.json")
our_system = template_to_system(our_specs)
A100_compute_area_mm2 = calc_compute_chiplet_area_mm2(A100_specs)
A100_io_area_mm2 = calc_io_die_area_mm2(A100_specs)
our_compute_area_mm2 = calc_compute_chiplet_area_mm2(our_specs)
our_io_area_mm2 = calc_io_die_area_mm2(our_specs)
print(f"A100 compute area: {A100_compute_area_mm2} mm2")
print(f"A100 IO area: {A100_io_area_mm2} mm2")
print(f"A100 total area: {A100_compute_area_mm2+A100_io_area_mm2} mm2")
print(f"Our compute area: {our_compute_area_mm2} mm2")
print(f"Our IO area: {our_io_area_mm2} mm2")
print(f"Our total area: {our_compute_area_mm2+our_io_area_mm2} mm2")
with open("ae/figure12/area.csv", "w") as f:
f.write(f"A100 compute area: {A100_compute_area_mm2} mm2\n")
f.write(f"A100 IO area: {A100_io_area_mm2} mm2\n")
f.write(f"A100 total area: {A100_compute_area_mm2+A100_io_area_mm2} mm2\n")
f.write(f"Our compute area: {our_compute_area_mm2} mm2\n")
f.write(f"Our IO area: {our_io_area_mm2} mm2\n")
f.write(f"Our total area: {our_compute_area_mm2+our_io_area_mm2} mm2\n")
def simulate_decoding_latency(system, bs, seq_len, name, lock, heuristics):
model_auto_regression = TransformerBlockAutoRegressionTP(
d_model=12288,
n_heads=96,
device_count=1,
data_type=data_type_dict["fp16"],
)
_ = model_auto_regression(
Tensor([bs, 1, 12288], data_type_dict["fp16"]),
seq_len,
)
auto_regression_latency_simulated = model_auto_regression.compile_and_simulate(
system, heuristics
)
with lock:
with open(f"ae/figure12/{name}_decoding.csv", "a") as f:
f.write(
f"{bs}, {seq_len}, {auto_regression_latency_simulated}, {model_auto_regression.simluate_log}\n"
)
def simulate_prefill_latency(system, bs, seq_len, name, lock, heuristics):
model = TransformerBlockInitComputationTP(
d_model=12288,
n_heads=96,
device_count=1,
data_type=data_type_dict["fp16"],
)
_ = model(
Tensor([bs, seq_len, 12288], data_type_dict["fp16"]),
)
latency_simulated = model.compile_and_simulate(system, heuristics)
with lock:
with open(f"ae/figure12/{name}_prefill.csv", "a") as f:
f.write(f"{bs}, {seq_len}, {latency_simulated}, {model.simluate_log}\n")
lock_our_prefill = Lock()
lock_our_decoding = Lock()
lock_A100_prefill = Lock()
lock_A100_decoding = Lock()
processes = []
for input_seq_len in [
256,
512,
1024,
2048,
]:
for output_seq_len in [
256,
512,
768,
1024,
1280,
1536,
1792,
2048,
]:
seq_len = input_seq_len + output_seq_len
for system in [our_system, A100_system]:
if system == A100_system:
name = f"A100/A100_{input_seq_len}_{output_seq_len}"
lock = lock_A100_prefill
bs = (80e9 - 2 * 12 * 12288**2 * 12) // ((12 * 4 + 8) * seq_len * 12288)
heuristics = "heuristic-GPU"
else:
name = f"our/our_{input_seq_len}_{output_seq_len}"
lock = lock_our_prefill
bs = (512e9 - 2 * 12 * 12288**2 * 12) // (
(12 * 4 + 8) * seq_len * 12288
)
heuristics = "heuristic-our-throughput"
bs = int(bs)
# print(bs)
p = Process(
target=simulate_prefill_latency,
args=(system, bs, input_seq_len, name, lock, heuristics),
)
processes.append(p)
for decoding_seq_len in range(input_seq_len, seq_len + 64, 64):
if system == A100_system:
name = f"A100/A100_{input_seq_len}_{output_seq_len}"
lock = lock_A100_decoding
heuristics = "heuristic-GPU"
else:
name = f"our/our_{input_seq_len}_{output_seq_len}"
lock = lock_our_decoding
heuristics = "heuristic-our-throughput"
p = Process(
target=simulate_decoding_latency,
args=(system, bs, decoding_seq_len, name, lock, heuristics),
)
processes.append(p)
print(len(processes))
# exit()
try:
for p in processes:
p.start()
print("Processes started.")
print("number of process:", len(processes))
while any(p.is_alive() for p in processes):
time.sleep(1)
except KeyboardInterrupt:
print("Terminating processes...")
for p in processes:
p.terminate()
p.join()
print("All processes have finished.")
================================================
FILE: ae/figure5/__init__.py
================================================
================================================
FILE: ae/figure5/ab/__init__.py
================================================
================================================
FILE: ae/figure5/ab/plot_matmul.py
================================================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
matmul_TPUv3_sim = pd.read_csv(
"matmul_TPUv3_sim.csv", header=None, names=["M", "N", "K", "latency", "throughput"]
)
matmul_TPUv3_sim["throughput"] = (
matmul_TPUv3_sim["throughput"].str.extract(r"(\d+\.?\d*)").astype(float)
)
matmul_TPUv3_sim.set_index(["M", "N", "K"], inplace=True)
matmul_TPUv3_roofline = pd.read_csv(
"matmul_TPUv3_roofline.csv",
header=None,
names=["M", "N", "K", "latency", "throughput"],
)
matmul_TPUv3_roofline["throughput"] = (
matmul_TPUv3_roofline["throughput"].str.extract(r"(\d+\.?\d*)").astype(float)
)
matmul_TPUv3_roofline.set_index(["M", "N", "K"], inplace=True)
matmul_A100 = pd.read_csv(
"real_hardware/matmul_A100.csv", header=None, names=["M", "N", "K", "latency", "throughput"]
)
matmul_A100["throughput"] = (
matmul_A100["throughput"].str.extract(r"(\d+\.?\d*)").astype(float)
)
matmul_A100.set_index(["M", "N", "K"], inplace=True)
matmul_A100_sim = pd.read_csv(
"matmul_A100_sim.csv", header=None, names=["M", "N", "K", "latency", "throughput"]
)
matmul_A100_sim["throughput"] = (
matmul_A100_sim["throughput"].str.extract(r"(\d+\.?\d*)").astype(float)
)
matmul_A100_sim.set_index(["M", "N", "K"], inplace=True)
matmul_A100_roofline = pd.read_csv(
"matmul_A100_roofline.csv",
header=None,
names=["M", "N", "K", "latency", "throughput"],
)
matmul_A100_roofline["throughput"] = (
matmul_A100_roofline["throughput"].str.extract(r"(\d+\.?\d*)").astype(float)
)
matmul_A100_roofline.set_index(["M", "N", "K"], inplace=True)
matmul_MI210 = pd.read_csv(
"real_hardware/matmul_MI210.csv", header=None, names=["M", "N", "K", "latency", "throughput"]
)
matmul_MI210["throughput"] = (
matmul_MI210["throughput"].str.extract(r"(\d+\.?\d*)").astype(float)
)
matmul_MI210.set_index(["M", "N", "K"], inplace=True)
matmul_MI210_sim = pd.read_csv(
"matmul_MI210_sim.csv", header=None, names=["M", "N", "K", "latency", "throughput"]
)
matmul_MI210_sim["throughput"] = (
matmul_MI210_sim["throughput"].str.extract(r"(\d+\.?\d*)").astype(float)
)
matmul_MI210_sim.set_index(["M", "N", "K"], inplace=True)
matmul_MI210_roofline = pd.read_csv(
"matmul_MI210_roofline.csv",
header=None,
names=["M", "N", "K", "latency", "throughput"],
)
matmul_MI210_roofline["throughput"] = (
matmul_MI210_roofline["throughput"].str.extract(r"(\d+\.?\d*)").astype(float)
)
matmul_MI210_roofline.set_index(["M", "N", "K"], inplace=True)
color_NV = sns.color_palette("Greens_d", 4)[1:]
color_Google = sns.color_palette("Blues_d", 4)[1:]
color_AMD = sns.color_palette("flare", 3)
K = 12288
N = K
title = f"Performance of Matmul with K={K}, N={N}"
M_list = []
throughput_TPU_list = []
throughput_TPU_sim_list = []
throughput_TPU_roofline_list = []
throughput_GPU_list = []
throughput_GPU_sim_list = []
throughput_GPU_roofline_list = []
throughput_AMD_list = []
throughput_AMD_sim_list = []
throughput_AMD_roofline_list = []
for M in range(6, 16):
M = 2**M
M_list.append(M)
throughput_TPU_sim_list.append(matmul_TPUv3_sim.loc[(M, N, K), "throughput"])
throughput_TPU_roofline_list.append(
matmul_TPUv3_roofline.loc[(M, N, K), "throughput"]
)
throughput_GPU_list.append(matmul_A100.loc[(M, N, K), "throughput"])
throughput_GPU_sim_list.append(matmul_A100_sim.loc[(M, N, K), "throughput"])
throughput_GPU_roofline_list.append(
matmul_A100_roofline.loc[(M, N, K), "throughput"]
)
throughput_AMD_list.append(matmul_MI210.loc[(M, N, K), "throughput"])
throughput_AMD_sim_list.append(matmul_MI210_sim.loc[(M, N, K), "throughput"])
throughput_AMD_roofline_list.append(
matmul_MI210_roofline.loc[(M, N, K), "throughput"]
)
# plt.figure(figsize=(6, 2.8))
plt.figure(figsize=(3.64, 2.8))
plt.xscale("log", base=2)
plt.plot(
M_list,
throughput_GPU_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of NVIDIA A100",
color=color_NV[0],
)
plt.plot(
M_list, throughput_GPU_list, marker="o", label="Real NVIDIA A100", color=color_NV[1]
)
plt.plot(
M_list,
throughput_GPU_sim_list,
marker="x",
label="Simulated NVIDIA A100",
color=color_NV[2],
)
plt.plot(
M_list,
throughput_AMD_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of AMD MI210",
color=color_AMD[0],
)
plt.plot(
M_list, throughput_AMD_list, marker="o", label="Real AMD MI210", color=color_AMD[1]
)
plt.plot(
M_list,
throughput_AMD_sim_list,
marker="x",
label="Simulated AMD MI210",
color=color_AMD[2],
)
plt.plot(
M_list,
throughput_TPU_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of Google TPUv3",
color=color_Google[0],
)
plt.plot(
M_list,
throughput_TPU_sim_list,
marker="x",
label="Simulated Google TPUv3",
color=color_Google[2],
)
# handles, labels = plt.gca().get_legend_handles_labels()
# plt.legend(handles, labels, loc="upper left", bbox_to_anchor=(1, 1))
# plt.title(title)
plt.xlabel("M")
plt.ylabel("TFLOPS")
plt.grid(True, which="both", ls="--", c="0.7") # Adding a grid for better readability
plt.tight_layout()
plt.savefig("figure5b.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300)
M = 8192
title = f"Performance of Matmul with M={M}"
K_list = []
throughput_TPU_list = []
throughput_TPU_sim_list = []
throughput_TPU_roofline_list = []
throughput_GPU_list = []
throughput_GPU_sim_list = []
throughput_GPU_roofline_list = []
throughput_AMD_list = []
throughput_AMD_sim_list = []
throughput_AMD_roofline_list = []
for K in range(6, 16):
K = 2**K
N = K
K_list.append(K)
throughput_TPU_sim_list.append(matmul_TPUv3_sim.loc[(M, N, K), "throughput"])
throughput_TPU_roofline_list.append(
matmul_TPUv3_roofline.loc[(M, N, K), "throughput"]
)
throughput_GPU_list.append(matmul_A100.loc[(M, N, K), "throughput"])
throughput_GPU_sim_list.append(matmul_A100_sim.loc[(M, N, K), "throughput"])
throughput_GPU_roofline_list.append(
matmul_A100_roofline.loc[(M, N, K), "throughput"]
)
throughput_AMD_list.append(matmul_MI210.loc[(M, N, K), "throughput"])
throughput_AMD_sim_list.append(matmul_MI210_sim.loc[(M, N, K), "throughput"])
throughput_AMD_roofline_list.append(
matmul_MI210_roofline.loc[(M, N, K), "throughput"]
)
plt.figure(figsize=(3.64, 2.8))
plt.xscale("log", base=2)
plt.plot(
K_list,
throughput_GPU_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of NVIDIA A100",
color=color_NV[0],
)
plt.plot(
K_list, throughput_GPU_list, marker="o", label="Real NVIDIA A100", color=color_NV[1]
)
plt.plot(
K_list,
throughput_GPU_sim_list,
marker="x",
label="Simulated NVIDIA A100",
color=color_NV[2],
)
plt.plot(
K_list,
throughput_AMD_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of AMD MI210",
color=color_AMD[0],
)
plt.plot(
K_list, throughput_AMD_list, marker="o", label="Real AMD MI210", color=color_AMD[1]
)
plt.plot(
K_list,
throughput_AMD_sim_list,
marker="x",
label="Simulated AMD MI210",
color=color_AMD[2],
)
plt.plot(
K_list,
throughput_TPU_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of Google TPUv3",
color=color_Google[0],
)
plt.plot(
K_list,
throughput_TPU_sim_list,
marker="x",
label="Simulated Google TPUv3",
color=color_Google[2],
)
# plt.legend()
# plt.title(title)
plt.xlabel("N=K")
plt.ylabel("TFLOPS")
plt.grid(True, which="both", ls="--", c="0.7") # Adding a grid for better readability
plt.tight_layout()
plt.savefig("figure5a.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300)
================================================
FILE: ae/figure5/ab/real_hardware/matmul_A100.csv
================================================
64, 12288, 12288, 0.1900ms, 101.7124Tflops
128, 12288, 12288, 0.2003ms, 193.0114Tflops
256, 12288, 12288, 0.3185ms, 242.7090Tflops
512, 12288, 12288, 0.6118ms, 252.7351Tflops
1024, 12288, 12288, 1.1990ms, 257.9115Tflops
2048, 12288, 12288, 2.3586ms, 262.2263Tflops
4096, 12288, 12288, 4.4576ms, 277.4929Tflops
8192, 12288, 12288, 8.6216ms, 286.9431Tflops
16384, 12288, 12288, 17.0307ms, 290.5223Tflops
32768, 12288, 12288, 35.4407ms, 279.2160Tflops
8192, 64, 64, 0.0296ms, 2.2700Tflops
8192, 128, 128, 0.0310ms, 8.6608Tflops
8192, 256, 256, 0.0356ms, 30.1244Tflops
8192, 512, 512, 0.0471ms, 91.2121Tflops
8192, 1024, 1024, 0.0927ms, 185.2380Tflops
8192, 2048, 2048, 0.2818ms, 243.8497Tflops
8192, 4096, 4096, 1.0210ms, 269.2169Tflops
8192, 8192, 8192, 3.9614ms, 277.5532Tflops
8192, 16384, 16384, 15.0087ms, 293.0334Tflops
8192, 32768, 32768, 61.1346ms, 287.7616Tflops
================================================
FILE: ae/figure5/ab/real_hardware/matmul_MI210.csv
================================================
32, 12288, 12288, 0.5493ms, 17.5922Tflops
64, 12288, 12288, 0.5584ms, 34.6135Tflops
128, 12288, 12288, 0.5932ms, 65.1646Tflops
256, 12288, 12288, 0.7699ms, 100.4209Tflops
512, 12288, 12288, 1.4054ms, 110.0209Tflops
1024, 12288, 12288, 2.7173ms, 113.8051Tflops
2048, 12288, 12288, 5.3905ms, 114.7338Tflops
4096, 12288, 12288, 10.4494ms, 118.3752Tflops
8192, 12288, 12288, 20.7849ms, 119.0242Tflops
16384, 12288, 12288, 41.1353ms, 120.2811Tflops
32768, 12288, 12288, 81.4046ms, 121.5608Tflops
8192, 32, 32, 0.0333ms, 0.5044Tflops
8192, 64, 64, 0.0345ms, 1.9479Tflops
8192, 128, 128, 0.0396ms, 6.7825Tflops
8192, 256, 256, 0.0485ms, 22.1307Tflops
8192, 512, 512, 0.0863ms, 49.7635Tflops
8192, 1024, 1024, 0.1950ms, 88.0900Tflops
8192, 2048, 2048, 0.5822ms, 118.0305Tflops
8192, 4096, 4096, 2.2901ms, 120.0272Tflops
8192, 8192, 8192, 9.4150ms, 116.7826Tflops
8192, 16384, 16384, 36.7552ms, 119.6578Tflops
8192, 32768, 32768, 146.2553ms, 120.2841Tflops
================================================
FILE: ae/figure5/ab/run.sh
================================================
rm *.csv
rm *.pdf
cd ../../..
python -m ae.figure5.ab.test_matmul --simgpu --roofline
python -m ae.figure5.ab.test_matmul --simtpu --roofline
python -m ae.figure5.ab.test_matmul --simamd --roofline
python -m ae.figure5.ab.test_matmul --simgpu
python -m ae.figure5.ab.test_matmul --simtpu
python -m ae.figure5.ab.test_matmul --simamd
cd ae/figure5/ab
python plot_matmul.py
================================================
FILE: ae/figure5/ab/test_matmul.py
================================================
from software_model.matmul import Matmul
from software_model.utils import data_type_dict, Tensor
from hardware_model.device import device_dict
import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--gpu", action="store_true", help="Enable GPU")
parser.add_argument("--simtpu", action="store_true", help="Enable simulation")
parser.add_argument("--simtpu-new", action="store_true", help="Enable simulation")
parser.add_argument("--simgpu", action="store_true", help="Enable simulation")
parser.add_argument("--simamd", action="store_true", help="amd simulation")
parser.add_argument("--roofline", action="store_true", help="Roofline simulation")
args = parser.parse_args()
if args.simtpu:
pcb = device_dict["TPUv3"]
if args.simtpu_new:
pcb = device_dict["TPUv3_new"]
if args.simgpu:
pcb = device_dict["A100_80GB_fp16"]
MI210 = device_dict["MI210"]
amd_overhead = MI210.compute_module.overhead.softmax
K = 12288
N = K
titile = f"Performance of Matmul with K={K}, N={N}"
print(f"{titile}")
test_overhead = True
for M in range(5, 16):
M = 2**M
model = Matmul(data_type=data_type_dict["fp16"])
_ = model(
Tensor([M, K]),
Tensor([K, N]),
)
if args.gpu:
if test_overhead:
model.gpu_kernel_launch_overhead()
test_overhead = False
latency = model.run_on_gpu()
if args.simtpu:
if args.roofline:
latency = model.roofline_model(pcb) + 110e-6
file_name='matmul_TPUv3_roofline.csv'
else:
latency = (
model.compile_and_simulate(pcb, compile_mode="heuristic-TPU")
+ 110e-6
)
file_name='matmul_TPUv3_sim.csv'
if args.simtpu_new:
if args.roofline:
latency = model.roofline_model(pcb) + 110e-6
else:
latency = (
model.compile_and_simulate(pcb, compile_mode="heuristic-TPU-new")
+ 110e-6
)
if args.simgpu:
if args.roofline:
latency = model.roofline_model(pcb) + 2.1e-5
file_name='matmul_A100_roofline.csv'
else:
latency = (
model.compile_and_simulate(pcb, compile_mode="heuristic-GPU")
+ 2.1e-5
)
file_name='matmul_A100_sim.csv'
if args.simamd:
if args.roofline:
latency = model.roofline_model(pcb_module=MI210) + amd_overhead
file_name='matmul_MI210_roofline.csv'
else:
latency = (
model.compile_and_simulate(
pcb_module=MI210, compile_mode="heuristic-GPU"
)
+ amd_overhead
)
file_name='matmul_MI210_sim.csv'
tflops = 2 * M * N * K / latency / 1e12
print(f"{M}, {N}, {K}, {latency*1e3:.4f}ms, {tflops:.4f}Tflops", flush=True)
with open(f'ae/figure5/ab/{file_name}', 'a') as f:
f.write(f"{M}, {N}, {K}, {latency*1e3:.4f}ms, {tflops:.4f}Tflops\n")
M = 8192
print(f"Performance of Matmul with M={M}, N=K")
for K in range(5, 16):
K = 2**K
N = K
model = Matmul(data_type=data_type_dict["fp16"])
_ = model(
Tensor([M, K]),
Tensor([K, N]),
)
if args.gpu:
latency = model.run_on_gpu()
if args.simtpu:
if args.roofline:
latency = model.roofline_model(pcb) + 110e-6
else:
latency = (
model.compile_and_simulate(pcb, compile_mode="heuristic-TPU")
+ 110e-6
)
if args.simtpu_new:
if args.roofline:
latency = model.roofline_model(pcb) + 110e-6
else:
latency = (
model.compile_and_simulate(pcb, compile_mode="heuristic-TPU-new")
+ 110e-6
)
if args.simgpu:
if args.roofline:
latency = model.roofline_model(pcb) + 2.1e-5
else:
latency = (
model.compile_and_simulate(pcb, compile_mode="heuristic-GPU")
+ 2.1e-5
)
if args.simamd:
if args.roofline:
latency = model.roofline_model(pcb_module=MI210) + amd_overhead
else:
latency = (
model.compile_and_simulate(
pcb_module=MI210, compile_mode="heuristic-GPU"
)
+ amd_overhead
)
tflops = 2 * M * N * K / latency / 1e12
print(f"{M}, {N}, {K}, {latency*1e3:.4f}ms, {tflops:.4f}Tflops", flush=True)
with open(f'ae/figure5/ab/{file_name}', 'a') as f:
f.write(f"{M}, {N}, {K}, {latency*1e3:.4f}ms, {tflops:.4f}Tflops\n")
================================================
FILE: ae/figure5/cf/__init__.py
================================================
================================================
FILE: ae/figure5/cf/plot_softmax.py
================================================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
softmax_TPUv3_sim = pd.read_csv(
"softmax_TPUv3_sim.csv", header=None, names=["M", "N", "throughput"]
)
softmax_TPUv3_sim.set_index(["M", "N"], inplace=True)
softmax_TPUv3_roofline = pd.read_csv(
"softmax_TPUv3_roofline.csv", header=None, names=["M", "N", "throughput"]
)
softmax_TPUv3_roofline.set_index(["M", "N"], inplace=True)
softmax_A100 = pd.read_csv(
"real_hardware/softmax_A100.csv", header=None, names=["M", "N", "throughput"]
)
softmax_A100.set_index(["M", "N"], inplace=True)
softmax_A100_sim = pd.read_csv(
"softmax_A100_sim.csv", header=None, names=["M", "N", "throughput"]
)
softmax_A100_sim.set_index(["M", "N"], inplace=True)
softmax_A100_roofline = pd.read_csv(
"softmax_A100_roofline.csv", header=None, names=["M", "N", "throughput"]
)
softmax_A100_roofline.set_index(["M", "N"], inplace=True)
softmax_MI210 = pd.read_csv(
"real_hardware/softmax_MI210.csv", header=None, names=["M", "N", "throughput"]
)
softmax_MI210.set_index(["M", "N"], inplace=True)
softmax_MI210_sim = pd.read_csv(
"softmax_MI210_sim.csv", header=None, names=["M", "N", "throughput"]
)
softmax_MI210_sim.set_index(["M", "N"], inplace=True)
softmax_MI210_roofline = pd.read_csv(
"softmax_MI210_roofline.csv", header=None, names=["M", "N", "throughput"]
)
softmax_MI210_roofline.set_index(["M", "N"], inplace=True)
color_NV = sns.color_palette("Greens_d", 4)[1:]
color_Google = sns.color_palette("Blues_d", 4)[1:]
color_AMD = sns.color_palette("flare", 3)
M = 4096
title = f"Performance of softmax with M={M}"
N_list = []
throughput_TPU_list = []
throughput_TPU_sim_list = []
throughput_TPU_roofline_list = []
throughput_GPU_list = []
throughput_GPU_sim_list = []
throughput_GPU_roofline_list = []
throughput_AMD_list = []
throughput_AMD_sim_list = []
throughput_AMD_roofline_list = []
for N in range(6, 16):
N = 2**N
N_list.append(N)
# print(M,N)
# print(softmax_TPUv3.loc[(M, N), 'throughput'])
throughput_TPU_sim_list.append(
softmax_TPUv3_sim.loc[(M, N), "throughput"].values[0]
)
throughput_TPU_roofline_list.append(
softmax_TPUv3_roofline.loc[(M, N), "throughput"].values[0]
)
throughput_GPU_list.append(softmax_A100.loc[(M, N), "throughput"].values[0])
throughput_GPU_sim_list.append(softmax_A100_sim.loc[(M, N), "throughput"].values[0])
throughput_GPU_roofline_list.append(
softmax_A100_roofline.loc[(M, N), "throughput"].values[0]
)
throughput_AMD_list.append(softmax_MI210.loc[(M, N), "throughput"].values[0])
throughput_AMD_sim_list.append(
softmax_MI210_sim.loc[(M, N), "throughput"].values[0]
)
throughput_AMD_roofline_list.append(
softmax_MI210_roofline.loc[(M, N), "throughput"].values[0]
)
plt.figure(figsize=(3.7, 2))
plt.xscale("log", base=2)
plt.plot(
N_list,
throughput_GPU_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of NVIDIA A100",
color=color_NV[0],
)
plt.plot(
N_list, throughput_GPU_list, marker="o", label="Real NVIDIA A100", color=color_NV[1]
)
plt.plot(
N_list,
throughput_GPU_sim_list,
marker="x",
label="Simulated NVIDIA A100",
color=color_NV[2],
)
plt.plot(
N_list,
throughput_AMD_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of AMD MI210",
color=color_AMD[0],
)
plt.plot(
N_list, throughput_AMD_list, marker="o", label="Real AMD MI210", color=color_AMD[1]
)
plt.plot(
N_list,
throughput_AMD_sim_list,
marker="x",
label="Simulated AMD MI210",
color=color_AMD[2],
)
plt.plot(
N_list,
throughput_TPU_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of Google TPUv3",
color=color_Google[0],
)
plt.plot(
N_list,
throughput_TPU_sim_list,
marker="x",
label="Simulated Google TPUv3",
color=color_Google[2],
)
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles, labels, loc="upper left", bbox_to_anchor=(1, 1.1))
# plt.legend()
# plt.title(title)
plt.xlabel("N")
plt.ylabel("G Elements/s")
plt.grid(True, which="both", ls="--", c="0.7") # Adding a grid for better readability
plt.savefig("figure5f.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300)
N = 4096
title = f"Performance of softmax with N={N}"
M_list = []
throughput_TPU_list = []
throughput_TPU_sim_list = []
throughput_TPU_roofline_list = []
throughput_GPU_list = []
throughput_GPU_sim_list = []
throughput_GPU_roofline_list = []
throughput_AMD_list = []
throughput_AMD_sim_list = []
throughput_AMD_roofline_list = []
for M in range(6, 16):
M = 2**M
M_list.append(M)
throughput_TPU_sim_list.append(
softmax_TPUv3_sim.loc[(M, N), "throughput"].values[0]
)
throughput_TPU_roofline_list.append(
softmax_TPUv3_roofline.loc[(M, N), "throughput"].values[0]
)
throughput_GPU_list.append(softmax_A100.loc[(M, N), "throughput"].values[0])
throughput_GPU_sim_list.append(softmax_A100_sim.loc[(M, N), "throughput"].values[0])
throughput_GPU_roofline_list.append(
softmax_A100_roofline.loc[(M, N), "throughput"].values[0]
)
throughput_AMD_list.append(softmax_MI210.loc[(M, N), "throughput"].values[0])
throughput_AMD_sim_list.append(
softmax_MI210_sim.loc[(M, N), "throughput"].values[0]
)
throughput_AMD_roofline_list.append(
softmax_MI210_roofline.loc[(M, N), "throughput"].values[0]
)
plt.figure(figsize=(3.7, 2.8))
plt.xscale("log", base=2)
plt.plot(
M_list,
throughput_GPU_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of NVIDIA A100",
color=color_NV[0],
)
plt.plot(
M_list, throughput_GPU_list, marker="o", label="Real NVIDIA A100", color=color_NV[1]
)
plt.plot(
M_list,
throughput_GPU_sim_list,
marker="x",
label="Simulated NVIDIA A100",
color=color_NV[2],
)
plt.plot(
M_list,
throughput_AMD_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of AMD MI210",
color=color_AMD[0],
)
plt.plot(
M_list, throughput_AMD_list, marker="o", label="Real AMD MI210", color=color_AMD[1]
)
plt.plot(
M_list,
throughput_AMD_sim_list,
marker="x",
label="Simulated AMD MI210",
color=color_AMD[2],
)
plt.plot(
M_list,
throughput_TPU_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of Google TPUv3",
color=color_Google[0],
)
plt.plot(
M_list,
throughput_TPU_sim_list,
marker="x",
label="Simulated Google TPUv3",
color=color_Google[2],
)
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles, labels, loc="upper left", bbox_to_anchor=(1, 1))
# plt.legend()
# plt.title(title)
plt.xlabel("M")
plt.ylabel("G Elements/s")
plt.grid(True, which="both", ls="--", c="0.7") # Adding a grid for better readability
plt.savefig("figure5c.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300)
================================================
FILE: ae/figure5/cf/real_hardware/softmax_A100.csv
================================================
4096, 32, 9.99556025250909
4096, 64, 19.634136210285714
4096, 128, 37.27158060257627
4096, 256, 68.719476736
4096, 512, 111.34294964820253
4096, 1024, 152.9755308210087
4096, 2048, 167.5446289944381
4096, 4096, 204.5603028420465
4096, 8192, 216.51921285435077
4096, 16384, 242.0249154863766
4096, 32768, 240.88573103179803
32, 4096, 8.457781752123077
64, 4096, 16.65926708751515
128, 4096, 30.97215852890141
256, 4096, 57.11748715719481
512, 4096, 96.6603628814066
1024, 4096, 137.438953472
2048, 4096, 166.75057862005687
4096, 4096, 203.9673744280116
8192, 4096, 229.96321626687583
16384, 4096, 247.12465031664266
32768, 4096, 278.826128490001
================================================
FILE: ae/figure5/cf/real_hardware/softmax_MI210.csv
================================================
4096, 32, 5.389762881254902
4096, 64, 10.372751205433962
4096, 128, 13.49094021811043
4096, 256, 36.049561566426235
4096, 512, 60.662710497986204
4096, 1024, 87.52331365381094
4096, 2048, 75.18028224109402
4096, 4096, 80.05545412703526
4096, 8192, 70.97200623062432
4096, 16384, 66.22940628486023
4096, 32768, 61.59189862377593
32, 4096, 4.822419420070175
64, 4096, 9.012390391606559
128, 4096, 15.820311191021583
256, 4096, 28.28325730613505
512, 4096, 43.76165682690547
1024, 4096, 60.662710497986204
2048, 4096, 72.54509709037526
4096, 4096, 80.28379255865829
8192, 4096, 84.78161949116145
16384, 4096, 86.98237846435599
32768, 4096, 88.01594018469544
================================================
FILE: ae/figure5/cf/run.sh
================================================
rm *.csv
rm *.pdf
cd ../../..
python -m ae.figure5.cf.test_softmax --simgpu --roofline
python -m ae.figure5.cf.test_softmax --simtpu --roofline
python -m ae.figure5.cf.test_softmax --simamd --roofline
python -m ae.figure5.cf.test_softmax --simgpu
python -m ae.figure5.cf.test_softmax --simtpu
python -m ae.figure5.cf.test_softmax --simamd
cd ae/figure5/cf
python plot_softmax.py
================================================
FILE: ae/figure5/cf/test_softmax.py
================================================
from software_model.softmax import Softmax
from software_model.utils import data_type_dict, Tensor
from hardware_model.device import device_dict
import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--gpu", action="store_true", help="Enable GPU")
parser.add_argument("--simgpu", action="store_true", help="Enable simulation")
parser.add_argument("--simtpu", action="store_true", help="Enable simulation")
parser.add_argument("--simamd", action="store_true", help="amd simulation")
parser.add_argument("--roofline", action="store_true", help="Roofline simulation")
args = parser.parse_args()
A100 = device_dict["A100_80GB_fp16"]
TPU = device_dict["TPUv3"]
MI210 = device_dict["MI210"]
tpu_overhead = 300e-6
gpu_overhead = 12e-6
amd_overhead = MI210.compute_module.overhead.softmax
if args.gpu:
gpu_kernel_launch_overhead = Softmax.gpu_kernel_launch_overhead()
print(f"Performance of Softmax")
M = 2**12
for N in range(5, 16):
N = 2**N
if args.simtpu:
model = Softmax(data_type=data_type_dict["fp32"])
_ = model(Tensor([M, N], data_type=data_type_dict["fp32"]))
if args.roofline:
latency = model.roofline_model(pcb_module=TPU) + tpu_overhead
file_name = "softmax_TPUv3_roofline.csv"
else:
latency = model.compile_and_simulate(pcb_module=TPU) + tpu_overhead
file_name = "softmax_TPUv3_sim.csv"
else:
model = Softmax(data_type=data_type_dict["fp16"])
_ = model(
Tensor([M, N]),
)
if args.gpu:
latency = model.run_on_gpu()
if args.simgpu:
if args.roofline:
latency = model.roofline_model(pcb_module=A100) + gpu_overhead
file_name = "softmax_A100_roofline.csv"
else:
latency = model.compile_and_simulate(pcb_module=A100) + gpu_overhead
file_name = "softmax_A100_sim.csv"
if args.simamd:
model = Softmax(data_type=data_type_dict["fp32"])
_ = model(Tensor([M, N], data_type=data_type_dict["fp32"]))
if args.roofline:
latency = model.roofline_model(pcb_module=MI210) + amd_overhead
file_name = "softmax_MI210_roofline.csv"
else:
latency = (
model.compile_and_simulate(pcb_module=MI210) + amd_overhead
)
file_name = "softmax_MI210_sim.csv"
print(f"{M}, {N}, {M*N/latency/1e9}")
with open(f"ae/figure5/cf/{file_name}", "a") as f:
f.write(f"{M}, {N}, {M*N/latency/1e9}\n")
N = 2**12
for M in range(5, 16):
M = 2**M
if args.simtpu:
model = Softmax(data_type=data_type_dict["fp32"])
_ = model(Tensor([M, N], data_type=data_type_dict["fp32"]))
if args.roofline:
latency = model.roofline_model(pcb_module=TPU) + tpu_overhead
else:
latency = model.compile_and_simulate(pcb_module=TPU) + tpu_overhead
else:
model = Softmax(data_type=data_type_dict["fp16"])
_ = model(
Tensor([M, N]),
)
if args.gpu:
latency = model.run_on_gpu()
if args.simgpu:
if args.roofline:
latency = model.roofline_model(pcb_module=A100) + gpu_overhead
else:
latency = model.compile_and_simulate(pcb_module=A100) + gpu_overhead
if args.simamd:
model = Softmax(data_type=data_type_dict["fp32"])
_ = model(Tensor([M, N], data_type=data_type_dict["fp32"]))
if args.roofline:
latency = model.roofline_model(pcb_module=MI210) + amd_overhead
else:
latency = (
model.compile_and_simulate(pcb_module=MI210) + amd_overhead
)
print(f"{M}, {N}, {M*N/latency/1e9}")
with open(f"ae/figure5/cf/{file_name}", "a") as f:
f.write(f"{M}, {N}, {M*N/latency/1e9}\n")
================================================
FILE: ae/figure5/de/__init__.py
================================================
================================================
FILE: ae/figure5/de/plot_layernorm.py
================================================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
layernorm_TPUv3_sim = pd.read_csv(
"layernorm_TPUv3_sim.csv", header=None, names=["M", "N", "throughput"]
)
layernorm_TPUv3_sim.set_index(["M", "N"], inplace=True)
layernorm_TPUv3_roofline = pd.read_csv(
"layernorm_TPUv3_roofline.csv", header=None, names=["M", "N", "throughput"]
)
layernorm_TPUv3_roofline.set_index(["M", "N"], inplace=True)
layernorm_A100 = pd.read_csv(
"real_hardware/layernorm_A100.csv", header=None, names=["M", "N", "throughput"]
)
layernorm_A100.set_index(["M", "N"], inplace=True)
layernorm_A100_sim = pd.read_csv(
"layernorm_A100_sim.csv", header=None, names=["M", "N", "throughput"]
)
layernorm_A100_sim.set_index(["M", "N"], inplace=True)
layernorm_A100_roofline = pd.read_csv(
"layernorm_A100_roofline.csv", header=None, names=["M", "N", "throughput"]
)
layernorm_A100_roofline.set_index(["M", "N"], inplace=True)
color_NV = sns.color_palette("Greens_d", 4)[1:]
color_Google = sns.color_palette("Blues_d", 4)[1:]
color_AMD = sns.color_palette("flare", 3)
M = 4096
title = f"Performance of layernorm with M={M}"
N_list = []
throughput_TPU_list = []
throughput_TPU_sim_list = []
throughput_TPU_roofline_list = []
throughput_GPU_list = []
throughput_GPU_sim_list = []
throughput_GPU_roofline_list = []
for N in range(6, 16):
N = 2**N
N_list.append(N)
throughput_TPU_sim_list.append(
layernorm_TPUv3_sim.loc[(M, N), "throughput"].values[0]
)
throughput_TPU_roofline_list.append(
layernorm_TPUv3_roofline.loc[(M, N), "throughput"].values[0]
)
throughput_GPU_list.append(layernorm_A100.loc[(M, N), "throughput"].values[0])
throughput_GPU_sim_list.append(
layernorm_A100_sim.loc[(M, N), "throughput"].values[0]
)
throughput_GPU_roofline_list.append(
layernorm_A100_roofline.loc[(M, N), "throughput"].values[0]
)
plt.figure(figsize=(3.64, 2))
plt.xscale("log", base=2)
plt.plot(
N_list,
throughput_GPU_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of NVIDIA A100",
color=color_NV[0],
)
plt.plot(
N_list, throughput_GPU_list, marker="o", label="Real NVIDIA A100", color=color_NV[1]
)
plt.plot(
N_list,
throughput_GPU_sim_list,
marker="x",
label="Simulated NVIDIA A100",
color=color_NV[2],
)
plt.plot(
N_list,
throughput_TPU_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of Google TPUv3",
color=color_Google[0],
)
plt.plot(
N_list,
throughput_TPU_sim_list,
marker="x",
label="Simulated Google TPUv3",
color=color_Google[2],
)
# plt.legend()
# plt.title(title)
plt.xlabel("N")
plt.ylabel("G Elements/s")
plt.grid(True, which="both", ls="--", c="0.7") # Adding a grid for better readability
plt.savefig("figure5e.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300)
N = 4096
title = f"Performance of layernorm with N={N}"
M_list = []
throughput_TPU_list = []
throughput_TPU_sim_list = []
throughput_TPU_roofline_list = []
throughput_GPU_list = []
throughput_GPU_sim_list = []
throughput_GPU_roofline_list = []
for M in range(6, 16):
M = 2**M
M_list.append(M)
throughput_TPU_sim_list.append(
layernorm_TPUv3_sim.loc[(M, N), "throughput"].values[0]
)
throughput_TPU_roofline_list.append(
layernorm_TPUv3_roofline.loc[(M, N), "throughput"].values[0]
)
throughput_GPU_list.append(layernorm_A100.loc[(M, N), "throughput"].values[0])
throughput_GPU_sim_list.append(
layernorm_A100_sim.loc[(M, N), "throughput"].values[0]
)
throughput_GPU_roofline_list.append(
layernorm_A100_roofline.loc[(M, N), "throughput"].values[0]
)
plt.figure(figsize=(3.64, 2))
plt.xscale("log", base=2)
plt.plot(
M_list,
throughput_GPU_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of NVIDIA A100",
color=color_NV[0],
)
plt.plot(
M_list, throughput_GPU_list, marker="o", label="Real NVIDIA A100", color=color_NV[1]
)
plt.plot(
M_list,
throughput_GPU_sim_list,
marker="x",
label="Simulated NVIDIA A100",
color=color_NV[2],
)
plt.plot(
M_list,
throughput_TPU_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of Google TPUv3",
color=color_Google[0],
)
plt.plot(
M_list,
throughput_TPU_sim_list,
marker="x",
label="Simulated Google TPUv3",
color=color_Google[2],
)
# plt.legend()
# plt.title(title)
plt.xlabel("M")
plt.ylabel("G Elements/s")
plt.grid(True, which="both", ls="--", c="0.7") # Adding a grid for better readability
plt.savefig("figure5d.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300)
================================================
FILE: ae/figure5/de/real_hardware/layernorm_A100.csv
================================================
4096, 32, 2.476377540036036
4096, 64, 4.8436635584845815
4096, 128, 10.064179659276888
4096, 256, 19.590407621844097
4096, 512, 39.26827242057143
4096, 1024, 78.888726656574
4096, 2048, 139.06866438273516
4096, 4096, 211.31755008307508
4096, 8192, 265.2921552409576
4096, 16384, 210.44858071824746
4096, 32768, 199.2743197951547
32, 4096, 2.4988900631272726
64, 4096, 5.020601040073059
128, 4096, 9.99556025250909
256, 4096, 20.082404160292235
512, 4096, 40.16480832058447
1024, 4096, 79.6026517846878
2048, 4096, 141.87246810012903
4096, 4096, 211.6353208350797
8192, 4096, 284.89370112414576
16384, 4096, 345.7923546813956
32768, 4096, 385.84643825998086
================================================
FILE: ae/figure5/de/real_hardware/layernorm_MI210.csv
================================================
4096, 32, 2.3695247806698823
4096, 64, 4.729122950631834
4096, 128, 9.17590195596949
4096, 256, 18.43588592802619
4096, 512, 31.59641013907877
4096, 1024, 52.44807583429779
4096, 2048, 83.74459902278055
4096, 4096, 89.41186119969811
4096, 8192, 86.39576298097589
4096, 16384, 86.3601901230308
4096, 32768, 78.76990948928942
32, 4096, 3.885694391427885
64, 4096, 6.742243759280835
128, 4096, 11.78431162745035
256, 4096, 22.64512970663591
512, 4096, 39.53264699736632
1024, 4096, 58.88026281772146
2048, 4096, 75.69152349271899
4096, 4096, 89.32684940809148
8192, 4096, 97.43158473170392
16384, 4096, 101.13477801032278
32768, 4096, 104.08326540288465
================================================
FILE: ae/figure5/de/run.sh
================================================
rm *.csv
rm *.pdf
cd ../../..
python -m ae.figure5.de.test_layernorm --simgpu --roofline
python -m ae.figure5.de.test_layernorm --simtpu --roofline
python -m ae.figure5.de.test_layernorm --simgpu
python -m ae.figure5.de.test_layernorm --simtpu
cd ae/figure5/de
python plot_layernorm.py
================================================
FILE: ae/figure5/de/test_layernorm.py
================================================
from software_model.layernorm import LayerNorm
from software_model.utils import data_type_dict, Tensor
from hardware_model.device import device_dict
import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--gpu", action="store_true", help="Enable GPU")
parser.add_argument("--amd", action="store_true", help="Enable AMD")
parser.add_argument("--simgpu", action="store_true", help="Enable simulation")
parser.add_argument("--simtpu", action="store_true", help="Enable simulation")
parser.add_argument("--simamd", action="store_true", help="Enable simulation")
parser.add_argument("--roofline", action="store_true", help="Roofline simulation")
args = parser.parse_args()
A100 = device_dict["A100_80GB_fp16"]
TPU = device_dict["TPUv3"]
MI210 = device_dict["MI210"]
if args.gpu:
gpu_kernel_launch_overhead = LayerNorm.gpu_kernel_launch_overhead()
print(f"Performance of LayerNorm")
M = 2**12
for N in range(5, 16):
N = 2**N
# N = 2**15
if args.simtpu:
model = LayerNorm(data_type=data_type_dict["fp32"])
_ = model(Tensor([M, N], data_type=data_type_dict["fp32"]))
if args.roofline:
latency = model.roofline_model(TPU) + 140e-6
file_name = "layernorm_TPUv3_roofline.csv"
else:
latency = (
model.compile_and_simulate(
pcb_module=TPU, compile_mode="heuristic-TPU"
)
+ 140e-6
)
file_name = "layernorm_TPUv3_sim.csv"
else:
model = LayerNorm(data_type=data_type_dict["fp16"])
_ = model(
Tensor([M, N]),
)
if args.gpu:
latency = model.run_on_gpu()
if args.amd:
# model.amd_kernel_launch_overhead()
latency = model.run_on_amd()
if args.simgpu:
if args.roofline:
latency = model.roofline_model(A100) + 4.5e-5
file_name = "layernorm_A100_roofline.csv"
else:
latency = (
model.compile_and_simulate(
pcb_module=A100, compile_mode="heuristic-GPU"
)
+ 4.5e-5
)
file_name = "layernorm_A100_sim.csv"
if args.simamd:
model = LayerNorm(data_type=data_type_dict["fp32"])
_ = model(Tensor([M, N], data_type=data_type_dict["fp32"]))
if args.roofline:
latency = (
model.roofline_model(MI210)
+ MI210.compute_module.overhead.layernorm
)
file_name = "layernorm_MI210_roofline.csv"
else:
latency = (
model.compile_and_simulate(
pcb_module=MI210, compile_mode="heuristic-GPU"
)
+ MI210.compute_module.overhead.layernorm
)
file_name = "layernorm_MI210_sim.csv"
print(f"{M}, {N}, {M*N/latency/1e9}")
with open(f"ae/figure5/de/{file_name}", "a") as f:
f.write(f"{M}, {N}, {M*N/latency/1e9}\n")
N = 2**12
for M in range(5, 16):
M = 2**M
if args.simtpu:
model = LayerNorm(data_type=data_type_dict["fp32"])
_ = model(Tensor([M, N], data_type=data_type_dict["fp32"]))
if args.roofline:
latency = model.roofline_model(TPU) + 140e-6
else:
latency = (
model.compile_and_simulate(
pcb_module=TPU, compile_mode="heuristic-TPU"
)
+ 140e-6
)
else:
model = LayerNorm(data_type=data_type_dict["fp16"])
_ = model(
Tensor([M, N]),
)
if args.gpu:
latency = model.run_on_gpu()
if args.amd:
latency = model.run_on_amd()
if args.simgpu:
if args.roofline:
latency = model.roofline_model(A100) + 4.5e-5
else:
latency = (
model.compile_and_simulate(
pcb_module=A100, compile_mode="heuristic-GPU"
)
+ 4.5e-5
)
if args.simamd:
model = LayerNorm(data_type=data_type_dict["fp32"])
_ = model(Tensor([M, N], data_type=data_type_dict["fp32"]))
if args.roofline:
latency = (
model.roofline_model(MI210)
+ MI210.compute_module.overhead.layernorm
)
else:
latency = (
model.compile_and_simulate(
pcb_module=MI210, compile_mode="heuristic-GPU"
)
+ MI210.compute_module.overhead.layernorm
)
print(f"{M}, {N}, {M*N/latency/1e9}")
with open(f"ae/figure5/de/{file_name}", "a") as f:
f.write(f"{M}, {N}, {M*N/latency/1e9}\n")
================================================
FILE: ae/figure5/g/__init__.py
================================================
================================================
FILE: ae/figure5/g/plot_gelu.py
================================================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
gelu_TPUv3_sim = pd.read_csv(
"gelu_TPUv3_sim.csv", header=None, names=["M", "throughput"]
)
gelu_TPUv3_roofline = pd.read_csv(
"gelu_TPUv3_roofline.csv", header=None, names=["M", "throughput"]
)
gelu_A100 = pd.read_csv("real_hardware/gelu_A100.csv", header=None, names=["M", "throughput"])
gelu_A100_sim = pd.read_csv("gelu_A100_sim.csv", header=None, names=["M", "throughput"])
gelu_A100_roofline = pd.read_csv(
"gelu_A100_roofline.csv", header=None, names=["M", "throughput"]
)
gelu_MI210 = pd.read_csv("real_hardware/gelu_MI210.csv", header=None, names=["M", "throughput"])
gelu_MI210_sim = pd.read_csv(
"gelu_MI210_sim.csv", header=None, names=["M", "throughput"]
)
gelu_MI210_roofline = pd.read_csv(
"gelu_MI210_roofline.csv", header=None, names=["M", "throughput"]
)
color_NV = sns.color_palette("Greens_d", 4)[1:]
color_Google = sns.color_palette("Blues_d", 4)[1:]
color_AMD = sns.color_palette("flare", 3)
M = 4096
title = f"Performance of gelu with M={M}"
M_list = []
throughput_TPU_list = []
throughput_TPU_sim_list = []
throughput_TPU_roofline_list = []
throughput_GPU_list = []
throughput_GPU_sim_list = []
throughput_GPU_roofline_list = []
throughput_AMD_list = []
throughput_AMD_sim_list = []
throughput_AMD_roofline_list = []
for M in range(10, 30):
M = 2**M
M_list.append(M)
throughput_TPU_sim_list.append(
gelu_TPUv3_sim[gelu_TPUv3_sim["M"] == M]["throughput"].iloc[0]
)
throughput_TPU_roofline_list.append(
gelu_TPUv3_roofline[gelu_TPUv3_roofline["M"] == M]["throughput"].iloc[0]
)
throughput_GPU_list.append(gelu_A100[gelu_A100["M"] == M]["throughput"].iloc[0])
throughput_GPU_sim_list.append(
gelu_A100_sim[gelu_A100_sim["M"] == M]["throughput"].iloc[0]
)
throughput_GPU_roofline_list.append(
gelu_A100_roofline[gelu_A100_roofline["M"] == M]["throughput"].iloc[0]
)
throughput_AMD_list.append(gelu_MI210[gelu_MI210["M"] == M]["throughput"].iloc[0])
throughput_AMD_sim_list.append(
gelu_MI210_sim[gelu_MI210_sim["M"] == M]["throughput"].iloc[0]
)
throughput_AMD_roofline_list.append(
gelu_MI210_roofline[gelu_MI210_roofline["M"] == M]["throughput"].iloc[0]
)
plt.figure(figsize=(6, 2.3))
plt.xscale("log", base=2)
plt.plot(
M_list,
throughput_GPU_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of NVIDIA A100",
color=color_NV[0],
)
plt.plot(
M_list, throughput_GPU_list, marker="o", label="Real NVIDIA A100", color=color_NV[1]
)
plt.plot(
M_list,
throughput_GPU_sim_list,
marker="x",
label="Simulated NVIDIA A100",
color=color_NV[2],
)
plt.plot(
M_list,
throughput_AMD_roofline_list,
marker=" ",
linewidth=1.5,
linestyle="--",
label="Roofline of AMD MI210",
color=color_AMD[0],
)
plt.plot(
M_list, throughput_AMD_list, marker="o", label="Real AMD MI210", color=color_AMD[1]
)
plt.plot(
M_list,
throughput_AMD_sim_list,
marker="x",
label="Simulated AMD MI210",
color=color_AMD[2],
)
plt.plot(
M_list,
throughput_TPU_roofline_list,
marker=" ",
linewidth=3.5,
linestyle="--",
label="Roofline of Google TPUv3",
color=color_Google[0],
)
plt.plot(
M_list,
throughput_TPU_sim_list,
marker="x",
label="Simulated Google TPUv3",
color=color_Google[2],
)
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles, labels, loc="upper left", bbox_to_anchor=(1, 1.05))
# plt.title(title)
plt.xlabel("# Elements")
plt.ylabel("G Elements/s")
plt.grid(True, which="both", ls="--", c="0.7") # Adding a grid for better readability
plt.savefig(f"figure5g.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01)
================================================
FILE: ae/figure5/g/real_hardware/gelu_A100.csv
================================================
1024, 0.021262214336633663
2048, 0.04338350804040404
4096, 0.08547198599004975
8192, 0.1700977146930693
16384, 0.3368601800784314
32768, 0.6889170600100251
65536, 1.38129601479397
131072, 2.735103551681592
262144, 5.52518405917588
524288, 11.19095804352163
1048576, 21.935394070344138
2097152, 43.980465111040004
4194304, 86.66101499712316
8388608, 141.87246810012903
16777216, 210.36993775086398
33554432, 277.042299912063
67108864, 328.2507017033889
134217728, 360.17271492086496
268435456, 379.53814489891255
536870912, 390.32758080867535
================================================
FILE: ae/figure5/g/real_hardware/gelu_MI210.csv
================================================
1024, 0.047197442813186816
2048, 0.09761289309090909
4096, 0.19522578618181818
8192, 0.39045157236363637
16384, 0.7898790429425288
32768, 1.5618062894545455
65536, 3.0885158083595505
131072, 6.108397932088889
262144, 11.822705675010754
524288, 21.55905152501961
1048576, 37.27158060257627
2097152, 59.43306096086487
4194304, 87.96093022208001
8388608, 114.60707520792182
16777216, 133.52702880012146
33554432, 146.984322042118
67108864, 151.08694402074934
134217728, 152.91320207016489
268435456, 154.04294798777178
536870912, 154.44972829556897
================================================
FILE: ae/figure5/g/run.sh
================================================
rm *.csv
rm *.pdf
cd ../../..
python -m ae.figure5.g.test_gelu --simgpu --roofline
python -m ae.figure5.g.test_gelu --simtpu --roofline
python -m ae.figure5.g.test_gelu --simamd --roofline
python -m ae.figure5.g.test_gelu --simgpu
python -m ae.figure5.g.test_gelu --simtpu
python -m ae.figure5.g.test_gelu --simamd
cd ae/figure5/g
python plot_gelu.py
================================================
FILE: ae/figure5/g/test_gelu.py
================================================
from software_model.gelu import GeLU
from software_model.utils import data_type_dict, Tensor
from hardware_model.device import device_dict
import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--gpu", action="store_true", help="Enable GPU")
parser.add_argument("--simgpu", action="store_true", help="Enable simulation")
parser.add_argument("--simtpu", action="store_true", help="Enable simulation")
parser.add_argument("--roofline", action="store_true", help="Roofline simulation")
parser.add_argument("--amd", action="store_true", help="Enable AMD")
parser.add_argument("--simamd", action="store_true", help="Enable simulation")
args = parser.parse_args()
A100 = device_dict["A100_80GB_fp16"]
TPU = device_dict["TPUv3"]
MI210 = device_dict["MI210"]
if args.gpu:
gpu_kernel_launch_overhead = GeLU.gpu_kernel_launch_overhead()
print(f"Performance of GELU")
for M in range(10, 30):
M = 2**M
# N = 2**15
if args.simtpu:
model = GeLU(data_type=data_type_dict["fp32"])
_ = model(Tensor([M], data_type=data_type_dict["fp32"]))
if args.roofline:
latency = model.roofline_model(TPU) + 100e-6
file_name = "gelu_TPUv3_roofline.csv"
else:
latency = (
model.compile_and_simulate(
pcb_module=TPU, compile_mode="heuristic-TPU"
)
+ 100e-6
)
file_name = "gelu_TPUv3_sim.csv"
else:
model = GeLU(data_type=data_type_dict["fp16"])
_ = model(
Tensor([M]),
)
if args.gpu:
latency = model.run_on_gpu()
if args.amd:
model.amd_kernel_launch_overhead()
latency = model.run_on_amd()
if args.simgpu:
if args.roofline:
latency = model.roofline_model(A100) + 4.5e-5
file_name = "gelu_A100_roofline.csv"
else:
latency = (
model.compile_and_simulate(
pcb_module=A100, compile_mode="heuristic-GPU"
)
+ 4.5e-5
)
file_name = "gelu_A100_sim.csv"
if args.simamd:
if args.roofline:
latency = (
model.roofline_model(MI210) + MI210.compute_module.overhead.gelu
)
file_name = "gelu_MI210_roofline.csv"
else:
latency = (
model.compile_and_simulate(
pcb_module=MI210, compile_mode="heuristic-GPU"
)
+ MI210.compute_module.overhead.gelu
)
file_name = "gelu_MI210_sim.csv"
print(f"{M}, {M/latency/1e9}")
with open(f"ae/figure5/g/{file_name}", "a") as f:
f.write(f"{M}, {M/latency/1e9}\n")
================================================
FILE: ae/figure5/h/__init__.py
================================================
================================================
FILE: ae/figure5/h/run.sh
================================================
rm *.csv
rm *.pdf
cd ../../..
python -m ae.figure5.h.test_allreduce
================================================
FILE: ae/figure5/h/test_allreduce.py
================================================
from software_model.communication_primitives import AllReduceMultiPCB
from software_model.utils import data_type_dict, Tensor
from hardware_model.interconnect import interconnect_module_dict
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
if __name__ == "__main__":
interconnect_module = interconnect_module_dict["NVLinkV3_FC_4"]
gpu_latency_list = [
12.52,
13.92,
12.39,
13.22,
12.35,
12.45,
13.12,
13.02,
15.12,
15.23,
15.99,
17.39,
20.00,
22.93,
28.66,
35.93,
47.27,
60.75,
66.40,
84.75,
128.8,
195.7,
279.7,
532.3,
961.7,
1883.7,
3659.0,
7219.2,
14136,
27944,
55384,
110277,
]
size_list = [
8,
16,
32,
64,
128,
256,
512,
1024,
2048,
4096,
8192,
16384,
32768,
65536,
131072,
262144,
524288,
1048576,
2097152,
4194304,
8388608,
16777216,
33554432,
67108864,
134217728,
268435456,
536870912,
1073741824,
2147483648,
4294967296,
8589934592,
17179869184,
]
simulated_latency_list = []
data_type = data_type_dict["fp16"]
for data_size in size_list:
model = AllReduceMultiPCB(data_type=data_type)
_ = model(
Tensor([data_size / 2]),
)
our_latency = model.simulate(interconnect_module=interconnect_module)
simulated_latency_list.append(our_latency * 1e6)
gpu_bandwidth_list = np.array(size_list) / np.array(gpu_latency_list) / 1e3
simulated_gpu_bandwidth_list = (
np.array(size_list) / np.array(simulated_latency_list) / 1e3
)
size_list = size_list[9:]
gpu_bandwidth_list = gpu_bandwidth_list[9:]
simulated_gpu_bandwidth_list = simulated_gpu_bandwidth_list[9:]
color_NV = sns.color_palette("Greens_d", 4)[1:]
color_Google = sns.color_palette("Blues_d", 4)[1:]
plt.figure(figsize=(6, 2.3))
plt.xscale("log", base=2)
plt.plot(
size_list,
gpu_bandwidth_list,
marker="o",
label="Real NVIDIA A100 Node",
color=color_NV[0],
)
plt.plot(
size_list,
simulated_gpu_bandwidth_list,
marker="x",
label="Simulated NVIDIA A100 Node",
color=color_NV[2],
)
interconnect_module = interconnect_module_dict["TPUv3Link_8"]
simulated_tpu_bandwidth_list = []
data_type = data_type_dict["fp16"]
for data_size in size_list:
model = AllReduceMultiPCB(data_type=data_type)
_ = model(
Tensor([data_size // 2]),
)
our_latency = model.simulate(interconnect_module=interconnect_module)
simulated_tpu_bandwidth_list.append(data_size / our_latency / 1e9)
plt.plot(
size_list,
simulated_tpu_bandwidth_list,
marker="x",
label="Simulated Google TPU v3 Node",
color=color_Google[2],
)
plt.xlabel("Data Size (Bytes)")
plt.ylabel("Bandwidth (GB/s)")
plt.grid(
True, which="both", ls="--", c="0.7"
) # Adding a grid for better readability
plt.legend()
plt.savefig(
"ae/figure5/h/figure5h.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300
)
================================================
FILE: ae/figure5/ijkl/__init__.py
================================================
================================================
FILE: ae/figure5/ijkl/plot_transformer.py
================================================
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import pandas as pd
def read_csv(filename: str):
numbers = []
# Open the CSV file and read the numbers
with open(filename, "r") as csvfile:
reader = csv.reader(csvfile)
for row in reader:
# Since each row contains only one number, we use row[0]
numbers.append(float(row[0]))
return numbers
categories = [
"Q_K_V",
"Q_mul_K",
"A_mul_V",
"Wo_proj",
"W1_proj",
"W2_proj",
"Softmax",
"LayerNorm_MHA",
"LayerNorm_FFN",
"GeLU",
"AllReduce_MHA",
"AllReduce_FFN",
]
colors_matmul = sns.color_palette("flare_r", 6)
colors_normalization = sns.color_palette("summer", 3)
colors_gelu = sns.color_palette("pink", 1)
colors_allreduce = sns.color_palette("Blues_r", 2)
colors = colors_matmul + colors_normalization + colors_gelu + colors_allreduce
# values_simgpu = read_csv("transformer_A100_sim.csv")
values_simgpu = pd.read_csv("transformer_A100_sim.csv", header=None, names=categories, index_col=None).iloc[0].tolist()
print(values_simgpu)
values_gpu = read_csv("real_hardware/transformer_A100.csv")
# values_gpu_roofline = read_csv("transformer_A100_roofline.csv")
values_gpu_roofline = pd.read_csv("transformer_A100_roofline.csv", header=None, names=categories, index_col=None).iloc[0].tolist()
# values_simtpu = read_csv("transformer_TPUv3_sim.csv")
values_simtpu = pd.read_csv("transformer_TPUv3_sim.csv", header=None, names=categories, index_col=None).iloc[0].tolist()
# values_tpu_roofline = read_csv("transformer_TPUv3_roofline.csv")
values_tpu_roofline = pd.read_csv("transformer_TPUv3_roofline.csv", header=None, names=categories, index_col=None).iloc[0].tolist()
plt.figure(figsize=(3, 2.8))
# Create the stacked bar graph
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_gpu)):
plt.bar(1, value, bottom=bottom, color=colors[i], label=category, width=0.5)
bottom += value
value_gt = bottom
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_simgpu)):
plt.bar(2, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
value_sim = bottom
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_gpu_roofline)):
plt.bar(3, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
value_roofline = bottom
print(f"gpu prefilling: {value_sim/value_gt}, {value_roofline/value_gt}")
# Set the title, legend, and display the graph
# plt.title(
# "GPU Runtime Breakdown of One Transformer Layer in GPT-3 \n(Initial computation, batch size = 8, sequence length = 2048)"
# )
plt.ylabel("Latency (s)")
# plt.xlabel('Bar Sets')
plt.xticks([1, 2, 3], ["Real\nA100", "Simulated\nA100", "Roofline\nModel"])
# handles, labels = plt.gca().get_legend_handles_labels()
# plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.savefig("figure5i.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300)
plt.figure(figsize=(3, 2.8))
# Create the stacked bar graph
# bottom = 0
# for i, (category, value) in enumerate(zip(categories, values_tpu)):
# plt.bar(1, value, bottom=bottom, color=colors[i], label=category, width=0.5)
# bottom += value
# value_gt = bottom
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_simtpu)):
plt.bar(2, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
value_sim = bottom
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_tpu_roofline)):
plt.bar(3, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
value_roofline = bottom
# print(f"tpu prefilling: {value_sim/value_gt}, {value_roofline/value_gt}")
# Set the title, legend, and display the graph
# plt.title(
# "TPU Runtime Breakdown of One Transformer Layer in GPT-3 \n(Initial computation, batch size = 8, sequence length = 2048)"
# )
plt.ylabel("Latency (s)")
# plt.xlabel('Bar Sets')
plt.xticks([1, 2, 3], ["Real\nTPUv3", "Simulated\nTPUv3", "Roofline\nModel"])
# handles, labels = plt.gca().get_legend_handles_labels()
# plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.savefig("figure5j.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300)
# values_simgpu = read_csv("transformerAR_A100_sim.csv")
values_simgpu=pd.read_csv("transformerAR_A100_sim.csv",header=None,names=categories,index_col=None).iloc[0].tolist()
values_gpu = read_csv("real_hardware/transformerAR_A100.csv")
# values_gpu_roofline = read_csv("transformerAR_A100_roofline.csv")
values_gpu_roofline=pd.read_csv("transformerAR_A100_roofline.csv",header=None,names=categories,index_col=None).iloc[0].tolist()
plt.figure(figsize=(3, 2.8))
# Create the stacked bar graph
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_gpu)):
value = value * 1e3
plt.bar(1, value, bottom=bottom, color=colors[i], label=category, width=0.5)
bottom += value
value_gt = bottom
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_simgpu)):
value = value * 1e3
plt.bar(2, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
value_sim = bottom
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_gpu_roofline)):
value = value * 1e3
plt.bar(3, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
value_roofline = bottom
print(value_sim / value_gt, value_roofline / value_gt)
# Set the title, legend, and display the graph
# plt.title(
# "GPU Runtime Breakdown of One Transformer Layer in GPT-3 \n(Auto regression, batch size = 8, sequence length = 2048)"
# )
plt.ylabel("Latency (ms)")
# plt.xlabel('Bar Sets')
plt.xticks([1, 2, 3], ["Real\nA100", "Simulated\nA100", "Roofline\nModel"])
# handles, labels = plt.gca().get_legend_handles_labels()
# plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.savefig("figure5k.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300)
# values_simtpu = read_csv("transformerAR_TPUv3_sim.csv")
values_simtpu = pd.read_csv("transformerAR_TPUv3_sim.csv", header=None, names=categories, index_col=None).iloc[0].tolist()
# values_tpu = read_csv("real_hardware/transformerAR_TPUv3.csv")
# values_tpu_roofline=read_csv("transformerAR_TPUv3_roofline.csv")
values_tpu_roofline=pd.read_csv("transformerAR_TPUv3_roofline.csv",header=None,names=categories,index_col=None).iloc[0].tolist()
plt.figure(figsize=(4.5, 2.8))
# Create the stacked bar graph
# bottom = 0
# for i, (category, value) in enumerate(zip(categories, values_tpu)):
# value=value*1e3
# plt.bar(1, value, bottom=bottom, color=colors[i], label=category, width=0.5)
# bottom += value
# value_gt=bottom
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_simtpu)):
value=value*1e3
plt.bar(2, value, bottom=bottom, color=colors[i], label=category,width=0.5)
bottom += value
value_sim=bottom
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_tpu_roofline)):
value=value*1e3
plt.bar(3, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
value_roofline=bottom
print(value_sim/value_gt,value_roofline/value_gt)
# Set the title, legend, and display the graph
# plt.title(
# "GPU Runtime Breakdown of One Transformer Layer in GPT-3 \n(Auto regression, batch size = 8, input(output) sequence length = 2048(1024))"
# )
plt.ylabel("Latency (ms)")
# plt.xlabel('Bar Sets')
plt.xticks([1, 2, 3], ["Real\nTPUv3", "Simulated\nTPUv3", "Roofline\nModel"])
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1, 1.05))
plt.tight_layout()
plt.savefig("figure5l.pdf",bbox_inches="tight", pad_inches=0.01, dpi=300)
================================================
FILE: ae/figure5/ijkl/real_hardware/transformerAR_A100.csv
================================================
0.0002124309539794922
0.00010609626770019531
0.0001386404037475586
6.890296936035156e-05
0.00018596649169921875
0.00018608570098876953
1.6689300537109375e-05
4.8041343688964844e-05
4.8041343688964844e-05
4.792213439941406e-05
26.04e-06
26.04e-06
================================================
FILE: ae/figure5/ijkl/real_hardware/transformer_A100.csv
================================================
0.013721823692321777
0.0018811225891113281
0.001183152198791504
0.0045403242111206055
0.017464280128479004
0.017485618591308594
0.00280153751373291
0.0006816387176513672
0.0006816387176513672
0.0005242824554443359
0.0028909
0.0028909
================================================
FILE: ae/figure5/ijkl/run.sh
================================================
rm *.csv
rm *.pdf
cd ../../..
python -m ae.figure5.ijkl.test_transformer --simgpu --roofline
python -m ae.figure5.ijkl.test_transformer --simtpu --roofline
python -m ae.figure5.ijkl.test_transformer --simgpu --init --roofline
python -m ae.figure5.ijkl.test_transformer --simtpu --init --roofline
python -m ae.figure5.ijkl.test_transformer --simgpu
python -m ae.figure5.ijkl.test_transformer --simtpu
python -m ae.figure5.ijkl.test_transformer --simgpu --init
python -m ae.figure5.ijkl.test_transformer --simtpu --init
cd ae/figure5/ijkl
python plot_transformer.py
================================================
FILE: ae/figure5/ijkl/test_transformer.py
================================================
from software_model.transformer import (
TransformerBlockInitComputationTP,
TransformerBlockAutoRegressionTP,
)
from software_model.utils import data_type_dict, Tensor
from hardware_model.system import system_dict
import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--init", action="store_true", help="initial computation")
parser.add_argument("--gpu", action="store_true", help="Enable GPU")
parser.add_argument("--simgpu", action="store_true", help="Enable simulation")
parser.add_argument("--simtpu", action="store_true", help="Enable simulation")
parser.add_argument("--roofline", action="store_true", help="use roofline")
args = parser.parse_args()
bs = 8
s = 2048
if args.init:
print("Initial computation")
if args.simgpu:
model = TransformerBlockInitComputationTP(
d_model=12288,
n_heads=96,
device_count=4,
data_type=data_type_dict["fp16"],
)
A100_system = system_dict["A100_4_fp16"]
# from design_space_exploration.dse import read_architecture_template, template_to_system
# arch_specs = read_architecture_template("configs/template.json")
# A100_system = template_to_system(arch_specs)
_ = model(Tensor([bs, s, 12288], data_type_dict["fp16"]))
if args.roofline:
model.roofline_model(A100_system)
file_name = "transformer_A100_roofline.csv"
else:
model.compile_and_simulate(A100_system, compile_mode="heuristic-GPU")
file_name = "transformer_A100_sim.csv"
if args.simtpu:
model = TransformerBlockInitComputationTP(
d_model=12288,
n_heads=96,
device_count=8,
data_type=data_type_dict["fp16"],
)
TPU_system = system_dict["TPUv3_8"]
_ = model(Tensor([bs, s, 12288], data_type_dict["fp16"]))
if args.roofline:
model.roofline_model(TPU_system)
file_name = "transformer_TPUv3_roofline.csv"
else:
model.compile_and_simulate(TPU_system, compile_mode="heuristic-TPU")
file_name = "transformer_TPUv3_sim.csv"
if args.gpu:
model = TransformerBlockInitComputationTP(
d_model=12288,
n_heads=96,
device_count=4,
data_type=data_type_dict["fp16"],
)
_ = model(Tensor([bs, s, 12288], data_type_dict["fp16"]))
model.run_on_gpu()
else:
print("Auto-regression")
output_token_length = 1024
if args.simgpu:
model = TransformerBlockAutoRegressionTP(
d_model=12288,
n_heads=96,
device_count=4,
data_type=data_type_dict["fp16"],
)
A100_system = system_dict["A100_4_fp16"]
_ = model(
Tensor([bs, 1, 12288], data_type_dict["fp16"]), s + output_token_length
)
if args.roofline:
model.roofline_model(A100_system)
file_name = "transformerAR_A100_roofline.csv"
else:
model.compile_and_simulate(A100_system, compile_mode="heuristic-GPU")
file_name = "transformerAR_A100_sim.csv"
if args.simtpu:
model = TransformerBlockAutoRegressionTP(
d_model=12288,
n_heads=96,
device_count=8,
data_type=data_type_dict["fp16"],
)
TPU_system = system_dict["TPUv3_8"]
_ = model(
Tensor([bs, 1, 12288], data_type_dict["fp16"]), s + output_token_length
)
if args.roofline:
model.roofline_model(TPU_system)
file_name = "transformerAR_TPUv3_roofline.csv"
else:
model.compile_and_simulate(TPU_system, compile_mode="heuristic-TPU")
file_name = "transformerAR_TPUv3_sim.csv"
if args.gpu:
model = TransformerBlockAutoRegressionTP(
d_model=12288,
n_heads=96,
device_count=4,
data_type=data_type_dict["fp16"],
)
_ = model(
Tensor([bs, 1, 12288], data_type_dict["fp16"]), s + output_token_length
)
model.run_on_gpu()
with open(f"ae/figure5/ijkl/{file_name}", "w") as f:
if args.roofline:
f.write(model.roofline_log)
else:
f.write(model.simluate_log)
================================================
FILE: ae/figure5/run_figure5.sh
================================================
cd ab
bash run.sh
cd ..
cd cf
bash run.sh
cd ..
cd de
bash run.sh
cd ..
cd g
bash run.sh
cd ..
cd h
bash run.sh
cd ..
cd ijkl
bash run.sh
cd ..
================================================
FILE: ae/figure6/real_hardware/die_area.csv
================================================
476.25, 446.22
76.44, 33
119.31, 25.2
58, 83.26
31.77, 40.83
20.95, 45.52
0, 42
40, 4
================================================
FILE: ae/figure6/run_figure6.sh
================================================
rm *.csv
rm *.pdf
cd ../..
python -m ae.figure6.test_cost_model
================================================
FILE: ae/figure6/test_cost_model.py
================================================
from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2
from design_space_exploration.dse import read_architecture_template
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
arch_specs = read_architecture_template("configs/GA100.json")
compute_chiplet_area_mm2, A100_core_breakdown_map, compute_total_die_map = (
calc_compute_chiplet_area_mm2(arch_specs, verbose=True)
)
io_die_area_mm2, io_total_die_map = calc_io_die_area_mm2(arch_specs, verbose=True)
categories = [
"Cores",
"On-chip interconnect",
"Global buffer",
"Memory(PHY)",
"Memory(Control)",
"Off-chip interconnect\n(PHY)",
"Off-chip interconnect\n(Control)",
"Other",
]
die_area = pd.read_csv(
"ae/figure6/real_hardware/die_area.csv", header=None, names=["A100", "MI210"]
)
values_a100 = die_area["A100"].tolist()
values_mi210 = die_area["MI210"].tolist()
values_a100_sim = [
compute_total_die_map["cores_area"],
compute_total_die_map["crossbar_area"],
io_total_die_map["global_buffer_area"],
io_total_die_map["mem_phy_area"],
io_total_die_map["mem_controller_area"],
io_total_die_map["device_phy_area"],
io_total_die_map["device_controller_area"],
0,
]
arch_specs = read_architecture_template("configs/mi210_template.json")
compute_chiplet_area_mm2, MI210_core_breakdown_map, compute_total_die_map = (
calc_compute_chiplet_area_mm2(arch_specs, verbose=True)
)
io_die_area_mm2, io_total_die_map = calc_io_die_area_mm2(arch_specs, verbose=True)
values_mi210_sim = [
compute_total_die_map["cores_area"],
compute_total_die_map["crossbar_area"],
io_total_die_map["global_buffer_area"],
io_total_die_map["mem_phy_area"],
io_total_die_map["mem_controller_area"],
io_total_die_map["device_phy_area"],
io_total_die_map["device_controller_area"],
0,
]
plt.figure(figsize=(4, 2))
colors_matmul = sns.color_palette("flare_r", 7)[5:6]
colors_normalization = sns.color_palette("summer", 2)
colors_gelu = sns.color_palette("pink", 5)[2:4]
colors_allreduce = sns.color_palette("Blues_r", 2)
colors = (
colors_matmul
+ colors_normalization
+ colors_gelu
+ colors_allreduce
+ sns.color_palette("Greys_r", 1)
)
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_a100)):
plt.bar(1, value, bottom=bottom, color=colors[i], label=category, width=0.5)
bottom += value
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_a100_sim)):
plt.bar(2, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_mi210)):
plt.bar(3, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_mi210_sim)):
plt.bar(4, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
plt.ylabel("Area ($mm^2$)")
plt.xticks(
[1, 2, 3, 4],
["Real\nGA100", "Simulated\nGA100", "Real\nAldebaran", "Simulated\nAldebaran"],
)
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1, 1.1))
plt.savefig("ae/figure6/figure6a.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300)
values_a100 = [3.75]
values_mi210 = [4.02]
values_a100_sim = [
A100_core_breakdown_map["control_area"],
A100_core_breakdown_map["alu_area"],
A100_core_breakdown_map["sa_area"],
A100_core_breakdown_map["regfile_area"],
A100_core_breakdown_map["local_buffer_area"],
]
values_mi210_sim = [
MI210_core_breakdown_map["control_area"],
MI210_core_breakdown_map["alu_area"],
MI210_core_breakdown_map["sa_area"],
MI210_core_breakdown_map["regfile_area"],
MI210_core_breakdown_map["local_buffer_area"],
]
categories = [
"Control logic",
"ALUs",
"Systolic array",
"Register file",
"Local buffer",
]
colors = colors_matmul + colors_normalization + colors_allreduce
color_gt = sns.color_palette("Greys_r", 1)[0]
plt.figure(figsize=(4, 1.5))
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_a100)):
plt.bar(1, value, bottom=bottom, color=color_gt, width=0.5)
bottom += value
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_a100_sim)):
plt.bar(2, value, bottom=bottom, color=colors[i], label=category, width=0.5)
bottom += value
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_mi210)):
plt.bar(3, value, bottom=bottom, color=color_gt, width=0.5)
bottom += value
bottom = 0
for i, (category, value) in enumerate(zip(categories, values_mi210_sim)):
plt.bar(4, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
plt.ylabel("Area ($mm^2$)")
plt.xticks(
[1, 2, 3, 4],
["Real\nGA100", "Simulated\nGA100", "Real\nAldebaran", "Simulated\nAldebaran"],
)
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1, 1))
plt.savefig("ae/figure6/figure6b.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300)
================================================
FILE: ae/figure7/__init__.py
================================================
================================================
FILE: ae/figure7/change_core_size.py
================================================
from software_model.transformer import (
TransformerBlockInitComputationTP,
TransformerBlockAutoRegressionTP,
)
from software_model.utils import data_type_dict, Tensor
from design_space_exploration.dse import template_to_system, read_architecture_template
from multiprocessing import Process, Lock
from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2
import time
input_seq_length = 2048
batch_size = 8
output_seq_length = 1024
arch_specs = read_architecture_template("configs/GA100.json")
device_count = arch_specs["device_count"]
model_init = TransformerBlockInitComputationTP(
d_model=12288,
n_heads=96,
device_count=device_count,
data_type=data_type_dict["fp16"],
)
model_auto_regression = TransformerBlockAutoRegressionTP(
d_model=12288,
n_heads=96,
device_count=device_count,
data_type=data_type_dict["fp16"],
)
_ = model_init(
Tensor([batch_size, input_seq_length, model_init.d_model], data_type_dict["fp16"])
)
_ = model_auto_regression(
Tensor([batch_size, 1, model_init.d_model], data_type_dict["fp16"]),
input_seq_length + output_seq_length,
)
def test_core_size(core_configs, lock):
name, core_count, sublane_count, array_width, vector_width, sram_KB = core_configs
arch_specs["device"]["compute_chiplet"]["core_count"] = core_count
arch_specs["device"]["compute_chiplet"]["core"]["sublane_count"] = sublane_count
arch_specs["device"]["compute_chiplet"]["core"]["systolic_array"][
"array_width"
] = array_width
arch_specs["device"]["compute_chiplet"]["core"]["systolic_array"][
"array_height"
] = array_width
arch_specs["device"]["compute_chiplet"]["core"]["vector_unit"][
"vector_width"
] = vector_width
arch_specs["device"]["compute_chiplet"]["core"]["SRAM_KB"] = sram_KB
# for area
arch_specs["device"]["compute_chiplet"]["physical_core_count"] = core_count
arch_specs["device"]["compute_chiplet"]["core"]["vector_unit"]["int32_count"] = (
vector_width // 2
)
arch_specs["device"]["compute_chiplet"]["core"]["vector_unit"]["fp32_count"] = (
vector_width // 2
)
arch_specs["device"]["compute_chiplet"]["core"]["vector_unit"]["fp64_count"] = (
vector_width // 4
)
if vector_width <= 32:
arch_specs["device"]["compute_chiplet"]["core"]["register_file"][
"num_registers"
] = (vector_width * 512)
else:
arch_specs["device"]["compute_chiplet"]["core"]["register_file"][
"num_reg_files"
] = (vector_width // 32)
compute_area_mm2 = calc_compute_chiplet_area_mm2(arch_specs)
io_area_mm2 = calc_io_die_area_mm2(arch_specs)
print(f"{name}, {compute_area_mm2}, {io_area_mm2}, {compute_area_mm2+io_area_mm2}")
# exit()
system = template_to_system(arch_specs)
auto_regression_latency_simulated = model_auto_regression.compile_and_simulate(
system, "heuristic-GPU"
)
init_latency_simulated = model_init.compile_and_simulate(system, "heuristic-GPU")
print(f"{name}, {init_latency_simulated}, {auto_regression_latency_simulated}")
with lock:
with open(f"ae/figure7/core_size_results_init.csv", "a") as f:
f.write(
f"{name}, {compute_area_mm2+io_area_mm2}, {init_latency_simulated}, {model_init.simluate_log}\n"
)
with open(f"ae/figure7/core_size_results_ar.csv", "a") as f:
f.write(
f"{name}, {compute_area_mm2+io_area_mm2}, {auto_regression_latency_simulated}, {model_auto_regression.simluate_log}\n"
)
lock = Lock()
configs = [
("A", 128, 4, 8, 8, 192),
("B", 128, 4, 16, 32, 192),
("C", 128, 1, 32, 128, 192),
("D", 32, 1, 64, 512, 768),
("E", 8, 1, 128, 2048, 3072),
]
processes = [Process(target=test_core_size, args=(i, lock)) for i in configs]
try:
for p in processes:
p.start()
while any(p.is_alive() for p in processes):
time.sleep(1)
except KeyboardInterrupt:
print("Terminating processes...")
for p in processes:
p.terminate()
p.join()
print("All processes have finished.")
================================================
FILE: ae/figure7/plot_core_size.py
================================================
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import pandas as pd
categories = [
"Q_K_V",
"Q_mul_K",
"A_mul_V",
"Wo_proj",
"W1_proj",
"W2_proj",
"Softmax",
"LayerNorm_MHA",
"LayerNorm_FFN",
"GeLU",
"AllReduce_MHA",
"AllReduce_FFN",
]
col_names = ["area", "latency"] + categories
colors_matmul = sns.color_palette("flare_r", 6)
colors_normalization = sns.color_palette("summer", 3)
colors_gelu = sns.color_palette("pink", 1)
colors_allreduce = sns.color_palette("Blues_r", 2)
colors = colors_matmul + colors_normalization + colors_gelu + colors_allreduce
core_size_init = pd.read_csv(
"core_size_results_init.csv", header=None, names=col_names, index_col=0
)
core_size_init.index.astype(str)
core_size_ar = pd.read_csv(
"core_size_results_ar.csv", header=None, names=col_names, index_col=0
)
core_size_ar.index.astype(str)
df_sorted = core_size_init.sort_index()
areas = df_sorted["area"].tolist()
# print(areas)
# exit()
# areas = [
# 475.52039916931585,
# 826.76355498007,
# 826.76355498007,
# 793.3380639020086,
# 763.3465573533286,
# ]
plt.figure(figsize=(7, 3))
# Create the stacked bar graph
x = 0
for row_index in ["A", "B", "C", "D", "E"]:
x = x + 1
values = core_size_init.loc[row_index].tolist()
bottom = 0
for i, (category, value) in enumerate(zip(categories, values[2:])):
if row_index == "A":
plt.bar(x, value, bottom=bottom, color=colors[i], label=category, width=0.5)
else:
plt.bar(x, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
plt.ylabel("Latency (s)")
plt.xlabel("Configurations")
plt.xticks([1, 2, 3, 4, 5], ["A", "B", "C", "D", "E"])
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1.2, 1.05))
plt.tight_layout()
xticks = plt.gca().get_xticks()
xticklabels = plt.gca().get_xticklabels()
index_to_color_red = list(xticks).index(2)
xticklabels[index_to_color_red].set_color("#76B900")
ax1 = plt.gca()
ax2 = ax1.twinx()
ax2.plot(
[1, 2, 3, 4, 5],
areas,
color="dimgray",
linestyle="dashed",
marker="x",
label="Area",
)
ax2.set_ylabel("Area ($mm^2$)")
ax2.set_ylim([0, 1000])
plt.legend(loc="upper right")
plt.savefig(
"figure7a.pdf",
dpi=300,
bbox_inches="tight",
pad_inches=0.01,
)
plt.show()
plt.figure(figsize=(7, 3))
x = 0
for row_index in ["A", "B", "C", "D", "E"]:
x = x + 1
values = core_size_ar.loc[row_index].tolist()
bottom = 0
for i, (category, value) in enumerate(zip(categories, values[2:])):
value = value * 1e3
if row_index == "A":
plt.bar(x, value, bottom=bottom, color=colors[i], label=category, width=0.5)
else:
plt.bar(x, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
# Set the title, legend, and display the graph
# plt.title(
# "Generation latency under different organization"
# )
plt.ylabel("Latency (ms)")
plt.xlabel("Configurations")
plt.xticks([1, 2, 3, 4, 5], ["A", "B", "C", "D", "E"])
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1.2, 1.05))
plt.tight_layout()
xticks = plt.gca().get_xticks()
xticklabels = plt.gca().get_xticklabels()
index_to_color_red = list(xticks).index(2)
xticklabels[index_to_color_red].set_color("#76B900")
ax1 = plt.gca()
ax2 = ax1.twinx()
ax2.plot(
[1, 2, 3, 4, 5],
areas,
color="dimgrey",
linestyle="dashed",
marker="x",
label="Area",
)
ax2.set_ylabel("Area ($mm^2$)")
ax2.set_ylim([0, 1000])
plt.legend(loc="upper left")
plt.savefig(
"figure7b.pdf",
dpi=300,
bbox_inches="tight",
pad_inches=0.01,
)
================================================
FILE: ae/figure7/run_figure7.sh
================================================
rm *.csv
rm *.pdf
cd ../..
python -m ae.figure7.change_core_size
cd ae/figure7
python plot_core_size.py
================================================
FILE: ae/figure8/__init__.py
================================================
================================================
FILE: ae/figure8/change_memory_bw.py
================================================
import json, re
from hardware_model.compute_module import (
VectorUnit,
SystolicArray,
Core,
ComputeModule,
overhead_dict,
)
from hardware_model.io_module import IOModule
from hardware_model.memory_module import MemoryModule
from hardware_model.device import Device
from hardware_model.interconnect import LinkModule, InterConnectModule, TopologyType
from hardware_model.system import System
from software_model.transformer import (
TransformerBlockInitComputationTP,
TransformerBlockAutoRegressionTP,
)
from software_model.utils import data_type_dict, Tensor
from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2
from math import ceil
from design_space_exploration.dse import template_to_system, read_architecture_template
from multiprocessing import Process, Lock
import time
from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2
input_seq_length = 2048
batch_size = 8
output_seq_length = 1024
arch_specs = read_architecture_template("configs/template.json")
device_count = arch_specs["device_count"]
model_init = TransformerBlockInitComputationTP(
d_model=12288,
n_heads=96,
device_count=device_count,
data_type=data_type_dict["fp16"],
)
model_auto_regression = TransformerBlockAutoRegressionTP(
d_model=12288,
n_heads=96,
device_count=device_count,
data_type=data_type_dict["fp16"],
)
_ = model_init(
Tensor([batch_size, input_seq_length, model_init.d_model], data_type_dict["fp16"])
)
_ = model_auto_regression(
Tensor([batch_size, 1, model_init.d_model], data_type_dict["fp16"]),
input_seq_length + output_seq_length,
)
def test_memory_bandwidth(memory_bandwidth, lock):
arch_specs["device"]["io"]["memory_channel_physical_count"] = memory_bandwidth
arch_specs["device"]["io"]["memory_channel_active_count"] = memory_bandwidth
compute_area_mm2 = calc_compute_chiplet_area_mm2(arch_specs)
io_area_mm2 = calc_io_die_area_mm2(arch_specs)
print(
f"{memory_bandwidth}, {compute_area_mm2}, {io_area_mm2}, {compute_area_mm2+io_area_mm2}"
)
system = template_to_system(arch_specs)
auto_regression_latency_simulated = model_auto_regression.compile_and_simulate(
system, "heuristic-GPU"
)
init_latency_simulated = model_init.compile_and_simulate(system, "heuristic-GPU")
print(
f"{memory_bandwidth}, {init_latency_simulated}, {auto_regression_latency_simulated}"
)
with lock:
with open(f"ae/figure8/memory_bw_results_bs{batch_size}_init.csv", "a") as f:
f.write(
f"{memory_bandwidth*400}, {compute_area_mm2+io_area_mm2}, {init_latency_simulated}, {model_init.simluate_log}\n"
)
with open(f"ae/figure8/memory_bw_results_bs{batch_size}_ar.csv", "a") as f:
f.write(
f"{memory_bandwidth*400}, {compute_area_mm2+io_area_mm2}, {auto_regression_latency_simulated}, {model_auto_regression.simluate_log}\n"
)
lock = Lock()
processes = [
Process(target=test_memory_bandwidth, args=(i, lock))
for i in [1, 2, 3, 4, 5, 6, 7, 8]
]
try:
for p in processes:
p.start()
while any(p.is_alive() for p in processes):
time.sleep(1)
except KeyboardInterrupt:
print("Terminating processes...")
for p in processes:
p.terminate()
p.join()
print("All processes have finished.")
================================================
FILE: ae/figure8/plot_memory_bw.py
================================================
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import pandas as pd
categories = [
"Q_K_V",
"Q_mul_K",
"A_mul_V",
"Wo_proj",
"W1_proj",
"W2_proj",
"Softmax",
"LayerNorm_MHA",
"LayerNorm_FFN",
"GeLU",
"AllReduce_MHA",
"AllReduce_FFN",
]
col_names = ["area", "latency"] + categories
colors_matmul = sns.color_palette("flare_r", 6)
colors_normalization = sns.color_palette("summer", 3)
colors_gelu = sns.color_palette("pink", 1)
colors_allreduce = sns.color_palette("Blues_r", 2)
colors = colors_matmul + colors_normalization + colors_gelu + colors_allreduce
batch_size = 8
results_init = pd.read_csv(
f"memory_bw_results_bs{batch_size}_init.csv",
header=None,
names=col_names,
index_col=0,
)
results_init.index.astype(int)
results_ar = pd.read_csv(
f"memory_bw_results_bs{batch_size}_ar.csv",
header=None,
names=col_names,
index_col=0,
)
results_ar.index.astype(int)
plt.figure(figsize=(7, 3))
# Create the stacked bar graph
x = 0
x_labels = [i * 400 for i in [1, 2, 3, 4, 5, 6, 7, 8]]
for row_index in x_labels:
x = x + 1
values = results_init.loc[row_index].tolist()
bottom = 0
for i, (category, value) in enumerate(zip(categories, values[2:])):
if row_index == x_labels[0]:
plt.bar(x, value, bottom=bottom, color=colors[i], label=category, width=0.5)
else:
plt.bar(x, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
# Set the title, legend, and display the graph
# plt.title(
# "Prefilling Latency per Layer"
# )
plt.ylabel("Latency (s)")
plt.xlabel("Memory bandwidth (GB/s)")
plt.xticks([1, 2, 3, 4, 5, 6, 7, 8], x_labels)
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1, 1.05))
plt.tight_layout()
xticks = plt.gca().get_xticks()
xticklabels = plt.gca().get_xticklabels()
index_to_color_red = list(xticks).index(5)
xticklabels[index_to_color_red].set_color("red")
plt.savefig(f"figure8a.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01)
plt.show()
plt.figure(figsize=(7, 3))
x = 0
for row_index in x_labels:
x = x + 1
values = results_ar.loc[row_index].tolist()
bottom = 0
for i, (category, value) in enumerate(zip(categories, values[2:])):
value = value * 1e3
if row_index == x_labels[0]:
plt.bar(x, value, bottom=bottom, color=colors[i], label=category, width=0.5)
else:
plt.bar(x, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
# Set the title, legend, and display the graph
# plt.title(
# "Generation Latency per Layer per Token"
# )
plt.ylabel("Latency (ms)")
plt.xlabel("Memory bandwidth (GB/s)")
plt.xticks([1, 2, 3, 4, 5, 6, 7, 8], x_labels)
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1, 1.05))
plt.tight_layout()
xticks = plt.gca().get_xticks()
xticklabels = plt.gca().get_xticklabels()
index_to_color_red = list(xticks).index(5)
xticklabels[index_to_color_red].set_color("red")
plt.savefig(f"figure8b.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01)
================================================
FILE: ae/figure8/run_figure8.sh
================================================
rm *.csv
rm *.pdf
cd ../..
python -m ae.figure8.change_memory_bw
cd ae/figure8
python plot_memory_bw.py
================================================
FILE: ae/figure9/__init__.py
================================================
================================================
FILE: ae/figure9/change_l1_cache.py
================================================
from software_model.transformer import (
TransformerBlockInitComputationTP,
TransformerBlockAutoRegressionTP,
)
from software_model.utils import data_type_dict, Tensor
from design_space_exploration.dse import template_to_system, read_architecture_template
from multiprocessing import Process, Lock
from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2
import time
input_seq_length = 2048
batch_size = 8
output_seq_length = 1024
arch_specs = read_architecture_template("configs/template.json")
device_count = arch_specs["device_count"]
model_init = TransformerBlockInitComputationTP(
d_model=12288,
n_heads=96,
device_count=device_count,
data_type=data_type_dict["fp16"],
)
model_auto_regression = TransformerBlockAutoRegressionTP(
d_model=12288,
n_heads=96,
device_count=device_count,
data_type=data_type_dict["fp16"],
)
_ = model_init(
Tensor([batch_size, input_seq_length, model_init.d_model], data_type_dict["fp16"])
)
_ = model_auto_regression(
Tensor([batch_size, 1, model_init.d_model], data_type_dict["fp16"]),
input_seq_length + output_seq_length,
)
def test_SRAM_KB(SRAM_KB, lock):
arch_specs["device"]["compute_chiplet"]["core"]["SRAM_KB"] = SRAM_KB
compute_area_mm2 = calc_compute_chiplet_area_mm2(arch_specs)
io_area_mm2 = calc_io_die_area_mm2(arch_specs)
print(
f"{SRAM_KB}, {compute_area_mm2}, {io_area_mm2}, {compute_area_mm2+io_area_mm2}"
)
system = template_to_system(arch_specs)
auto_regression_latency_simulated = model_auto_regression.compile_and_simulate(
system, "heuristic-GPU"
)
init_latency_simulated = model_init.compile_and_simulate(system, "heuristic-GPU")
print(f"{SRAM_KB}, {init_latency_simulated}, {auto_regression_latency_simulated}")
with lock:
with open(f"ae/figure9/l1_cache_results_init.csv", "a") as f:
f.write(
f"{SRAM_KB}, {compute_area_mm2+io_area_mm2}, {init_latency_simulated}, {model_init.simluate_log}\n"
)
with open(f"ae/figure9/l1_cache_results_ar.csv", "a") as f:
f.write(
f"{SRAM_KB}, {compute_area_mm2+io_area_mm2}, {auto_regression_latency_simulated}, {model_auto_regression.simluate_log}\n"
)
# for SRAM_KB in [64, 128, 192, 256, 512, 1024]:
# test_SRAM_KB(SRAM_KB, None)
lock = Lock()
processes = [
Process(target=test_SRAM_KB, args=(i, lock)) for i in [64, 128, 192, 256, 512, 1024]
]
try:
for p in processes:
p.start()
while any(p.is_alive() for p in processes):
time.sleep(1)
except KeyboardInterrupt:
print("Terminating processes...")
for p in processes:
p.terminate()
p.join()
print("All processes have finished.")
# for SRAM_KB in [64, 128, 192, 256, 512, 1024]:
# arch_specs["device"]["compute_chiplet"]["core"][
# "SRAM_KB"
# ] = SRAM_KB
# system=template_to_system(arch_specs)
# auto_regression_latency_simulated = model_auto_regression.compile_and_simulate(system, 'heuristic-GPU')
# init_latency_simulated = model_init.compile_and_simulate(system, 'heuristic-GPU')
# print(f'{SRAM_KB}, {init_latency_simulated}, {auto_regression_latency_simulated}')
# with open(f'test/case_study/l1_cache/l1_cache_results.csv', 'a') as f:
# f.write(f'{SRAM_KB}, {init_latency_simulated}, {auto_regression_latency_simulated}\n')
================================================
FILE: ae/figure9/plot_l1_cache.py
================================================
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import pandas as pd
categories = [
"Q_K_V",
"Q_mul_K",
"A_mul_V",
"Wo_proj",
"W1_proj",
"W2_proj",
"Softmax",
"LayerNorm_MHA",
"LayerNorm_FFN",
"GeLU",
"AllReduce_MHA",
"AllReduce_FFN",
]
col_names = ["area", "latency"] + categories
colors_matmul = sns.color_palette("flare_r", 6)
colors_normalization = sns.color_palette("summer", 3)
colors_gelu = sns.color_palette("pink", 1)
colors_allreduce = sns.color_palette("Blues_r", 2)
colors = colors_matmul + colors_normalization + colors_gelu + colors_allreduce
results_init = pd.read_csv(
"l1_cache_results_init.csv", header=None, names=col_names, index_col=0
)
results_init.index.astype(int)
results_ar = pd.read_csv(
"l1_cache_results_ar.csv", header=None, names=col_names, index_col=0
)
results_ar.index.astype(int)
areas = [
782.1048032068737,
794.1065561553206,
826.76355498007,
848.4527315580167,
913.304090096728,
1064.9121549472263,
]
plt.figure(figsize=(7, 3))
# Create the stacked bar graph
x = 0
x_labels = [64, 128, 192, 256, 512, 1024]
for row_index in x_labels:
x = x + 1
values = results_init.loc[row_index].tolist()
bottom = 0
for i, (category, value) in enumerate(zip(categories, values[2:])):
if row_index == x_labels[0]:
plt.bar(x, value, bottom=bottom, color=colors[i], label=category, width=0.5)
else:
plt.bar(x, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
# Set the title, legend, and display the graph
# plt.title(
# "Prefilling Latency per Layer"
# )
plt.ylabel("Latency (s)")
plt.xlabel("Local buffer size (KB)")
plt.xticks([1, 2, 3, 4, 5, 6], x_labels)
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1.2, 1.03))
plt.tight_layout()
xticks = plt.gca().get_xticks()
xticklabels = plt.gca().get_xticklabels()
index_to_color_red = list(xticks).index(3)
xticklabels[index_to_color_red].set_color("#76B900")
ax1 = plt.gca()
ax2 = ax1.twinx()
ax2.plot(
[1, 2, 3, 4, 5, 6],
areas,
color="dimgray",
linestyle="dashed",
marker="x",
label="Area",
)
ax2.set_ylabel("Area ($mm^2$)")
ax2.set_ylim([0, 1200])
plt.legend(loc="upper center")
plt.savefig("figure9a.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01)
plt.show()
plt.figure(figsize=(7, 3))
x = 0
for row_index in x_labels:
x = x + 1
values = results_ar.loc[row_index].tolist()
bottom = 0
for i, (category, value) in enumerate(zip(categories, values[2:])):
value = value * 1e3
if row_index == x_labels[0]:
plt.bar(x, value, bottom=bottom, color=colors[i], label=category, width=0.5)
else:
plt.bar(x, value, bottom=bottom, color=colors[i], width=0.5)
bottom += value
# Set the title, legend, and display the graph
# plt.title(
# "Generation Latency per Layer per Token"
# )
plt.ylabel("Latency (ms)")
plt.xlabel("Local buffer size (KB)")
plt.xticks([1, 2, 3, 4, 5, 6], x_labels)
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1.2, 1.03))
plt.tight_layout()
xticks = plt.gca().get_xticks()
xticklabels = plt.gca().get_xticklabels()
index_to_color_red = list(xticks).index(3)
xticklabels[index_to_color_red].set_color("#76B900")
ax1 = plt.gca()
ax1.set_ylim([0, 1.2])
ax2 = ax1.twinx()
ax2.plot(
[1, 2, 3, 4, 5, 6],
areas,
color="dimgray",
linestyle="dashed",
marker="x",
label="Area",
)
ax2.set_ylabel("Area ($mm^2$)")
ax2.set_ylim([0, 1200])
plt.legend(loc="upper center")
plt.savefig("figure9b.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01)
================================================
FILE: ae/figure9/run_figure9.sh
================================================
rm *.csv
rm *.pdf
cd ../..
python -m ae.figure9.change_l1_cache
cd ae/figure9
python plot_l1_cache.py
================================================
FILE: configs/GA100.json
================================================
{
"name": "NVIDIA A100(80GB)x4",
"device_count": 4,
"interconnect": {
"link": {
"name": "NVLink3",
"bandwidth_per_direction_byte": 25e9,
"bandwidth_both_directions_byte": 50e9,
"latency_second": 8.92e-6,
"flit_size_byte": 16,
"header_size_byte": 16,
"max_payload_size_byte": 256
},
"link_count_per_device": 12,
"topology": "FC"
},
"device": {
"frequency_Hz": 1410e6,
"compute_chiplet_count": 1,
"compute_chiplet": {
"physical_core_count": 128,
"core_count": 128,
"process_node": "7nm",
"core": {
"sublane_count": 4,
"systolic_array": {
"array_width": 16,
"array_height": 16,
"data_type": "fp16",
"mac_per_cycle": 1
},
"vector_unit": {
"vector_width": 32,
"flop_per_cycle": 4,
"data_type": "fp16",
"int32_count": 16,
"fp16_count": 0,
"fp32_count": 16,
"fp64_count": 8
},
"register_file": {
"num_reg_files": 1,
"num_registers": 16384,
"register_bitwidth":32,
"num_rdwr_ports":4
},
"SRAM_KB": 192
}
},
"memory_protocol": "HBM2e",
"_memory_protocol_list": [
"HBM2e",
"DDR4",
"DDR5",
"PCIe4",
"PCIe5"
],
"io": {
"process_node": "7nm",
"global_buffer_MB": 48,
"physical_global_buffer_MB": 48,
"global_buffer_bandwidth_per_cycle_byte": 5120,
"memory_channel_physical_count": 6,
"memory_channel_active_count": 5,
"pin_count_per_channel": 1024,
"bandwidth_per_pin_bit": 3.2e9
},
"memory": {
"total_capacity_GB": 80
}
}
}
================================================
FILE: configs/ga102_template.json
================================================
{
"name": "NVIDIA A100(80GB)x4",
"device_count": 4,
"interconnect": {
"link": {
"name": "NVLink3",
"bandwidth_per_direction_byte": 25e9,
"bandwidth_both_directions_byte": 50e9,
"latency_second": 8.92e-6,
"flit_size_byte": 16,
"header_size_byte": 16,
"max_payload_size_byte": 256
},
"link_count_per_device": 12,
"topology": "FC"
},
"device": {
"frequency_Hz": 1410e6,
"compute_chiplet_count": 1,
"compute_chiplet": {
"physical_core_count": 128,
"core_count": 108,
"process_node": "7nm",
"core": {
"sublane_count": 4,
"systolic_array": {
"array_width": 16,
"array_height": 16,
"data_type": "fp16",
"mac_per_cycle": 1
},
"vector_unit": {
"vector_width": 32,
"flop_per_cycle": 4,
"data_type": "fp16",
"int32_count": 16,
"fp16_count": 0,
"fp32_count": 32,
"fp64_count": 0.5
},
"SRAM_KB": 128
}
},
"memory_protocol": "HBM2e",
"_memory_protocol_list": [
"HBM2e",
"DDR4",
"DDR5",
"PCIe4",
"PCIe5"
],
"io": {
"process_node": "7nm",
"global_buffer_MB": 40,
"physical_global_buffer_MB": 48,
"global_buffer_bandwidth_per_cycle_byte": 5120,
"memory_channel_physical_count": 6,
"memory_channel_active_count": 5,
"pin_count_per_channel": 1024,
"bandwidth_per_pin_bit": 3.2e9
},
"memory": {
"total_capacity_GB": 80
}
}
}
================================================
FILE: configs/generation_system.json
================================================
{
"name": "NVIDIA A100(80GB)x4",
"device_count": 4,
"interconnect": {
"link": {
"name": "NVLink3",
"bandwidth_per_direction_byte": 25e9,
"bandwidth_both_directions_byte": 50e9,
"latency_second": 8.92e-6,
"flit_size_byte": 16,
"header_size_byte": 16,
"max_payload_size_byte": 256
},
"link_count_per_device": 12,
"topology": "FC"
},
"device": {
"frequency_Hz": 1410e6,
"compute_chiplet_count": 1,
"compute_chiplet": {
"physical_core_count": 128,
"core_count": 128,
"process_node": "7nm",
"core": {
"sublane_count": 4,
"systolic_array": {
"array_width": 8,
"array_height": 8,
"data_type": "fp16",
"mac_per_cycle": 1
},
"vector_unit": {
"vector_width": 8,
"flop_per_cycle": 4,
"data_type": "fp16",
"int32_count": 4,
"fp16_count": 32,
"fp32_count": 0,
"fp64_count": 0
},
"register_file": {
"num_reg_files": 1,
"num_registers": 4096,
"register_bitwidth": 32,
"num_rdwr_ports": 4
},
"SRAM_KB": 48
}
},
"memory_protocol": "HBM2e",
"_memory_protocol_list": [
"HBM2e",
"DDR4",
"DDR5",
"PCIe4",
"PCIe5"
],
"io": {
"process_node": "7nm",
"global_buffer_MB": 24,
"physical_global_buffer_MB": 24,
"global_buffer_bandwidth_per_cycle_byte": 2560,
"memory_channel_physical_count": 6,
"memory_channel_active_count": 5,
"pin_count_per_channel": 1024,
"bandwidth_per_pin_bit": 3.2e9
},
"memory": {
"total_capacity_GB": 80
}
}
}
================================================
FILE: configs/latency_design.json
================================================
{
"name": "NVIDIA A100(80GB)x4",
"device_count": 4,
"interconnect": {
"link": {
"name": "NVLink3",
"bandwidth_per_direction_byte": 25e9,
"bandwidth_both_directions_byte": 50e9,
"latency_second": 8.92e-6,
"flit_size_byte": 16,
"header_size_byte": 16,
"max_payload_size_byte": 256
},
"link_count_per_device": 12,
"topology": "FC"
},
"device": {
"frequency_Hz": 1410e6,
"compute_chiplet_count": 1,
"compute_chiplet": {
"physical_core_count": 64,
"core_count": 64,
"process_node": "7nm",
"core": {
"sublane_count": 4,
"systolic_array": {
"array_width": 16,
"array_height": 16,
"data_type": "fp16",
"mac_per_cycle": 1
},
"vector_unit": {
"vector_width": 32,
"flop_per_cycle": 4,
"data_type": "fp16",
"int32_count": 16,
"fp16_count": 0,
"fp32_count": 16,
"fp64_count": 8
},
"register_file": {
"num_reg_files": 1,
"num_registers": 16384,
"register_bitwidth": 32,
"num_rdwr_ports": 4
},
"SRAM_KB": 192
}
},
"memory_protocol": "HBM2e",
"_memory_protocol_list": [
"HBM2e",
"DDR4",
"DDR5",
"PCIe4",
"PCIe5"
],
"io": {
"process_node": "7nm",
"global_buffer_MB": 24,
"physical_global_buffer_MB": 24,
"global_buffer_bandwidth_per_cycle_byte": 2560,
"memory_channel_physical_count": 6,
"memory_channel_active_count": 5,
"pin_count_per_channel": 1024,
"bandwidth_per_pin_bit": 3.2e9
},
"memory": {
"total_capacity_GB": 80
}
}
}
================================================
FILE: configs/mi210.json
================================================
{
"name": "AMD MI210",
"device_count": 4,
"interconnect": {
"link": {
"name": "NVLink3",
"bandwidth_per_direction_byte": 25e9,
"bandwidth_both_directions_byte": 50e9,
"latency_second": 8.92e-6,
"flit_size_byte": 16,
"header_size_byte": 16,
"max_payload_size_byte": 256
},
"link_count_per_device": 12,
"topology": "FC"
},
"device": {
"frequency_Hz": 1400e6,
"compute_chiplet_count": 1,
"compute_chiplet": {
"core_count": 104,
"process_node": "7nm",
"core": {
"sublane_count": 4,
"systolic_array": {
"array_width": 16,
"array_height": 16,
"data_type": "fp16",
"mac_per_cycle": 1
},
"vector_unit": {
"vector_width": 16,
"flop_per_cycle": 2,
"data_type": "fp32",
"int32_count": 16,
"fp32_count": 16,
"fp64_count": 8
},
"SRAM_KB": 128
}
},
"memory_protocol": "HBM2e",
"_memory_protocol_list": [
"HBM2e",
"DDR4",
"DDR5",
"PCIe4",
"PCIe5"
],
"io": {
"process_node": "7nm",
"global_buffer_MB": 8,
"global_buffer_bandwidth_per_cycle_byte": 4096,
"memory_channel_physical_count": 6,
"memory_channel_active_count": 4,
"pin_count_per_channel": 1024,
"bandwidth_per_pin_bit": 3.2e9
},
"memory": {
"total_capacity_GB": 64
}
}
}
================================================
FILE: configs/mi210_template.json
================================================
{
"name": "AMD MI210",
"device_count": 4,
"interconnect": {
"link": {
"name": "InfinityFabric",
"bandwidth_per_direction_byte": 25e9,
"bandwidth_both_directions_byte": 50e9,
"latency_second": 8.92e-6,
"flit_size_byte": 16,
"header_size_byte": 16,
"max_payload_size_byte": 256
},
"link_count_per_device": 8,
"topology": "FC"
},
"device": {
"frequency_Hz": 1410e6,
"compute_chiplet_count": 1,
"compute_chiplet": {
"physical_core_count": 112,
"core_count": 108,
"process_node": "6nm",
"core": {
"sublane_count": 4,
"systolic_array": {
"array_width": 16,
"array_height": 16,
"data_type": "fp16",
"mac_per_cycle": 1
},
"vector_unit": {
"vector_width": 16,
"flop_per_cycle": 4,
"data_type": "fp16",
"int32_count": 16,
"fp16_count": 0,
"fp32_count": 0,
"fp64_count": 16
},
"register_file": {
"num_reg_files": 64,
"num_registers": 512,
"register_bitwidth":32,
"num_rdwr_ports":4
},
"SRAM_KB": 80
}
},
"memory_protocol": "HBM2e",
"_memory_protocol_list": [
"HBM2e",
"DDR4",
"DDR5",
"PCIe4",
"PCIe5"
],
"io": {
"process_node": "6nm",
"global_buffer_MB": 8,
"physical_global_buffer_MB": 8,
"global_buffer_bandwidth_per_cycle_byte": 5120,
"memory_channel_physical_count": 8,
"memory_channel_active_count": 8,
"pin_count_per_channel": 1024,
"bandwidth_per_pin_bit": 3.2e9
},
"memory": {
"total_capacity_GB": 80
}
}
}
================================================
FILE: configs/prefilling_system.json
================================================
{
"name": "NVIDIA A100(80GB)x4",
"device_count": 4,
"interconnect": {
"link": {
"name": "NVLink3",
"bandwidth_per_direction_byte": 25e9,
"bandwidth_both_directions_byte": 50e9,
"latency_second": 8.92e-6,
"flit_size_byte": 16,
"header_size_byte": 16,
"max_payload_size_byte": 256
},
"link_count_per_device": 12,
"topology": "FC"
},
"device": {
"frequency_Hz": 1410e6,
"compute_chiplet_count": 1,
"compute_chiplet": {
"physical_core_count": 64,
"core_count": 64,
"process_node": "7nm",
"core": {
"sublane_count": 4,
"systolic_array": {
"array_width": 32,
"array_height": 32,
"data_type": "fp16",
"mac_per_cycle": 1
},
"vector_unit": {
"vector_width": 32,
"flop_per_cycle": 4,
"data_type": "fp16",
"int32_count": 16,
"fp16_count": 0,
"fp32_count": 16,
"fp64_count": 8
},
"register_file": {
"num_reg_files": 1,
"num_registers": 16384,
"register_bitwidth": 32,
"num_rdwr_ports": 4
},
"SRAM_KB": 768
}
},
"memory_protocol": "PCIe5",
"_memory_protocol_list": [
"HBM2e",
"DDR4",
"DDR5",
"PCIe4",
"PCIe5"
],
"io": {
"process_node": "7nm",
"global_buffer_MB": 48,
"physical_global_buffer_MB": 48,
"global_buffer_bandwidth_per_cycle_byte": 5120,
"memory_channel_physical_count": 256,
"memory_channel_active_count": 256,
"pin_count_per_channel": 1,
"bandwidth_per_pin_bit": 32e9
},
"memory": {
"total_capacity_GB": 160
}
}
}
================================================
FILE: configs/template.json
================================================
{
"name": "NVIDIA A100(80GB)x4",
"device_count": 4,
"interconnect": {
"link": {
"name": "NVLink3",
"bandwidth_per_direction_byte": 25e9,
"bandwidth_both_directions_byte": 50e9,
"latency_second": 8.92e-6,
"flit_size_byte": 16,
"header_size_byte": 16,
"max_payload_size_byte": 256
},
"link_count_per_device": 12,
"topology": "FC"
},
"device": {
"frequency_Hz": 1410e6,
"compute_chiplet_count": 1,
"compute_chiplet": {
"physical_core_count": 128,
"core_count": 108,
"process_node": "7nm",
"core": {
"sublane_count": 4,
"systolic_array": {
"array_width": 16,
"array_height": 16,
"data_type": "fp16",
"mac_per_cycle": 1
},
"vector_unit": {
"vector_width": 32,
"flop_per_cycle": 4,
"data_type": "fp16",
"int32_count": 16,
"fp16_count": 0,
"fp32_count": 16,
"fp64_count": 8
},
"register_file": {
"num_reg_files": 1,
"num_registers": 16384,
"register_bitwidth":32,
"num_rdwr_ports":4
},
"SRAM_KB": 192
}
},
"memory_protocol": "HBM2e",
"_memory_protocol_list": [
"HBM2e",
"DDR4",
"DDR5",
"PCIe4",
"PCIe5"
],
"io": {
"process_node": "7nm",
"global_buffer_MB": 40,
"physical_global_buffer_MB": 48,
"global_buffer_bandwidth_per_cycle_byte": 5120,
"memory_channel_physical_count": 6,
"memory_channel_active_count": 5,
"pin_count_per_channel": 1024,
"bandwidth_per_pin_bit": 3.2e9
},
"memory": {
"total_capacity_GB": 80
}
}
}
================================================
FILE: cost_model/__init__.py
================================================
## made this for first commit
================================================
FILE: cost_model/cost_examples.py
================================================
import cost_model.cost_model as cost_model
import json
# example chip with a 32 wide vector, 16x16 SA, 256kb cache core, 8 cores per die
# io die with 64 mb cache, 8 nvlinks, 32 pcie phys
# all at 5nm and 7nm
with open("./configs/prefilling_system.json", "r") as f:
# with open('../configs/mi210_template.json', 'r') as f:
configs_dict = json.load(f)
# print(configs_dict['device'])
# print(data['device']['compute_chiplet_count'])
compute_area = cost_model.calc_compute_chiplet_area_mm2(configs_dict)
io_area = cost_model.calc_io_die_area_mm2(configs_dict)
print(
f"compute area: {compute_area}, io area: {io_area}, total area: {compute_area+io_area}"
)
exit(0)
core_compute_area_mm2 = cost_model.calc_compute_core_area_mm2(
32,
16,
2**18,
cost_model.transistor_density_7nm,
cost_model.sram_bit_cell_density_7nm,
)
io_die_area_mm2 = cost_model.calc_io_die_area_mm2(
2**25,
cost_model.PCIE5,
32,
8,
cost_model.transistor_density_7nm,
cost_model.sram_bit_cell_density_7nm,
)
print(core_compute_area_mm2)
print(io_die_area_mm2)
================================================
FILE: cost_model/cost_model.py
================================================
# Author: August Ning aning@princeton.edu
# Date started: 12 October 2023
# This file is the cost model for Naivesim
import numpy as np
import math
# import supply_chain.supply_chain_model as scm
import cost_model.supply_chain.supply_chain_model as scm
# lots of parameters required for calculating silicon die area cost
# these are in terms of million transistors per mm2
transistor_density_7nm = scm.transistor_density_arr[scm.PN_7_INDEX]
transistor_density_6nm = 114.2
transistor_density_5nm = scm.transistor_density_arr[scm.PN_5_INDEX]
sram_bit_cell_density_7nm = 1.70e-07
sram_bit_cell_density_6nm = 1.40e-07
sram_bit_cell_density_5nm = 1.25e-07
# cache size overheads derived from cacti for cache sizes
# 4096, 8192, 16384, ..., 1 MB
cache_area_efficiency_arr = [0.076, 0.142, 0.247, 0.393, 0.559, \
0.704, 0.526, 0.602, 0.561]
# fpu transistor counts are for 64 bit FPU, based off Ariane and OpenPiton's SPARC T1
# assume that fp32 are half the transistors
# int32 transistor count is based off of Ariane's Mult and OpenPiton's SPARC T1
# systolic array is for 1x1 area
# scale FPU area by mantissa bits quadratically
fpu64_transistor_count = 685300
fpu32_transistor_count = fpu64_transistor_count * ((23 / 52) ** 2)
fpu16_transistor_count = fpu64_transistor_count * ((10 / 52) ** 2)
int32_transistor_count = 177690
# based off of A100 SM and MI 210 CU
# these overheads are per sublane, per vector width
# (ex 32 for A100, 16 for MI 210)
per_sublane_control_transistor_count = 996200
nvidia_per_sublane_control_transistor_count = 725650
amd_per_sublane_control_transistor_count = 1534500
per_sublane_control_dict = {'nvidia':per_sublane_control_transistor_count, \
'amd':per_sublane_control_transistor_count}
per_core_comm_transistor_count = 44300000
nvidia_per_core_comm_transistor_count = 55000000
amd_per_core_comm_transistor_count = 33600000
per_core_comm_dict = {'nvidia':per_core_comm_transistor_count, \
'amd':per_core_comm_transistor_count}
# memory controllers scale with process node, but PHYs do not
# pcie, ddr, hbm
# note: DDR link unit is 32 bits
pcie5_phy_mm2_per_lane = 0.64
pcie4_phy_mm2_per_lane = 0.48
ddr5_phy_mm2_per_link_unit = 1.45
hbm2e_phy_mm2_per_link_unit = 10.45
nvlink3_phy_mm2_per_link_unit = 1.888
nvlink4_phy_mm2_per_link_unit = 0.965
infinity_fabric_phy_mm2_per_link_unit = 5.69
pcie5_ctrl_transistors_per_lane = 5372100
pcie4_ctrl_transistors_per_lane = 3962500
ddr5_ctrl_transistors_per_link_unit = 90446400
hbm2e_ctrl_transistors_per_link_unit = 552743000
nvlink3_ctrl_transistors_per_link_unit = 74632000
nvlink4_ctrl_transistors_per_link_unit = 86628000
infinity_fabric_ctrl_transistors_per_link_unit = 348148000
# mem tech keywords
PCIE5 = 'PCIe5'
PCIE4 = 'PCIe4'
DDR5 = 'DDR5'
HBM = 'HBM2e'
NVLINK3 = 'NVLink3'
NVLINK4 = 'NVLink4'
INFINITYFABRIC = 'InfinityFabric'
# average via dramexchange spot price, Oct 2023
ddr5_cost_per_gb = 2.4
hbm_cost_per_gb = 7
# return die area for a dimension x dimension SA with a
# give bitwidth FPU at a given process node
# right now, we model each PE's MAC as a FPU
def calc_systolic_array_area_mm2(dimension_x, dimension_y, bitwidth, transistor_density_mil_mm2):
if bitwidth == 'fp64':
total_transistor_count = fpu64_transistor_count * dimension_x * dimension_y
elif bitwidth == 'fp32':
total_transistor_count = fpu32_transistor_count * dimension_x * dimension_y
elif bitwidth == 'fp16':
total_transistor_count = fpu16_transistor_count * dimension_x * dimension_y
return total_transistor_count / 1e6 / transistor_density_mil_mm2
# vector width corresponds to number of FPUs you have
def calc_vector_area_mm2(int32_count, fp16_count, fp32_count, fp64_count, transistor_density_mil_mm2):
total_transistor_count = 0
total_transistor_count += int32_count * int32_transistor_count
total_transistor_count += fp16_count * fpu16_transistor_count
total_transistor_count += fp32_count * fpu32_transistor_count
total_transistor_count += fp64_count * fpu64_transistor_count
return total_transistor_count / 1e6 / transistor_density_mil_mm2
# for cache designs, if the desired capacity is larger than the max cache unit
# split them up into multiple units of the max capacity
# min cache size is 4096 bytes
def calc_cache_sram_area_mm2(capacity_bytes, sram_bitcell_area_mm2, max_cache_unit_bytes=(2**19)):
if capacity_bytes > max_cache_unit_bytes:
num_cache_units = math.ceil(capacity_bytes / max_cache_unit_bytes)
unit_size_bytes = max_cache_unit_bytes
else:
num_cache_units = 1
unit_size_bytes = capacity_bytes
# cache size model is for capacity of 4096 bytes to 1 MB
if unit_size_bytes < 2 ** 12:
unit_size_bytes = 2 ** 12
area_efficiency_index = math.ceil(math.log(unit_size_bytes, 2)) - 12
area_efficiency_factor = cache_area_efficiency_arr[area_efficiency_index]
unit_cache_area = unit_size_bytes * 8 * sram_bitcell_area_mm2 / area_efficiency_factor
cache_area = num_cache_units * unit_cache_area
return cache_area
# area model comes from EMPIRE
# num_reg_files: how many distinct register files each sublanes has
# D: how many registers there are in each RF
# W: bits per register
# P: number of read/write ports
def calc_reg_file_area(num_reg_files, D, W, P, transistor_density_mil_mm2):
area_90nm_um2 = (3.29 * 10**4) - (1.09 * 10**3 * D) - (8.83 * 10**2 * W) - (5.55 * 10**3 * P) \
+ (5.35 * 10**1 * D * W) + (1.50 * 10**-2 * D**2) + (1.08 * 10**-2 * W**2) \
+ (5.86 * 10**-1 * P**2) + (1.42 * 10**2 * D * P) + (3.68 * 10**2 * W * P)
# need to convert um2 to mm2, convert to 7nm
area_90nm_mm2 = area_90nm_um2 / 1e6
area_mm2 = area_90nm_mm2 * (scm.transistor_density_arr[scm.PN_90_INDEX] / transistor_density_mil_mm2)
total_reg_file_area = num_reg_files * area_mm2
return total_reg_file_area
# for width, for PCIe and NVLink, it is the whole lane
# for DDR and HBM, it's 128 bits and 1024 bits respectively
def calc_mem_controller_area_mm2(mem_tech, width, transistor_density_mil_mm2):
controller_transistor_count = -1
if mem_tech == PCIE5:
controller_transistor_count = pcie5_ctrl_transistors_per_lane * width
elif mem_tech == PCIE4:
controller_transistor_count = pcie4_ctrl_transistors_per_lane * width
elif mem_tech == DDR5:
controller_transistor_count = ddr5_ctrl_transistors_per_link_unit * width
elif mem_tech == HBM:
controller_transistor_count = hbm2e_ctrl_transistors_per_link_unit * width
elif mem_tech == NVLINK3:
controller_transistor_count = nvlink3_ctrl_transistors_per_link_unit * width
elif mem_tech == NVLINK4:
controller_transistor_count = nvlink4_ctrl_transistors_per_link_unit * width
elif mem_tech == INFINITYFABRIC:
controller_transistor_count = infinity_fabric_ctrl_transistors_per_link_unit * width
return (controller_transistor_count / 1e6) / transistor_density_mil_mm2
def calc_mem_phy_area_mm2(mem_tech, width):
if mem_tech == PCIE5:
return pcie5_phy_mm2_per_lane * width
elif mem_tech == PCIE4:
return pcie4_phy_mm2_per_lane * width
elif mem_tech == DDR5:
return ddr5_phy_mm2_per_link_unit * width
elif mem_tech == HBM:
return hbm2e_phy_mm2_per_link_unit * width
elif mem_tech == NVLINK3:
return nvlink3_phy_mm2_per_link_unit * width
elif mem_tech == NVLINK4:
return nvlink4_phy_mm2_per_link_unit * width
elif mem_tech == INFINITYFABRIC:
return infinity_fabric_phy_mm2_per_link_unit * width
else:
return -1
def find_logic_sram_transistor_density(process_node):
if '7' in process_node:
return transistor_density_7nm, sram_bit_cell_density_7nm
elif '6' in process_node:
return transistor_density_6nm, sram_bit_cell_density_6nm
elif '5' in process_node:
return transistor_density_5nm, sram_bit_cell_density_5nm
raise Exception("Invalid Process Node")
# a compute core consists of a fixed control overhead
# a specified width fp32 vector engine
# a specified dimmension fp16 systolic array
# a specified L1 cache
# at a specified process node
# NB: you can fit multiple cores onto a single die for chiplet systems
def calc_compute_chiplet_area_mm2(configs_dict, verbose=False):
total_die_map = {}
core_breakdown_map = {}
device_name = configs_dict['name']
device_brand = 'nvidia' if 'nvidia' in device_name.lower() else 'amd'
vector_width = configs_dict['device']['compute_chiplet']['core']['vector_unit']['vector_width']
vector_int32_count = configs_dict['device']['compute_chiplet']['core']['vector_unit']['int32_count']
vector_fp16_count = configs_dict['device']['compute_chiplet']['core']['vector_unit']['fp16_count']
vector_fp32_count = configs_dict['device']['compute_chiplet']['core']['vector_unit']['fp32_count']
vector_fp64_count = configs_dict['device']['compute_chiplet']['core']['vector_unit']['fp64_count']
sa_dim_x = configs_dict['device']['compute_chiplet']['core']['systolic_array']['array_width']
sa_dim_y = configs_dict['device']['compute_chiplet']['core']['systolic_array']['array_height']
sa_bitwidth = configs_dict['device']['compute_chiplet']['core']['systolic_array']['data_type']
num_reg_files = configs_dict['device']['compute_chiplet']['core']['register_file']['num_reg_files']
num_registers = configs_dict['device']['compute_chiplet']['core']['register_file']['num_registers']
register_bitwidth = configs_dict['device']['compute_chiplet']['core']['register_file']['register_bitwidth']
num_rdwr_ports = configs_dict['device']['compute_chiplet']['core']['register_file']['num_rdwr_ports']
sublane_count = configs_dict['device']['compute_chiplet']['core']['sublane_count']
cache_size_bytes = configs_dict['device']['compute_chiplet']['core']['SRAM_KB'] * (2 ** 10)
process_node = configs_dict['device']['compute_chiplet']['process_node']
cores_per_chiplet = configs_dict['device']['compute_chiplet']['physical_core_count']
# each sublane has a SA and vector unit. a core is made up of sublanes. a chiplet has multiple cores
transistor_density_mil_mm2, sram_density_bitcell_mm2 = find_logic_sram_transistor_density(process_node)
per_sublane_area_mm2 = 0
per_sublane_control_area_mm2 = per_sublane_control_dict[device_brand] / 1e6 / transistor_density_mil_mm2
per_sublane_area_mm2 += (vector_width * per_sublane_control_area_mm2)
control_logic_area = per_sublane_area_mm2 * sublane_count
per_lane_vector_area = calc_vector_area_mm2(vector_int32_count, vector_fp16_count, vector_fp32_count, vector_fp64_count, transistor_density_mil_mm2)
per_sublane_area_mm2 += per_lane_vector_area
per_lane_sa_area = calc_systolic_array_area_mm2(sa_dim_x, sa_dim_y, sa_bitwidth, transistor_density_mil_mm2)
per_sublane_area_mm2 += per_lane_sa_area
per_lane_regfile_area = calc_reg_file_area(num_reg_files, num_registers, register_bitwidth, num_rdwr_ports, transistor_density_mil_mm2)
per_sublane_area_mm2 += per_lane_regfile_area
per_core_compute_area_mm2 = per_sublane_area_mm2 * sublane_count
cache_area_mm2 = calc_cache_sram_area_mm2(cache_size_bytes, sram_density_bitcell_mm2)
per_core_area_mm2 = per_core_compute_area_mm2 + cache_area_mm2
core_breakdown_map['total_core_area'] = per_core_area_mm2
core_breakdown_map['control_area'] = control_logic_area
core_breakdown_map['alu_area'] = per_lane_vector_area * sublane_count
core_breakdown_map['sa_area'] = per_lane_sa_area * sublane_count
core_breakdown_map['regfile_area'] = per_lane_regfile_area * sublane_count
core_breakdown_map['local_buffer_area'] = cache_area_mm2
total_cores_area = per_core_area_mm2 * cores_per_chiplet
total_crossbar_area = (per_core_comm_dict[device_brand] / 1e6 / transistor_density_mil_mm2) * cores_per_chiplet
# each core has an area overhead to connect to the xbar
compute_chiplet_area_mm2 = total_cores_area + total_crossbar_area
total_die_map['total_area'] = compute_chiplet_area_mm2
total_die_map['cores_area'] = total_cores_area
total_die_map['crossbar_area'] = total_crossbar_area
if verbose:
return compute_chiplet_area_mm2, core_breakdown_map, total_die_map
else:
return compute_chiplet_area_mm2
# NB: for mem_tech, if you are using DDR or HBM, it will be 128 bits and 1024 bits respectively per lane
# for PCIe and NVLink, specify the number of lanes (128bits per lane)
# def calc_io_die_area_mm2(cache_size_bytes, mem_tech, mem_tech_width, num_nvlink_phys, \
# transistor_density_mil_mm2, sram_density_bitcell_mm2):
def calc_io_die_area_mm2(config_dict, verbose=False):
total_die_map = {}
cache_size_bytes = config_dict['device']['io']['physical_global_buffer_MB'] * (2 ** 20)
mem_tech = config_dict['device']['memory_protocol']
num_mem_tech_units = config_dict['device']['io']['memory_channel_physical_count']
gpu_gpu_comm_tech = config_dict['interconnect']['link']['name']
num_gpu_gpu_comm_phy = config_dict['interconnect']['link_count_per_device']
process_node = config_dict['device']['io']['process_node']
transistor_density_mil_mm2, sram_density_bitcell_mm2 = find_logic_sram_transistor_density(process_node)
io_die_area_mm2 = 0
io_die_area_mm2 += calc_cache_sram_area_mm2(cache_size_bytes, sram_density_bitcell_mm2)
global_buffer_area = io_die_area_mm2
# mem tech for communicating to off chip memory
mem_phy_area = calc_mem_phy_area_mm2(mem_tech, num_mem_tech_units)
mem_controller_area = calc_mem_controller_area_mm2(mem_tech, num_mem_tech_units, transistor_density_mil_mm2)
io_die_area_mm2 += mem_phy_area
io_die_area_mm2 += mem_controller_area
# every IO die has a few NV links for chip to chip communication
device_phy_area = calc_mem_phy_area_mm2(gpu_gpu_comm_tech, num_gpu_gpu_comm_phy)
device_controller_area = calc_mem_controller_area_mm2(gpu_gpu_comm_tech, num_gpu_gpu_comm_phy, transistor_density_mil_mm2)
io_die_area_mm2 += device_phy_area
io_die_area_mm2 += device_controller_area
total_die_map['total_die_area'] = io_die_area_mm2
total_die_map['global_buffer_area'] = global_buffer_area
total_die_map['mem_phy_area'] = mem_phy_area
total_die_map['mem_controller_area'] = mem_controller_area
total_die_map['device_phy_area'] = device_phy_area
total_die_map['device_controller_area'] = device_controller_area
if verbose:
return io_die_area_mm2, total_die_map
else:
return io_die_area_mm2
================================================
FILE: cost_model/regfile_area.py
================================================
def calculate_regfile_area(D, W, P):
area_90nm_um2 = (3.29 * 10**4) - (1.09 * 10**3 * D) - (8.83 * 10**2 * W) - (5.55 * 10**3 * P) \
+ (5.35 * 10**1 * D * W) + (1.50 * 10**-2 * D**2) + (1.08 * 10**-2 * W**2) \
+ (5.86 * 10**-1 * P**2) + (1.42 * 10**2 * D * P) + (3.68 * 10**2 * W * P)
# need to convert um2 to mm2, convert to 7nm
area_90nm_mm2 = area_90nm_um2 / 1e6
area_7nm_mm2 = area_90nm_mm2 * (1.6 / 96.3)
return area_7nm_mm2
reg_area = calculate_regfile_area(16384, 32, 4)
print(reg_area)
reg_area = 64 * calculate_regfile_area(512, 32, 4)
print(reg_area)
reg_area = calculate_regfile_area(800, 32, 4)
print(reg_area)
================================================
FILE: design_space_exploration/__init__.py
================================================
================================================
FILE: design_space_exploration/dse.py
================================================
import json, re
from hardware_model.compute_module import (
VectorUnit,
SystolicArray,
Core,
ComputeModule,
overhead_dict,
)
from hardware_model.io_module import IOModule
from hardware_model.memory_module import MemoryModule
from hardware_model.device import Device
from hardware_model.interconnect import LinkModule, InterConnectModule, TopologyType
from hardware_model.system import System
from software_model.transformer import (
TransformerBlockInitComputationTP,
TransformerBlockAutoRegressionTP,
)
from software_model.utils import data_type_dict, Tensor
# from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2
from math import ceil
def read_architecture_template(file_path):
with open(file_path, "r") as f:
arch_specs = json.load(f)
return arch_specs
def template_to_system(arch_specs):
device_specs = arch_specs["device"]
compute_chiplet_specs = device_specs["compute_chiplet"]
io_specs = device_specs["io"]
core_specs = compute_chiplet_specs["core"]
sublane_count = core_specs["sublane_count"]
# vector unit
vector_unit_specs = core_specs["vector_unit"]
vector_unit = VectorUnit(
sublane_count
* vector_unit_specs["vector_width"]
* vector_unit_specs["flop_per_cycle"],
int(re.search(r"(\d+)", vector_unit_specs["data_type"]).group(1)) // 8,
35,
vector_unit_specs["vector_width"],
sublane_count,
)
# systolic array
systolic_array_specs = core_specs["systolic_array"]
systolic_array = SystolicArray(
systolic_array_specs["array_height"],
systolic_array_specs["array_width"],
systolic_array_specs["mac_per_cycle"],
int(re.search(r"(\d+)", systolic_array_specs["data_type"]).group(1)) // 8,
int(re.search(r"(\d+)", systolic_array_specs["data_type"]).group(1)) // 8,
)
# core
core = Core(
vector_unit,
systolic_array,
sublane_count,
core_specs["SRAM_KB"] * 1024,
)
# compute module
compute_module = ComputeModule(
core,
compute_chiplet_specs["core_count"] * device_specs["compute_chiplet_count"],
device_specs["frequency_Hz"],
io_specs["global_buffer_MB"] * 1024 * 1024,
io_specs["global_buffer_bandwidth_per_cycle_byte"],
overhead_dict["A100"],
)
# io module
io_module = IOModule(
io_specs["memory_channel_active_count"]
* io_specs["pin_count_per_channel"]
* io_specs["bandwidth_per_pin_bit"]
// 8,
1e-6,
)
# memory module
memory_module = MemoryModule(
device_specs["memory"]["total_capacity_GB"] * 1024 * 1024 * 1024
)
# device
device = Device(compute_module, io_module, memory_module)
# interconnect
interconnect_specs = arch_specs["interconnect"]
link_specs = interconnect_specs["link"]
link_module = LinkModule(
link_specs["bandwidth_per_direction_byte"],
link_specs["bandwidth_both_directions_byte"],
link_specs["latency_second"],
link_specs["flit_size_byte"],
link_specs["max_payload_size_byte"],
link_specs["header_size_byte"],
)
interconnect_module = InterConnectModule(
arch_specs["device_count"],
TopologyType.FC
if interconnect_specs["topology"] == "FC"
else TopologyType.RING,
link_module,
interconnect_specs["link_count_per_device"],
)
# system
system = System(device, interconnect_module)
return system
def test_template_to_system():
arch_specs = read_architecture_template("configs/template.json")
A100_system = template_to_system(arch_specs)
bs = 8
s = 2048
model = TransformerBlockInitComputationTP(
d_model=12288,
n_heads=96,
device_count=4,
data_type=data_type_dict["fp16"],
)
_ = model(Tensor([bs, s, 12288], data_type_dict["fp16"]))
model.roofline_model(A100_system)
def find_cheapest_design(
d_model,
n_heads,
n_layers,
batch_size,
input_seq_length,
init_latency,
output_seq_length,
auto_regression_latency,
):
i=0
smallest_total_area_mm2=float('inf')
best_arch_specs=None
arch_specs = read_architecture_template("configs/template.json")
for device_count in [4, 8, 12, 16]:
model_init = TransformerBlockInitComputationTP(
d_model=12288,
n_heads=96,
device_count=device_count,
data_type=data_type_dict["fp16"],
)
model_auto_regression = TransformerBlockAutoRegressionTP(
d_model=12288,
n_heads=96,
device_count=device_count,
data_type=data_type_dict["fp16"],)
_ = model_init(Tensor([batch_size, input_seq_length, model_init.d_model], data_type_dict["fp16"]))
_ = model_auto_regression(Tensor([batch_size, 1, model_init.d_model],data_type_dict["fp16"]), input_seq_length+output_seq_length)
arch_specs["device_count"] = device_count
if device_count <= 4:
topology = "FC"
else:
topology = "RING"
arch_specs["interconnect"]["topology"] = topology
for link_count_per_device in [6, 12, 18, 24]:
arch_specs["interconnect"]["link_count_per_device"] = link_count_per_device
# device
for core_count in [32, 64, 128, 256]:
arch_specs["device"]["compute_chiplet"]["core_count"] = core_count
# core
for sublane_count in [1, 2, 4, 8]:
arch_specs["device"]["compute_chiplet"]["core"][
"sublane_count"
] = sublane_count
# systolic array
for array_height in [16, 32, 64, 128]:
arch_specs["device"]["compute_chiplet"]["core"][
"systolic_array"
]["array_height"] = array_height
arch_specs["device"]["compute_chiplet"]["core"][
"systolic_array"
]["array_width"] = array_height
# vector unit
for vector_width in [16, 32, 64, 128]:
arch_specs["device"]["compute_chiplet"]["core"][
"vector_unit"
]["vector_width"] = vector_width
for SRAM_KB in [64, 128, 256, 512, 1024]:
arch_specs["device"]["compute_chiplet"]["core"][
"SRAM_KB"
] = SRAM_KB
# global buffer
for total_global_buffer_MB in [
80,
160,
240,
320,
400,
480,
640,
800,
960,
]:
global_buffer_MB = (
total_global_buffer_MB // device_count
)
global_buffer_bandwidth_per_cycle_byte = (
5120 * global_buffer_MB // 40
)
arch_specs["device"]["io"][
"global_buffer_MB"
] = global_buffer_MB
arch_specs["device"]["io"][
"global_buffer_bandwidth_per_cycle_byte"
] = global_buffer_bandwidth_per_cycle_byte
# memory
memory_capacity_requirement_GB = ceil(model_auto_regression.memory_requirement*n_layers/1e9/16)*16
# print(f"memory_capacity_requirement_GB={model_auto_regression.memory_requirement*n_layers/1e9}")
# exit()
for memory_protocol in [
"HBM2e",
"DDR5",
"PCIe5",
# "GDDR6X"
]:
arch_specs['device']['memory_protocol']=memory_protocol
if memory_protocol == "HBM2e":
# 400 GB/s per channel, 16 GB
channel_count=memory_capacity_requirement_GB // 16
if channel_count>8:
continue
channel_count_list = [channel_count]
pin_count_per_channel=1024
bandwidth_per_pin_bit=3.2e9
elif memory_protocol == "DDR5":
# 19.2 GB/s per channel, 2 channel per dimm
channel_count_list = [16, 24, 32]
pin_count_per_channel=32
bandwidth_per_pin_bit=4.8e9
elif memory_protocol == "PCIe5":
# 4 GB/s per channel
channel_count_list = [64, 96, 128]
pin_count_per_channel=1
bandwidth_per_pin_bit=32e9
# elif memory_protocol == "GDDR6X":
# # 84 GB/s per channel, 2 GB
# channel_count_list= memo
for channel_count in channel_count_list:
arch_specs['device']['memory']['total_capacity_GB'] = memory_capacity_requirement_GB
arch_specs['device']['io']['memory_channel_active_count'] = channel_count
arch_specs['device']['io']['memory_channel_physical_count'] = channel_count
arch_specs['device']['io']['pin_count_per_channel'] = pin_count_per_channel
arch_specs['device']['io']['bandwidth_per_pin_bit'] = bandwidth_per_pin_bit
total_area_mm2=calc_compute_chiplet_area_mm2(arch_specs)+calc_io_die_area_mm2(arch_specs)
# print(f"channel_count={arch_specs['device']['io']['memory_channel_active_count']},total area={total_area_mm2}")
if total_area_mm2>900:
continue
system=template_to_system(arch_specs)
init_roofline_latency=model_init.roofline_model(system)*n_layers
if init_roofline_latency>init_latency:
continue
auto_regression_roofline_latency=model_auto_regression.roofline_model(system)*n_layers
if auto_regression_roofline_latency>auto_regression_latency:
continue
auto_regression_latency_simulated = model_auto_regression.compile_and_simulate(system, 'heuristic-GPU')
if auto_regression_latency_simulated>auto_regression_latency:
continue
init_latency_simulated = model_init.compile_and_simulate(system, 'heuristic-GPU')
if init_latency_simulated>init_latency:
continue
if total_area_mm2*device_count<smallest_total_area_mm2:
smallest_total_area_mm2=total_area_mm2*device_count
best_arch_specs=arch_specs
best_arch_specs['area_per_device_mm2']=total_area_mm2
# print(f"best_arch_specs={best_arch_specs}")
# print(f"smallest_total_area_mm2={smallest_total_area_mm2}")
i=i+1
if i%100==0:
print(f'i={i}')
print(f'number of potential designs={i}')
with open("configs/best_arch_specs.json", "w") as f:
json.dump(best_arch_specs, f, indent=4)
if __name__ == "__main__":
# test_template_to_system()
find_cheapest_design(12288, 96, 96, 8, 2048, 5, 1024, 0.1)
================================================
FILE: docs/run.md
================================================
# User Guide: How to Run a LLMCompass Simulation
## Step 1: Build a Hardware Configuration
Follow the [NVIDIA GA100 example](../configs/GA100.json). This is a 4-GA100 node connected with NVLinks.
### Explanations on the Knobs
Most of the attributes are self-explained:
```json
{
"name": "NVIDIA A100(80GB)x4",
"device_count": 4, # how many devices in a node
"interconnect": {
"link": {
"name": "NVLink3",
"bandwidth_per_direction_byte": 25e9,
"bandwidth_both_directions_byte": 50e9,
"latency_second": 8.92e-6,
"flit_size_byte": 16,
"header_size_byte": 16,
"max_payload_size_byte": 256
},
"link_count_per_device": 12,
"topology": "FC" # currently support FC (fully-connected) and RING
},
"device": {
"frequency_Hz": 1410e6,
"compute_chiplet_count": 1,
"compute_chiplet": {
"physical_core_count": 128, # used for area model
"core_count": 128, # used for performance model
"process_node": "7nm", # currently support 7nm, 6nm, 5nm
"core": {
"sublane_count": 4,
"systolic_array": {
"array_width": 16,
"array_height": 16,
"data_type": "fp16",
"mac_per_cycle": 1
},
"vector_unit": {
"vector_width": 32,
"flop_per_cycle": 4, # 32*4=128 flops per cycle per vector unit
"data_type": "fp16",
"int32_count": 16, # the number of int32 ALUs, used for area model
"fp16_count": 0,
"fp32_count": 16,
"fp64_count": 8
},
"register_file": {
"num_reg_files": 1,
"num_registers": 16384,
"register_bitwidth":32,
"num_rdwr_ports":4
},
"SRAM_KB": 192
}
},
"memory_protocol": "HBM2e",
"_memory_protocol_list": [
"HBM2e",
"DDR4",
"DDR5",
"PCIe4",
"PCIe5"
],
"io": {
"process_node": "7nm",
"global_buffer_MB": 48,
"physical_global_buffer_MB": 48,
"global_buffer_bandwidth_per_cycle_byte": 5120,
"memory_channel_physical_count": 6, # used for area model
"memory_channel_active_count": 5, # used for performance model
"pin_count_per_channel": 1024,
"bandwidth_per_pin_bit": 3.2e9
},
"memory": {
"total_capacity_GB": 80
}
}
}
```
## Step 2: Build a LLM Computational Graph
Transformer blocks have been provided as in [`transformer.py`](../software_model/transformer.py), including Initial Computation (also called Prefill or Context stage) and Auto Regression (also called Decoding or Generation stage), with Tensor Parallelism support (automatically turned of if the system only has 1 device).
The user needs to provide these parameter:
* `d_model`: the hidden dimension, 12288 for GPT3
* `n_heads`: the number of heads, 96 for GPT3
* `device_count`: tensor parallelism
* `data_type`: `int8`, `fp16`, or `fp32`
### Build Your Own LLM
The user can also build their own computational graph following the [`transformer.py`](../software_model/transformer.py) example using provided operators: [`matmul`](../software_model/matmul.py), [`softmax`](../software_model/softmax.py), [`layernorm`](../software_model/layernorm.py), [`gelu`](../software_model/gelu.py), and [`allreduce`](../software_model/communication_primitives.py).
The user needs to define a new `class` by inheriting `Operator` the class and configure these fields:
* `__init__`: define the needed operators in the initial function
* `__call__`: build the computational graph. The shape of Tensors will be automatically calculated and used for simulation.
* `compile_and_simulate`: simulate all the operators and get the total latency as well as other runtimes.
* `roofline_model` (optional): a roofline model analysis.
* `run_on_gpu` (optional): run the computational graph on real-world GPUs with PyTorch.
## Step 3: Run a LLMCompass Simulation
First, read the hardware configuration and parse it to LLMCompass:
```python
from design_space_exploration.dse import template_to_system, read_architecture_template
specs = read_architecture_template("PATH/TO/YOUR/JSON")
system = template_to_system(specs)
```
Next, initiate and instantiate an LLM as in this example:
```python
model_auto_regression = TransformerBlockAutoRegressionTP(
d_model=12288,
n_heads=96,
device_count=1,
data_type=data_type_dict["fp16"],
)
_ = model_auto_regression(
Tensor([bs, 1, 12288], data_type_dict["fp16"]),
seq_len,
)
```
Finally, run the simulation
```
auto_regression_latency_simulated = model_auto_regression.compile_and_simulate(
system, "heuristic-GPU"
)
```
================================================
FILE: environment.yml
================================================
name: llmcompass_ae
channels:
- pytorch
- defaults
dependencies:
- python=3.9
- pytorch
- pip:
- scalesim
- matplotlib
- seaborn
- scipy
================================================
FILE: hardware_model/__init__.py
================================================
# from compute_module import *
# from io_module import *
# from memory_module import *
# from chiplet import *
# from interconnect import *
# from pcb import *
================================================
FILE: hardware_model/arch_template.py
================================================
class ArchitectureTemplate:
def __init__(self,
# ve
):
================================================
FILE: hardware_model/compute_module.py
================================================
from math import ceil
from software_model.utils import DataType, data_type_dict
class VectorUnit:
def __init__(
self,
total_vector_flops_per_cycle,
word_size,
flops_per_exp,
vector_width,
vector_count,
data_type=data_type_dict["fp16"],
):
self.total_vector_flops_per_cycle = total_vector_flops_per_cycle
self.word_size = word_size # Byte
self.flops_per_exp = flops_per_exp # flops per exp instruction
self.vector_width = vector_width
self.vector_count = vector_count
self.flops_per_cycle = ceil(
total_vector_flops_per_cycle / vector_width / vector_count
)
self.data_type = data_type
vector_unit_dict = {
"A100_fp16": VectorUnit(512, 2, 35, 32, 4),
"TPUv3_fp32": VectorUnit(128 * 8, 4, 15, 128, 8, data_type_dict["fp32"]),
"MI210_fp32": VectorUnit(128, 4, 18, 16, 4, data_type_dict["fp32"]),
"TPUv3_new": VectorUnit(128 * 4, 4, 15, 128, 4, data_type_dict["fp32"]),
}
class SystolicArray:
def __init__(
self,
array_height,
array_width,
mac_per_cycle,
input_word_size,
output_word_size,
):
self.array_height = array_height
self.array_width = array_width
self.mac_per_cycle = mac_per_cycle
self.input_word_size = input_word_size
self.output_word_size = output_word_size
systolic_array_dict = {
"A100_fp16": SystolicArray(16, 16, 1, 2, 2),
"A100_int8": SystolicArray(16, 16, 2, 1, 4),
"TPUv3_bf16": SystolicArray(128, 128, 1, 2, 4),
"MI210_fp16": SystolicArray(16, 16, 0.5, 2, 2),
"TPUv3_new": SystolicArray(128, 128, 1, 2, 4),
}
class Core:
def __init__(
self,
vector_unit: VectorUnit,
systolic_array: SystolicArray,
systolic_array_count,
SRAM_size,
):
self.vector_unit = vector_unit
self.systolic_array = systolic_array
self.systolic_array_count = systolic_array_count
self.SRAM_size = SRAM_size # Byte
# assert(vector_unit.word_size==systolic_array.word_size)
self.vector_word_size = vector_unit.word_size
core_dict = {
"SM_A100_fp16": Core(
vector_unit_dict["A100_fp16"], systolic_array_dict["A100_fp16"], 4, 192 * 1024
),
"SM_A100_int8": Core(
vector_unit_dict["A100_fp16"], systolic_array_dict["A100_int8"], 4, 192 * 1024
),
"Core_TPUv3_bf16": Core(
vector_unit_dict["TPUv3_fp32"],
systolic_array_dict["TPUv3_bf16"],
2,
16 * 1024 * 1024,
),
"CU_MI210_fp16": Core(
vector_unit_dict["MI210_fp32"], systolic_array_dict["MI210_fp16"], 4, 128 * 1024
),
"Core_TPUv3_new": Core(
vector_unit_dict["TPUv3_new"],
systolic_array_dict["TPUv3_new"],
1,
8 * 1024 * 1024,
),
}
# compute_tile_dict={'SM_A100_int8':ComputeTile(512, 4096, 192*1024*8,3.41, 'TSMC N7', 128*8),'SM_A100_fp16':ComputeTile(512, 2048, 192*1024*8,3.41, 'TSMC N7', 128),}
# flops: https://docs.nvidia.com/deeplearning/performance/dl-performance-gpu-background/index.html#gpu-arch__fig2
# area: https://pbs.twimg.com/media/FOT_-NJWUAARrtB?format=jpg&name=large
class Overhead:
def __init__(self, matmul, softmax, layernorm, gelu):
self.matmul = matmul
self.softmax = softmax
self.layernorm = layernorm
self.gelu = gelu
overhead_dict = {
"A100": Overhead(2.1e-5, 1.2e-5, 4.5e-5, 4.5e-5),
"TPUv3": Overhead(11e-5, 30e-5, 14e-5, 10e-5),
"MI210": Overhead(3.4e-5, 2.2e-5, 2.8e-5, 2.1e-5),
}
class ComputeModule:
def __init__(
self,
core: Core,
core_count,
clock_freq,
l2_size,
l2_bandwidth_per_cycle,
overhead: Overhead = overhead_dict["A100"],
):
self.core = core
self.core_count = core_count
self.clock_freq = clock_freq
self.l2_size = int(l2_size) # Byte
self.l2_bandwidth_per_cycle = l2_bandwidth_per_cycle # Byte/clock
self.total_vector_flops_per_cycle = (
core.vector_unit.total_vector_flops_per_cycle * core_count
)
self.total_vector_flops = self.total_vector_flops_per_cycle * clock_freq
self.total_systolic_array_flops = (
core_count
* core.systolic_array_count
* core.systolic_array.mac_per_cycle
* 2
* core.systolic_array.array_height
* core.systolic_array.array_width
* clock_freq
)
self.overhead = overhead
compute_module_dict = {
"A100_fp16": ComputeModule(
core_dict["SM_A100_fp16"],
108,
1.41e9,
40 * 1024**2,
5120,
overhead_dict["A100"],
),
"A100_int8": ComputeModule(
core_dict["SM_A100_int8"],
108,
1.41e9,
40 * 1024**2,
5120,
overhead_dict["A100"],
),
"TPUv3_bf16": ComputeModule(
core_dict["Core_TPUv3_bf16"],
1,
940e6,
16 * 1024**3,
490,
overhead_dict["TPUv3"],
),
"MI210_fp16": ComputeModule(
core_dict["CU_MI210_fp16"],
104,
1.4e9,
8 * 1024**2,
4096,
overhead_dict["MI210"],
),
"TPUv3_new": ComputeModule(
core_dict["Core_TPUv3_new"],
2,
940e6,
16 * 1024**3,
490,
overhead_dict["TPUv3"],
),
}
================================================
FILE: hardware_model/device.py
================================================
from hardware_model.compute_module import ComputeModule, compute_module_dict
from hardware_model.io_module import IOModule, IO_module_dict
from hardware_model.memory_module import MemoryModule, memory_module_dict
class Device:
def __init__(
self,
compute_module: ComputeModule,
io_module: IOModu
gitextract_pu2g2804/ ├── .gitignore ├── .gitmodules ├── Dockerfile ├── LICENSE ├── README.md ├── __init__.py ├── ae/ │ ├── .gitignore │ ├── __init__.py │ ├── figure10/ │ │ ├── __init__.py │ │ ├── plot_latency.py │ │ ├── run_figure10.sh │ │ └── test_latency.py │ ├── figure11/ │ │ ├── __init__.py │ │ ├── plot_decoding.py │ │ ├── run_figure11.sh │ │ └── test_decoding.py │ ├── figure12/ │ │ ├── __init__.py │ │ ├── plot_throughput.py │ │ ├── run_figure12.sh │ │ └── test_throughput.py │ ├── figure5/ │ │ ├── __init__.py │ │ ├── ab/ │ │ │ ├── __init__.py │ │ │ ├── plot_matmul.py │ │ │ ├── real_hardware/ │ │ │ │ ├── matmul_A100.csv │ │ │ │ └── matmul_MI210.csv │ │ │ ├── run.sh │ │ │ └── test_matmul.py │ │ ├── cf/ │ │ │ ├── __init__.py │ │ │ ├── plot_softmax.py │ │ │ ├── real_hardware/ │ │ │ │ ├── softmax_A100.csv │ │ │ │ └── softmax_MI210.csv │ │ │ ├── run.sh │ │ │ └── test_softmax.py │ │ ├── de/ │ │ │ ├── __init__.py │ │ │ ├── plot_layernorm.py │ │ │ ├── real_hardware/ │ │ │ │ ├── layernorm_A100.csv │ │ │ │ └── layernorm_MI210.csv │ │ │ ├── run.sh │ │ │ └── test_layernorm.py │ │ ├── g/ │ │ │ ├── __init__.py │ │ │ ├── plot_gelu.py │ │ │ ├── real_hardware/ │ │ │ │ ├── gelu_A100.csv │ │ │ │ └── gelu_MI210.csv │ │ │ ├── run.sh │ │ │ └── test_gelu.py │ │ ├── h/ │ │ │ ├── __init__.py │ │ │ ├── run.sh │ │ │ └── test_allreduce.py │ │ ├── ijkl/ │ │ │ ├── __init__.py │ │ │ ├── plot_transformer.py │ │ │ ├── real_hardware/ │ │ │ │ ├── transformerAR_A100.csv │ │ │ │ └── transformer_A100.csv │ │ │ ├── run.sh │ │ │ └── test_transformer.py │ │ └── run_figure5.sh │ ├── figure6/ │ │ ├── real_hardware/ │ │ │ └── die_area.csv │ │ ├── run_figure6.sh │ │ └── test_cost_model.py │ ├── figure7/ │ │ ├── __init__.py │ │ ├── change_core_size.py │ │ ├── plot_core_size.py │ │ └── run_figure7.sh │ ├── figure8/ │ │ ├── __init__.py │ │ ├── change_memory_bw.py │ │ ├── plot_memory_bw.py │ │ └── run_figure8.sh │ └── figure9/ │ ├── __init__.py │ ├── change_l1_cache.py │ ├── plot_l1_cache.py │ └── run_figure9.sh ├── configs/ │ ├── GA100.json │ ├── ga102_template.json │ ├── generation_system.json │ ├── latency_design.json │ ├── mi210.json │ ├── mi210_template.json │ ├── prefilling_system.json │ └── template.json ├── cost_model/ │ ├── __init__.py │ ├── cost_examples.py │ ├── cost_model.py │ └── regfile_area.py ├── design_space_exploration/ │ ├── __init__.py │ └── dse.py ├── docs/ │ └── run.md ├── environment.yml ├── hardware_model/ │ ├── __init__.py │ ├── arch_template.py │ ├── compute_module.py │ ├── device.py │ ├── interconnect.py │ ├── io_module.py │ ├── memory_module.py │ └── system.py ├── software_model/ │ ├── __init__.py │ ├── communication_primitives.py │ ├── gelu.py │ ├── layernorm.py │ ├── matmul.py │ ├── operators.py │ ├── softmax.py │ ├── transformer.py │ └── utils.py ├── systolic_array_model/ │ ├── look_up_table.csv │ ├── look_up_table_128_128.csv │ ├── look_up_table_16_16.csv │ ├── look_up_table_32_32.csv │ ├── look_up_table_64_64.csv │ ├── look_up_table_8_8.csv │ └── look_up_table_old.csv └── utils.py
SYMBOL INDEX (182 symbols across 28 files)
FILE: ae/figure10/plot_latency.py
function get_total_decoding_latency (line 19) | def get_total_decoding_latency(df: pd.DataFrame, start, end):
function get_intensity (line 69) | def get_intensity(color):
FILE: ae/figure10/test_latency.py
function simulate_decoding_latency (line 35) | def simulate_decoding_latency(system, bs, seq_len, name, lock):
function simulate_prefill_latency (line 54) | def simulate_prefill_latency(system, bs, seq_len, name, lock):
FILE: ae/figure11/test_decoding.py
function simulate_latency (line 27) | def simulate_latency(system, bs, seq_len, name, lock):
FILE: ae/figure12/plot_throughput.py
function get_total_decoding_latency (line 33) | def get_total_decoding_latency(df: pd.DataFrame, start, end):
function get_intensity (line 125) | def get_intensity(color):
FILE: ae/figure12/test_throughput.py
function simulate_decoding_latency (line 34) | def simulate_decoding_latency(system, bs, seq_len, name, lock, heuristics):
function simulate_prefill_latency (line 55) | def simulate_prefill_latency(system, bs, seq_len, name, lock, heuristics):
FILE: ae/figure5/ijkl/plot_transformer.py
function read_csv (line 6) | def read_csv(filename: str):
FILE: ae/figure7/change_core_size.py
function test_core_size (line 38) | def test_core_size(core_configs, lock):
FILE: ae/figure8/change_memory_bw.py
function test_memory_bandwidth (line 54) | def test_memory_bandwidth(memory_bandwidth, lock):
FILE: ae/figure9/change_l1_cache.py
function test_SRAM_KB (line 38) | def test_SRAM_KB(SRAM_KB, lock):
FILE: cost_model/cost_model.py
function calc_systolic_array_area_mm2 (line 91) | def calc_systolic_array_area_mm2(dimension_x, dimension_y, bitwidth, tra...
function calc_vector_area_mm2 (line 102) | def calc_vector_area_mm2(int32_count, fp16_count, fp32_count, fp64_count...
function calc_cache_sram_area_mm2 (line 114) | def calc_cache_sram_area_mm2(capacity_bytes, sram_bitcell_area_mm2, max_...
function calc_reg_file_area (line 137) | def calc_reg_file_area(num_reg_files, D, W, P, transistor_density_mil_mm2):
function calc_mem_controller_area_mm2 (line 150) | def calc_mem_controller_area_mm2(mem_tech, width, transistor_density_mil...
function calc_mem_phy_area_mm2 (line 170) | def calc_mem_phy_area_mm2(mem_tech, width):
function find_logic_sram_transistor_density (line 188) | def find_logic_sram_transistor_density(process_node):
function calc_compute_chiplet_area_mm2 (line 205) | def calc_compute_chiplet_area_mm2(configs_dict, verbose=False):
function calc_io_die_area_mm2 (line 272) | def calc_io_die_area_mm2(config_dict, verbose=False):
FILE: cost_model/regfile_area.py
function calculate_regfile_area (line 1) | def calculate_regfile_area(D, W, P):
FILE: design_space_exploration/dse.py
function read_architecture_template (line 22) | def read_architecture_template(file_path):
function template_to_system (line 28) | def template_to_system(arch_specs):
function test_template_to_system (line 110) | def test_template_to_system():
function find_cheapest_design (line 125) | def find_cheapest_design(
FILE: hardware_model/arch_template.py
class ArchitectureTemplate (line 1) | class ArchitectureTemplate:
method __init__ (line 2) | def __init__(self,
FILE: hardware_model/compute_module.py
class VectorUnit (line 5) | class VectorUnit:
method __init__ (line 6) | def __init__(
class SystolicArray (line 34) | class SystolicArray:
method __init__ (line 35) | def __init__(
class Core (line 59) | class Core:
method __init__ (line 60) | def __init__(
class Overhead (line 103) | class Overhead:
method __init__ (line 104) | def __init__(self, matmul, softmax, layernorm, gelu):
class ComputeModule (line 118) | class ComputeModule:
method __init__ (line 119) | def __init__(
FILE: hardware_model/device.py
class Device (line 6) | class Device:
method __init__ (line 7) | def __init__(
FILE: hardware_model/interconnect.py
class TopologyType (line 5) | class TopologyType(Enum):
class LinkModule (line 10) | class LinkModule:
method __init__ (line 11) | def __init__(
class InterConnectModule (line 35) | class InterConnectModule:
method __init__ (line 36) | def __init__(
FILE: hardware_model/io_module.py
class IOModule (line 1) | class IOModule:
method __init__ (line 2) | def __init__(self, bandwidth, latency):
FILE: hardware_model/memory_module.py
class MemoryModule (line 1) | class MemoryModule:
method __init__ (line 2) | def __init__(self, memory_capacity):
FILE: hardware_model/system.py
class System (line 6) | class System:
method __init__ (line 7) | def __init__(self, pcb_module: Device, interconnect: InterConnectModul...
FILE: software_model/communication_primitives.py
class CommunicationPrimitive (line 14) | class CommunicationPrimitive:
method __init__ (line 15) | def __init__(self, data_type: DataType) -> None:
class AllReduceMultiPCB (line 21) | class AllReduceMultiPCB(CommunicationPrimitive):
method __init__ (line 22) | def __init__(self, data_type: DataType) -> None:
method __call__ (line 25) | def __call__(self, tensor: Tensor) -> Any:
method simulate (line 30) | def simulate(self, interconnect_module: InterConnectModule) -> None:
class Broadcast (line 111) | class Broadcast:
method __init__ (line 112) | def __init__(self):
method __call__ (line 116) | def __call__(self, src: int, tensor: Tensor):
FILE: software_model/gelu.py
function gelu_gpu (line 14) | def gelu_gpu(input: torch.Tensor) -> torch.Tensor:
class GeLU (line 19) | class GeLU(Operator):
method __init__ (line 20) | def __init__(self, data_type: DataType):
method __call__ (line 24) | def __call__(self, input: Tensor) -> Tensor:
method roofline_model (line 31) | def roofline_model(self, pcb_module: Device):
method print_latency (line 55) | def print_latency(self):
class ComputationalGraph (line 58) | class ComputationalGraph:
method __init__ (line 59) | def __init__(self, M: int, data_type: DataType):
method compile_and_simulate (line 63) | def compile_and_simulate(self, pcb_module: Device, compile_mode: str):
method run_on_gpu (line 93) | def run_on_gpu(self):
method gpu_kernel_launch_overhead (line 114) | def gpu_kernel_launch_overhead():
FILE: software_model/layernorm.py
function layernorm_gpu (line 14) | def layernorm_gpu(input: torch.Tensor) -> torch.Tensor:
class LayerNorm (line 18) | class LayerNorm(Operator):
method __init__ (line 19) | def __init__(self, data_type: DataType):
method __call__ (line 23) | def __call__(self, input: Tensor) -> Tensor:
method roofline_model (line 33) | def roofline_model(self, pcb_module: Device):
method print_latency (line 47) | def print_latency(self):
class ComputationalGraph (line 50) | class ComputationalGraph:
method __init__ (line 51) | def __init__(self, M: int, N: int, data_type: DataType):
class Mapping (line 56) | class Mapping:
method __init__ (line 57) | def __init__(
method display (line 69) | def display(self):
method compile_and_simulate (line 75) | def compile_and_simulate(self, pcb_module: Device, compile_mode: str):
method simulate (line 128) | def simulate(
class L2TileSimulator (line 169) | class L2TileSimulator:
method __init__ (line 170) | def __init__(
method simulate_l2_tile_io_cycle_count (line 190) | def simulate_l2_tile_io_cycle_count(
method simulate_l2_tile_compute_cycle_count (line 203) | def simulate_l2_tile_compute_cycle_count(
class L1TileSimulator (line 235) | class L1TileSimulator:
method __init__ (line 236) | def __init__(
method simulate_l1_tile_io_cycle_count (line 269) | def simulate_l1_tile_io_cycle_count(
method simulate_l1_tile_compute_cycle_count (line 279) | def simulate_l1_tile_compute_cycle_count(
method run_on_gpu (line 332) | def run_on_gpu(self):
method gpu_kernel_launch_overhead (line 357) | def gpu_kernel_launch_overhead():
FILE: software_model/matmul.py
class BatchedMatmul (line 17) | class BatchedMatmul(Operator):
method __init__ (line 18) | def __init__(self, data_type: DataType):
method __call__ (line 24) | def __call__(self, input1: Tensor, input2: Tensor) -> Tensor:
method roofline_model (line 40) | def roofline_model(self, pcb_module: Device):
method compile_and_simulate (line 57) | def compile_and_simulate(self, pcb_module: Device, compile_mode: str):
method run_on_gpu (line 79) | def run_on_gpu(
method gpu_kernel_launch_overhead (line 105) | def gpu_kernel_launch_overhead():
class Matmul (line 122) | class Matmul(Operator):
method __init__ (line 123) | def __init__(self, data_type: DataType):
method __call__ (line 131) | def __call__(self, input1: Tensor, input2: Tensor) -> Tensor:
method roofline_model (line 154) | def roofline_model(self, pcb_module: Device):
method print_latency (line 166) | def print_latency(self):
method generate_tile_loops (line 173) | def generate_tile_loops(loop_M: int, loop_N: int, loop_K: int, loop_or...
class ComputationalGraph (line 206) | class ComputationalGraph:
method __init__ (line 207) | def __init__(self, M: int, N: int, K: int, data_type: DataType):
method display (line 213) | def display(self):
class Mapping (line 219) | class Mapping:
method __init__ (line 220) | def __init__(
method display (line 250) | def display(self):
method find_permutations (line 263) | def find_permutations(n):
method compile_and_simulate (line 275) | def compile_and_simulate(
method simulate (line 742) | def simulate(
class L2TileSimulator (line 972) | class L2TileSimulator:
method __init__ (line 973) | def __init__(
method simulate_l2_tile_io_cycle_count (line 1009) | def simulate_l2_tile_io_cycle_count(
method simulate_l2_tile_compute_cycle_count (line 1022) | def simulate_l2_tile_compute_cycle_count(
class L1TileSimulator (line 1293) | class L1TileSimulator:
method __init__ (line 1294) | def __init__(
method simulate_l1_tile_compute_cycle_count (line 1312) | def simulate_l1_tile_compute_cycle_count(
method simulate_systolic_array_cycle_count (line 1357) | def simulate_systolic_array_cycle_count(
method run_on_gpu (line 1479) | def run_on_gpu(
method gpu_kernel_launch_overhead (line 1528) | def gpu_kernel_launch_overhead():
FILE: software_model/operators.py
class Operator (line 7) | class Operator:
method __init__ (line 8) | def __init__(
class mapping (line 35) | class mapping:
class Reshape (line 42) | class Reshape(Operator):
method __init__ (line 43) | def __init__(self, data_type: DataType):
method __call__ (line 48) | def __call__(self, input: Tensor, output_shape: List[int]) -> Tensor:
class Concat (line 61) | class Concat(Operator):
method __init__ (line 62) | def __init__(self, data_type: DataType):
method __call__ (line 69) | def __call__(self, input1: Tensor, input2: Tensor, concat_dim: int) ->...
class Transpose (line 91) | class Transpose(Operator):
method __init__ (line 92) | def __init__(self, data_type: DataType):
method __call__ (line 97) | def __call__(self, input: Tensor, permute: List[int]) -> Tensor:
FILE: software_model/softmax.py
class Softmax (line 13) | class Softmax(Operator):
method __init__ (line 14) | def __init__(self, data_type: DataType):
method __call__ (line 18) | def __call__(self, input: Tensor) -> Tensor:
method print_latency (line 28) | def print_latency(self):
class ComputationalGraph (line 31) | class ComputationalGraph:
method __init__ (line 32) | def __init__(self, M: int, N: int, data_type: DataType):
class Mapping (line 37) | class Mapping:
method __init__ (line 38) | def __init__(
method display (line 54) | def display(self):
method roofline_model (line 60) | def roofline_model(self, pcb_module: Device):
method compile_and_simulate (line 66) | def compile_and_simulate(self, pcb_module: Device, compile_mode=None):
method simulate (line 116) | def simulate(
class L2TileSimulator (line 167) | class L2TileSimulator:
method __init__ (line 168) | def __init__(
method simulate_l2_tile_io_cycle_count (line 188) | def simulate_l2_tile_io_cycle_count(
method simulate_l2_tile_compute_cycle_count (line 201) | def simulate_l2_tile_compute_cycle_count(
class L1TileSimulator (line 234) | class L1TileSimulator:
method __init__ (line 235) | def __init__(
method simulate_l1_tile_io_cycle_count (line 269) | def simulate_l1_tile_io_cycle_count(
method simulate_l1_tile_compute_cycle_count (line 279) | def simulate_l1_tile_compute_cycle_count(
method run_on_gpu (line 294) | def run_on_gpu(self):
method gpu_kernel_launch_overhead (line 313) | def gpu_kernel_launch_overhead():
FILE: software_model/transformer.py
class TransformerBlockInitComputationTP (line 20) | class TransformerBlockInitComputationTP(Operator):
method __init__ (line 21) | def __init__(self, d_model, n_heads, device_count, data_type: DataType):
method __call__ (line 60) | def __call__(self, X: Tensor) -> Tensor:
method roofline_model (line 114) | def roofline_model(self, system: System):
method compile_and_simulate (line 194) | def compile_and_simulate(self, system: System, compile_mode: str):
method run_on_gpu (line 286) | def run_on_gpu(self):
class TransformerBlockAutoRegressionTP (line 355) | class TransformerBlockAutoRegressionTP(Operator):
method __init__ (line 356) | def __init__(self, d_model, n_heads, device_count, data_type: DataType):
method __call__ (line 397) | def __call__(self, x: Tensor, seq_len: int) -> Tensor:
method roofline_model (line 470) | def roofline_model(self, system: System):
method compile_and_simulate (line 551) | def compile_and_simulate(self, system: System, compile_mode: str):
method run_on_gpu (line 642) | def run_on_gpu(self):
class LLMInitComputationTP (line 712) | class LLMInitComputationTP:
method __init__ (line 713) | def __init__(
FILE: software_model/utils.py
class DataType (line 5) | class DataType:
method __init__ (line 6) | def __init__(self, name: str, word_size: int) -> None:
class Tensor (line 12) | class Tensor:
method __init__ (line 13) | def __init__(
FILE: utils.py
function size_of_list (line 3) | def size_of_list(list: List):
function size (line 9) | def size(list):
function closest_factors (line 15) | def closest_factors(n):
Condensed preview — 111 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (2,132K chars).
[
{
"path": ".gitignore",
"chars": 51,
"preview": "__pycache__/\nSlurmOutput/\ntemp/\n.vscode/\n*.ncu-rep\n"
},
{
"path": ".gitmodules",
"chars": 128,
"preview": "[submodule \"cost_model/supply_chain\"]\n\tpath = cost_model/supply_chain\n\turl = https://github.com/PrincetonUniversity/ttm-"
},
{
"path": "Dockerfile",
"chars": 691,
"preview": "# Start with a base image that includes Miniconda to manage our environment\nFROM continuumio/miniconda3\n\n# Set the worki"
},
{
"path": "LICENSE",
"chars": 1507,
"preview": "BSD 3-Clause License\n\nCopyright (c) 2024, Princeton University\n\nRedistribution and use in source and binary forms, with "
},
{
"path": "README.md",
"chars": 2581,
"preview": "[](https://zenodo.org/doi/10.5281/zenodo.10892431)\n\n# LLMCompass\n\nThis rep"
},
{
"path": "__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ae/.gitignore",
"chars": 58,
"preview": "*.pdf\n*.csv\n!**/real_hardware/**/*.csv\n!expected_results/*"
},
{
"path": "ae/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ae/figure10/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ae/figure10/plot_latency.py",
"chars": 7296,
"preview": "import pandas as pd\nimport numpy as np\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n\nour_decoding = pd.read_cs"
},
{
"path": "ae/figure10/run_figure10.sh",
"chars": 102,
"preview": "rm *.csv\nrm *.pdf\n\ncd ../..\n\npython -m ae.figure10.test_latency\n\ncd ae/figure10\npython plot_latency.py"
},
{
"path": "ae/figure10/test_latency.py",
"chars": 4237,
"preview": "from software_model.transformer import (\n TransformerBlockInitComputationTP,\n TransformerBlockAutoRegressionTP,\n)\n"
},
{
"path": "ae/figure11/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ae/figure11/plot_decoding.py",
"chars": 3735,
"preview": "import pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\nfrom scipy.stats import gme"
},
{
"path": "ae/figure11/run_figure11.sh",
"chars": 104,
"preview": "rm *.csv\nrm *.pdf\n\ncd ../..\n\npython -m ae.figure11.test_decoding\n\ncd ae/figure11\npython plot_decoding.py"
},
{
"path": "ae/figure11/test_decoding.py",
"chars": 2557,
"preview": "from software_model.transformer import (\n TransformerBlockInitComputationTP,\n TransformerBlockAutoRegressionTP,\n)\n"
},
{
"path": "ae/figure12/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ae/figure12/plot_throughput.py",
"chars": 7153,
"preview": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport statistics\n\nour_dire"
},
{
"path": "ae/figure12/run_figure12.sh",
"chars": 148,
"preview": "rm A100/*.csv\nrm our/*.csv\nrm *.pdf\n\nmkdir A100\nmkdir our\n\ncd ../..\n\npython -m ae.figure12.test_throughput\n\ncd ae/figure"
},
{
"path": "ae/figure12/test_throughput.py",
"chars": 5275,
"preview": "from software_model.transformer import (\n TransformerBlockInitComputationTP,\n TransformerBlockAutoRegressionTP,\n)\n"
},
{
"path": "ae/figure5/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ae/figure5/ab/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ae/figure5/ab/plot_matmul.py",
"chars": 7871,
"preview": "import pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\n\n\nmatmul_TPUv3_sim = pd.rea"
},
{
"path": "ae/figure5/ab/real_hardware/matmul_A100.csv",
"chars": 868,
"preview": "64, 12288, 12288, 0.1900ms, 101.7124Tflops\n128, 12288, 12288, 0.2003ms, 193.0114Tflops\n256, 12288, 12288, 0.3185ms, 242."
},
{
"path": "ae/figure5/ab/real_hardware/matmul_MI210.csv",
"chars": 947,
"preview": "32, 12288, 12288, 0.5493ms, 17.5922Tflops\n64, 12288, 12288, 0.5584ms, 34.6135Tflops\n128, 12288, 12288, 0.5932ms, 65.1646"
},
{
"path": "ae/figure5/ab/run.sh",
"chars": 375,
"preview": "rm *.csv\nrm *.pdf\n\ncd ../../..\n\npython -m ae.figure5.ab.test_matmul --simgpu --roofline\npython -m ae.figure5.ab.test_mat"
},
{
"path": "ae/figure5/ab/test_matmul.py",
"chars": 5208,
"preview": "from software_model.matmul import Matmul\nfrom software_model.utils import data_type_dict, Tensor\nfrom hardware_model.dev"
},
{
"path": "ae/figure5/cf/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ae/figure5/cf/plot_softmax.py",
"chars": 7087,
"preview": "import pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\n\n\nsoftmax_TPUv3_sim = pd.re"
},
{
"path": "ae/figure5/cf/real_hardware/softmax_A100.csv",
"chars": 644,
"preview": "4096, 32, 9.99556025250909\n4096, 64, 19.634136210285714\n4096, 128, 37.27158060257627\n4096, 256, 68.719476736\n4096, 512, "
},
{
"path": "ae/figure5/cf/real_hardware/softmax_MI210.csv",
"chars": 654,
"preview": "4096, 32, 5.389762881254902\n4096, 64, 10.372751205433962\n4096, 128, 13.49094021811043\n4096, 256, 36.049561566426235\n4096"
},
{
"path": "ae/figure5/cf/run.sh",
"chars": 382,
"preview": "rm *.csv\nrm *.pdf\n\ncd ../../..\n\npython -m ae.figure5.cf.test_softmax --simgpu --roofline\npython -m ae.figure5.cf.test_so"
},
{
"path": "ae/figure5/cf/test_softmax.py",
"chars": 4360,
"preview": "from software_model.softmax import Softmax\nfrom software_model.utils import data_type_dict, Tensor\nfrom hardware_model.d"
},
{
"path": "ae/figure5/de/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ae/figure5/de/plot_layernorm.py",
"chars": 4777,
"preview": "import pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\n\n\nlayernorm_TPUv3_sim = pd."
},
{
"path": "ae/figure5/de/real_hardware/layernorm_A100.csv",
"chars": 656,
"preview": "4096, 32, 2.476377540036036\n4096, 64, 4.8436635584845815\n4096, 128, 10.064179659276888\n4096, 256, 19.590407621844097\n409"
},
{
"path": "ae/figure5/de/real_hardware/layernorm_MI210.csv",
"chars": 650,
"preview": "4096, 32, 2.3695247806698823\n4096, 64, 4.729122950631834\n4096, 128, 9.17590195596949\n4096, 256, 18.43588592802619\n4096, "
},
{
"path": "ae/figure5/de/run.sh",
"chars": 289,
"preview": "rm *.csv\nrm *.pdf\n\ncd ../../..\n\npython -m ae.figure5.de.test_layernorm --simgpu --roofline\npython -m ae.figure5.de.test_"
},
{
"path": "ae/figure5/de/test_layernorm.py",
"chars": 5510,
"preview": "from software_model.layernorm import LayerNorm\nfrom software_model.utils import data_type_dict, Tensor\nfrom hardware_mod"
},
{
"path": "ae/figure5/g/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ae/figure5/g/plot_gelu.py",
"chars": 3840,
"preview": "import pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\n\n\ngelu_TPUv3_sim = pd.read_"
},
{
"path": "ae/figure5/g/real_hardware/gelu_A100.csv",
"chars": 541,
"preview": "1024, 0.021262214336633663\n2048, 0.04338350804040404\n4096, 0.08547198599004975\n8192, 0.1700977146930693\n16384, 0.3368601"
},
{
"path": "ae/figure5/g/real_hardware/gelu_MI210.csv",
"chars": 545,
"preview": "1024, 0.047197442813186816\n2048, 0.09761289309090909\n4096, 0.19522578618181818\n8192, 0.39045157236363637\n16384, 0.789879"
},
{
"path": "ae/figure5/g/run.sh",
"chars": 354,
"preview": "rm *.csv\nrm *.pdf\n\ncd ../../..\n\npython -m ae.figure5.g.test_gelu --simgpu --roofline\npython -m ae.figure5.g.test_gelu --"
},
{
"path": "ae/figure5/g/test_gelu.py",
"chars": 3186,
"preview": "from software_model.gelu import GeLU\nfrom software_model.utils import data_type_dict, Tensor\nfrom hardware_model.device "
},
{
"path": "ae/figure5/h/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ae/figure5/h/run.sh",
"chars": 70,
"preview": "rm *.csv\nrm *.pdf\n\ncd ../../..\n\npython -m ae.figure5.h.test_allreduce\n"
},
{
"path": "ae/figure5/h/test_allreduce.py",
"chars": 3556,
"preview": "from software_model.communication_primitives import AllReduceMultiPCB\nfrom software_model.utils import data_type_dict, T"
},
{
"path": "ae/figure5/ijkl/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ae/figure5/ijkl/plot_transformer.py",
"chars": 7852,
"preview": "import matplotlib.pyplot as plt\nimport seaborn as sns\nimport csv\nimport pandas as pd\n\ndef read_csv(filename: str):\n n"
},
{
"path": "ae/figure5/ijkl/real_hardware/transformerAR_A100.csv",
"chars": 245,
"preview": "0.0002124309539794922\n0.00010609626770019531\n0.0001386404037475586\n6.890296936035156e-05\n0.00018596649169921875\n0.000186"
},
{
"path": "ae/figure5/ijkl/real_hardware/transformer_A100.csv",
"chars": 233,
"preview": "0.013721823692321777\n0.0018811225891113281\n0.001183152198791504\n0.0045403242111206055\n0.017464280128479004\n0.01748561859"
},
{
"path": "ae/figure5/ijkl/run.sh",
"chars": 567,
"preview": "rm *.csv\nrm *.pdf\n\ncd ../../..\n\npython -m ae.figure5.ijkl.test_transformer --simgpu --roofline\npython -m ae.figure5.ijkl"
},
{
"path": "ae/figure5/ijkl/test_transformer.py",
"chars": 4761,
"preview": "from software_model.transformer import (\n TransformerBlockInitComputationTP,\n TransformerBlockAutoRegressionTP,\n)\n"
},
{
"path": "ae/figure5/run_figure5.sh",
"chars": 148,
"preview": "cd ab\nbash run.sh\ncd ..\n\ncd cf\nbash run.sh\ncd ..\n\ncd de\nbash run.sh\ncd ..\n\ncd g\nbash run.sh\ncd ..\n\ncd h\nbash run.sh\ncd ."
},
{
"path": "ae/figure6/real_hardware/die_area.csv",
"chars": 85,
"preview": "476.25,\t446.22\n76.44,\t33\n119.31,\t25.2\n58,\t83.26\n31.77,\t40.83\n20.95,\t45.52\n0,\t42\n40,\t4"
},
{
"path": "ae/figure6/run_figure6.sh",
"chars": 65,
"preview": "rm *.csv\nrm *.pdf\n\ncd ../..\n\npython -m ae.figure6.test_cost_model"
},
{
"path": "ae/figure6/test_cost_model.py",
"chars": 5161,
"preview": "from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2\nfrom design_space_exploration.dse "
},
{
"path": "ae/figure7/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ae/figure7/change_core_size.py",
"chars": 4174,
"preview": "from software_model.transformer import (\n TransformerBlockInitComputationTP,\n TransformerBlockAutoRegressionTP,\n)\n"
},
{
"path": "ae/figure7/plot_core_size.py",
"chars": 3809,
"preview": "import matplotlib.pyplot as plt\nimport seaborn as sns\nimport csv\nimport pandas as pd\n\ncategories = [\n \"Q_K_V\",\n \"Q"
},
{
"path": "ae/figure7/run_figure7.sh",
"chars": 106,
"preview": "rm *.csv\nrm *.pdf\n\ncd ../..\n\npython -m ae.figure7.change_core_size\n\ncd ae/figure7\npython plot_core_size.py"
},
{
"path": "ae/figure8/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ae/figure8/change_memory_bw.py",
"chars": 3423,
"preview": "import json, re\nfrom hardware_model.compute_module import (\n VectorUnit,\n SystolicArray,\n Core,\n ComputeModu"
},
{
"path": "ae/figure8/plot_memory_bw.py",
"chars": 3231,
"preview": "import matplotlib.pyplot as plt\nimport seaborn as sns\nimport csv\nimport pandas as pd\n\ncategories = [\n \"Q_K_V\",\n \"Q"
},
{
"path": "ae/figure8/run_figure8.sh",
"chars": 106,
"preview": "rm *.csv\nrm *.pdf\n\ncd ../..\n\npython -m ae.figure8.change_memory_bw\n\ncd ae/figure8\npython plot_memory_bw.py"
},
{
"path": "ae/figure9/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ae/figure9/change_l1_cache.py",
"chars": 3484,
"preview": "from software_model.transformer import (\n TransformerBlockInitComputationTP,\n TransformerBlockAutoRegressionTP,\n)\n"
},
{
"path": "ae/figure9/plot_l1_cache.py",
"chars": 3805,
"preview": "import matplotlib.pyplot as plt\nimport seaborn as sns\nimport csv\nimport pandas as pd\n\ncategories = [\n \"Q_K_V\",\n \"Q"
},
{
"path": "ae/figure9/run_figure9.sh",
"chars": 104,
"preview": "rm *.csv\nrm *.pdf\n\ncd ../..\n\npython -m ae.figure9.change_l1_cache\n\ncd ae/figure9\npython plot_l1_cache.py"
},
{
"path": "configs/GA100.json",
"chars": 2195,
"preview": "{\n \"name\": \"NVIDIA A100(80GB)x4\",\n \"device_count\": 4,\n \"interconnect\": {\n \"link\": {\n \"name\": "
},
{
"path": "configs/ga102_template.json",
"chars": 1976,
"preview": "{\n \"name\": \"NVIDIA A100(80GB)x4\",\n \"device_count\": 4,\n \"interconnect\": {\n \"link\": {\n \"name\": "
},
{
"path": "configs/generation_system.json",
"chars": 2191,
"preview": "{\n \"name\": \"NVIDIA A100(80GB)x4\",\n \"device_count\": 4,\n \"interconnect\": {\n \"link\": {\n \"name\": "
},
{
"path": "configs/latency_design.json",
"chars": 2195,
"preview": "{\n \"name\": \"NVIDIA A100(80GB)x4\",\n \"device_count\": 4,\n \"interconnect\": {\n \"link\": {\n \"name\": "
},
{
"path": "configs/mi210.json",
"chars": 1841,
"preview": "{\n \"name\": \"AMD MI210\",\n \"device_count\": 4,\n \"interconnect\": {\n \"link\": {\n \"name\": \"NVLink3\","
},
{
"path": "configs/mi210_template.json",
"chars": 2187,
"preview": "{\n \"name\": \"AMD MI210\",\n \"device_count\": 4,\n \"interconnect\": {\n \"link\": {\n \"name\": \"InfinityF"
},
{
"path": "configs/prefilling_system.json",
"chars": 2196,
"preview": "{\n \"name\": \"NVIDIA A100(80GB)x4\",\n \"device_count\": 4,\n \"interconnect\": {\n \"link\": {\n \"name\": "
},
{
"path": "configs/template.json",
"chars": 2195,
"preview": "{\n \"name\": \"NVIDIA A100(80GB)x4\",\n \"device_count\": 4,\n \"interconnect\": {\n \"link\": {\n \"name\": "
},
{
"path": "cost_model/__init__.py",
"chars": 30,
"preview": "## made this for first commit\n"
},
{
"path": "cost_model/cost_examples.py",
"chars": 1087,
"preview": "import cost_model.cost_model as cost_model\nimport json\n\n# example chip with a 32 wide vector, 16x16 SA, 256kb cache core"
},
{
"path": "cost_model/cost_model.py",
"chars": 14735,
"preview": "# Author: August Ning aning@princeton.edu\n# Date started: 12 October 2023\n# This file is the cost model for Naivesim\n\nim"
},
{
"path": "cost_model/regfile_area.py",
"chars": 673,
"preview": "def calculate_regfile_area(D, W, P):\n area_90nm_um2 = (3.29 * 10**4) - (1.09 * 10**3 * D) - (8.83 * 10**2 * W) - (5.5"
},
{
"path": "design_space_exploration/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "design_space_exploration/dse.py",
"chars": 13789,
"preview": "import json, re\nfrom hardware_model.compute_module import (\n VectorUnit,\n SystolicArray,\n Core,\n ComputeModu"
},
{
"path": "docs/run.md",
"chars": 5115,
"preview": "# User Guide: How to Run a LLMCompass Simulation\n\n## Step 1: Build a Hardware Configuration\n\nFollow the [NVIDIA GA100 ex"
},
{
"path": "environment.yml",
"chars": 162,
"preview": "name: llmcompass_ae\nchannels:\n - pytorch\n - defaults\ndependencies:\n - python=3.9\n - pytorch\n - pip:\n - scalesim\n"
},
{
"path": "hardware_model/__init__.py",
"chars": 160,
"preview": "# from compute_module import *\n# from io_module import *\n# from memory_module import *\n# from chiplet import *\n# from in"
},
{
"path": "hardware_model/arch_template.py",
"chars": 93,
"preview": "class ArchitectureTemplate:\n def __init__(self, \n # ve\n ):"
},
{
"path": "hardware_model/compute_module.py",
"chars": 5468,
"preview": "from math import ceil\nfrom software_model.utils import DataType, data_type_dict\n\n\nclass VectorUnit:\n def __init__(\n "
},
{
"path": "hardware_model/device.py",
"chars": 1102,
"preview": "from hardware_model.compute_module import ComputeModule, compute_module_dict\nfrom hardware_model.io_module import IOModu"
},
{
"path": "hardware_model/interconnect.py",
"chars": 2259,
"preview": "from enum import Enum, auto\nfrom math import ceil\n\n\nclass TopologyType(Enum):\n RING = auto()\n FC = auto()\n\n\nclass "
},
{
"path": "hardware_model/io_module.py",
"chars": 264,
"preview": "class IOModule:\n def __init__(self, bandwidth, latency):\n self.bandwidth = bandwidth\n self.latency = la"
},
{
"path": "hardware_model/memory_module.py",
"chars": 228,
"preview": "class MemoryModule:\n def __init__(self, memory_capacity):\n self.memory_capacity = memory_capacity\n\nmemory_modu"
},
{
"path": "hardware_model/system.py",
"chars": 572,
"preview": "from hardware_model.device import Device, device_dict\nfrom hardware_model.interconnect import InterConnectModule, interc"
},
{
"path": "software_model/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "software_model/communication_primitives.py",
"chars": 4262,
"preview": "from hardware_model.device import Device\nfrom hardware_model.interconnect import (\n LinkModule,\n InterConnectModul"
},
{
"path": "software_model/gelu.py",
"chars": 4653,
"preview": "from utils import size\nfrom typing import List, Tuple\nfrom hardware_model.device import Device\nfrom software_model.opera"
},
{
"path": "software_model/layernorm.py",
"chars": 12476,
"preview": "from utils import size\nfrom typing import List, Tuple\nfrom hardware_model.device import Device\nfrom software_model.opera"
},
{
"path": "software_model/matmul.py",
"chars": 63682,
"preview": "from utils import size\nfrom typing import List, Tuple\nfrom hardware_model.device import Device\nfrom software_model.opera"
},
{
"path": "software_model/operators.py",
"chars": 3503,
"preview": "from utils import size, closest_factors\nfrom typing import List, Tuple, Union\nfrom hardware_model.device import Device\nf"
},
{
"path": "software_model/softmax.py",
"chars": 11641,
"preview": "from utils import size\nfrom typing import List, Tuple\nfrom hardware_model.device import Device\nfrom software_model.opera"
},
{
"path": "software_model/transformer.py",
"chars": 27286,
"preview": "from software_model.operators import (\n Operator,\n Reshape,\n Concat,\n Transpose,\n)\nfrom software_model.matmu"
},
{
"path": "software_model/utils.py",
"chars": 505,
"preview": "from typing import List\nfrom utils import size\n\n\nclass DataType:\n def __init__(self, name: str, word_size: int) -> No"
},
{
"path": "systolic_array_model/look_up_table.csv",
"chars": 2107,
"preview": "64,16,4096,128,128,os,4349,5.886\n32,32,4096,128,128,os,4349,5.886\n64,32,4096,128,128,os,4349,11.773\n64,32,2048,128,128,o"
},
{
"path": "systolic_array_model/look_up_table_128_128.csv",
"chars": 14859,
"preview": "64,16,4096,128,128,os,4349,5.886\n32,32,4096,128,128,os,4349,5.886\n64,32,4096,128,128,os,4349,11.773\n64,32,2048,128,128,o"
},
{
"path": "systolic_array_model/look_up_table_16_16.csv",
"chars": 1020385,
"preview": "32,16,256,16,16,os,571,89.667\n16,16,512,16,16,os,541,94.64\n32,32,128,16,16,os,631,81.141\n8,32,256,16,16,os,571,44.834\n32"
},
{
"path": "systolic_array_model/look_up_table_32_32.csv",
"chars": 544559,
"preview": "32,16,1024,32,32,os,1085,47.189\n32,32,512,32,32,os,573,89.354\n8,32,1024,32,32,os,1085,23.594\n32,16,512,32,32,os,573,44.6"
},
{
"path": "systolic_array_model/look_up_table_64_64.csv",
"chars": 11693,
"preview": "8,16,128,64,64,os,253,1.581\n4,32,128,64,64,os,253,1.581\n8,32,64,64,64,os,189,2.116\n8,16,64,64,64,os,189,1.058\n4,32,64,64"
},
{
"path": "systolic_array_model/look_up_table_8_8.csv",
"chars": 3537,
"preview": "16,16,4,8,8,os,71,22.535\n8,32,64,8,8,os,311,82.315\n16,32,4,8,8,os,143,22.378\n4,64,32,8,8,os,367,34.877\n16,64,4,8,8,os,28"
},
{
"path": "systolic_array_model/look_up_table_old.csv",
"chars": 134488,
"preview": "16,16,448,16,16,os,477,93.92\n16,16,896,16,16,os,925,96.865\n16,32,224,16,16,os,507,88.363\n32,16,224,16,16,os,507,88.363\n8"
},
{
"path": "utils.py",
"chars": 393,
"preview": "from typing import List\n\ndef size_of_list(list: List):\n result = 1\n for i in list:\n result *= i\n return "
}
]
About this extraction
This page contains the full source code of the PrincetonUniversity/LLMCompass GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 111 files (2.0 MB), approximately 1.2M tokens, and a symbol index with 182 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.