Repository: PrincetonUniversity/LLMCompass Branch: main Commit: 2e015fd2ee75 Files: 111 Total size: 2.0 MB Directory structure: gitextract_pu2g2804/ ├── .gitignore ├── .gitmodules ├── Dockerfile ├── LICENSE ├── README.md ├── __init__.py ├── ae/ │ ├── .gitignore │ ├── __init__.py │ ├── figure10/ │ │ ├── __init__.py │ │ ├── plot_latency.py │ │ ├── run_figure10.sh │ │ └── test_latency.py │ ├── figure11/ │ │ ├── __init__.py │ │ ├── plot_decoding.py │ │ ├── run_figure11.sh │ │ └── test_decoding.py │ ├── figure12/ │ │ ├── __init__.py │ │ ├── plot_throughput.py │ │ ├── run_figure12.sh │ │ └── test_throughput.py │ ├── figure5/ │ │ ├── __init__.py │ │ ├── ab/ │ │ │ ├── __init__.py │ │ │ ├── plot_matmul.py │ │ │ ├── real_hardware/ │ │ │ │ ├── matmul_A100.csv │ │ │ │ └── matmul_MI210.csv │ │ │ ├── run.sh │ │ │ └── test_matmul.py │ │ ├── cf/ │ │ │ ├── __init__.py │ │ │ ├── plot_softmax.py │ │ │ ├── real_hardware/ │ │ │ │ ├── softmax_A100.csv │ │ │ │ └── softmax_MI210.csv │ │ │ ├── run.sh │ │ │ └── test_softmax.py │ │ ├── de/ │ │ │ ├── __init__.py │ │ │ ├── plot_layernorm.py │ │ │ ├── real_hardware/ │ │ │ │ ├── layernorm_A100.csv │ │ │ │ └── layernorm_MI210.csv │ │ │ ├── run.sh │ │ │ └── test_layernorm.py │ │ ├── g/ │ │ │ ├── __init__.py │ │ │ ├── plot_gelu.py │ │ │ ├── real_hardware/ │ │ │ │ ├── gelu_A100.csv │ │ │ │ └── gelu_MI210.csv │ │ │ ├── run.sh │ │ │ └── test_gelu.py │ │ ├── h/ │ │ │ ├── __init__.py │ │ │ ├── run.sh │ │ │ └── test_allreduce.py │ │ ├── ijkl/ │ │ │ ├── __init__.py │ │ │ ├── plot_transformer.py │ │ │ ├── real_hardware/ │ │ │ │ ├── transformerAR_A100.csv │ │ │ │ └── transformer_A100.csv │ │ │ ├── run.sh │ │ │ └── test_transformer.py │ │ └── run_figure5.sh │ ├── figure6/ │ │ ├── real_hardware/ │ │ │ └── die_area.csv │ │ ├── run_figure6.sh │ │ └── test_cost_model.py │ ├── figure7/ │ │ ├── __init__.py │ │ ├── change_core_size.py │ │ ├── plot_core_size.py │ │ └── run_figure7.sh │ ├── figure8/ │ │ ├── __init__.py │ │ ├── change_memory_bw.py │ │ ├── plot_memory_bw.py │ │ └── run_figure8.sh │ └── figure9/ │ ├── __init__.py │ ├── change_l1_cache.py │ ├── plot_l1_cache.py │ └── run_figure9.sh ├── configs/ │ ├── GA100.json │ ├── ga102_template.json │ ├── generation_system.json │ ├── latency_design.json │ ├── mi210.json │ ├── mi210_template.json │ ├── prefilling_system.json │ └── template.json ├── cost_model/ │ ├── __init__.py │ ├── cost_examples.py │ ├── cost_model.py │ └── regfile_area.py ├── design_space_exploration/ │ ├── __init__.py │ └── dse.py ├── docs/ │ └── run.md ├── environment.yml ├── hardware_model/ │ ├── __init__.py │ ├── arch_template.py │ ├── compute_module.py │ ├── device.py │ ├── interconnect.py │ ├── io_module.py │ ├── memory_module.py │ └── system.py ├── software_model/ │ ├── __init__.py │ ├── communication_primitives.py │ ├── gelu.py │ ├── layernorm.py │ ├── matmul.py │ ├── operators.py │ ├── softmax.py │ ├── transformer.py │ └── utils.py ├── systolic_array_model/ │ ├── look_up_table.csv │ ├── look_up_table_128_128.csv │ ├── look_up_table_16_16.csv │ ├── look_up_table_32_32.csv │ ├── look_up_table_64_64.csv │ ├── look_up_table_8_8.csv │ └── look_up_table_old.csv └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ __pycache__/ SlurmOutput/ temp/ .vscode/ *.ncu-rep ================================================ FILE: .gitmodules ================================================ [submodule "cost_model/supply_chain"] path = cost_model/supply_chain url = https://github.com/PrincetonUniversity/ttm-cas.git ================================================ FILE: Dockerfile ================================================ # Start with a base image that includes Miniconda to manage our environment FROM continuumio/miniconda3 # Set the working directory in the container to /app WORKDIR /app # Create the conda environment COPY environment.yml /app/environment.yml RUN conda env create -f /app/environment.yml # Initialize conda in bash shell RUN echo "source activate llmcompass_ae" > ~/.bashrc ENV PATH /opt/conda/envs/llmcompass_ae/bin:$PATH # Clone your GitHub repository RUN git clone https://github.com/HenryChang213/LLMCompass_ISCA_AE.git /app/LLMCompass_ISCA_AE RUN cd /app/LLMCompass_ISCA_AE && git submodule init && git submodule update --recursive # Expose the port your app runs on EXPOSE 8000 ================================================ FILE: LICENSE ================================================ BSD 3-Clause License Copyright (c) 2024, Princeton University Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: README.md ================================================ [![DOI](https://zenodo.org/badge/779008229.svg)](https://zenodo.org/doi/10.5281/zenodo.10892431) # LLMCompass This repository provides the implementation of **LLMCompass** from the following papers: [**LLMCompass: Enabling Efficient Hardware Design for Large Language Model Inference**](https://parallel.princeton.edu/papers/isca24_llmcompass.pdf) *Hengrui Zhang, August Ning, Rohan Baskar Prabhakar, David Wentzlaff* In the Proceedings of the 51st Annual International Symposium on Computer Architecture: ``` @inproceedings{LLMCompass, author = {Zhang, Hengrui and Ning, August and Prabhakar, Rohan Baskar and Wentzlaff, David}, title = {LLMCompass: Enabling Efficient Hardware Design for Large Language Model Inference}, year = {2024}, booktitle = {Proceedings of the 51st Annual International Symposium on Computer Architecture}, } ``` ## Set up the environment ``` $ conda create -n llmcompass_ae python=3.9 $ conda activate llmcompass_ae $ pip3 install scalesim $ conda install pytorch==2.0.0 -c pytorch $ pip3 install matplotlib $ pip3 install seaborn $ pip3 install scipy ``` ## Installation ### If using Github ``` $ git clone -b ISCA_AE https://github.com/PrincetonUniversity/LLMCompass $ cd LLMCompass $ git submodule init $ git submodule update --recursive ``` ### If using Zenodo Unzip the file and download from https://github.com/PrincetonUniversity/ttm-cas.git as `cost_model\supply_chain` ### If using Docker A Dockerfile has been provided (`./Dockerfile`), including all the software dependencies and the LLMCompass source code. A docker image has been provided [here](https://github.com/HenryChang213/LLMCompass_ISCA_AE_docker). ## AE Experiment workflow ``` # Figure 5 (around 100 min) $ cd ae/figure5 $ bash run_figure5.sh # Figure 6 (around 1 min) $ cd ae/figure6 $ bash run_figure6.sh # Figure 7 (around 20 min) $ cd ae/figure7 $ bash run_figure7.sh # Figure 8 (around 40 min) $ cd ae/figure8 $ bash run_figure8.sh # Figure 9 (around 30 min) $ cd ae/figure9 $ bash run_figure9.sh # Figure 10 (around 45 min) $ cd ae/figure10 $ bash run_figure10.sh # Figure 11 (around 5 min) $ cd ae/figure11 $ bash run_figure11.sh # Figure 12 (around 4 hours) $ cd ae/figure12 $ bash run_figure12.sh ``` ## AE Expected result After running each script above, the corresponding figures will be generated under the corresponding directory as suggested by its name. For comparison, a copy of the expected results can be found in `ae\expected_results` ## User Guide A guide on "How to Run a LLMCompass Simulation" is shown [here](./docs/run.md). ================================================ FILE: __init__.py ================================================ ================================================ FILE: ae/.gitignore ================================================ *.pdf *.csv !**/real_hardware/**/*.csv !expected_results/* ================================================ FILE: ae/__init__.py ================================================ ================================================ FILE: ae/figure10/__init__.py ================================================ ================================================ FILE: ae/figure10/plot_latency.py ================================================ import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt our_decoding = pd.read_csv( "our_decoding.csv", header=None, names=["bs", "s", "latency"] ).sort_values(by="s") our_prefill = pd.read_csv("our_prefill.csv", header=None, names=["bs", "s", "latency"]) A100_decoding = pd.read_csv( "A100_decoding.csv", header=None, names=["bs", "s", "latency"] ).sort_values(by="s") A100_prefill = pd.read_csv( "A100_prefill.csv", header=None, names=["bs", "s", "latency"] ) def get_total_decoding_latency(df: pd.DataFrame, start, end): df_filtered = df[(df["s"] >= start) & (df["s"] <= end)] total_latency = 0 # Calculate the mean of the values for each length interval and add to the sum for i in range(len(df_filtered) - 1): # Calculate the mean of current and next value mean = (df_filtered.iloc[i]["latency"] + df_filtered.iloc[i + 1]["latency"]) / 2 # Calculate the difference in length length_interval = df_filtered.iloc[i + 1]["s"] - df_filtered.iloc[i]["s"] # Multiply the mean value by the length interval and add to the sum total_latency += mean * length_interval # print(total_latency) return total_latency norm_perf = [] for input_length in [256, 512, 1024, 2048]: temp_list = [] our_prefill_latency = our_prefill[our_prefill["s"] == input_length][ "latency" ].values[0] A100_prefill_latency = A100_prefill[A100_prefill["s"] == input_length][ "latency" ].values[0] for output_length in [256, 512, 768, 1024, 1280, 1536, 1792, 2048]: our_total_latency = our_prefill_latency + get_total_decoding_latency( our_decoding, input_length, input_length + output_length ) A100_total_latency = A100_prefill_latency + get_total_decoding_latency( A100_decoding, input_length, input_length + output_length ) temp_list.append(A100_total_latency / our_total_latency) norm_perf.append(temp_list) cmap = sns.color_palette("viridis", as_cmap=True) data = np.array(norm_perf) import statistics print(statistics.geometric_mean(data.flatten())) fig, ax = plt.subplots() cax = ax.imshow(data, interpolation="nearest", cmap=cmap, vmin=0.8, vmax=1) # cax = sns.heatmap(data, cmap="viridis") # Add a colorbar fig.colorbar(cax, shrink=0.5) # Function to convert RGB to grayscale intensity def get_intensity(color): return color[0] * 0.299 + color[1] * 0.587 + color[2] * 0.114 # Set a threshold for deciding text color intensity_threshold = 0.5 for i in range(data.shape[0]): for j in range(data.shape[1]): # Get the color from the colormap cell_color = cax.cmap(cax.norm(data[i, j])) # Calculate intensity of the cell color intensity = get_intensity(cell_color) # Choose text color based on intensity text_color = "white" if intensity < intensity_threshold else "black" text = ax.text( j, i, round(data[i, j], 2), ha="center", va="center", color=text_color ) # Set the x-axis and y-axis values x_axis_labels = [256, 512, 768, 1024, 1280, 1536, 1792, 2048] y_axis_labels = [256, 512, 1024, 2048] # Set ticks positions ax.set_xticks(np.arange(len(x_axis_labels))) ax.set_yticks(np.arange(len(y_axis_labels))) # Set ticks labels ax.set_xticklabels(x_axis_labels) ax.set_yticklabels(y_axis_labels) # Set labels for axes ax.set_xlabel("Output Length") ax.set_ylabel("Input Length") ax.invert_yaxis() # # Rotate the tick labels for the x-axis if needed # plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") # Show the plot plt.tight_layout() plt.savefig("figure10.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01) # norm_perf = [] # norm_perf_ttft = [] # for input_length in [256, 512, 1024, 2048]: # temp_list = [] # our_prefill_latency = our_prefill[our_prefill["s"] == input_length][ # "latency" # ].values[0] # A100_prefill_latency = A100_prefill[A100_prefill["s"] == input_length][ # "latency" # ].values[0] # for output_length in [256, 512, 768, 1024, 1280, 1536, 1792, 2048]: # our_tbt_latency = get_total_decoding_latency( # our_decoding, input_length, input_length + output_length # ) # A100_tbt_latency = get_total_decoding_latency( # A100_decoding, input_length, input_length + output_length # ) # temp_list.append(our_tbt_latency / A100_tbt_latency) # norm_perf.append(temp_list) # norm_perf_ttft.append(our_prefill_latency / A100_prefill_latency) # cmap = sns.color_palette("viridis", as_cmap=True) # data = np.array(norm_perf) # data_ttft = np.array(norm_perf_ttft) # print(data) # print(data_ttft) # import statistics # from matplotlib import gridspec # print(statistics.geometric_mean(data.flatten())) # print(statistics.geometric_mean(data_ttft)) # # fig, axs = plt.subplots(1, 2, figsize=(8, 4), # # gridspec_kw={'width_ratios': [3, 1]}, sharey=True) # fig = plt.figure(figsize=(8, 3)) # Define the figure size # gs = gridspec.GridSpec( # 1, 2, width_ratios=[4, 1] # ) # 2 rows, 1 column, with the first row 3 times the height of the second # ax = fig.add_subplot(gs[0]) # # ax=axs[0] # cax = ax.imshow(data, interpolation="nearest", cmap=cmap, vmin=1.015, vmax=1.045) # # cax = sns.heatmap(data, cmap="viridis") # # Add a colorbar # fig.colorbar(cax, shrink=1) # # Function to convert RGB to grayscale intensity # def get_intensity(color): # return color[0] * 0.299 + color[1] * 0.587 + color[2] * 0.114 # # Set a threshold for deciding text color # intensity_threshold = 0.5 # for i in range(data.shape[0]): # for j in range(data.shape[1]): # # Get the color from the colormap # cell_color = cax.cmap(cax.norm(data[i, j])) # # Calculate intensity of the cell color # intensity = get_intensity(cell_color) # # Choose text color based on intensity # text_color = "white" if intensity < intensity_threshold else "black" # text = ax.text( # j, i, round(data[i, j], 3), ha="center", va="center", color=text_color # ) # # Set the x-axis and y-axis values # x_axis_labels = [256, 512, 768, 1024, 1280, 1536, 1792, 2048] # y_axis_labels = [256, 512, 1024, 2048] # # Set ticks positions # ax.set_xticks(np.arange(len(x_axis_labels))) # ax.set_yticks(np.arange(len(y_axis_labels))) # # Set ticks labels # ax.set_xticklabels(x_axis_labels) # ax.set_yticklabels(y_axis_labels) # # Set labels for axes # ax.set_xlabel("Output Length\n" + r"$\mathbf{Normalized\ TBT}$") # ax.set_ylabel("Input Length") # ax.invert_yaxis() # # # Rotate the tick labels for the x-axis if needed # # plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") # # fig = plt.figure(figsize=(10, 5)) # Define the figure size # axs1 = fig.add_subplot(gs[0, 1]) # axs1.barh(np.arange(len(data_ttft)) / 2 + 0.2, data_ttft, color="steelblue", height=0.3) # axs1.set_yticks(np.arange(len(y_axis_labels)) / 2 + 0.2) # axs1.set_yticklabels(y_axis_labels) # axs1.set_xlabel(r"$\mathbf{Normalized\ TTFT}$") # axs1.set_xlim(1, 2) # # Show the plot # plt.tight_layout() # plt.savefig("figure11.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01) ================================================ FILE: ae/figure10/run_figure10.sh ================================================ rm *.csv rm *.pdf cd ../.. python -m ae.figure10.test_latency cd ae/figure10 python plot_latency.py ================================================ FILE: ae/figure10/test_latency.py ================================================ from software_model.transformer import ( TransformerBlockInitComputationTP, TransformerBlockAutoRegressionTP, ) from software_model.utils import data_type_dict, Tensor from design_space_exploration.dse import template_to_system, read_architecture_template from multiprocessing import Process, Lock from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2 import time A100_specs = read_architecture_template("configs/GA100.json") A100_system = template_to_system(A100_specs) our_specs = read_architecture_template("configs/latency_design.json") our_system = template_to_system(our_specs) A100_compute_area_mm2 = calc_compute_chiplet_area_mm2(A100_specs) A100_io_area_mm2 = calc_io_die_area_mm2(A100_specs) our_compute_area_mm2 = calc_compute_chiplet_area_mm2(our_specs) our_io_area_mm2 = calc_io_die_area_mm2(our_specs) print(f"A100 compute area: {A100_compute_area_mm2} mm2") print(f"A100 IO area: {A100_io_area_mm2} mm2") print(f"A100 total area: {A100_compute_area_mm2+A100_io_area_mm2} mm2") print(f"Our compute area: {our_compute_area_mm2} mm2") print(f"Our IO area: {our_io_area_mm2} mm2") print(f"Our total area: {our_compute_area_mm2+our_io_area_mm2} mm2") with open("ae/figure10/area.csv", "w") as f: f.write(f"A100 compute area: {A100_compute_area_mm2} mm2\n") f.write(f"A100 IO area: {A100_io_area_mm2} mm2\n") f.write(f"A100 total area: {A100_compute_area_mm2+A100_io_area_mm2} mm2\n") f.write(f"Our compute area: {our_compute_area_mm2} mm2\n") f.write(f"Our IO area: {our_io_area_mm2} mm2\n") f.write(f"Our total area: {our_compute_area_mm2+our_io_area_mm2} mm2\n") def simulate_decoding_latency(system, bs, seq_len, name, lock): model_auto_regression = TransformerBlockAutoRegressionTP( d_model=12288, n_heads=96, device_count=4, data_type=data_type_dict["fp16"], ) _ = model_auto_regression( Tensor([bs, 1, 12288], data_type_dict["fp16"]), seq_len, ) auto_regression_latency_simulated = model_auto_regression.compile_and_simulate( system, "heuristic-GPU" ) with lock: with open(f"ae/figure10/{name}_decoding.csv", "a") as f: f.write(f"{bs}, {seq_len}, {auto_regression_latency_simulated}\n") def simulate_prefill_latency(system, bs, seq_len, name, lock): model = TransformerBlockInitComputationTP( d_model=12288, n_heads=96, device_count=4, data_type=data_type_dict["fp16"], ) _ = model( Tensor([bs, seq_len, 12288], data_type_dict["fp16"]), ) latency_simulated = model.compile_and_simulate(system, "heuristic-GPU") with lock: with open(f"ae/figure10/{name}_prefill.csv", "a") as f: f.write(f"{bs}, {seq_len}, {latency_simulated}\n") lock_our_prefill = Lock() lock_our_decoding = Lock() lock_A100_prefill = Lock() lock_A100_decoding = Lock() processes = [] for bs in [16]: # [1, 4, 8, 16, 32, 64]: for seq_len in [256, 512, 1024, 2048]: for system in [our_system, A100_system]: if system == A100_system: name = "A100" lock = lock_A100_prefill else: name = "our" lock = lock_our_prefill p = Process( target=simulate_prefill_latency, args=(system, bs, seq_len, name, lock) ) processes.append(p) for seq_len in range(256, 4096 + 64, 64): for system in [our_system, A100_system]: if system == A100_system: name = "A100" lock = lock_A100_decoding else: name = "our" lock = lock_our_decoding p = Process( target=simulate_decoding_latency, args=(system, bs, seq_len, name, lock) ) processes.append(p) try: for p in processes: p.start() print("Processes started.") print("number of process:", len(processes)) while any(p.is_alive() for p in processes): time.sleep(1) except KeyboardInterrupt: print("Terminating processes...") for p in processes: p.terminate() p.join() print("All processes have finished.") ================================================ FILE: ae/figure11/__init__.py ================================================ ================================================ FILE: ae/figure11/plot_decoding.py ================================================ import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np from scipy.stats import gmean categories = ["bs", "seq_len", "latency"] + [ "Q_K_V", "Q_mul_K", "A_mul_V", "Wo_proj", "W1_proj", "W2_proj", "Softmax", "LayerNorm_MHA", "LayerNorm_FFN", "GeLU", "AllReduce_MHA", "AllReduce_FFN", ] A100 = pd.read_csv("A100.csv", header=None, names=categories) A100["latency"] = A100["latency"] * 1000 our = pd.read_csv("our.csv", header=None, names=categories) our["latency"] = our["latency"] * 1000 bs_list = [1, 2, 4, 8, 16, 32] colors_our = sns.color_palette("Blues", 3)[1:] colors_a100 = sns.color_palette("summer_r", 2) our_512 = our[our.seq_len == 512][our["bs"].isin(bs_list)]["latency"].tolist() our_2048 = our[our.seq_len == 2048][our["bs"].isin(bs_list)]["latency"].tolist() a100_512 = A100[A100.seq_len == 512][A100["bs"].isin(bs_list)]["latency"].tolist() a100_2048 = A100[A100.seq_len == 2048][A100["bs"].isin(bs_list)]["latency"].tolist() avg_speedup = gmean( np.concatenate( ( np.array(a100_512) / np.array(our_512), np.array(a100_2048) / np.array(our_2048), ) ) ) print(avg_speedup) plt.figure(figsize=(8, 3.5)) x_pos = 0.25 for bs in bs_list: if bs == 1: seq_len = 512 plt.bar( x_pos, A100[(A100.bs == bs) & (A100.seq_len == seq_len)].latency, width=0.5, label=f"GA100 (seq_len={seq_len})", color=colors_a100[0], ) bars = plt.bar( x_pos + 0.5, our[(our.bs == bs) & (our.seq_len == seq_len)].latency, width=0.5, label=f"Latency design (seq_len={seq_len})", color=colors_our[0], ) for bar in bars: bar.set_hatch("//") # Add diagonal stripes seq_len = 2048 plt.bar( x_pos + 1, A100[(A100.bs == bs) & (A100.seq_len == seq_len)].latency, width=0.5, label=f"GA100 (seq_len={seq_len})", color=colors_a100[1], ) bars = plt.bar( x_pos + 1.5, our[(our.bs == bs) & (our.seq_len == seq_len)].latency, width=0.5, label=f"Latency design (seq_len={seq_len})", color=colors_our[1], ) for bar in bars: bar.set_hatch("//") # Add diagonal stripes else: seq_len = 512 plt.bar( x_pos, A100[(A100.bs == bs) & (A100.seq_len == seq_len)].latency, width=0.5, color=colors_a100[0], ) bars = plt.bar( x_pos + 0.5, our[(our.bs == bs) & (our.seq_len == seq_len)].latency, width=0.5, color=colors_our[0], ) for bar in bars: bar.set_hatch("//") # Add diagonal stripes seq_len = 2048 if bs < 164: plt.bar( x_pos + 1, A100[(A100.bs == bs) & (A100.seq_len == seq_len)].latency, width=0.5, color=colors_a100[1], ) bars = plt.bar( x_pos + 1.5, our[(our.bs == bs) & (our.seq_len == seq_len)].latency, width=0.5, color=colors_our[1], ) for bar in bars: bar.set_hatch("//") # Add diagonal stripes x_pos += 3 plt.xticks([1, 4, 7, 10, 13, 16], bs_list) plt.xlabel("Batch Size") plt.ylabel("Latency (ms)") plt.legend(loc="upper left") plt.tight_layout() plt.grid(True, axis="y", ls="--", c="0.8") plt.savefig("figure11.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300) plt.show() ================================================ FILE: ae/figure11/run_figure11.sh ================================================ rm *.csv rm *.pdf cd ../.. python -m ae.figure11.test_decoding cd ae/figure11 python plot_decoding.py ================================================ FILE: ae/figure11/test_decoding.py ================================================ from software_model.transformer import ( TransformerBlockInitComputationTP, TransformerBlockAutoRegressionTP, ) from software_model.utils import data_type_dict, Tensor from design_space_exploration.dse import template_to_system, read_architecture_template from multiprocessing import Process, Lock from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2 import time A100_specs = read_architecture_template("configs/GA100.json") A100_system = template_to_system(A100_specs) our_specs = read_architecture_template("configs/latency_design.json") our_system = template_to_system(our_specs) A100_compute_area_mm2 = calc_compute_chiplet_area_mm2(A100_specs) A100_io_area_mm2 = calc_io_die_area_mm2(A100_specs) our_compute_area_mm2 = calc_compute_chiplet_area_mm2(our_specs) our_io_area_mm2 = calc_io_die_area_mm2(our_specs) print(f"A100 compute area: {A100_compute_area_mm2} mm2") print(f"A100 IO area: {A100_io_area_mm2} mm2") print(f"A100 total area: {A100_compute_area_mm2+A100_io_area_mm2} mm2") print(f"Our compute area: {our_compute_area_mm2} mm2") print(f"Our IO area: {our_io_area_mm2} mm2") print(f"Our total area: {our_compute_area_mm2+our_io_area_mm2} mm2") def simulate_latency(system, bs, seq_len, name, lock): model_auto_regression = TransformerBlockAutoRegressionTP( d_model=12288, n_heads=96, device_count=4, data_type=data_type_dict["fp16"], ) _ = model_auto_regression( Tensor([bs, 1, 12288], data_type_dict["fp16"]), seq_len, ) auto_regression_latency_simulated = model_auto_regression.compile_and_simulate( system, "heuristic-GPU" ) with lock: with open(f"ae/figure11/{name}.csv", "a") as f: f.write( f"{bs}, {seq_len}, {auto_regression_latency_simulated}, {model_auto_regression.simluate_log}\n" ) lock = Lock() processes = [] for bs in [1, 2, 4, 8, 16, 32, 64]: for seq_len in [512, 2048]: for system in [our_system, A100_system]: if system == A100_system: name = "A100" else: name = "our" p = Process(target=simulate_latency, args=(system, bs, seq_len, name, lock)) processes.append(p) try: for p in processes: p.start() while any(p.is_alive() for p in processes): time.sleep(1) except KeyboardInterrupt: print("Terminating processes...") for p in processes: p.terminate() p.join() print("All processes have finished.") ================================================ FILE: ae/figure12/__init__.py ================================================ ================================================ FILE: ae/figure12/plot_throughput.py ================================================ import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import statistics our_directory = "our/our" A100_directory = "A100/A100" categories = ["bs", "s", "latency"] + [ "Q_K_V", "Q_mul_K", "A_mul_V", "Wo_proj", "W1_proj", "W2_proj", "Softmax", "LayerNorm_MHA", "LayerNorm_FFN", "GeLU", "AllReduce_MHA", "AllReduce_FFN", ] throughput_our = [] bs_our = [] latency_our = [] throughput_A100 = [] bs_A100 = [] latency_A100 = [] def get_total_decoding_latency(df: pd.DataFrame, start, end): df_filtered = df[(df["s"] >= start) & (df["s"] <= end)] total_latency = 0 # Calculate the mean of the values for each length interval and add to the sum for i in range(len(df_filtered) - 1): # Calculate the mean of current and next value mean = (df_filtered.iloc[i]["latency"] + df_filtered.iloc[i + 1]["latency"]) / 2 # Calculate the difference in length length_interval = df_filtered.iloc[i + 1]["s"] - df_filtered.iloc[i]["s"] # Multiply the mean value by the length interval and add to the sum total_latency += mean * length_interval # print(total_latency) return total_latency for input_length in [256, 512, 1024, 2048]: temp_our = [] temp_A100 = [] temp_our_bs = [] temp_A100_bs = [] temp_our_latency = [] temp_A100_latency = [] for output_length in [256, 512, 768, 1024, 1280, 1536, 1792, 2048]: our_prefill_df = pd.read_csv( f"{our_directory}_{input_length}_{output_length}_prefill.csv", header=None, names=categories, ) # print(our_prefill_df) our_prefill_latency = our_prefill_df.iloc[0]["latency"] our_bs = our_prefill_df.iloc[0]["bs"] temp_our_bs.append(our_bs) our_decoding_df = pd.read_csv( f"{our_directory}_{input_length}_{output_length}_decoding.csv", header=None, names=categories, ).sort_values(by="s") our_decoding_latency = get_total_decoding_latency( our_decoding_df, input_length, input_length + output_length ) # print(our_decoding_latency) our_throughput = ( our_bs * output_length / (our_prefill_latency + our_decoding_latency) / 12 ) temp_our.append(our_throughput) temp_our_latency.append(our_prefill_latency + our_decoding_latency) A100_prefill_df = pd.read_csv( f"{A100_directory}_{input_length}_{output_length}_prefill.csv", header=None, names=categories, ) A100_prefill_latency = A100_prefill_df.iloc[0]["latency"] A100_bs = A100_prefill_df.iloc[0]["bs"] temp_A100_bs.append(A100_bs) A100_decoding_df = pd.read_csv( f"{A100_directory}_{input_length}_{output_length}_decoding.csv", header=None, names=categories, ).sort_values(by="s") A100_decoding_latency = get_total_decoding_latency( A100_decoding_df, input_length, input_length + output_length ) A100_throughput = ( A100_bs * output_length / (A100_prefill_latency + A100_decoding_latency) / 12 ) temp_A100.append(A100_throughput) temp_A100_latency.append(A100_prefill_latency + A100_decoding_latency) throughput_our.append(temp_our) throughput_A100.append(temp_A100) bs_our.append(temp_our_bs) bs_A100.append(temp_A100_bs) latency_our.append(temp_our_latency) latency_A100.append(temp_A100_latency) # print(throughput_our) # print(throughput_A100) print(latency_our) print(latency_A100) print( statistics.geometric_mean( (np.array(latency_our) / np.array(latency_A100)).flatten() ) ) # Function to convert RGB to grayscale intensity def get_intensity(color): return color[0] * 0.299 + color[1] * 0.587 + color[2] * 0.114 cmap = sns.color_palette("viridis", as_cmap=True) data = np.array(throughput_our) # / np.array(throughput_A100) print(data.mean()) fig, ax = plt.subplots() cax = ax.imshow(data, interpolation="nearest", cmap=cmap) # cax = sns.heatmap(data, cmap="Blues") # Add a colorbar fig.colorbar(cax, shrink=0.5) # Set a threshold for deciding text color intensity_threshold = 0.5 for i in range(data.shape[0]): for j in range(data.shape[1]): # Get the color from the colormap cell_color = cax.cmap(cax.norm(data[i, j])) # Calculate intensity of the cell color intensity = get_intensity(cell_color) # Choose text color based on intensity text_color = "white" if intensity < intensity_threshold else "black" text = ax.text( j, i, int(data[i, j]), ha="center", va="center", color=text_color ) # Set the x-axis and y-axis values x_axis_labels = [256, 512, 768, 1024, 1280, 1536, 1792, 2048] y_axis_labels = [256, 512, 1024, 2048] # Set ticks positions ax.set_xticks(np.arange(len(x_axis_labels))) ax.set_yticks(np.arange(len(y_axis_labels))) # Set ticks labels ax.set_xticklabels(x_axis_labels) ax.set_yticklabels(y_axis_labels) # Set labels for axes ax.set_xlabel("Output Length") ax.set_ylabel("Input Length") ax.invert_yaxis() # # Rotate the tick labels for the x-axis if needed # plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") # Show the plot plt.tight_layout() plt.savefig("figure12a.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01) import statistics cmap = sns.color_palette("viridis", as_cmap=True) data = np.array(throughput_our) / np.array(throughput_A100) print(statistics.geometric_mean(data.flatten())) fig, ax = plt.subplots() cax = ax.imshow( data, interpolation="nearest", cmap=cmap, ) # cax = sns.heatmap(data, cmap="viridis") # Add a colorbar fig.colorbar(cax, shrink=0.5) # Set a threshold for deciding text color intensity_threshold = 0.5 for i in range(data.shape[0]): for j in range(data.shape[1]): # Get the color from the colormap cell_color = cax.cmap(cax.norm(data[i, j])) # Calculate intensity of the cell color intensity = get_intensity(cell_color) # Choose text color based on intensity text_color = "white" if intensity < intensity_threshold else "black" text = ax.text( j, i, round(data[i, j], 2), ha="center", va="center", color=text_color ) # Set the x-axis and y-axis values x_axis_labels = [256, 512, 768, 1024, 1280, 1536, 1792, 2048] y_axis_labels = [256, 512, 1024, 2048] # Set ticks positions ax.set_xticks(np.arange(len(x_axis_labels))) ax.set_yticks(np.arange(len(y_axis_labels))) # Set ticks labels ax.set_xticklabels(x_axis_labels) ax.set_yticklabels(y_axis_labels) # Set labels for axes ax.set_xlabel("Output Length") ax.set_ylabel("Input Length") ax.invert_yaxis() # # Rotate the tick labels for the x-axis if needed # plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") # Show the plot plt.tight_layout() plt.savefig("figure12b.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01) ================================================ FILE: ae/figure12/run_figure12.sh ================================================ rm A100/*.csv rm our/*.csv rm *.pdf mkdir A100 mkdir our cd ../.. python -m ae.figure12.test_throughput cd ae/figure12 python plot_throughput.py ================================================ FILE: ae/figure12/test_throughput.py ================================================ from software_model.transformer import ( TransformerBlockInitComputationTP, TransformerBlockAutoRegressionTP, ) from software_model.utils import data_type_dict, Tensor from design_space_exploration.dse import template_to_system, read_architecture_template from multiprocessing import Process, Lock from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2 import time A100_specs = read_architecture_template("configs/GA100.json") A100_system = template_to_system(A100_specs) our_specs = read_architecture_template("configs/prefilling_system.json") our_system = template_to_system(our_specs) A100_compute_area_mm2 = calc_compute_chiplet_area_mm2(A100_specs) A100_io_area_mm2 = calc_io_die_area_mm2(A100_specs) our_compute_area_mm2 = calc_compute_chiplet_area_mm2(our_specs) our_io_area_mm2 = calc_io_die_area_mm2(our_specs) print(f"A100 compute area: {A100_compute_area_mm2} mm2") print(f"A100 IO area: {A100_io_area_mm2} mm2") print(f"A100 total area: {A100_compute_area_mm2+A100_io_area_mm2} mm2") print(f"Our compute area: {our_compute_area_mm2} mm2") print(f"Our IO area: {our_io_area_mm2} mm2") print(f"Our total area: {our_compute_area_mm2+our_io_area_mm2} mm2") with open("ae/figure12/area.csv", "w") as f: f.write(f"A100 compute area: {A100_compute_area_mm2} mm2\n") f.write(f"A100 IO area: {A100_io_area_mm2} mm2\n") f.write(f"A100 total area: {A100_compute_area_mm2+A100_io_area_mm2} mm2\n") f.write(f"Our compute area: {our_compute_area_mm2} mm2\n") f.write(f"Our IO area: {our_io_area_mm2} mm2\n") f.write(f"Our total area: {our_compute_area_mm2+our_io_area_mm2} mm2\n") def simulate_decoding_latency(system, bs, seq_len, name, lock, heuristics): model_auto_regression = TransformerBlockAutoRegressionTP( d_model=12288, n_heads=96, device_count=1, data_type=data_type_dict["fp16"], ) _ = model_auto_regression( Tensor([bs, 1, 12288], data_type_dict["fp16"]), seq_len, ) auto_regression_latency_simulated = model_auto_regression.compile_and_simulate( system, heuristics ) with lock: with open(f"ae/figure12/{name}_decoding.csv", "a") as f: f.write( f"{bs}, {seq_len}, {auto_regression_latency_simulated}, {model_auto_regression.simluate_log}\n" ) def simulate_prefill_latency(system, bs, seq_len, name, lock, heuristics): model = TransformerBlockInitComputationTP( d_model=12288, n_heads=96, device_count=1, data_type=data_type_dict["fp16"], ) _ = model( Tensor([bs, seq_len, 12288], data_type_dict["fp16"]), ) latency_simulated = model.compile_and_simulate(system, heuristics) with lock: with open(f"ae/figure12/{name}_prefill.csv", "a") as f: f.write(f"{bs}, {seq_len}, {latency_simulated}, {model.simluate_log}\n") lock_our_prefill = Lock() lock_our_decoding = Lock() lock_A100_prefill = Lock() lock_A100_decoding = Lock() processes = [] for input_seq_len in [ 256, 512, 1024, 2048, ]: for output_seq_len in [ 256, 512, 768, 1024, 1280, 1536, 1792, 2048, ]: seq_len = input_seq_len + output_seq_len for system in [our_system, A100_system]: if system == A100_system: name = f"A100/A100_{input_seq_len}_{output_seq_len}" lock = lock_A100_prefill bs = (80e9 - 2 * 12 * 12288**2 * 12) // ((12 * 4 + 8) * seq_len * 12288) heuristics = "heuristic-GPU" else: name = f"our/our_{input_seq_len}_{output_seq_len}" lock = lock_our_prefill bs = (512e9 - 2 * 12 * 12288**2 * 12) // ( (12 * 4 + 8) * seq_len * 12288 ) heuristics = "heuristic-our-throughput" bs = int(bs) # print(bs) p = Process( target=simulate_prefill_latency, args=(system, bs, input_seq_len, name, lock, heuristics), ) processes.append(p) for decoding_seq_len in range(input_seq_len, seq_len + 64, 64): if system == A100_system: name = f"A100/A100_{input_seq_len}_{output_seq_len}" lock = lock_A100_decoding heuristics = "heuristic-GPU" else: name = f"our/our_{input_seq_len}_{output_seq_len}" lock = lock_our_decoding heuristics = "heuristic-our-throughput" p = Process( target=simulate_decoding_latency, args=(system, bs, decoding_seq_len, name, lock, heuristics), ) processes.append(p) print(len(processes)) # exit() try: for p in processes: p.start() print("Processes started.") print("number of process:", len(processes)) while any(p.is_alive() for p in processes): time.sleep(1) except KeyboardInterrupt: print("Terminating processes...") for p in processes: p.terminate() p.join() print("All processes have finished.") ================================================ FILE: ae/figure5/__init__.py ================================================ ================================================ FILE: ae/figure5/ab/__init__.py ================================================ ================================================ FILE: ae/figure5/ab/plot_matmul.py ================================================ import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np matmul_TPUv3_sim = pd.read_csv( "matmul_TPUv3_sim.csv", header=None, names=["M", "N", "K", "latency", "throughput"] ) matmul_TPUv3_sim["throughput"] = ( matmul_TPUv3_sim["throughput"].str.extract(r"(\d+\.?\d*)").astype(float) ) matmul_TPUv3_sim.set_index(["M", "N", "K"], inplace=True) matmul_TPUv3_roofline = pd.read_csv( "matmul_TPUv3_roofline.csv", header=None, names=["M", "N", "K", "latency", "throughput"], ) matmul_TPUv3_roofline["throughput"] = ( matmul_TPUv3_roofline["throughput"].str.extract(r"(\d+\.?\d*)").astype(float) ) matmul_TPUv3_roofline.set_index(["M", "N", "K"], inplace=True) matmul_A100 = pd.read_csv( "real_hardware/matmul_A100.csv", header=None, names=["M", "N", "K", "latency", "throughput"] ) matmul_A100["throughput"] = ( matmul_A100["throughput"].str.extract(r"(\d+\.?\d*)").astype(float) ) matmul_A100.set_index(["M", "N", "K"], inplace=True) matmul_A100_sim = pd.read_csv( "matmul_A100_sim.csv", header=None, names=["M", "N", "K", "latency", "throughput"] ) matmul_A100_sim["throughput"] = ( matmul_A100_sim["throughput"].str.extract(r"(\d+\.?\d*)").astype(float) ) matmul_A100_sim.set_index(["M", "N", "K"], inplace=True) matmul_A100_roofline = pd.read_csv( "matmul_A100_roofline.csv", header=None, names=["M", "N", "K", "latency", "throughput"], ) matmul_A100_roofline["throughput"] = ( matmul_A100_roofline["throughput"].str.extract(r"(\d+\.?\d*)").astype(float) ) matmul_A100_roofline.set_index(["M", "N", "K"], inplace=True) matmul_MI210 = pd.read_csv( "real_hardware/matmul_MI210.csv", header=None, names=["M", "N", "K", "latency", "throughput"] ) matmul_MI210["throughput"] = ( matmul_MI210["throughput"].str.extract(r"(\d+\.?\d*)").astype(float) ) matmul_MI210.set_index(["M", "N", "K"], inplace=True) matmul_MI210_sim = pd.read_csv( "matmul_MI210_sim.csv", header=None, names=["M", "N", "K", "latency", "throughput"] ) matmul_MI210_sim["throughput"] = ( matmul_MI210_sim["throughput"].str.extract(r"(\d+\.?\d*)").astype(float) ) matmul_MI210_sim.set_index(["M", "N", "K"], inplace=True) matmul_MI210_roofline = pd.read_csv( "matmul_MI210_roofline.csv", header=None, names=["M", "N", "K", "latency", "throughput"], ) matmul_MI210_roofline["throughput"] = ( matmul_MI210_roofline["throughput"].str.extract(r"(\d+\.?\d*)").astype(float) ) matmul_MI210_roofline.set_index(["M", "N", "K"], inplace=True) color_NV = sns.color_palette("Greens_d", 4)[1:] color_Google = sns.color_palette("Blues_d", 4)[1:] color_AMD = sns.color_palette("flare", 3) K = 12288 N = K title = f"Performance of Matmul with K={K}, N={N}" M_list = [] throughput_TPU_list = [] throughput_TPU_sim_list = [] throughput_TPU_roofline_list = [] throughput_GPU_list = [] throughput_GPU_sim_list = [] throughput_GPU_roofline_list = [] throughput_AMD_list = [] throughput_AMD_sim_list = [] throughput_AMD_roofline_list = [] for M in range(6, 16): M = 2**M M_list.append(M) throughput_TPU_sim_list.append(matmul_TPUv3_sim.loc[(M, N, K), "throughput"]) throughput_TPU_roofline_list.append( matmul_TPUv3_roofline.loc[(M, N, K), "throughput"] ) throughput_GPU_list.append(matmul_A100.loc[(M, N, K), "throughput"]) throughput_GPU_sim_list.append(matmul_A100_sim.loc[(M, N, K), "throughput"]) throughput_GPU_roofline_list.append( matmul_A100_roofline.loc[(M, N, K), "throughput"] ) throughput_AMD_list.append(matmul_MI210.loc[(M, N, K), "throughput"]) throughput_AMD_sim_list.append(matmul_MI210_sim.loc[(M, N, K), "throughput"]) throughput_AMD_roofline_list.append( matmul_MI210_roofline.loc[(M, N, K), "throughput"] ) # plt.figure(figsize=(6, 2.8)) plt.figure(figsize=(3.64, 2.8)) plt.xscale("log", base=2) plt.plot( M_list, throughput_GPU_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of NVIDIA A100", color=color_NV[0], ) plt.plot( M_list, throughput_GPU_list, marker="o", label="Real NVIDIA A100", color=color_NV[1] ) plt.plot( M_list, throughput_GPU_sim_list, marker="x", label="Simulated NVIDIA A100", color=color_NV[2], ) plt.plot( M_list, throughput_AMD_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of AMD MI210", color=color_AMD[0], ) plt.plot( M_list, throughput_AMD_list, marker="o", label="Real AMD MI210", color=color_AMD[1] ) plt.plot( M_list, throughput_AMD_sim_list, marker="x", label="Simulated AMD MI210", color=color_AMD[2], ) plt.plot( M_list, throughput_TPU_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of Google TPUv3", color=color_Google[0], ) plt.plot( M_list, throughput_TPU_sim_list, marker="x", label="Simulated Google TPUv3", color=color_Google[2], ) # handles, labels = plt.gca().get_legend_handles_labels() # plt.legend(handles, labels, loc="upper left", bbox_to_anchor=(1, 1)) # plt.title(title) plt.xlabel("M") plt.ylabel("TFLOPS") plt.grid(True, which="both", ls="--", c="0.7") # Adding a grid for better readability plt.tight_layout() plt.savefig("figure5b.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300) M = 8192 title = f"Performance of Matmul with M={M}" K_list = [] throughput_TPU_list = [] throughput_TPU_sim_list = [] throughput_TPU_roofline_list = [] throughput_GPU_list = [] throughput_GPU_sim_list = [] throughput_GPU_roofline_list = [] throughput_AMD_list = [] throughput_AMD_sim_list = [] throughput_AMD_roofline_list = [] for K in range(6, 16): K = 2**K N = K K_list.append(K) throughput_TPU_sim_list.append(matmul_TPUv3_sim.loc[(M, N, K), "throughput"]) throughput_TPU_roofline_list.append( matmul_TPUv3_roofline.loc[(M, N, K), "throughput"] ) throughput_GPU_list.append(matmul_A100.loc[(M, N, K), "throughput"]) throughput_GPU_sim_list.append(matmul_A100_sim.loc[(M, N, K), "throughput"]) throughput_GPU_roofline_list.append( matmul_A100_roofline.loc[(M, N, K), "throughput"] ) throughput_AMD_list.append(matmul_MI210.loc[(M, N, K), "throughput"]) throughput_AMD_sim_list.append(matmul_MI210_sim.loc[(M, N, K), "throughput"]) throughput_AMD_roofline_list.append( matmul_MI210_roofline.loc[(M, N, K), "throughput"] ) plt.figure(figsize=(3.64, 2.8)) plt.xscale("log", base=2) plt.plot( K_list, throughput_GPU_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of NVIDIA A100", color=color_NV[0], ) plt.plot( K_list, throughput_GPU_list, marker="o", label="Real NVIDIA A100", color=color_NV[1] ) plt.plot( K_list, throughput_GPU_sim_list, marker="x", label="Simulated NVIDIA A100", color=color_NV[2], ) plt.plot( K_list, throughput_AMD_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of AMD MI210", color=color_AMD[0], ) plt.plot( K_list, throughput_AMD_list, marker="o", label="Real AMD MI210", color=color_AMD[1] ) plt.plot( K_list, throughput_AMD_sim_list, marker="x", label="Simulated AMD MI210", color=color_AMD[2], ) plt.plot( K_list, throughput_TPU_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of Google TPUv3", color=color_Google[0], ) plt.plot( K_list, throughput_TPU_sim_list, marker="x", label="Simulated Google TPUv3", color=color_Google[2], ) # plt.legend() # plt.title(title) plt.xlabel("N=K") plt.ylabel("TFLOPS") plt.grid(True, which="both", ls="--", c="0.7") # Adding a grid for better readability plt.tight_layout() plt.savefig("figure5a.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300) ================================================ FILE: ae/figure5/ab/real_hardware/matmul_A100.csv ================================================ 64, 12288, 12288, 0.1900ms, 101.7124Tflops 128, 12288, 12288, 0.2003ms, 193.0114Tflops 256, 12288, 12288, 0.3185ms, 242.7090Tflops 512, 12288, 12288, 0.6118ms, 252.7351Tflops 1024, 12288, 12288, 1.1990ms, 257.9115Tflops 2048, 12288, 12288, 2.3586ms, 262.2263Tflops 4096, 12288, 12288, 4.4576ms, 277.4929Tflops 8192, 12288, 12288, 8.6216ms, 286.9431Tflops 16384, 12288, 12288, 17.0307ms, 290.5223Tflops 32768, 12288, 12288, 35.4407ms, 279.2160Tflops 8192, 64, 64, 0.0296ms, 2.2700Tflops 8192, 128, 128, 0.0310ms, 8.6608Tflops 8192, 256, 256, 0.0356ms, 30.1244Tflops 8192, 512, 512, 0.0471ms, 91.2121Tflops 8192, 1024, 1024, 0.0927ms, 185.2380Tflops 8192, 2048, 2048, 0.2818ms, 243.8497Tflops 8192, 4096, 4096, 1.0210ms, 269.2169Tflops 8192, 8192, 8192, 3.9614ms, 277.5532Tflops 8192, 16384, 16384, 15.0087ms, 293.0334Tflops 8192, 32768, 32768, 61.1346ms, 287.7616Tflops ================================================ FILE: ae/figure5/ab/real_hardware/matmul_MI210.csv ================================================ 32, 12288, 12288, 0.5493ms, 17.5922Tflops 64, 12288, 12288, 0.5584ms, 34.6135Tflops 128, 12288, 12288, 0.5932ms, 65.1646Tflops 256, 12288, 12288, 0.7699ms, 100.4209Tflops 512, 12288, 12288, 1.4054ms, 110.0209Tflops 1024, 12288, 12288, 2.7173ms, 113.8051Tflops 2048, 12288, 12288, 5.3905ms, 114.7338Tflops 4096, 12288, 12288, 10.4494ms, 118.3752Tflops 8192, 12288, 12288, 20.7849ms, 119.0242Tflops 16384, 12288, 12288, 41.1353ms, 120.2811Tflops 32768, 12288, 12288, 81.4046ms, 121.5608Tflops 8192, 32, 32, 0.0333ms, 0.5044Tflops 8192, 64, 64, 0.0345ms, 1.9479Tflops 8192, 128, 128, 0.0396ms, 6.7825Tflops 8192, 256, 256, 0.0485ms, 22.1307Tflops 8192, 512, 512, 0.0863ms, 49.7635Tflops 8192, 1024, 1024, 0.1950ms, 88.0900Tflops 8192, 2048, 2048, 0.5822ms, 118.0305Tflops 8192, 4096, 4096, 2.2901ms, 120.0272Tflops 8192, 8192, 8192, 9.4150ms, 116.7826Tflops 8192, 16384, 16384, 36.7552ms, 119.6578Tflops 8192, 32768, 32768, 146.2553ms, 120.2841Tflops ================================================ FILE: ae/figure5/ab/run.sh ================================================ rm *.csv rm *.pdf cd ../../.. python -m ae.figure5.ab.test_matmul --simgpu --roofline python -m ae.figure5.ab.test_matmul --simtpu --roofline python -m ae.figure5.ab.test_matmul --simamd --roofline python -m ae.figure5.ab.test_matmul --simgpu python -m ae.figure5.ab.test_matmul --simtpu python -m ae.figure5.ab.test_matmul --simamd cd ae/figure5/ab python plot_matmul.py ================================================ FILE: ae/figure5/ab/test_matmul.py ================================================ from software_model.matmul import Matmul from software_model.utils import data_type_dict, Tensor from hardware_model.device import device_dict import argparse if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--gpu", action="store_true", help="Enable GPU") parser.add_argument("--simtpu", action="store_true", help="Enable simulation") parser.add_argument("--simtpu-new", action="store_true", help="Enable simulation") parser.add_argument("--simgpu", action="store_true", help="Enable simulation") parser.add_argument("--simamd", action="store_true", help="amd simulation") parser.add_argument("--roofline", action="store_true", help="Roofline simulation") args = parser.parse_args() if args.simtpu: pcb = device_dict["TPUv3"] if args.simtpu_new: pcb = device_dict["TPUv3_new"] if args.simgpu: pcb = device_dict["A100_80GB_fp16"] MI210 = device_dict["MI210"] amd_overhead = MI210.compute_module.overhead.softmax K = 12288 N = K titile = f"Performance of Matmul with K={K}, N={N}" print(f"{titile}") test_overhead = True for M in range(5, 16): M = 2**M model = Matmul(data_type=data_type_dict["fp16"]) _ = model( Tensor([M, K]), Tensor([K, N]), ) if args.gpu: if test_overhead: model.gpu_kernel_launch_overhead() test_overhead = False latency = model.run_on_gpu() if args.simtpu: if args.roofline: latency = model.roofline_model(pcb) + 110e-6 file_name='matmul_TPUv3_roofline.csv' else: latency = ( model.compile_and_simulate(pcb, compile_mode="heuristic-TPU") + 110e-6 ) file_name='matmul_TPUv3_sim.csv' if args.simtpu_new: if args.roofline: latency = model.roofline_model(pcb) + 110e-6 else: latency = ( model.compile_and_simulate(pcb, compile_mode="heuristic-TPU-new") + 110e-6 ) if args.simgpu: if args.roofline: latency = model.roofline_model(pcb) + 2.1e-5 file_name='matmul_A100_roofline.csv' else: latency = ( model.compile_and_simulate(pcb, compile_mode="heuristic-GPU") + 2.1e-5 ) file_name='matmul_A100_sim.csv' if args.simamd: if args.roofline: latency = model.roofline_model(pcb_module=MI210) + amd_overhead file_name='matmul_MI210_roofline.csv' else: latency = ( model.compile_and_simulate( pcb_module=MI210, compile_mode="heuristic-GPU" ) + amd_overhead ) file_name='matmul_MI210_sim.csv' tflops = 2 * M * N * K / latency / 1e12 print(f"{M}, {N}, {K}, {latency*1e3:.4f}ms, {tflops:.4f}Tflops", flush=True) with open(f'ae/figure5/ab/{file_name}', 'a') as f: f.write(f"{M}, {N}, {K}, {latency*1e3:.4f}ms, {tflops:.4f}Tflops\n") M = 8192 print(f"Performance of Matmul with M={M}, N=K") for K in range(5, 16): K = 2**K N = K model = Matmul(data_type=data_type_dict["fp16"]) _ = model( Tensor([M, K]), Tensor([K, N]), ) if args.gpu: latency = model.run_on_gpu() if args.simtpu: if args.roofline: latency = model.roofline_model(pcb) + 110e-6 else: latency = ( model.compile_and_simulate(pcb, compile_mode="heuristic-TPU") + 110e-6 ) if args.simtpu_new: if args.roofline: latency = model.roofline_model(pcb) + 110e-6 else: latency = ( model.compile_and_simulate(pcb, compile_mode="heuristic-TPU-new") + 110e-6 ) if args.simgpu: if args.roofline: latency = model.roofline_model(pcb) + 2.1e-5 else: latency = ( model.compile_and_simulate(pcb, compile_mode="heuristic-GPU") + 2.1e-5 ) if args.simamd: if args.roofline: latency = model.roofline_model(pcb_module=MI210) + amd_overhead else: latency = ( model.compile_and_simulate( pcb_module=MI210, compile_mode="heuristic-GPU" ) + amd_overhead ) tflops = 2 * M * N * K / latency / 1e12 print(f"{M}, {N}, {K}, {latency*1e3:.4f}ms, {tflops:.4f}Tflops", flush=True) with open(f'ae/figure5/ab/{file_name}', 'a') as f: f.write(f"{M}, {N}, {K}, {latency*1e3:.4f}ms, {tflops:.4f}Tflops\n") ================================================ FILE: ae/figure5/cf/__init__.py ================================================ ================================================ FILE: ae/figure5/cf/plot_softmax.py ================================================ import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np softmax_TPUv3_sim = pd.read_csv( "softmax_TPUv3_sim.csv", header=None, names=["M", "N", "throughput"] ) softmax_TPUv3_sim.set_index(["M", "N"], inplace=True) softmax_TPUv3_roofline = pd.read_csv( "softmax_TPUv3_roofline.csv", header=None, names=["M", "N", "throughput"] ) softmax_TPUv3_roofline.set_index(["M", "N"], inplace=True) softmax_A100 = pd.read_csv( "real_hardware/softmax_A100.csv", header=None, names=["M", "N", "throughput"] ) softmax_A100.set_index(["M", "N"], inplace=True) softmax_A100_sim = pd.read_csv( "softmax_A100_sim.csv", header=None, names=["M", "N", "throughput"] ) softmax_A100_sim.set_index(["M", "N"], inplace=True) softmax_A100_roofline = pd.read_csv( "softmax_A100_roofline.csv", header=None, names=["M", "N", "throughput"] ) softmax_A100_roofline.set_index(["M", "N"], inplace=True) softmax_MI210 = pd.read_csv( "real_hardware/softmax_MI210.csv", header=None, names=["M", "N", "throughput"] ) softmax_MI210.set_index(["M", "N"], inplace=True) softmax_MI210_sim = pd.read_csv( "softmax_MI210_sim.csv", header=None, names=["M", "N", "throughput"] ) softmax_MI210_sim.set_index(["M", "N"], inplace=True) softmax_MI210_roofline = pd.read_csv( "softmax_MI210_roofline.csv", header=None, names=["M", "N", "throughput"] ) softmax_MI210_roofline.set_index(["M", "N"], inplace=True) color_NV = sns.color_palette("Greens_d", 4)[1:] color_Google = sns.color_palette("Blues_d", 4)[1:] color_AMD = sns.color_palette("flare", 3) M = 4096 title = f"Performance of softmax with M={M}" N_list = [] throughput_TPU_list = [] throughput_TPU_sim_list = [] throughput_TPU_roofline_list = [] throughput_GPU_list = [] throughput_GPU_sim_list = [] throughput_GPU_roofline_list = [] throughput_AMD_list = [] throughput_AMD_sim_list = [] throughput_AMD_roofline_list = [] for N in range(6, 16): N = 2**N N_list.append(N) # print(M,N) # print(softmax_TPUv3.loc[(M, N), 'throughput']) throughput_TPU_sim_list.append( softmax_TPUv3_sim.loc[(M, N), "throughput"].values[0] ) throughput_TPU_roofline_list.append( softmax_TPUv3_roofline.loc[(M, N), "throughput"].values[0] ) throughput_GPU_list.append(softmax_A100.loc[(M, N), "throughput"].values[0]) throughput_GPU_sim_list.append(softmax_A100_sim.loc[(M, N), "throughput"].values[0]) throughput_GPU_roofline_list.append( softmax_A100_roofline.loc[(M, N), "throughput"].values[0] ) throughput_AMD_list.append(softmax_MI210.loc[(M, N), "throughput"].values[0]) throughput_AMD_sim_list.append( softmax_MI210_sim.loc[(M, N), "throughput"].values[0] ) throughput_AMD_roofline_list.append( softmax_MI210_roofline.loc[(M, N), "throughput"].values[0] ) plt.figure(figsize=(3.7, 2)) plt.xscale("log", base=2) plt.plot( N_list, throughput_GPU_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of NVIDIA A100", color=color_NV[0], ) plt.plot( N_list, throughput_GPU_list, marker="o", label="Real NVIDIA A100", color=color_NV[1] ) plt.plot( N_list, throughput_GPU_sim_list, marker="x", label="Simulated NVIDIA A100", color=color_NV[2], ) plt.plot( N_list, throughput_AMD_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of AMD MI210", color=color_AMD[0], ) plt.plot( N_list, throughput_AMD_list, marker="o", label="Real AMD MI210", color=color_AMD[1] ) plt.plot( N_list, throughput_AMD_sim_list, marker="x", label="Simulated AMD MI210", color=color_AMD[2], ) plt.plot( N_list, throughput_TPU_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of Google TPUv3", color=color_Google[0], ) plt.plot( N_list, throughput_TPU_sim_list, marker="x", label="Simulated Google TPUv3", color=color_Google[2], ) handles, labels = plt.gca().get_legend_handles_labels() plt.legend(handles, labels, loc="upper left", bbox_to_anchor=(1, 1.1)) # plt.legend() # plt.title(title) plt.xlabel("N") plt.ylabel("G Elements/s") plt.grid(True, which="both", ls="--", c="0.7") # Adding a grid for better readability plt.savefig("figure5f.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300) N = 4096 title = f"Performance of softmax with N={N}" M_list = [] throughput_TPU_list = [] throughput_TPU_sim_list = [] throughput_TPU_roofline_list = [] throughput_GPU_list = [] throughput_GPU_sim_list = [] throughput_GPU_roofline_list = [] throughput_AMD_list = [] throughput_AMD_sim_list = [] throughput_AMD_roofline_list = [] for M in range(6, 16): M = 2**M M_list.append(M) throughput_TPU_sim_list.append( softmax_TPUv3_sim.loc[(M, N), "throughput"].values[0] ) throughput_TPU_roofline_list.append( softmax_TPUv3_roofline.loc[(M, N), "throughput"].values[0] ) throughput_GPU_list.append(softmax_A100.loc[(M, N), "throughput"].values[0]) throughput_GPU_sim_list.append(softmax_A100_sim.loc[(M, N), "throughput"].values[0]) throughput_GPU_roofline_list.append( softmax_A100_roofline.loc[(M, N), "throughput"].values[0] ) throughput_AMD_list.append(softmax_MI210.loc[(M, N), "throughput"].values[0]) throughput_AMD_sim_list.append( softmax_MI210_sim.loc[(M, N), "throughput"].values[0] ) throughput_AMD_roofline_list.append( softmax_MI210_roofline.loc[(M, N), "throughput"].values[0] ) plt.figure(figsize=(3.7, 2.8)) plt.xscale("log", base=2) plt.plot( M_list, throughput_GPU_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of NVIDIA A100", color=color_NV[0], ) plt.plot( M_list, throughput_GPU_list, marker="o", label="Real NVIDIA A100", color=color_NV[1] ) plt.plot( M_list, throughput_GPU_sim_list, marker="x", label="Simulated NVIDIA A100", color=color_NV[2], ) plt.plot( M_list, throughput_AMD_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of AMD MI210", color=color_AMD[0], ) plt.plot( M_list, throughput_AMD_list, marker="o", label="Real AMD MI210", color=color_AMD[1] ) plt.plot( M_list, throughput_AMD_sim_list, marker="x", label="Simulated AMD MI210", color=color_AMD[2], ) plt.plot( M_list, throughput_TPU_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of Google TPUv3", color=color_Google[0], ) plt.plot( M_list, throughput_TPU_sim_list, marker="x", label="Simulated Google TPUv3", color=color_Google[2], ) handles, labels = plt.gca().get_legend_handles_labels() plt.legend(handles, labels, loc="upper left", bbox_to_anchor=(1, 1)) # plt.legend() # plt.title(title) plt.xlabel("M") plt.ylabel("G Elements/s") plt.grid(True, which="both", ls="--", c="0.7") # Adding a grid for better readability plt.savefig("figure5c.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300) ================================================ FILE: ae/figure5/cf/real_hardware/softmax_A100.csv ================================================ 4096, 32, 9.99556025250909 4096, 64, 19.634136210285714 4096, 128, 37.27158060257627 4096, 256, 68.719476736 4096, 512, 111.34294964820253 4096, 1024, 152.9755308210087 4096, 2048, 167.5446289944381 4096, 4096, 204.5603028420465 4096, 8192, 216.51921285435077 4096, 16384, 242.0249154863766 4096, 32768, 240.88573103179803 32, 4096, 8.457781752123077 64, 4096, 16.65926708751515 128, 4096, 30.97215852890141 256, 4096, 57.11748715719481 512, 4096, 96.6603628814066 1024, 4096, 137.438953472 2048, 4096, 166.75057862005687 4096, 4096, 203.9673744280116 8192, 4096, 229.96321626687583 16384, 4096, 247.12465031664266 32768, 4096, 278.826128490001 ================================================ FILE: ae/figure5/cf/real_hardware/softmax_MI210.csv ================================================ 4096, 32, 5.389762881254902 4096, 64, 10.372751205433962 4096, 128, 13.49094021811043 4096, 256, 36.049561566426235 4096, 512, 60.662710497986204 4096, 1024, 87.52331365381094 4096, 2048, 75.18028224109402 4096, 4096, 80.05545412703526 4096, 8192, 70.97200623062432 4096, 16384, 66.22940628486023 4096, 32768, 61.59189862377593 32, 4096, 4.822419420070175 64, 4096, 9.012390391606559 128, 4096, 15.820311191021583 256, 4096, 28.28325730613505 512, 4096, 43.76165682690547 1024, 4096, 60.662710497986204 2048, 4096, 72.54509709037526 4096, 4096, 80.28379255865829 8192, 4096, 84.78161949116145 16384, 4096, 86.98237846435599 32768, 4096, 88.01594018469544 ================================================ FILE: ae/figure5/cf/run.sh ================================================ rm *.csv rm *.pdf cd ../../.. python -m ae.figure5.cf.test_softmax --simgpu --roofline python -m ae.figure5.cf.test_softmax --simtpu --roofline python -m ae.figure5.cf.test_softmax --simamd --roofline python -m ae.figure5.cf.test_softmax --simgpu python -m ae.figure5.cf.test_softmax --simtpu python -m ae.figure5.cf.test_softmax --simamd cd ae/figure5/cf python plot_softmax.py ================================================ FILE: ae/figure5/cf/test_softmax.py ================================================ from software_model.softmax import Softmax from software_model.utils import data_type_dict, Tensor from hardware_model.device import device_dict import argparse if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--gpu", action="store_true", help="Enable GPU") parser.add_argument("--simgpu", action="store_true", help="Enable simulation") parser.add_argument("--simtpu", action="store_true", help="Enable simulation") parser.add_argument("--simamd", action="store_true", help="amd simulation") parser.add_argument("--roofline", action="store_true", help="Roofline simulation") args = parser.parse_args() A100 = device_dict["A100_80GB_fp16"] TPU = device_dict["TPUv3"] MI210 = device_dict["MI210"] tpu_overhead = 300e-6 gpu_overhead = 12e-6 amd_overhead = MI210.compute_module.overhead.softmax if args.gpu: gpu_kernel_launch_overhead = Softmax.gpu_kernel_launch_overhead() print(f"Performance of Softmax") M = 2**12 for N in range(5, 16): N = 2**N if args.simtpu: model = Softmax(data_type=data_type_dict["fp32"]) _ = model(Tensor([M, N], data_type=data_type_dict["fp32"])) if args.roofline: latency = model.roofline_model(pcb_module=TPU) + tpu_overhead file_name = "softmax_TPUv3_roofline.csv" else: latency = model.compile_and_simulate(pcb_module=TPU) + tpu_overhead file_name = "softmax_TPUv3_sim.csv" else: model = Softmax(data_type=data_type_dict["fp16"]) _ = model( Tensor([M, N]), ) if args.gpu: latency = model.run_on_gpu() if args.simgpu: if args.roofline: latency = model.roofline_model(pcb_module=A100) + gpu_overhead file_name = "softmax_A100_roofline.csv" else: latency = model.compile_and_simulate(pcb_module=A100) + gpu_overhead file_name = "softmax_A100_sim.csv" if args.simamd: model = Softmax(data_type=data_type_dict["fp32"]) _ = model(Tensor([M, N], data_type=data_type_dict["fp32"])) if args.roofline: latency = model.roofline_model(pcb_module=MI210) + amd_overhead file_name = "softmax_MI210_roofline.csv" else: latency = ( model.compile_and_simulate(pcb_module=MI210) + amd_overhead ) file_name = "softmax_MI210_sim.csv" print(f"{M}, {N}, {M*N/latency/1e9}") with open(f"ae/figure5/cf/{file_name}", "a") as f: f.write(f"{M}, {N}, {M*N/latency/1e9}\n") N = 2**12 for M in range(5, 16): M = 2**M if args.simtpu: model = Softmax(data_type=data_type_dict["fp32"]) _ = model(Tensor([M, N], data_type=data_type_dict["fp32"])) if args.roofline: latency = model.roofline_model(pcb_module=TPU) + tpu_overhead else: latency = model.compile_and_simulate(pcb_module=TPU) + tpu_overhead else: model = Softmax(data_type=data_type_dict["fp16"]) _ = model( Tensor([M, N]), ) if args.gpu: latency = model.run_on_gpu() if args.simgpu: if args.roofline: latency = model.roofline_model(pcb_module=A100) + gpu_overhead else: latency = model.compile_and_simulate(pcb_module=A100) + gpu_overhead if args.simamd: model = Softmax(data_type=data_type_dict["fp32"]) _ = model(Tensor([M, N], data_type=data_type_dict["fp32"])) if args.roofline: latency = model.roofline_model(pcb_module=MI210) + amd_overhead else: latency = ( model.compile_and_simulate(pcb_module=MI210) + amd_overhead ) print(f"{M}, {N}, {M*N/latency/1e9}") with open(f"ae/figure5/cf/{file_name}", "a") as f: f.write(f"{M}, {N}, {M*N/latency/1e9}\n") ================================================ FILE: ae/figure5/de/__init__.py ================================================ ================================================ FILE: ae/figure5/de/plot_layernorm.py ================================================ import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np layernorm_TPUv3_sim = pd.read_csv( "layernorm_TPUv3_sim.csv", header=None, names=["M", "N", "throughput"] ) layernorm_TPUv3_sim.set_index(["M", "N"], inplace=True) layernorm_TPUv3_roofline = pd.read_csv( "layernorm_TPUv3_roofline.csv", header=None, names=["M", "N", "throughput"] ) layernorm_TPUv3_roofline.set_index(["M", "N"], inplace=True) layernorm_A100 = pd.read_csv( "real_hardware/layernorm_A100.csv", header=None, names=["M", "N", "throughput"] ) layernorm_A100.set_index(["M", "N"], inplace=True) layernorm_A100_sim = pd.read_csv( "layernorm_A100_sim.csv", header=None, names=["M", "N", "throughput"] ) layernorm_A100_sim.set_index(["M", "N"], inplace=True) layernorm_A100_roofline = pd.read_csv( "layernorm_A100_roofline.csv", header=None, names=["M", "N", "throughput"] ) layernorm_A100_roofline.set_index(["M", "N"], inplace=True) color_NV = sns.color_palette("Greens_d", 4)[1:] color_Google = sns.color_palette("Blues_d", 4)[1:] color_AMD = sns.color_palette("flare", 3) M = 4096 title = f"Performance of layernorm with M={M}" N_list = [] throughput_TPU_list = [] throughput_TPU_sim_list = [] throughput_TPU_roofline_list = [] throughput_GPU_list = [] throughput_GPU_sim_list = [] throughput_GPU_roofline_list = [] for N in range(6, 16): N = 2**N N_list.append(N) throughput_TPU_sim_list.append( layernorm_TPUv3_sim.loc[(M, N), "throughput"].values[0] ) throughput_TPU_roofline_list.append( layernorm_TPUv3_roofline.loc[(M, N), "throughput"].values[0] ) throughput_GPU_list.append(layernorm_A100.loc[(M, N), "throughput"].values[0]) throughput_GPU_sim_list.append( layernorm_A100_sim.loc[(M, N), "throughput"].values[0] ) throughput_GPU_roofline_list.append( layernorm_A100_roofline.loc[(M, N), "throughput"].values[0] ) plt.figure(figsize=(3.64, 2)) plt.xscale("log", base=2) plt.plot( N_list, throughput_GPU_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of NVIDIA A100", color=color_NV[0], ) plt.plot( N_list, throughput_GPU_list, marker="o", label="Real NVIDIA A100", color=color_NV[1] ) plt.plot( N_list, throughput_GPU_sim_list, marker="x", label="Simulated NVIDIA A100", color=color_NV[2], ) plt.plot( N_list, throughput_TPU_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of Google TPUv3", color=color_Google[0], ) plt.plot( N_list, throughput_TPU_sim_list, marker="x", label="Simulated Google TPUv3", color=color_Google[2], ) # plt.legend() # plt.title(title) plt.xlabel("N") plt.ylabel("G Elements/s") plt.grid(True, which="both", ls="--", c="0.7") # Adding a grid for better readability plt.savefig("figure5e.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300) N = 4096 title = f"Performance of layernorm with N={N}" M_list = [] throughput_TPU_list = [] throughput_TPU_sim_list = [] throughput_TPU_roofline_list = [] throughput_GPU_list = [] throughput_GPU_sim_list = [] throughput_GPU_roofline_list = [] for M in range(6, 16): M = 2**M M_list.append(M) throughput_TPU_sim_list.append( layernorm_TPUv3_sim.loc[(M, N), "throughput"].values[0] ) throughput_TPU_roofline_list.append( layernorm_TPUv3_roofline.loc[(M, N), "throughput"].values[0] ) throughput_GPU_list.append(layernorm_A100.loc[(M, N), "throughput"].values[0]) throughput_GPU_sim_list.append( layernorm_A100_sim.loc[(M, N), "throughput"].values[0] ) throughput_GPU_roofline_list.append( layernorm_A100_roofline.loc[(M, N), "throughput"].values[0] ) plt.figure(figsize=(3.64, 2)) plt.xscale("log", base=2) plt.plot( M_list, throughput_GPU_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of NVIDIA A100", color=color_NV[0], ) plt.plot( M_list, throughput_GPU_list, marker="o", label="Real NVIDIA A100", color=color_NV[1] ) plt.plot( M_list, throughput_GPU_sim_list, marker="x", label="Simulated NVIDIA A100", color=color_NV[2], ) plt.plot( M_list, throughput_TPU_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of Google TPUv3", color=color_Google[0], ) plt.plot( M_list, throughput_TPU_sim_list, marker="x", label="Simulated Google TPUv3", color=color_Google[2], ) # plt.legend() # plt.title(title) plt.xlabel("M") plt.ylabel("G Elements/s") plt.grid(True, which="both", ls="--", c="0.7") # Adding a grid for better readability plt.savefig("figure5d.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300) ================================================ FILE: ae/figure5/de/real_hardware/layernorm_A100.csv ================================================ 4096, 32, 2.476377540036036 4096, 64, 4.8436635584845815 4096, 128, 10.064179659276888 4096, 256, 19.590407621844097 4096, 512, 39.26827242057143 4096, 1024, 78.888726656574 4096, 2048, 139.06866438273516 4096, 4096, 211.31755008307508 4096, 8192, 265.2921552409576 4096, 16384, 210.44858071824746 4096, 32768, 199.2743197951547 32, 4096, 2.4988900631272726 64, 4096, 5.020601040073059 128, 4096, 9.99556025250909 256, 4096, 20.082404160292235 512, 4096, 40.16480832058447 1024, 4096, 79.6026517846878 2048, 4096, 141.87246810012903 4096, 4096, 211.6353208350797 8192, 4096, 284.89370112414576 16384, 4096, 345.7923546813956 32768, 4096, 385.84643825998086 ================================================ FILE: ae/figure5/de/real_hardware/layernorm_MI210.csv ================================================ 4096, 32, 2.3695247806698823 4096, 64, 4.729122950631834 4096, 128, 9.17590195596949 4096, 256, 18.43588592802619 4096, 512, 31.59641013907877 4096, 1024, 52.44807583429779 4096, 2048, 83.74459902278055 4096, 4096, 89.41186119969811 4096, 8192, 86.39576298097589 4096, 16384, 86.3601901230308 4096, 32768, 78.76990948928942 32, 4096, 3.885694391427885 64, 4096, 6.742243759280835 128, 4096, 11.78431162745035 256, 4096, 22.64512970663591 512, 4096, 39.53264699736632 1024, 4096, 58.88026281772146 2048, 4096, 75.69152349271899 4096, 4096, 89.32684940809148 8192, 4096, 97.43158473170392 16384, 4096, 101.13477801032278 32768, 4096, 104.08326540288465 ================================================ FILE: ae/figure5/de/run.sh ================================================ rm *.csv rm *.pdf cd ../../.. python -m ae.figure5.de.test_layernorm --simgpu --roofline python -m ae.figure5.de.test_layernorm --simtpu --roofline python -m ae.figure5.de.test_layernorm --simgpu python -m ae.figure5.de.test_layernorm --simtpu cd ae/figure5/de python plot_layernorm.py ================================================ FILE: ae/figure5/de/test_layernorm.py ================================================ from software_model.layernorm import LayerNorm from software_model.utils import data_type_dict, Tensor from hardware_model.device import device_dict import argparse if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--gpu", action="store_true", help="Enable GPU") parser.add_argument("--amd", action="store_true", help="Enable AMD") parser.add_argument("--simgpu", action="store_true", help="Enable simulation") parser.add_argument("--simtpu", action="store_true", help="Enable simulation") parser.add_argument("--simamd", action="store_true", help="Enable simulation") parser.add_argument("--roofline", action="store_true", help="Roofline simulation") args = parser.parse_args() A100 = device_dict["A100_80GB_fp16"] TPU = device_dict["TPUv3"] MI210 = device_dict["MI210"] if args.gpu: gpu_kernel_launch_overhead = LayerNorm.gpu_kernel_launch_overhead() print(f"Performance of LayerNorm") M = 2**12 for N in range(5, 16): N = 2**N # N = 2**15 if args.simtpu: model = LayerNorm(data_type=data_type_dict["fp32"]) _ = model(Tensor([M, N], data_type=data_type_dict["fp32"])) if args.roofline: latency = model.roofline_model(TPU) + 140e-6 file_name = "layernorm_TPUv3_roofline.csv" else: latency = ( model.compile_and_simulate( pcb_module=TPU, compile_mode="heuristic-TPU" ) + 140e-6 ) file_name = "layernorm_TPUv3_sim.csv" else: model = LayerNorm(data_type=data_type_dict["fp16"]) _ = model( Tensor([M, N]), ) if args.gpu: latency = model.run_on_gpu() if args.amd: # model.amd_kernel_launch_overhead() latency = model.run_on_amd() if args.simgpu: if args.roofline: latency = model.roofline_model(A100) + 4.5e-5 file_name = "layernorm_A100_roofline.csv" else: latency = ( model.compile_and_simulate( pcb_module=A100, compile_mode="heuristic-GPU" ) + 4.5e-5 ) file_name = "layernorm_A100_sim.csv" if args.simamd: model = LayerNorm(data_type=data_type_dict["fp32"]) _ = model(Tensor([M, N], data_type=data_type_dict["fp32"])) if args.roofline: latency = ( model.roofline_model(MI210) + MI210.compute_module.overhead.layernorm ) file_name = "layernorm_MI210_roofline.csv" else: latency = ( model.compile_and_simulate( pcb_module=MI210, compile_mode="heuristic-GPU" ) + MI210.compute_module.overhead.layernorm ) file_name = "layernorm_MI210_sim.csv" print(f"{M}, {N}, {M*N/latency/1e9}") with open(f"ae/figure5/de/{file_name}", "a") as f: f.write(f"{M}, {N}, {M*N/latency/1e9}\n") N = 2**12 for M in range(5, 16): M = 2**M if args.simtpu: model = LayerNorm(data_type=data_type_dict["fp32"]) _ = model(Tensor([M, N], data_type=data_type_dict["fp32"])) if args.roofline: latency = model.roofline_model(TPU) + 140e-6 else: latency = ( model.compile_and_simulate( pcb_module=TPU, compile_mode="heuristic-TPU" ) + 140e-6 ) else: model = LayerNorm(data_type=data_type_dict["fp16"]) _ = model( Tensor([M, N]), ) if args.gpu: latency = model.run_on_gpu() if args.amd: latency = model.run_on_amd() if args.simgpu: if args.roofline: latency = model.roofline_model(A100) + 4.5e-5 else: latency = ( model.compile_and_simulate( pcb_module=A100, compile_mode="heuristic-GPU" ) + 4.5e-5 ) if args.simamd: model = LayerNorm(data_type=data_type_dict["fp32"]) _ = model(Tensor([M, N], data_type=data_type_dict["fp32"])) if args.roofline: latency = ( model.roofline_model(MI210) + MI210.compute_module.overhead.layernorm ) else: latency = ( model.compile_and_simulate( pcb_module=MI210, compile_mode="heuristic-GPU" ) + MI210.compute_module.overhead.layernorm ) print(f"{M}, {N}, {M*N/latency/1e9}") with open(f"ae/figure5/de/{file_name}", "a") as f: f.write(f"{M}, {N}, {M*N/latency/1e9}\n") ================================================ FILE: ae/figure5/g/__init__.py ================================================ ================================================ FILE: ae/figure5/g/plot_gelu.py ================================================ import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np gelu_TPUv3_sim = pd.read_csv( "gelu_TPUv3_sim.csv", header=None, names=["M", "throughput"] ) gelu_TPUv3_roofline = pd.read_csv( "gelu_TPUv3_roofline.csv", header=None, names=["M", "throughput"] ) gelu_A100 = pd.read_csv("real_hardware/gelu_A100.csv", header=None, names=["M", "throughput"]) gelu_A100_sim = pd.read_csv("gelu_A100_sim.csv", header=None, names=["M", "throughput"]) gelu_A100_roofline = pd.read_csv( "gelu_A100_roofline.csv", header=None, names=["M", "throughput"] ) gelu_MI210 = pd.read_csv("real_hardware/gelu_MI210.csv", header=None, names=["M", "throughput"]) gelu_MI210_sim = pd.read_csv( "gelu_MI210_sim.csv", header=None, names=["M", "throughput"] ) gelu_MI210_roofline = pd.read_csv( "gelu_MI210_roofline.csv", header=None, names=["M", "throughput"] ) color_NV = sns.color_palette("Greens_d", 4)[1:] color_Google = sns.color_palette("Blues_d", 4)[1:] color_AMD = sns.color_palette("flare", 3) M = 4096 title = f"Performance of gelu with M={M}" M_list = [] throughput_TPU_list = [] throughput_TPU_sim_list = [] throughput_TPU_roofline_list = [] throughput_GPU_list = [] throughput_GPU_sim_list = [] throughput_GPU_roofline_list = [] throughput_AMD_list = [] throughput_AMD_sim_list = [] throughput_AMD_roofline_list = [] for M in range(10, 30): M = 2**M M_list.append(M) throughput_TPU_sim_list.append( gelu_TPUv3_sim[gelu_TPUv3_sim["M"] == M]["throughput"].iloc[0] ) throughput_TPU_roofline_list.append( gelu_TPUv3_roofline[gelu_TPUv3_roofline["M"] == M]["throughput"].iloc[0] ) throughput_GPU_list.append(gelu_A100[gelu_A100["M"] == M]["throughput"].iloc[0]) throughput_GPU_sim_list.append( gelu_A100_sim[gelu_A100_sim["M"] == M]["throughput"].iloc[0] ) throughput_GPU_roofline_list.append( gelu_A100_roofline[gelu_A100_roofline["M"] == M]["throughput"].iloc[0] ) throughput_AMD_list.append(gelu_MI210[gelu_MI210["M"] == M]["throughput"].iloc[0]) throughput_AMD_sim_list.append( gelu_MI210_sim[gelu_MI210_sim["M"] == M]["throughput"].iloc[0] ) throughput_AMD_roofline_list.append( gelu_MI210_roofline[gelu_MI210_roofline["M"] == M]["throughput"].iloc[0] ) plt.figure(figsize=(6, 2.3)) plt.xscale("log", base=2) plt.plot( M_list, throughput_GPU_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of NVIDIA A100", color=color_NV[0], ) plt.plot( M_list, throughput_GPU_list, marker="o", label="Real NVIDIA A100", color=color_NV[1] ) plt.plot( M_list, throughput_GPU_sim_list, marker="x", label="Simulated NVIDIA A100", color=color_NV[2], ) plt.plot( M_list, throughput_AMD_roofline_list, marker=" ", linewidth=1.5, linestyle="--", label="Roofline of AMD MI210", color=color_AMD[0], ) plt.plot( M_list, throughput_AMD_list, marker="o", label="Real AMD MI210", color=color_AMD[1] ) plt.plot( M_list, throughput_AMD_sim_list, marker="x", label="Simulated AMD MI210", color=color_AMD[2], ) plt.plot( M_list, throughput_TPU_roofline_list, marker=" ", linewidth=3.5, linestyle="--", label="Roofline of Google TPUv3", color=color_Google[0], ) plt.plot( M_list, throughput_TPU_sim_list, marker="x", label="Simulated Google TPUv3", color=color_Google[2], ) handles, labels = plt.gca().get_legend_handles_labels() plt.legend(handles, labels, loc="upper left", bbox_to_anchor=(1, 1.05)) # plt.title(title) plt.xlabel("# Elements") plt.ylabel("G Elements/s") plt.grid(True, which="both", ls="--", c="0.7") # Adding a grid for better readability plt.savefig(f"figure5g.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01) ================================================ FILE: ae/figure5/g/real_hardware/gelu_A100.csv ================================================ 1024, 0.021262214336633663 2048, 0.04338350804040404 4096, 0.08547198599004975 8192, 0.1700977146930693 16384, 0.3368601800784314 32768, 0.6889170600100251 65536, 1.38129601479397 131072, 2.735103551681592 262144, 5.52518405917588 524288, 11.19095804352163 1048576, 21.935394070344138 2097152, 43.980465111040004 4194304, 86.66101499712316 8388608, 141.87246810012903 16777216, 210.36993775086398 33554432, 277.042299912063 67108864, 328.2507017033889 134217728, 360.17271492086496 268435456, 379.53814489891255 536870912, 390.32758080867535 ================================================ FILE: ae/figure5/g/real_hardware/gelu_MI210.csv ================================================ 1024, 0.047197442813186816 2048, 0.09761289309090909 4096, 0.19522578618181818 8192, 0.39045157236363637 16384, 0.7898790429425288 32768, 1.5618062894545455 65536, 3.0885158083595505 131072, 6.108397932088889 262144, 11.822705675010754 524288, 21.55905152501961 1048576, 37.27158060257627 2097152, 59.43306096086487 4194304, 87.96093022208001 8388608, 114.60707520792182 16777216, 133.52702880012146 33554432, 146.984322042118 67108864, 151.08694402074934 134217728, 152.91320207016489 268435456, 154.04294798777178 536870912, 154.44972829556897 ================================================ FILE: ae/figure5/g/run.sh ================================================ rm *.csv rm *.pdf cd ../../.. python -m ae.figure5.g.test_gelu --simgpu --roofline python -m ae.figure5.g.test_gelu --simtpu --roofline python -m ae.figure5.g.test_gelu --simamd --roofline python -m ae.figure5.g.test_gelu --simgpu python -m ae.figure5.g.test_gelu --simtpu python -m ae.figure5.g.test_gelu --simamd cd ae/figure5/g python plot_gelu.py ================================================ FILE: ae/figure5/g/test_gelu.py ================================================ from software_model.gelu import GeLU from software_model.utils import data_type_dict, Tensor from hardware_model.device import device_dict import argparse if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--gpu", action="store_true", help="Enable GPU") parser.add_argument("--simgpu", action="store_true", help="Enable simulation") parser.add_argument("--simtpu", action="store_true", help="Enable simulation") parser.add_argument("--roofline", action="store_true", help="Roofline simulation") parser.add_argument("--amd", action="store_true", help="Enable AMD") parser.add_argument("--simamd", action="store_true", help="Enable simulation") args = parser.parse_args() A100 = device_dict["A100_80GB_fp16"] TPU = device_dict["TPUv3"] MI210 = device_dict["MI210"] if args.gpu: gpu_kernel_launch_overhead = GeLU.gpu_kernel_launch_overhead() print(f"Performance of GELU") for M in range(10, 30): M = 2**M # N = 2**15 if args.simtpu: model = GeLU(data_type=data_type_dict["fp32"]) _ = model(Tensor([M], data_type=data_type_dict["fp32"])) if args.roofline: latency = model.roofline_model(TPU) + 100e-6 file_name = "gelu_TPUv3_roofline.csv" else: latency = ( model.compile_and_simulate( pcb_module=TPU, compile_mode="heuristic-TPU" ) + 100e-6 ) file_name = "gelu_TPUv3_sim.csv" else: model = GeLU(data_type=data_type_dict["fp16"]) _ = model( Tensor([M]), ) if args.gpu: latency = model.run_on_gpu() if args.amd: model.amd_kernel_launch_overhead() latency = model.run_on_amd() if args.simgpu: if args.roofline: latency = model.roofline_model(A100) + 4.5e-5 file_name = "gelu_A100_roofline.csv" else: latency = ( model.compile_and_simulate( pcb_module=A100, compile_mode="heuristic-GPU" ) + 4.5e-5 ) file_name = "gelu_A100_sim.csv" if args.simamd: if args.roofline: latency = ( model.roofline_model(MI210) + MI210.compute_module.overhead.gelu ) file_name = "gelu_MI210_roofline.csv" else: latency = ( model.compile_and_simulate( pcb_module=MI210, compile_mode="heuristic-GPU" ) + MI210.compute_module.overhead.gelu ) file_name = "gelu_MI210_sim.csv" print(f"{M}, {M/latency/1e9}") with open(f"ae/figure5/g/{file_name}", "a") as f: f.write(f"{M}, {M/latency/1e9}\n") ================================================ FILE: ae/figure5/h/__init__.py ================================================ ================================================ FILE: ae/figure5/h/run.sh ================================================ rm *.csv rm *.pdf cd ../../.. python -m ae.figure5.h.test_allreduce ================================================ FILE: ae/figure5/h/test_allreduce.py ================================================ from software_model.communication_primitives import AllReduceMultiPCB from software_model.utils import data_type_dict, Tensor from hardware_model.interconnect import interconnect_module_dict import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np if __name__ == "__main__": interconnect_module = interconnect_module_dict["NVLinkV3_FC_4"] gpu_latency_list = [ 12.52, 13.92, 12.39, 13.22, 12.35, 12.45, 13.12, 13.02, 15.12, 15.23, 15.99, 17.39, 20.00, 22.93, 28.66, 35.93, 47.27, 60.75, 66.40, 84.75, 128.8, 195.7, 279.7, 532.3, 961.7, 1883.7, 3659.0, 7219.2, 14136, 27944, 55384, 110277, ] size_list = [ 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, 2147483648, 4294967296, 8589934592, 17179869184, ] simulated_latency_list = [] data_type = data_type_dict["fp16"] for data_size in size_list: model = AllReduceMultiPCB(data_type=data_type) _ = model( Tensor([data_size / 2]), ) our_latency = model.simulate(interconnect_module=interconnect_module) simulated_latency_list.append(our_latency * 1e6) gpu_bandwidth_list = np.array(size_list) / np.array(gpu_latency_list) / 1e3 simulated_gpu_bandwidth_list = ( np.array(size_list) / np.array(simulated_latency_list) / 1e3 ) size_list = size_list[9:] gpu_bandwidth_list = gpu_bandwidth_list[9:] simulated_gpu_bandwidth_list = simulated_gpu_bandwidth_list[9:] color_NV = sns.color_palette("Greens_d", 4)[1:] color_Google = sns.color_palette("Blues_d", 4)[1:] plt.figure(figsize=(6, 2.3)) plt.xscale("log", base=2) plt.plot( size_list, gpu_bandwidth_list, marker="o", label="Real NVIDIA A100 Node", color=color_NV[0], ) plt.plot( size_list, simulated_gpu_bandwidth_list, marker="x", label="Simulated NVIDIA A100 Node", color=color_NV[2], ) interconnect_module = interconnect_module_dict["TPUv3Link_8"] simulated_tpu_bandwidth_list = [] data_type = data_type_dict["fp16"] for data_size in size_list: model = AllReduceMultiPCB(data_type=data_type) _ = model( Tensor([data_size // 2]), ) our_latency = model.simulate(interconnect_module=interconnect_module) simulated_tpu_bandwidth_list.append(data_size / our_latency / 1e9) plt.plot( size_list, simulated_tpu_bandwidth_list, marker="x", label="Simulated Google TPU v3 Node", color=color_Google[2], ) plt.xlabel("Data Size (Bytes)") plt.ylabel("Bandwidth (GB/s)") plt.grid( True, which="both", ls="--", c="0.7" ) # Adding a grid for better readability plt.legend() plt.savefig( "ae/figure5/h/figure5h.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300 ) ================================================ FILE: ae/figure5/ijkl/__init__.py ================================================ ================================================ FILE: ae/figure5/ijkl/plot_transformer.py ================================================ import matplotlib.pyplot as plt import seaborn as sns import csv import pandas as pd def read_csv(filename: str): numbers = [] # Open the CSV file and read the numbers with open(filename, "r") as csvfile: reader = csv.reader(csvfile) for row in reader: # Since each row contains only one number, we use row[0] numbers.append(float(row[0])) return numbers categories = [ "Q_K_V", "Q_mul_K", "A_mul_V", "Wo_proj", "W1_proj", "W2_proj", "Softmax", "LayerNorm_MHA", "LayerNorm_FFN", "GeLU", "AllReduce_MHA", "AllReduce_FFN", ] colors_matmul = sns.color_palette("flare_r", 6) colors_normalization = sns.color_palette("summer", 3) colors_gelu = sns.color_palette("pink", 1) colors_allreduce = sns.color_palette("Blues_r", 2) colors = colors_matmul + colors_normalization + colors_gelu + colors_allreduce # values_simgpu = read_csv("transformer_A100_sim.csv") values_simgpu = pd.read_csv("transformer_A100_sim.csv", header=None, names=categories, index_col=None).iloc[0].tolist() print(values_simgpu) values_gpu = read_csv("real_hardware/transformer_A100.csv") # values_gpu_roofline = read_csv("transformer_A100_roofline.csv") values_gpu_roofline = pd.read_csv("transformer_A100_roofline.csv", header=None, names=categories, index_col=None).iloc[0].tolist() # values_simtpu = read_csv("transformer_TPUv3_sim.csv") values_simtpu = pd.read_csv("transformer_TPUv3_sim.csv", header=None, names=categories, index_col=None).iloc[0].tolist() # values_tpu_roofline = read_csv("transformer_TPUv3_roofline.csv") values_tpu_roofline = pd.read_csv("transformer_TPUv3_roofline.csv", header=None, names=categories, index_col=None).iloc[0].tolist() plt.figure(figsize=(3, 2.8)) # Create the stacked bar graph bottom = 0 for i, (category, value) in enumerate(zip(categories, values_gpu)): plt.bar(1, value, bottom=bottom, color=colors[i], label=category, width=0.5) bottom += value value_gt = bottom bottom = 0 for i, (category, value) in enumerate(zip(categories, values_simgpu)): plt.bar(2, value, bottom=bottom, color=colors[i], width=0.5) bottom += value value_sim = bottom bottom = 0 for i, (category, value) in enumerate(zip(categories, values_gpu_roofline)): plt.bar(3, value, bottom=bottom, color=colors[i], width=0.5) bottom += value value_roofline = bottom print(f"gpu prefilling: {value_sim/value_gt}, {value_roofline/value_gt}") # Set the title, legend, and display the graph # plt.title( # "GPU Runtime Breakdown of One Transformer Layer in GPT-3 \n(Initial computation, batch size = 8, sequence length = 2048)" # ) plt.ylabel("Latency (s)") # plt.xlabel('Bar Sets') plt.xticks([1, 2, 3], ["Real\nA100", "Simulated\nA100", "Roofline\nModel"]) # handles, labels = plt.gca().get_legend_handles_labels() # plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1, 1)) plt.tight_layout() plt.savefig("figure5i.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300) plt.figure(figsize=(3, 2.8)) # Create the stacked bar graph # bottom = 0 # for i, (category, value) in enumerate(zip(categories, values_tpu)): # plt.bar(1, value, bottom=bottom, color=colors[i], label=category, width=0.5) # bottom += value # value_gt = bottom bottom = 0 for i, (category, value) in enumerate(zip(categories, values_simtpu)): plt.bar(2, value, bottom=bottom, color=colors[i], width=0.5) bottom += value value_sim = bottom bottom = 0 for i, (category, value) in enumerate(zip(categories, values_tpu_roofline)): plt.bar(3, value, bottom=bottom, color=colors[i], width=0.5) bottom += value value_roofline = bottom # print(f"tpu prefilling: {value_sim/value_gt}, {value_roofline/value_gt}") # Set the title, legend, and display the graph # plt.title( # "TPU Runtime Breakdown of One Transformer Layer in GPT-3 \n(Initial computation, batch size = 8, sequence length = 2048)" # ) plt.ylabel("Latency (s)") # plt.xlabel('Bar Sets') plt.xticks([1, 2, 3], ["Real\nTPUv3", "Simulated\nTPUv3", "Roofline\nModel"]) # handles, labels = plt.gca().get_legend_handles_labels() # plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1, 1)) plt.tight_layout() plt.savefig("figure5j.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300) # values_simgpu = read_csv("transformerAR_A100_sim.csv") values_simgpu=pd.read_csv("transformerAR_A100_sim.csv",header=None,names=categories,index_col=None).iloc[0].tolist() values_gpu = read_csv("real_hardware/transformerAR_A100.csv") # values_gpu_roofline = read_csv("transformerAR_A100_roofline.csv") values_gpu_roofline=pd.read_csv("transformerAR_A100_roofline.csv",header=None,names=categories,index_col=None).iloc[0].tolist() plt.figure(figsize=(3, 2.8)) # Create the stacked bar graph bottom = 0 for i, (category, value) in enumerate(zip(categories, values_gpu)): value = value * 1e3 plt.bar(1, value, bottom=bottom, color=colors[i], label=category, width=0.5) bottom += value value_gt = bottom bottom = 0 for i, (category, value) in enumerate(zip(categories, values_simgpu)): value = value * 1e3 plt.bar(2, value, bottom=bottom, color=colors[i], width=0.5) bottom += value value_sim = bottom bottom = 0 for i, (category, value) in enumerate(zip(categories, values_gpu_roofline)): value = value * 1e3 plt.bar(3, value, bottom=bottom, color=colors[i], width=0.5) bottom += value value_roofline = bottom print(value_sim / value_gt, value_roofline / value_gt) # Set the title, legend, and display the graph # plt.title( # "GPU Runtime Breakdown of One Transformer Layer in GPT-3 \n(Auto regression, batch size = 8, sequence length = 2048)" # ) plt.ylabel("Latency (ms)") # plt.xlabel('Bar Sets') plt.xticks([1, 2, 3], ["Real\nA100", "Simulated\nA100", "Roofline\nModel"]) # handles, labels = plt.gca().get_legend_handles_labels() # plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1, 1)) plt.tight_layout() plt.savefig("figure5k.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300) # values_simtpu = read_csv("transformerAR_TPUv3_sim.csv") values_simtpu = pd.read_csv("transformerAR_TPUv3_sim.csv", header=None, names=categories, index_col=None).iloc[0].tolist() # values_tpu = read_csv("real_hardware/transformerAR_TPUv3.csv") # values_tpu_roofline=read_csv("transformerAR_TPUv3_roofline.csv") values_tpu_roofline=pd.read_csv("transformerAR_TPUv3_roofline.csv",header=None,names=categories,index_col=None).iloc[0].tolist() plt.figure(figsize=(4.5, 2.8)) # Create the stacked bar graph # bottom = 0 # for i, (category, value) in enumerate(zip(categories, values_tpu)): # value=value*1e3 # plt.bar(1, value, bottom=bottom, color=colors[i], label=category, width=0.5) # bottom += value # value_gt=bottom bottom = 0 for i, (category, value) in enumerate(zip(categories, values_simtpu)): value=value*1e3 plt.bar(2, value, bottom=bottom, color=colors[i], label=category,width=0.5) bottom += value value_sim=bottom bottom = 0 for i, (category, value) in enumerate(zip(categories, values_tpu_roofline)): value=value*1e3 plt.bar(3, value, bottom=bottom, color=colors[i], width=0.5) bottom += value value_roofline=bottom print(value_sim/value_gt,value_roofline/value_gt) # Set the title, legend, and display the graph # plt.title( # "GPU Runtime Breakdown of One Transformer Layer in GPT-3 \n(Auto regression, batch size = 8, input(output) sequence length = 2048(1024))" # ) plt.ylabel("Latency (ms)") # plt.xlabel('Bar Sets') plt.xticks([1, 2, 3], ["Real\nTPUv3", "Simulated\nTPUv3", "Roofline\nModel"]) handles, labels = plt.gca().get_legend_handles_labels() plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1, 1.05)) plt.tight_layout() plt.savefig("figure5l.pdf",bbox_inches="tight", pad_inches=0.01, dpi=300) ================================================ FILE: ae/figure5/ijkl/real_hardware/transformerAR_A100.csv ================================================ 0.0002124309539794922 0.00010609626770019531 0.0001386404037475586 6.890296936035156e-05 0.00018596649169921875 0.00018608570098876953 1.6689300537109375e-05 4.8041343688964844e-05 4.8041343688964844e-05 4.792213439941406e-05 26.04e-06 26.04e-06 ================================================ FILE: ae/figure5/ijkl/real_hardware/transformer_A100.csv ================================================ 0.013721823692321777 0.0018811225891113281 0.001183152198791504 0.0045403242111206055 0.017464280128479004 0.017485618591308594 0.00280153751373291 0.0006816387176513672 0.0006816387176513672 0.0005242824554443359 0.0028909 0.0028909 ================================================ FILE: ae/figure5/ijkl/run.sh ================================================ rm *.csv rm *.pdf cd ../../.. python -m ae.figure5.ijkl.test_transformer --simgpu --roofline python -m ae.figure5.ijkl.test_transformer --simtpu --roofline python -m ae.figure5.ijkl.test_transformer --simgpu --init --roofline python -m ae.figure5.ijkl.test_transformer --simtpu --init --roofline python -m ae.figure5.ijkl.test_transformer --simgpu python -m ae.figure5.ijkl.test_transformer --simtpu python -m ae.figure5.ijkl.test_transformer --simgpu --init python -m ae.figure5.ijkl.test_transformer --simtpu --init cd ae/figure5/ijkl python plot_transformer.py ================================================ FILE: ae/figure5/ijkl/test_transformer.py ================================================ from software_model.transformer import ( TransformerBlockInitComputationTP, TransformerBlockAutoRegressionTP, ) from software_model.utils import data_type_dict, Tensor from hardware_model.system import system_dict import argparse if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--init", action="store_true", help="initial computation") parser.add_argument("--gpu", action="store_true", help="Enable GPU") parser.add_argument("--simgpu", action="store_true", help="Enable simulation") parser.add_argument("--simtpu", action="store_true", help="Enable simulation") parser.add_argument("--roofline", action="store_true", help="use roofline") args = parser.parse_args() bs = 8 s = 2048 if args.init: print("Initial computation") if args.simgpu: model = TransformerBlockInitComputationTP( d_model=12288, n_heads=96, device_count=4, data_type=data_type_dict["fp16"], ) A100_system = system_dict["A100_4_fp16"] # from design_space_exploration.dse import read_architecture_template, template_to_system # arch_specs = read_architecture_template("configs/template.json") # A100_system = template_to_system(arch_specs) _ = model(Tensor([bs, s, 12288], data_type_dict["fp16"])) if args.roofline: model.roofline_model(A100_system) file_name = "transformer_A100_roofline.csv" else: model.compile_and_simulate(A100_system, compile_mode="heuristic-GPU") file_name = "transformer_A100_sim.csv" if args.simtpu: model = TransformerBlockInitComputationTP( d_model=12288, n_heads=96, device_count=8, data_type=data_type_dict["fp16"], ) TPU_system = system_dict["TPUv3_8"] _ = model(Tensor([bs, s, 12288], data_type_dict["fp16"])) if args.roofline: model.roofline_model(TPU_system) file_name = "transformer_TPUv3_roofline.csv" else: model.compile_and_simulate(TPU_system, compile_mode="heuristic-TPU") file_name = "transformer_TPUv3_sim.csv" if args.gpu: model = TransformerBlockInitComputationTP( d_model=12288, n_heads=96, device_count=4, data_type=data_type_dict["fp16"], ) _ = model(Tensor([bs, s, 12288], data_type_dict["fp16"])) model.run_on_gpu() else: print("Auto-regression") output_token_length = 1024 if args.simgpu: model = TransformerBlockAutoRegressionTP( d_model=12288, n_heads=96, device_count=4, data_type=data_type_dict["fp16"], ) A100_system = system_dict["A100_4_fp16"] _ = model( Tensor([bs, 1, 12288], data_type_dict["fp16"]), s + output_token_length ) if args.roofline: model.roofline_model(A100_system) file_name = "transformerAR_A100_roofline.csv" else: model.compile_and_simulate(A100_system, compile_mode="heuristic-GPU") file_name = "transformerAR_A100_sim.csv" if args.simtpu: model = TransformerBlockAutoRegressionTP( d_model=12288, n_heads=96, device_count=8, data_type=data_type_dict["fp16"], ) TPU_system = system_dict["TPUv3_8"] _ = model( Tensor([bs, 1, 12288], data_type_dict["fp16"]), s + output_token_length ) if args.roofline: model.roofline_model(TPU_system) file_name = "transformerAR_TPUv3_roofline.csv" else: model.compile_and_simulate(TPU_system, compile_mode="heuristic-TPU") file_name = "transformerAR_TPUv3_sim.csv" if args.gpu: model = TransformerBlockAutoRegressionTP( d_model=12288, n_heads=96, device_count=4, data_type=data_type_dict["fp16"], ) _ = model( Tensor([bs, 1, 12288], data_type_dict["fp16"]), s + output_token_length ) model.run_on_gpu() with open(f"ae/figure5/ijkl/{file_name}", "w") as f: if args.roofline: f.write(model.roofline_log) else: f.write(model.simluate_log) ================================================ FILE: ae/figure5/run_figure5.sh ================================================ cd ab bash run.sh cd .. cd cf bash run.sh cd .. cd de bash run.sh cd .. cd g bash run.sh cd .. cd h bash run.sh cd .. cd ijkl bash run.sh cd .. ================================================ FILE: ae/figure6/real_hardware/die_area.csv ================================================ 476.25, 446.22 76.44, 33 119.31, 25.2 58, 83.26 31.77, 40.83 20.95, 45.52 0, 42 40, 4 ================================================ FILE: ae/figure6/run_figure6.sh ================================================ rm *.csv rm *.pdf cd ../.. python -m ae.figure6.test_cost_model ================================================ FILE: ae/figure6/test_cost_model.py ================================================ from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2 from design_space_exploration.dse import read_architecture_template import matplotlib.pyplot as plt import seaborn as sns import pandas as pd arch_specs = read_architecture_template("configs/GA100.json") compute_chiplet_area_mm2, A100_core_breakdown_map, compute_total_die_map = ( calc_compute_chiplet_area_mm2(arch_specs, verbose=True) ) io_die_area_mm2, io_total_die_map = calc_io_die_area_mm2(arch_specs, verbose=True) categories = [ "Cores", "On-chip interconnect", "Global buffer", "Memory(PHY)", "Memory(Control)", "Off-chip interconnect\n(PHY)", "Off-chip interconnect\n(Control)", "Other", ] die_area = pd.read_csv( "ae/figure6/real_hardware/die_area.csv", header=None, names=["A100", "MI210"] ) values_a100 = die_area["A100"].tolist() values_mi210 = die_area["MI210"].tolist() values_a100_sim = [ compute_total_die_map["cores_area"], compute_total_die_map["crossbar_area"], io_total_die_map["global_buffer_area"], io_total_die_map["mem_phy_area"], io_total_die_map["mem_controller_area"], io_total_die_map["device_phy_area"], io_total_die_map["device_controller_area"], 0, ] arch_specs = read_architecture_template("configs/mi210_template.json") compute_chiplet_area_mm2, MI210_core_breakdown_map, compute_total_die_map = ( calc_compute_chiplet_area_mm2(arch_specs, verbose=True) ) io_die_area_mm2, io_total_die_map = calc_io_die_area_mm2(arch_specs, verbose=True) values_mi210_sim = [ compute_total_die_map["cores_area"], compute_total_die_map["crossbar_area"], io_total_die_map["global_buffer_area"], io_total_die_map["mem_phy_area"], io_total_die_map["mem_controller_area"], io_total_die_map["device_phy_area"], io_total_die_map["device_controller_area"], 0, ] plt.figure(figsize=(4, 2)) colors_matmul = sns.color_palette("flare_r", 7)[5:6] colors_normalization = sns.color_palette("summer", 2) colors_gelu = sns.color_palette("pink", 5)[2:4] colors_allreduce = sns.color_palette("Blues_r", 2) colors = ( colors_matmul + colors_normalization + colors_gelu + colors_allreduce + sns.color_palette("Greys_r", 1) ) bottom = 0 for i, (category, value) in enumerate(zip(categories, values_a100)): plt.bar(1, value, bottom=bottom, color=colors[i], label=category, width=0.5) bottom += value bottom = 0 for i, (category, value) in enumerate(zip(categories, values_a100_sim)): plt.bar(2, value, bottom=bottom, color=colors[i], width=0.5) bottom += value bottom = 0 for i, (category, value) in enumerate(zip(categories, values_mi210)): plt.bar(3, value, bottom=bottom, color=colors[i], width=0.5) bottom += value bottom = 0 for i, (category, value) in enumerate(zip(categories, values_mi210_sim)): plt.bar(4, value, bottom=bottom, color=colors[i], width=0.5) bottom += value plt.ylabel("Area ($mm^2$)") plt.xticks( [1, 2, 3, 4], ["Real\nGA100", "Simulated\nGA100", "Real\nAldebaran", "Simulated\nAldebaran"], ) handles, labels = plt.gca().get_legend_handles_labels() plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1, 1.1)) plt.savefig("ae/figure6/figure6a.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300) values_a100 = [3.75] values_mi210 = [4.02] values_a100_sim = [ A100_core_breakdown_map["control_area"], A100_core_breakdown_map["alu_area"], A100_core_breakdown_map["sa_area"], A100_core_breakdown_map["regfile_area"], A100_core_breakdown_map["local_buffer_area"], ] values_mi210_sim = [ MI210_core_breakdown_map["control_area"], MI210_core_breakdown_map["alu_area"], MI210_core_breakdown_map["sa_area"], MI210_core_breakdown_map["regfile_area"], MI210_core_breakdown_map["local_buffer_area"], ] categories = [ "Control logic", "ALUs", "Systolic array", "Register file", "Local buffer", ] colors = colors_matmul + colors_normalization + colors_allreduce color_gt = sns.color_palette("Greys_r", 1)[0] plt.figure(figsize=(4, 1.5)) bottom = 0 for i, (category, value) in enumerate(zip(categories, values_a100)): plt.bar(1, value, bottom=bottom, color=color_gt, width=0.5) bottom += value bottom = 0 for i, (category, value) in enumerate(zip(categories, values_a100_sim)): plt.bar(2, value, bottom=bottom, color=colors[i], label=category, width=0.5) bottom += value bottom = 0 for i, (category, value) in enumerate(zip(categories, values_mi210)): plt.bar(3, value, bottom=bottom, color=color_gt, width=0.5) bottom += value bottom = 0 for i, (category, value) in enumerate(zip(categories, values_mi210_sim)): plt.bar(4, value, bottom=bottom, color=colors[i], width=0.5) bottom += value plt.ylabel("Area ($mm^2$)") plt.xticks( [1, 2, 3, 4], ["Real\nGA100", "Simulated\nGA100", "Real\nAldebaran", "Simulated\nAldebaran"], ) handles, labels = plt.gca().get_legend_handles_labels() plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1, 1)) plt.savefig("ae/figure6/figure6b.pdf", bbox_inches="tight", pad_inches=0.01, dpi=300) ================================================ FILE: ae/figure7/__init__.py ================================================ ================================================ FILE: ae/figure7/change_core_size.py ================================================ from software_model.transformer import ( TransformerBlockInitComputationTP, TransformerBlockAutoRegressionTP, ) from software_model.utils import data_type_dict, Tensor from design_space_exploration.dse import template_to_system, read_architecture_template from multiprocessing import Process, Lock from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2 import time input_seq_length = 2048 batch_size = 8 output_seq_length = 1024 arch_specs = read_architecture_template("configs/GA100.json") device_count = arch_specs["device_count"] model_init = TransformerBlockInitComputationTP( d_model=12288, n_heads=96, device_count=device_count, data_type=data_type_dict["fp16"], ) model_auto_regression = TransformerBlockAutoRegressionTP( d_model=12288, n_heads=96, device_count=device_count, data_type=data_type_dict["fp16"], ) _ = model_init( Tensor([batch_size, input_seq_length, model_init.d_model], data_type_dict["fp16"]) ) _ = model_auto_regression( Tensor([batch_size, 1, model_init.d_model], data_type_dict["fp16"]), input_seq_length + output_seq_length, ) def test_core_size(core_configs, lock): name, core_count, sublane_count, array_width, vector_width, sram_KB = core_configs arch_specs["device"]["compute_chiplet"]["core_count"] = core_count arch_specs["device"]["compute_chiplet"]["core"]["sublane_count"] = sublane_count arch_specs["device"]["compute_chiplet"]["core"]["systolic_array"][ "array_width" ] = array_width arch_specs["device"]["compute_chiplet"]["core"]["systolic_array"][ "array_height" ] = array_width arch_specs["device"]["compute_chiplet"]["core"]["vector_unit"][ "vector_width" ] = vector_width arch_specs["device"]["compute_chiplet"]["core"]["SRAM_KB"] = sram_KB # for area arch_specs["device"]["compute_chiplet"]["physical_core_count"] = core_count arch_specs["device"]["compute_chiplet"]["core"]["vector_unit"]["int32_count"] = ( vector_width // 2 ) arch_specs["device"]["compute_chiplet"]["core"]["vector_unit"]["fp32_count"] = ( vector_width // 2 ) arch_specs["device"]["compute_chiplet"]["core"]["vector_unit"]["fp64_count"] = ( vector_width // 4 ) if vector_width <= 32: arch_specs["device"]["compute_chiplet"]["core"]["register_file"][ "num_registers" ] = (vector_width * 512) else: arch_specs["device"]["compute_chiplet"]["core"]["register_file"][ "num_reg_files" ] = (vector_width // 32) compute_area_mm2 = calc_compute_chiplet_area_mm2(arch_specs) io_area_mm2 = calc_io_die_area_mm2(arch_specs) print(f"{name}, {compute_area_mm2}, {io_area_mm2}, {compute_area_mm2+io_area_mm2}") # exit() system = template_to_system(arch_specs) auto_regression_latency_simulated = model_auto_regression.compile_and_simulate( system, "heuristic-GPU" ) init_latency_simulated = model_init.compile_and_simulate(system, "heuristic-GPU") print(f"{name}, {init_latency_simulated}, {auto_regression_latency_simulated}") with lock: with open(f"ae/figure7/core_size_results_init.csv", "a") as f: f.write( f"{name}, {compute_area_mm2+io_area_mm2}, {init_latency_simulated}, {model_init.simluate_log}\n" ) with open(f"ae/figure7/core_size_results_ar.csv", "a") as f: f.write( f"{name}, {compute_area_mm2+io_area_mm2}, {auto_regression_latency_simulated}, {model_auto_regression.simluate_log}\n" ) lock = Lock() configs = [ ("A", 128, 4, 8, 8, 192), ("B", 128, 4, 16, 32, 192), ("C", 128, 1, 32, 128, 192), ("D", 32, 1, 64, 512, 768), ("E", 8, 1, 128, 2048, 3072), ] processes = [Process(target=test_core_size, args=(i, lock)) for i in configs] try: for p in processes: p.start() while any(p.is_alive() for p in processes): time.sleep(1) except KeyboardInterrupt: print("Terminating processes...") for p in processes: p.terminate() p.join() print("All processes have finished.") ================================================ FILE: ae/figure7/plot_core_size.py ================================================ import matplotlib.pyplot as plt import seaborn as sns import csv import pandas as pd categories = [ "Q_K_V", "Q_mul_K", "A_mul_V", "Wo_proj", "W1_proj", "W2_proj", "Softmax", "LayerNorm_MHA", "LayerNorm_FFN", "GeLU", "AllReduce_MHA", "AllReduce_FFN", ] col_names = ["area", "latency"] + categories colors_matmul = sns.color_palette("flare_r", 6) colors_normalization = sns.color_palette("summer", 3) colors_gelu = sns.color_palette("pink", 1) colors_allreduce = sns.color_palette("Blues_r", 2) colors = colors_matmul + colors_normalization + colors_gelu + colors_allreduce core_size_init = pd.read_csv( "core_size_results_init.csv", header=None, names=col_names, index_col=0 ) core_size_init.index.astype(str) core_size_ar = pd.read_csv( "core_size_results_ar.csv", header=None, names=col_names, index_col=0 ) core_size_ar.index.astype(str) df_sorted = core_size_init.sort_index() areas = df_sorted["area"].tolist() # print(areas) # exit() # areas = [ # 475.52039916931585, # 826.76355498007, # 826.76355498007, # 793.3380639020086, # 763.3465573533286, # ] plt.figure(figsize=(7, 3)) # Create the stacked bar graph x = 0 for row_index in ["A", "B", "C", "D", "E"]: x = x + 1 values = core_size_init.loc[row_index].tolist() bottom = 0 for i, (category, value) in enumerate(zip(categories, values[2:])): if row_index == "A": plt.bar(x, value, bottom=bottom, color=colors[i], label=category, width=0.5) else: plt.bar(x, value, bottom=bottom, color=colors[i], width=0.5) bottom += value plt.ylabel("Latency (s)") plt.xlabel("Configurations") plt.xticks([1, 2, 3, 4, 5], ["A", "B", "C", "D", "E"]) handles, labels = plt.gca().get_legend_handles_labels() plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1.2, 1.05)) plt.tight_layout() xticks = plt.gca().get_xticks() xticklabels = plt.gca().get_xticklabels() index_to_color_red = list(xticks).index(2) xticklabels[index_to_color_red].set_color("#76B900") ax1 = plt.gca() ax2 = ax1.twinx() ax2.plot( [1, 2, 3, 4, 5], areas, color="dimgray", linestyle="dashed", marker="x", label="Area", ) ax2.set_ylabel("Area ($mm^2$)") ax2.set_ylim([0, 1000]) plt.legend(loc="upper right") plt.savefig( "figure7a.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01, ) plt.show() plt.figure(figsize=(7, 3)) x = 0 for row_index in ["A", "B", "C", "D", "E"]: x = x + 1 values = core_size_ar.loc[row_index].tolist() bottom = 0 for i, (category, value) in enumerate(zip(categories, values[2:])): value = value * 1e3 if row_index == "A": plt.bar(x, value, bottom=bottom, color=colors[i], label=category, width=0.5) else: plt.bar(x, value, bottom=bottom, color=colors[i], width=0.5) bottom += value # Set the title, legend, and display the graph # plt.title( # "Generation latency under different organization" # ) plt.ylabel("Latency (ms)") plt.xlabel("Configurations") plt.xticks([1, 2, 3, 4, 5], ["A", "B", "C", "D", "E"]) handles, labels = plt.gca().get_legend_handles_labels() plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1.2, 1.05)) plt.tight_layout() xticks = plt.gca().get_xticks() xticklabels = plt.gca().get_xticklabels() index_to_color_red = list(xticks).index(2) xticklabels[index_to_color_red].set_color("#76B900") ax1 = plt.gca() ax2 = ax1.twinx() ax2.plot( [1, 2, 3, 4, 5], areas, color="dimgrey", linestyle="dashed", marker="x", label="Area", ) ax2.set_ylabel("Area ($mm^2$)") ax2.set_ylim([0, 1000]) plt.legend(loc="upper left") plt.savefig( "figure7b.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01, ) ================================================ FILE: ae/figure7/run_figure7.sh ================================================ rm *.csv rm *.pdf cd ../.. python -m ae.figure7.change_core_size cd ae/figure7 python plot_core_size.py ================================================ FILE: ae/figure8/__init__.py ================================================ ================================================ FILE: ae/figure8/change_memory_bw.py ================================================ import json, re from hardware_model.compute_module import ( VectorUnit, SystolicArray, Core, ComputeModule, overhead_dict, ) from hardware_model.io_module import IOModule from hardware_model.memory_module import MemoryModule from hardware_model.device import Device from hardware_model.interconnect import LinkModule, InterConnectModule, TopologyType from hardware_model.system import System from software_model.transformer import ( TransformerBlockInitComputationTP, TransformerBlockAutoRegressionTP, ) from software_model.utils import data_type_dict, Tensor from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2 from math import ceil from design_space_exploration.dse import template_to_system, read_architecture_template from multiprocessing import Process, Lock import time from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2 input_seq_length = 2048 batch_size = 8 output_seq_length = 1024 arch_specs = read_architecture_template("configs/template.json") device_count = arch_specs["device_count"] model_init = TransformerBlockInitComputationTP( d_model=12288, n_heads=96, device_count=device_count, data_type=data_type_dict["fp16"], ) model_auto_regression = TransformerBlockAutoRegressionTP( d_model=12288, n_heads=96, device_count=device_count, data_type=data_type_dict["fp16"], ) _ = model_init( Tensor([batch_size, input_seq_length, model_init.d_model], data_type_dict["fp16"]) ) _ = model_auto_regression( Tensor([batch_size, 1, model_init.d_model], data_type_dict["fp16"]), input_seq_length + output_seq_length, ) def test_memory_bandwidth(memory_bandwidth, lock): arch_specs["device"]["io"]["memory_channel_physical_count"] = memory_bandwidth arch_specs["device"]["io"]["memory_channel_active_count"] = memory_bandwidth compute_area_mm2 = calc_compute_chiplet_area_mm2(arch_specs) io_area_mm2 = calc_io_die_area_mm2(arch_specs) print( f"{memory_bandwidth}, {compute_area_mm2}, {io_area_mm2}, {compute_area_mm2+io_area_mm2}" ) system = template_to_system(arch_specs) auto_regression_latency_simulated = model_auto_regression.compile_and_simulate( system, "heuristic-GPU" ) init_latency_simulated = model_init.compile_and_simulate(system, "heuristic-GPU") print( f"{memory_bandwidth}, {init_latency_simulated}, {auto_regression_latency_simulated}" ) with lock: with open(f"ae/figure8/memory_bw_results_bs{batch_size}_init.csv", "a") as f: f.write( f"{memory_bandwidth*400}, {compute_area_mm2+io_area_mm2}, {init_latency_simulated}, {model_init.simluate_log}\n" ) with open(f"ae/figure8/memory_bw_results_bs{batch_size}_ar.csv", "a") as f: f.write( f"{memory_bandwidth*400}, {compute_area_mm2+io_area_mm2}, {auto_regression_latency_simulated}, {model_auto_regression.simluate_log}\n" ) lock = Lock() processes = [ Process(target=test_memory_bandwidth, args=(i, lock)) for i in [1, 2, 3, 4, 5, 6, 7, 8] ] try: for p in processes: p.start() while any(p.is_alive() for p in processes): time.sleep(1) except KeyboardInterrupt: print("Terminating processes...") for p in processes: p.terminate() p.join() print("All processes have finished.") ================================================ FILE: ae/figure8/plot_memory_bw.py ================================================ import matplotlib.pyplot as plt import seaborn as sns import csv import pandas as pd categories = [ "Q_K_V", "Q_mul_K", "A_mul_V", "Wo_proj", "W1_proj", "W2_proj", "Softmax", "LayerNorm_MHA", "LayerNorm_FFN", "GeLU", "AllReduce_MHA", "AllReduce_FFN", ] col_names = ["area", "latency"] + categories colors_matmul = sns.color_palette("flare_r", 6) colors_normalization = sns.color_palette("summer", 3) colors_gelu = sns.color_palette("pink", 1) colors_allreduce = sns.color_palette("Blues_r", 2) colors = colors_matmul + colors_normalization + colors_gelu + colors_allreduce batch_size = 8 results_init = pd.read_csv( f"memory_bw_results_bs{batch_size}_init.csv", header=None, names=col_names, index_col=0, ) results_init.index.astype(int) results_ar = pd.read_csv( f"memory_bw_results_bs{batch_size}_ar.csv", header=None, names=col_names, index_col=0, ) results_ar.index.astype(int) plt.figure(figsize=(7, 3)) # Create the stacked bar graph x = 0 x_labels = [i * 400 for i in [1, 2, 3, 4, 5, 6, 7, 8]] for row_index in x_labels: x = x + 1 values = results_init.loc[row_index].tolist() bottom = 0 for i, (category, value) in enumerate(zip(categories, values[2:])): if row_index == x_labels[0]: plt.bar(x, value, bottom=bottom, color=colors[i], label=category, width=0.5) else: plt.bar(x, value, bottom=bottom, color=colors[i], width=0.5) bottom += value # Set the title, legend, and display the graph # plt.title( # "Prefilling Latency per Layer" # ) plt.ylabel("Latency (s)") plt.xlabel("Memory bandwidth (GB/s)") plt.xticks([1, 2, 3, 4, 5, 6, 7, 8], x_labels) handles, labels = plt.gca().get_legend_handles_labels() plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1, 1.05)) plt.tight_layout() xticks = plt.gca().get_xticks() xticklabels = plt.gca().get_xticklabels() index_to_color_red = list(xticks).index(5) xticklabels[index_to_color_red].set_color("red") plt.savefig(f"figure8a.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01) plt.show() plt.figure(figsize=(7, 3)) x = 0 for row_index in x_labels: x = x + 1 values = results_ar.loc[row_index].tolist() bottom = 0 for i, (category, value) in enumerate(zip(categories, values[2:])): value = value * 1e3 if row_index == x_labels[0]: plt.bar(x, value, bottom=bottom, color=colors[i], label=category, width=0.5) else: plt.bar(x, value, bottom=bottom, color=colors[i], width=0.5) bottom += value # Set the title, legend, and display the graph # plt.title( # "Generation Latency per Layer per Token" # ) plt.ylabel("Latency (ms)") plt.xlabel("Memory bandwidth (GB/s)") plt.xticks([1, 2, 3, 4, 5, 6, 7, 8], x_labels) handles, labels = plt.gca().get_legend_handles_labels() plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1, 1.05)) plt.tight_layout() xticks = plt.gca().get_xticks() xticklabels = plt.gca().get_xticklabels() index_to_color_red = list(xticks).index(5) xticklabels[index_to_color_red].set_color("red") plt.savefig(f"figure8b.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01) ================================================ FILE: ae/figure8/run_figure8.sh ================================================ rm *.csv rm *.pdf cd ../.. python -m ae.figure8.change_memory_bw cd ae/figure8 python plot_memory_bw.py ================================================ FILE: ae/figure9/__init__.py ================================================ ================================================ FILE: ae/figure9/change_l1_cache.py ================================================ from software_model.transformer import ( TransformerBlockInitComputationTP, TransformerBlockAutoRegressionTP, ) from software_model.utils import data_type_dict, Tensor from design_space_exploration.dse import template_to_system, read_architecture_template from multiprocessing import Process, Lock from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2 import time input_seq_length = 2048 batch_size = 8 output_seq_length = 1024 arch_specs = read_architecture_template("configs/template.json") device_count = arch_specs["device_count"] model_init = TransformerBlockInitComputationTP( d_model=12288, n_heads=96, device_count=device_count, data_type=data_type_dict["fp16"], ) model_auto_regression = TransformerBlockAutoRegressionTP( d_model=12288, n_heads=96, device_count=device_count, data_type=data_type_dict["fp16"], ) _ = model_init( Tensor([batch_size, input_seq_length, model_init.d_model], data_type_dict["fp16"]) ) _ = model_auto_regression( Tensor([batch_size, 1, model_init.d_model], data_type_dict["fp16"]), input_seq_length + output_seq_length, ) def test_SRAM_KB(SRAM_KB, lock): arch_specs["device"]["compute_chiplet"]["core"]["SRAM_KB"] = SRAM_KB compute_area_mm2 = calc_compute_chiplet_area_mm2(arch_specs) io_area_mm2 = calc_io_die_area_mm2(arch_specs) print( f"{SRAM_KB}, {compute_area_mm2}, {io_area_mm2}, {compute_area_mm2+io_area_mm2}" ) system = template_to_system(arch_specs) auto_regression_latency_simulated = model_auto_regression.compile_and_simulate( system, "heuristic-GPU" ) init_latency_simulated = model_init.compile_and_simulate(system, "heuristic-GPU") print(f"{SRAM_KB}, {init_latency_simulated}, {auto_regression_latency_simulated}") with lock: with open(f"ae/figure9/l1_cache_results_init.csv", "a") as f: f.write( f"{SRAM_KB}, {compute_area_mm2+io_area_mm2}, {init_latency_simulated}, {model_init.simluate_log}\n" ) with open(f"ae/figure9/l1_cache_results_ar.csv", "a") as f: f.write( f"{SRAM_KB}, {compute_area_mm2+io_area_mm2}, {auto_regression_latency_simulated}, {model_auto_regression.simluate_log}\n" ) # for SRAM_KB in [64, 128, 192, 256, 512, 1024]: # test_SRAM_KB(SRAM_KB, None) lock = Lock() processes = [ Process(target=test_SRAM_KB, args=(i, lock)) for i in [64, 128, 192, 256, 512, 1024] ] try: for p in processes: p.start() while any(p.is_alive() for p in processes): time.sleep(1) except KeyboardInterrupt: print("Terminating processes...") for p in processes: p.terminate() p.join() print("All processes have finished.") # for SRAM_KB in [64, 128, 192, 256, 512, 1024]: # arch_specs["device"]["compute_chiplet"]["core"][ # "SRAM_KB" # ] = SRAM_KB # system=template_to_system(arch_specs) # auto_regression_latency_simulated = model_auto_regression.compile_and_simulate(system, 'heuristic-GPU') # init_latency_simulated = model_init.compile_and_simulate(system, 'heuristic-GPU') # print(f'{SRAM_KB}, {init_latency_simulated}, {auto_regression_latency_simulated}') # with open(f'test/case_study/l1_cache/l1_cache_results.csv', 'a') as f: # f.write(f'{SRAM_KB}, {init_latency_simulated}, {auto_regression_latency_simulated}\n') ================================================ FILE: ae/figure9/plot_l1_cache.py ================================================ import matplotlib.pyplot as plt import seaborn as sns import csv import pandas as pd categories = [ "Q_K_V", "Q_mul_K", "A_mul_V", "Wo_proj", "W1_proj", "W2_proj", "Softmax", "LayerNorm_MHA", "LayerNorm_FFN", "GeLU", "AllReduce_MHA", "AllReduce_FFN", ] col_names = ["area", "latency"] + categories colors_matmul = sns.color_palette("flare_r", 6) colors_normalization = sns.color_palette("summer", 3) colors_gelu = sns.color_palette("pink", 1) colors_allreduce = sns.color_palette("Blues_r", 2) colors = colors_matmul + colors_normalization + colors_gelu + colors_allreduce results_init = pd.read_csv( "l1_cache_results_init.csv", header=None, names=col_names, index_col=0 ) results_init.index.astype(int) results_ar = pd.read_csv( "l1_cache_results_ar.csv", header=None, names=col_names, index_col=0 ) results_ar.index.astype(int) areas = [ 782.1048032068737, 794.1065561553206, 826.76355498007, 848.4527315580167, 913.304090096728, 1064.9121549472263, ] plt.figure(figsize=(7, 3)) # Create the stacked bar graph x = 0 x_labels = [64, 128, 192, 256, 512, 1024] for row_index in x_labels: x = x + 1 values = results_init.loc[row_index].tolist() bottom = 0 for i, (category, value) in enumerate(zip(categories, values[2:])): if row_index == x_labels[0]: plt.bar(x, value, bottom=bottom, color=colors[i], label=category, width=0.5) else: plt.bar(x, value, bottom=bottom, color=colors[i], width=0.5) bottom += value # Set the title, legend, and display the graph # plt.title( # "Prefilling Latency per Layer" # ) plt.ylabel("Latency (s)") plt.xlabel("Local buffer size (KB)") plt.xticks([1, 2, 3, 4, 5, 6], x_labels) handles, labels = plt.gca().get_legend_handles_labels() plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1.2, 1.03)) plt.tight_layout() xticks = plt.gca().get_xticks() xticklabels = plt.gca().get_xticklabels() index_to_color_red = list(xticks).index(3) xticklabels[index_to_color_red].set_color("#76B900") ax1 = plt.gca() ax2 = ax1.twinx() ax2.plot( [1, 2, 3, 4, 5, 6], areas, color="dimgray", linestyle="dashed", marker="x", label="Area", ) ax2.set_ylabel("Area ($mm^2$)") ax2.set_ylim([0, 1200]) plt.legend(loc="upper center") plt.savefig("figure9a.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01) plt.show() plt.figure(figsize=(7, 3)) x = 0 for row_index in x_labels: x = x + 1 values = results_ar.loc[row_index].tolist() bottom = 0 for i, (category, value) in enumerate(zip(categories, values[2:])): value = value * 1e3 if row_index == x_labels[0]: plt.bar(x, value, bottom=bottom, color=colors[i], label=category, width=0.5) else: plt.bar(x, value, bottom=bottom, color=colors[i], width=0.5) bottom += value # Set the title, legend, and display the graph # plt.title( # "Generation Latency per Layer per Token" # ) plt.ylabel("Latency (ms)") plt.xlabel("Local buffer size (KB)") plt.xticks([1, 2, 3, 4, 5, 6], x_labels) handles, labels = plt.gca().get_legend_handles_labels() plt.legend(handles[::-1], labels[::-1], loc="upper left", bbox_to_anchor=(1.2, 1.03)) plt.tight_layout() xticks = plt.gca().get_xticks() xticklabels = plt.gca().get_xticklabels() index_to_color_red = list(xticks).index(3) xticklabels[index_to_color_red].set_color("#76B900") ax1 = plt.gca() ax1.set_ylim([0, 1.2]) ax2 = ax1.twinx() ax2.plot( [1, 2, 3, 4, 5, 6], areas, color="dimgray", linestyle="dashed", marker="x", label="Area", ) ax2.set_ylabel("Area ($mm^2$)") ax2.set_ylim([0, 1200]) plt.legend(loc="upper center") plt.savefig("figure9b.pdf", dpi=300, bbox_inches="tight", pad_inches=0.01) ================================================ FILE: ae/figure9/run_figure9.sh ================================================ rm *.csv rm *.pdf cd ../.. python -m ae.figure9.change_l1_cache cd ae/figure9 python plot_l1_cache.py ================================================ FILE: configs/GA100.json ================================================ { "name": "NVIDIA A100(80GB)x4", "device_count": 4, "interconnect": { "link": { "name": "NVLink3", "bandwidth_per_direction_byte": 25e9, "bandwidth_both_directions_byte": 50e9, "latency_second": 8.92e-6, "flit_size_byte": 16, "header_size_byte": 16, "max_payload_size_byte": 256 }, "link_count_per_device": 12, "topology": "FC" }, "device": { "frequency_Hz": 1410e6, "compute_chiplet_count": 1, "compute_chiplet": { "physical_core_count": 128, "core_count": 128, "process_node": "7nm", "core": { "sublane_count": 4, "systolic_array": { "array_width": 16, "array_height": 16, "data_type": "fp16", "mac_per_cycle": 1 }, "vector_unit": { "vector_width": 32, "flop_per_cycle": 4, "data_type": "fp16", "int32_count": 16, "fp16_count": 0, "fp32_count": 16, "fp64_count": 8 }, "register_file": { "num_reg_files": 1, "num_registers": 16384, "register_bitwidth":32, "num_rdwr_ports":4 }, "SRAM_KB": 192 } }, "memory_protocol": "HBM2e", "_memory_protocol_list": [ "HBM2e", "DDR4", "DDR5", "PCIe4", "PCIe5" ], "io": { "process_node": "7nm", "global_buffer_MB": 48, "physical_global_buffer_MB": 48, "global_buffer_bandwidth_per_cycle_byte": 5120, "memory_channel_physical_count": 6, "memory_channel_active_count": 5, "pin_count_per_channel": 1024, "bandwidth_per_pin_bit": 3.2e9 }, "memory": { "total_capacity_GB": 80 } } } ================================================ FILE: configs/ga102_template.json ================================================ { "name": "NVIDIA A100(80GB)x4", "device_count": 4, "interconnect": { "link": { "name": "NVLink3", "bandwidth_per_direction_byte": 25e9, "bandwidth_both_directions_byte": 50e9, "latency_second": 8.92e-6, "flit_size_byte": 16, "header_size_byte": 16, "max_payload_size_byte": 256 }, "link_count_per_device": 12, "topology": "FC" }, "device": { "frequency_Hz": 1410e6, "compute_chiplet_count": 1, "compute_chiplet": { "physical_core_count": 128, "core_count": 108, "process_node": "7nm", "core": { "sublane_count": 4, "systolic_array": { "array_width": 16, "array_height": 16, "data_type": "fp16", "mac_per_cycle": 1 }, "vector_unit": { "vector_width": 32, "flop_per_cycle": 4, "data_type": "fp16", "int32_count": 16, "fp16_count": 0, "fp32_count": 32, "fp64_count": 0.5 }, "SRAM_KB": 128 } }, "memory_protocol": "HBM2e", "_memory_protocol_list": [ "HBM2e", "DDR4", "DDR5", "PCIe4", "PCIe5" ], "io": { "process_node": "7nm", "global_buffer_MB": 40, "physical_global_buffer_MB": 48, "global_buffer_bandwidth_per_cycle_byte": 5120, "memory_channel_physical_count": 6, "memory_channel_active_count": 5, "pin_count_per_channel": 1024, "bandwidth_per_pin_bit": 3.2e9 }, "memory": { "total_capacity_GB": 80 } } } ================================================ FILE: configs/generation_system.json ================================================ { "name": "NVIDIA A100(80GB)x4", "device_count": 4, "interconnect": { "link": { "name": "NVLink3", "bandwidth_per_direction_byte": 25e9, "bandwidth_both_directions_byte": 50e9, "latency_second": 8.92e-6, "flit_size_byte": 16, "header_size_byte": 16, "max_payload_size_byte": 256 }, "link_count_per_device": 12, "topology": "FC" }, "device": { "frequency_Hz": 1410e6, "compute_chiplet_count": 1, "compute_chiplet": { "physical_core_count": 128, "core_count": 128, "process_node": "7nm", "core": { "sublane_count": 4, "systolic_array": { "array_width": 8, "array_height": 8, "data_type": "fp16", "mac_per_cycle": 1 }, "vector_unit": { "vector_width": 8, "flop_per_cycle": 4, "data_type": "fp16", "int32_count": 4, "fp16_count": 32, "fp32_count": 0, "fp64_count": 0 }, "register_file": { "num_reg_files": 1, "num_registers": 4096, "register_bitwidth": 32, "num_rdwr_ports": 4 }, "SRAM_KB": 48 } }, "memory_protocol": "HBM2e", "_memory_protocol_list": [ "HBM2e", "DDR4", "DDR5", "PCIe4", "PCIe5" ], "io": { "process_node": "7nm", "global_buffer_MB": 24, "physical_global_buffer_MB": 24, "global_buffer_bandwidth_per_cycle_byte": 2560, "memory_channel_physical_count": 6, "memory_channel_active_count": 5, "pin_count_per_channel": 1024, "bandwidth_per_pin_bit": 3.2e9 }, "memory": { "total_capacity_GB": 80 } } } ================================================ FILE: configs/latency_design.json ================================================ { "name": "NVIDIA A100(80GB)x4", "device_count": 4, "interconnect": { "link": { "name": "NVLink3", "bandwidth_per_direction_byte": 25e9, "bandwidth_both_directions_byte": 50e9, "latency_second": 8.92e-6, "flit_size_byte": 16, "header_size_byte": 16, "max_payload_size_byte": 256 }, "link_count_per_device": 12, "topology": "FC" }, "device": { "frequency_Hz": 1410e6, "compute_chiplet_count": 1, "compute_chiplet": { "physical_core_count": 64, "core_count": 64, "process_node": "7nm", "core": { "sublane_count": 4, "systolic_array": { "array_width": 16, "array_height": 16, "data_type": "fp16", "mac_per_cycle": 1 }, "vector_unit": { "vector_width": 32, "flop_per_cycle": 4, "data_type": "fp16", "int32_count": 16, "fp16_count": 0, "fp32_count": 16, "fp64_count": 8 }, "register_file": { "num_reg_files": 1, "num_registers": 16384, "register_bitwidth": 32, "num_rdwr_ports": 4 }, "SRAM_KB": 192 } }, "memory_protocol": "HBM2e", "_memory_protocol_list": [ "HBM2e", "DDR4", "DDR5", "PCIe4", "PCIe5" ], "io": { "process_node": "7nm", "global_buffer_MB": 24, "physical_global_buffer_MB": 24, "global_buffer_bandwidth_per_cycle_byte": 2560, "memory_channel_physical_count": 6, "memory_channel_active_count": 5, "pin_count_per_channel": 1024, "bandwidth_per_pin_bit": 3.2e9 }, "memory": { "total_capacity_GB": 80 } } } ================================================ FILE: configs/mi210.json ================================================ { "name": "AMD MI210", "device_count": 4, "interconnect": { "link": { "name": "NVLink3", "bandwidth_per_direction_byte": 25e9, "bandwidth_both_directions_byte": 50e9, "latency_second": 8.92e-6, "flit_size_byte": 16, "header_size_byte": 16, "max_payload_size_byte": 256 }, "link_count_per_device": 12, "topology": "FC" }, "device": { "frequency_Hz": 1400e6, "compute_chiplet_count": 1, "compute_chiplet": { "core_count": 104, "process_node": "7nm", "core": { "sublane_count": 4, "systolic_array": { "array_width": 16, "array_height": 16, "data_type": "fp16", "mac_per_cycle": 1 }, "vector_unit": { "vector_width": 16, "flop_per_cycle": 2, "data_type": "fp32", "int32_count": 16, "fp32_count": 16, "fp64_count": 8 }, "SRAM_KB": 128 } }, "memory_protocol": "HBM2e", "_memory_protocol_list": [ "HBM2e", "DDR4", "DDR5", "PCIe4", "PCIe5" ], "io": { "process_node": "7nm", "global_buffer_MB": 8, "global_buffer_bandwidth_per_cycle_byte": 4096, "memory_channel_physical_count": 6, "memory_channel_active_count": 4, "pin_count_per_channel": 1024, "bandwidth_per_pin_bit": 3.2e9 }, "memory": { "total_capacity_GB": 64 } } } ================================================ FILE: configs/mi210_template.json ================================================ { "name": "AMD MI210", "device_count": 4, "interconnect": { "link": { "name": "InfinityFabric", "bandwidth_per_direction_byte": 25e9, "bandwidth_both_directions_byte": 50e9, "latency_second": 8.92e-6, "flit_size_byte": 16, "header_size_byte": 16, "max_payload_size_byte": 256 }, "link_count_per_device": 8, "topology": "FC" }, "device": { "frequency_Hz": 1410e6, "compute_chiplet_count": 1, "compute_chiplet": { "physical_core_count": 112, "core_count": 108, "process_node": "6nm", "core": { "sublane_count": 4, "systolic_array": { "array_width": 16, "array_height": 16, "data_type": "fp16", "mac_per_cycle": 1 }, "vector_unit": { "vector_width": 16, "flop_per_cycle": 4, "data_type": "fp16", "int32_count": 16, "fp16_count": 0, "fp32_count": 0, "fp64_count": 16 }, "register_file": { "num_reg_files": 64, "num_registers": 512, "register_bitwidth":32, "num_rdwr_ports":4 }, "SRAM_KB": 80 } }, "memory_protocol": "HBM2e", "_memory_protocol_list": [ "HBM2e", "DDR4", "DDR5", "PCIe4", "PCIe5" ], "io": { "process_node": "6nm", "global_buffer_MB": 8, "physical_global_buffer_MB": 8, "global_buffer_bandwidth_per_cycle_byte": 5120, "memory_channel_physical_count": 8, "memory_channel_active_count": 8, "pin_count_per_channel": 1024, "bandwidth_per_pin_bit": 3.2e9 }, "memory": { "total_capacity_GB": 80 } } } ================================================ FILE: configs/prefilling_system.json ================================================ { "name": "NVIDIA A100(80GB)x4", "device_count": 4, "interconnect": { "link": { "name": "NVLink3", "bandwidth_per_direction_byte": 25e9, "bandwidth_both_directions_byte": 50e9, "latency_second": 8.92e-6, "flit_size_byte": 16, "header_size_byte": 16, "max_payload_size_byte": 256 }, "link_count_per_device": 12, "topology": "FC" }, "device": { "frequency_Hz": 1410e6, "compute_chiplet_count": 1, "compute_chiplet": { "physical_core_count": 64, "core_count": 64, "process_node": "7nm", "core": { "sublane_count": 4, "systolic_array": { "array_width": 32, "array_height": 32, "data_type": "fp16", "mac_per_cycle": 1 }, "vector_unit": { "vector_width": 32, "flop_per_cycle": 4, "data_type": "fp16", "int32_count": 16, "fp16_count": 0, "fp32_count": 16, "fp64_count": 8 }, "register_file": { "num_reg_files": 1, "num_registers": 16384, "register_bitwidth": 32, "num_rdwr_ports": 4 }, "SRAM_KB": 768 } }, "memory_protocol": "PCIe5", "_memory_protocol_list": [ "HBM2e", "DDR4", "DDR5", "PCIe4", "PCIe5" ], "io": { "process_node": "7nm", "global_buffer_MB": 48, "physical_global_buffer_MB": 48, "global_buffer_bandwidth_per_cycle_byte": 5120, "memory_channel_physical_count": 256, "memory_channel_active_count": 256, "pin_count_per_channel": 1, "bandwidth_per_pin_bit": 32e9 }, "memory": { "total_capacity_GB": 160 } } } ================================================ FILE: configs/template.json ================================================ { "name": "NVIDIA A100(80GB)x4", "device_count": 4, "interconnect": { "link": { "name": "NVLink3", "bandwidth_per_direction_byte": 25e9, "bandwidth_both_directions_byte": 50e9, "latency_second": 8.92e-6, "flit_size_byte": 16, "header_size_byte": 16, "max_payload_size_byte": 256 }, "link_count_per_device": 12, "topology": "FC" }, "device": { "frequency_Hz": 1410e6, "compute_chiplet_count": 1, "compute_chiplet": { "physical_core_count": 128, "core_count": 108, "process_node": "7nm", "core": { "sublane_count": 4, "systolic_array": { "array_width": 16, "array_height": 16, "data_type": "fp16", "mac_per_cycle": 1 }, "vector_unit": { "vector_width": 32, "flop_per_cycle": 4, "data_type": "fp16", "int32_count": 16, "fp16_count": 0, "fp32_count": 16, "fp64_count": 8 }, "register_file": { "num_reg_files": 1, "num_registers": 16384, "register_bitwidth":32, "num_rdwr_ports":4 }, "SRAM_KB": 192 } }, "memory_protocol": "HBM2e", "_memory_protocol_list": [ "HBM2e", "DDR4", "DDR5", "PCIe4", "PCIe5" ], "io": { "process_node": "7nm", "global_buffer_MB": 40, "physical_global_buffer_MB": 48, "global_buffer_bandwidth_per_cycle_byte": 5120, "memory_channel_physical_count": 6, "memory_channel_active_count": 5, "pin_count_per_channel": 1024, "bandwidth_per_pin_bit": 3.2e9 }, "memory": { "total_capacity_GB": 80 } } } ================================================ FILE: cost_model/__init__.py ================================================ ## made this for first commit ================================================ FILE: cost_model/cost_examples.py ================================================ import cost_model.cost_model as cost_model import json # example chip with a 32 wide vector, 16x16 SA, 256kb cache core, 8 cores per die # io die with 64 mb cache, 8 nvlinks, 32 pcie phys # all at 5nm and 7nm with open("./configs/prefilling_system.json", "r") as f: # with open('../configs/mi210_template.json', 'r') as f: configs_dict = json.load(f) # print(configs_dict['device']) # print(data['device']['compute_chiplet_count']) compute_area = cost_model.calc_compute_chiplet_area_mm2(configs_dict) io_area = cost_model.calc_io_die_area_mm2(configs_dict) print( f"compute area: {compute_area}, io area: {io_area}, total area: {compute_area+io_area}" ) exit(0) core_compute_area_mm2 = cost_model.calc_compute_core_area_mm2( 32, 16, 2**18, cost_model.transistor_density_7nm, cost_model.sram_bit_cell_density_7nm, ) io_die_area_mm2 = cost_model.calc_io_die_area_mm2( 2**25, cost_model.PCIE5, 32, 8, cost_model.transistor_density_7nm, cost_model.sram_bit_cell_density_7nm, ) print(core_compute_area_mm2) print(io_die_area_mm2) ================================================ FILE: cost_model/cost_model.py ================================================ # Author: August Ning aning@princeton.edu # Date started: 12 October 2023 # This file is the cost model for Naivesim import numpy as np import math # import supply_chain.supply_chain_model as scm import cost_model.supply_chain.supply_chain_model as scm # lots of parameters required for calculating silicon die area cost # these are in terms of million transistors per mm2 transistor_density_7nm = scm.transistor_density_arr[scm.PN_7_INDEX] transistor_density_6nm = 114.2 transistor_density_5nm = scm.transistor_density_arr[scm.PN_5_INDEX] sram_bit_cell_density_7nm = 1.70e-07 sram_bit_cell_density_6nm = 1.40e-07 sram_bit_cell_density_5nm = 1.25e-07 # cache size overheads derived from cacti for cache sizes # 4096, 8192, 16384, ..., 1 MB cache_area_efficiency_arr = [0.076, 0.142, 0.247, 0.393, 0.559, \ 0.704, 0.526, 0.602, 0.561] # fpu transistor counts are for 64 bit FPU, based off Ariane and OpenPiton's SPARC T1 # assume that fp32 are half the transistors # int32 transistor count is based off of Ariane's Mult and OpenPiton's SPARC T1 # systolic array is for 1x1 area # scale FPU area by mantissa bits quadratically fpu64_transistor_count = 685300 fpu32_transistor_count = fpu64_transistor_count * ((23 / 52) ** 2) fpu16_transistor_count = fpu64_transistor_count * ((10 / 52) ** 2) int32_transistor_count = 177690 # based off of A100 SM and MI 210 CU # these overheads are per sublane, per vector width # (ex 32 for A100, 16 for MI 210) per_sublane_control_transistor_count = 996200 nvidia_per_sublane_control_transistor_count = 725650 amd_per_sublane_control_transistor_count = 1534500 per_sublane_control_dict = {'nvidia':per_sublane_control_transistor_count, \ 'amd':per_sublane_control_transistor_count} per_core_comm_transistor_count = 44300000 nvidia_per_core_comm_transistor_count = 55000000 amd_per_core_comm_transistor_count = 33600000 per_core_comm_dict = {'nvidia':per_core_comm_transistor_count, \ 'amd':per_core_comm_transistor_count} # memory controllers scale with process node, but PHYs do not # pcie, ddr, hbm # note: DDR link unit is 32 bits pcie5_phy_mm2_per_lane = 0.64 pcie4_phy_mm2_per_lane = 0.48 ddr5_phy_mm2_per_link_unit = 1.45 hbm2e_phy_mm2_per_link_unit = 10.45 nvlink3_phy_mm2_per_link_unit = 1.888 nvlink4_phy_mm2_per_link_unit = 0.965 infinity_fabric_phy_mm2_per_link_unit = 5.69 pcie5_ctrl_transistors_per_lane = 5372100 pcie4_ctrl_transistors_per_lane = 3962500 ddr5_ctrl_transistors_per_link_unit = 90446400 hbm2e_ctrl_transistors_per_link_unit = 552743000 nvlink3_ctrl_transistors_per_link_unit = 74632000 nvlink4_ctrl_transistors_per_link_unit = 86628000 infinity_fabric_ctrl_transistors_per_link_unit = 348148000 # mem tech keywords PCIE5 = 'PCIe5' PCIE4 = 'PCIe4' DDR5 = 'DDR5' HBM = 'HBM2e' NVLINK3 = 'NVLink3' NVLINK4 = 'NVLink4' INFINITYFABRIC = 'InfinityFabric' # average via dramexchange spot price, Oct 2023 ddr5_cost_per_gb = 2.4 hbm_cost_per_gb = 7 # return die area for a dimension x dimension SA with a # give bitwidth FPU at a given process node # right now, we model each PE's MAC as a FPU def calc_systolic_array_area_mm2(dimension_x, dimension_y, bitwidth, transistor_density_mil_mm2): if bitwidth == 'fp64': total_transistor_count = fpu64_transistor_count * dimension_x * dimension_y elif bitwidth == 'fp32': total_transistor_count = fpu32_transistor_count * dimension_x * dimension_y elif bitwidth == 'fp16': total_transistor_count = fpu16_transistor_count * dimension_x * dimension_y return total_transistor_count / 1e6 / transistor_density_mil_mm2 # vector width corresponds to number of FPUs you have def calc_vector_area_mm2(int32_count, fp16_count, fp32_count, fp64_count, transistor_density_mil_mm2): total_transistor_count = 0 total_transistor_count += int32_count * int32_transistor_count total_transistor_count += fp16_count * fpu16_transistor_count total_transistor_count += fp32_count * fpu32_transistor_count total_transistor_count += fp64_count * fpu64_transistor_count return total_transistor_count / 1e6 / transistor_density_mil_mm2 # for cache designs, if the desired capacity is larger than the max cache unit # split them up into multiple units of the max capacity # min cache size is 4096 bytes def calc_cache_sram_area_mm2(capacity_bytes, sram_bitcell_area_mm2, max_cache_unit_bytes=(2**19)): if capacity_bytes > max_cache_unit_bytes: num_cache_units = math.ceil(capacity_bytes / max_cache_unit_bytes) unit_size_bytes = max_cache_unit_bytes else: num_cache_units = 1 unit_size_bytes = capacity_bytes # cache size model is for capacity of 4096 bytes to 1 MB if unit_size_bytes < 2 ** 12: unit_size_bytes = 2 ** 12 area_efficiency_index = math.ceil(math.log(unit_size_bytes, 2)) - 12 area_efficiency_factor = cache_area_efficiency_arr[area_efficiency_index] unit_cache_area = unit_size_bytes * 8 * sram_bitcell_area_mm2 / area_efficiency_factor cache_area = num_cache_units * unit_cache_area return cache_area # area model comes from EMPIRE # num_reg_files: how many distinct register files each sublanes has # D: how many registers there are in each RF # W: bits per register # P: number of read/write ports def calc_reg_file_area(num_reg_files, D, W, P, transistor_density_mil_mm2): area_90nm_um2 = (3.29 * 10**4) - (1.09 * 10**3 * D) - (8.83 * 10**2 * W) - (5.55 * 10**3 * P) \ + (5.35 * 10**1 * D * W) + (1.50 * 10**-2 * D**2) + (1.08 * 10**-2 * W**2) \ + (5.86 * 10**-1 * P**2) + (1.42 * 10**2 * D * P) + (3.68 * 10**2 * W * P) # need to convert um2 to mm2, convert to 7nm area_90nm_mm2 = area_90nm_um2 / 1e6 area_mm2 = area_90nm_mm2 * (scm.transistor_density_arr[scm.PN_90_INDEX] / transistor_density_mil_mm2) total_reg_file_area = num_reg_files * area_mm2 return total_reg_file_area # for width, for PCIe and NVLink, it is the whole lane # for DDR and HBM, it's 128 bits and 1024 bits respectively def calc_mem_controller_area_mm2(mem_tech, width, transistor_density_mil_mm2): controller_transistor_count = -1 if mem_tech == PCIE5: controller_transistor_count = pcie5_ctrl_transistors_per_lane * width elif mem_tech == PCIE4: controller_transistor_count = pcie4_ctrl_transistors_per_lane * width elif mem_tech == DDR5: controller_transistor_count = ddr5_ctrl_transistors_per_link_unit * width elif mem_tech == HBM: controller_transistor_count = hbm2e_ctrl_transistors_per_link_unit * width elif mem_tech == NVLINK3: controller_transistor_count = nvlink3_ctrl_transistors_per_link_unit * width elif mem_tech == NVLINK4: controller_transistor_count = nvlink4_ctrl_transistors_per_link_unit * width elif mem_tech == INFINITYFABRIC: controller_transistor_count = infinity_fabric_ctrl_transistors_per_link_unit * width return (controller_transistor_count / 1e6) / transistor_density_mil_mm2 def calc_mem_phy_area_mm2(mem_tech, width): if mem_tech == PCIE5: return pcie5_phy_mm2_per_lane * width elif mem_tech == PCIE4: return pcie4_phy_mm2_per_lane * width elif mem_tech == DDR5: return ddr5_phy_mm2_per_link_unit * width elif mem_tech == HBM: return hbm2e_phy_mm2_per_link_unit * width elif mem_tech == NVLINK3: return nvlink3_phy_mm2_per_link_unit * width elif mem_tech == NVLINK4: return nvlink4_phy_mm2_per_link_unit * width elif mem_tech == INFINITYFABRIC: return infinity_fabric_phy_mm2_per_link_unit * width else: return -1 def find_logic_sram_transistor_density(process_node): if '7' in process_node: return transistor_density_7nm, sram_bit_cell_density_7nm elif '6' in process_node: return transistor_density_6nm, sram_bit_cell_density_6nm elif '5' in process_node: return transistor_density_5nm, sram_bit_cell_density_5nm raise Exception("Invalid Process Node") # a compute core consists of a fixed control overhead # a specified width fp32 vector engine # a specified dimmension fp16 systolic array # a specified L1 cache # at a specified process node # NB: you can fit multiple cores onto a single die for chiplet systems def calc_compute_chiplet_area_mm2(configs_dict, verbose=False): total_die_map = {} core_breakdown_map = {} device_name = configs_dict['name'] device_brand = 'nvidia' if 'nvidia' in device_name.lower() else 'amd' vector_width = configs_dict['device']['compute_chiplet']['core']['vector_unit']['vector_width'] vector_int32_count = configs_dict['device']['compute_chiplet']['core']['vector_unit']['int32_count'] vector_fp16_count = configs_dict['device']['compute_chiplet']['core']['vector_unit']['fp16_count'] vector_fp32_count = configs_dict['device']['compute_chiplet']['core']['vector_unit']['fp32_count'] vector_fp64_count = configs_dict['device']['compute_chiplet']['core']['vector_unit']['fp64_count'] sa_dim_x = configs_dict['device']['compute_chiplet']['core']['systolic_array']['array_width'] sa_dim_y = configs_dict['device']['compute_chiplet']['core']['systolic_array']['array_height'] sa_bitwidth = configs_dict['device']['compute_chiplet']['core']['systolic_array']['data_type'] num_reg_files = configs_dict['device']['compute_chiplet']['core']['register_file']['num_reg_files'] num_registers = configs_dict['device']['compute_chiplet']['core']['register_file']['num_registers'] register_bitwidth = configs_dict['device']['compute_chiplet']['core']['register_file']['register_bitwidth'] num_rdwr_ports = configs_dict['device']['compute_chiplet']['core']['register_file']['num_rdwr_ports'] sublane_count = configs_dict['device']['compute_chiplet']['core']['sublane_count'] cache_size_bytes = configs_dict['device']['compute_chiplet']['core']['SRAM_KB'] * (2 ** 10) process_node = configs_dict['device']['compute_chiplet']['process_node'] cores_per_chiplet = configs_dict['device']['compute_chiplet']['physical_core_count'] # each sublane has a SA and vector unit. a core is made up of sublanes. a chiplet has multiple cores transistor_density_mil_mm2, sram_density_bitcell_mm2 = find_logic_sram_transistor_density(process_node) per_sublane_area_mm2 = 0 per_sublane_control_area_mm2 = per_sublane_control_dict[device_brand] / 1e6 / transistor_density_mil_mm2 per_sublane_area_mm2 += (vector_width * per_sublane_control_area_mm2) control_logic_area = per_sublane_area_mm2 * sublane_count per_lane_vector_area = calc_vector_area_mm2(vector_int32_count, vector_fp16_count, vector_fp32_count, vector_fp64_count, transistor_density_mil_mm2) per_sublane_area_mm2 += per_lane_vector_area per_lane_sa_area = calc_systolic_array_area_mm2(sa_dim_x, sa_dim_y, sa_bitwidth, transistor_density_mil_mm2) per_sublane_area_mm2 += per_lane_sa_area per_lane_regfile_area = calc_reg_file_area(num_reg_files, num_registers, register_bitwidth, num_rdwr_ports, transistor_density_mil_mm2) per_sublane_area_mm2 += per_lane_regfile_area per_core_compute_area_mm2 = per_sublane_area_mm2 * sublane_count cache_area_mm2 = calc_cache_sram_area_mm2(cache_size_bytes, sram_density_bitcell_mm2) per_core_area_mm2 = per_core_compute_area_mm2 + cache_area_mm2 core_breakdown_map['total_core_area'] = per_core_area_mm2 core_breakdown_map['control_area'] = control_logic_area core_breakdown_map['alu_area'] = per_lane_vector_area * sublane_count core_breakdown_map['sa_area'] = per_lane_sa_area * sublane_count core_breakdown_map['regfile_area'] = per_lane_regfile_area * sublane_count core_breakdown_map['local_buffer_area'] = cache_area_mm2 total_cores_area = per_core_area_mm2 * cores_per_chiplet total_crossbar_area = (per_core_comm_dict[device_brand] / 1e6 / transistor_density_mil_mm2) * cores_per_chiplet # each core has an area overhead to connect to the xbar compute_chiplet_area_mm2 = total_cores_area + total_crossbar_area total_die_map['total_area'] = compute_chiplet_area_mm2 total_die_map['cores_area'] = total_cores_area total_die_map['crossbar_area'] = total_crossbar_area if verbose: return compute_chiplet_area_mm2, core_breakdown_map, total_die_map else: return compute_chiplet_area_mm2 # NB: for mem_tech, if you are using DDR or HBM, it will be 128 bits and 1024 bits respectively per lane # for PCIe and NVLink, specify the number of lanes (128bits per lane) # def calc_io_die_area_mm2(cache_size_bytes, mem_tech, mem_tech_width, num_nvlink_phys, \ # transistor_density_mil_mm2, sram_density_bitcell_mm2): def calc_io_die_area_mm2(config_dict, verbose=False): total_die_map = {} cache_size_bytes = config_dict['device']['io']['physical_global_buffer_MB'] * (2 ** 20) mem_tech = config_dict['device']['memory_protocol'] num_mem_tech_units = config_dict['device']['io']['memory_channel_physical_count'] gpu_gpu_comm_tech = config_dict['interconnect']['link']['name'] num_gpu_gpu_comm_phy = config_dict['interconnect']['link_count_per_device'] process_node = config_dict['device']['io']['process_node'] transistor_density_mil_mm2, sram_density_bitcell_mm2 = find_logic_sram_transistor_density(process_node) io_die_area_mm2 = 0 io_die_area_mm2 += calc_cache_sram_area_mm2(cache_size_bytes, sram_density_bitcell_mm2) global_buffer_area = io_die_area_mm2 # mem tech for communicating to off chip memory mem_phy_area = calc_mem_phy_area_mm2(mem_tech, num_mem_tech_units) mem_controller_area = calc_mem_controller_area_mm2(mem_tech, num_mem_tech_units, transistor_density_mil_mm2) io_die_area_mm2 += mem_phy_area io_die_area_mm2 += mem_controller_area # every IO die has a few NV links for chip to chip communication device_phy_area = calc_mem_phy_area_mm2(gpu_gpu_comm_tech, num_gpu_gpu_comm_phy) device_controller_area = calc_mem_controller_area_mm2(gpu_gpu_comm_tech, num_gpu_gpu_comm_phy, transistor_density_mil_mm2) io_die_area_mm2 += device_phy_area io_die_area_mm2 += device_controller_area total_die_map['total_die_area'] = io_die_area_mm2 total_die_map['global_buffer_area'] = global_buffer_area total_die_map['mem_phy_area'] = mem_phy_area total_die_map['mem_controller_area'] = mem_controller_area total_die_map['device_phy_area'] = device_phy_area total_die_map['device_controller_area'] = device_controller_area if verbose: return io_die_area_mm2, total_die_map else: return io_die_area_mm2 ================================================ FILE: cost_model/regfile_area.py ================================================ def calculate_regfile_area(D, W, P): area_90nm_um2 = (3.29 * 10**4) - (1.09 * 10**3 * D) - (8.83 * 10**2 * W) - (5.55 * 10**3 * P) \ + (5.35 * 10**1 * D * W) + (1.50 * 10**-2 * D**2) + (1.08 * 10**-2 * W**2) \ + (5.86 * 10**-1 * P**2) + (1.42 * 10**2 * D * P) + (3.68 * 10**2 * W * P) # need to convert um2 to mm2, convert to 7nm area_90nm_mm2 = area_90nm_um2 / 1e6 area_7nm_mm2 = area_90nm_mm2 * (1.6 / 96.3) return area_7nm_mm2 reg_area = calculate_regfile_area(16384, 32, 4) print(reg_area) reg_area = 64 * calculate_regfile_area(512, 32, 4) print(reg_area) reg_area = calculate_regfile_area(800, 32, 4) print(reg_area) ================================================ FILE: design_space_exploration/__init__.py ================================================ ================================================ FILE: design_space_exploration/dse.py ================================================ import json, re from hardware_model.compute_module import ( VectorUnit, SystolicArray, Core, ComputeModule, overhead_dict, ) from hardware_model.io_module import IOModule from hardware_model.memory_module import MemoryModule from hardware_model.device import Device from hardware_model.interconnect import LinkModule, InterConnectModule, TopologyType from hardware_model.system import System from software_model.transformer import ( TransformerBlockInitComputationTP, TransformerBlockAutoRegressionTP, ) from software_model.utils import data_type_dict, Tensor # from cost_model.cost_model import calc_compute_chiplet_area_mm2, calc_io_die_area_mm2 from math import ceil def read_architecture_template(file_path): with open(file_path, "r") as f: arch_specs = json.load(f) return arch_specs def template_to_system(arch_specs): device_specs = arch_specs["device"] compute_chiplet_specs = device_specs["compute_chiplet"] io_specs = device_specs["io"] core_specs = compute_chiplet_specs["core"] sublane_count = core_specs["sublane_count"] # vector unit vector_unit_specs = core_specs["vector_unit"] vector_unit = VectorUnit( sublane_count * vector_unit_specs["vector_width"] * vector_unit_specs["flop_per_cycle"], int(re.search(r"(\d+)", vector_unit_specs["data_type"]).group(1)) // 8, 35, vector_unit_specs["vector_width"], sublane_count, ) # systolic array systolic_array_specs = core_specs["systolic_array"] systolic_array = SystolicArray( systolic_array_specs["array_height"], systolic_array_specs["array_width"], systolic_array_specs["mac_per_cycle"], int(re.search(r"(\d+)", systolic_array_specs["data_type"]).group(1)) // 8, int(re.search(r"(\d+)", systolic_array_specs["data_type"]).group(1)) // 8, ) # core core = Core( vector_unit, systolic_array, sublane_count, core_specs["SRAM_KB"] * 1024, ) # compute module compute_module = ComputeModule( core, compute_chiplet_specs["core_count"] * device_specs["compute_chiplet_count"], device_specs["frequency_Hz"], io_specs["global_buffer_MB"] * 1024 * 1024, io_specs["global_buffer_bandwidth_per_cycle_byte"], overhead_dict["A100"], ) # io module io_module = IOModule( io_specs["memory_channel_active_count"] * io_specs["pin_count_per_channel"] * io_specs["bandwidth_per_pin_bit"] // 8, 1e-6, ) # memory module memory_module = MemoryModule( device_specs["memory"]["total_capacity_GB"] * 1024 * 1024 * 1024 ) # device device = Device(compute_module, io_module, memory_module) # interconnect interconnect_specs = arch_specs["interconnect"] link_specs = interconnect_specs["link"] link_module = LinkModule( link_specs["bandwidth_per_direction_byte"], link_specs["bandwidth_both_directions_byte"], link_specs["latency_second"], link_specs["flit_size_byte"], link_specs["max_payload_size_byte"], link_specs["header_size_byte"], ) interconnect_module = InterConnectModule( arch_specs["device_count"], TopologyType.FC if interconnect_specs["topology"] == "FC" else TopologyType.RING, link_module, interconnect_specs["link_count_per_device"], ) # system system = System(device, interconnect_module) return system def test_template_to_system(): arch_specs = read_architecture_template("configs/template.json") A100_system = template_to_system(arch_specs) bs = 8 s = 2048 model = TransformerBlockInitComputationTP( d_model=12288, n_heads=96, device_count=4, data_type=data_type_dict["fp16"], ) _ = model(Tensor([bs, s, 12288], data_type_dict["fp16"])) model.roofline_model(A100_system) def find_cheapest_design( d_model, n_heads, n_layers, batch_size, input_seq_length, init_latency, output_seq_length, auto_regression_latency, ): i=0 smallest_total_area_mm2=float('inf') best_arch_specs=None arch_specs = read_architecture_template("configs/template.json") for device_count in [4, 8, 12, 16]: model_init = TransformerBlockInitComputationTP( d_model=12288, n_heads=96, device_count=device_count, data_type=data_type_dict["fp16"], ) model_auto_regression = TransformerBlockAutoRegressionTP( d_model=12288, n_heads=96, device_count=device_count, data_type=data_type_dict["fp16"],) _ = model_init(Tensor([batch_size, input_seq_length, model_init.d_model], data_type_dict["fp16"])) _ = model_auto_regression(Tensor([batch_size, 1, model_init.d_model],data_type_dict["fp16"]), input_seq_length+output_seq_length) arch_specs["device_count"] = device_count if device_count <= 4: topology = "FC" else: topology = "RING" arch_specs["interconnect"]["topology"] = topology for link_count_per_device in [6, 12, 18, 24]: arch_specs["interconnect"]["link_count_per_device"] = link_count_per_device # device for core_count in [32, 64, 128, 256]: arch_specs["device"]["compute_chiplet"]["core_count"] = core_count # core for sublane_count in [1, 2, 4, 8]: arch_specs["device"]["compute_chiplet"]["core"][ "sublane_count" ] = sublane_count # systolic array for array_height in [16, 32, 64, 128]: arch_specs["device"]["compute_chiplet"]["core"][ "systolic_array" ]["array_height"] = array_height arch_specs["device"]["compute_chiplet"]["core"][ "systolic_array" ]["array_width"] = array_height # vector unit for vector_width in [16, 32, 64, 128]: arch_specs["device"]["compute_chiplet"]["core"][ "vector_unit" ]["vector_width"] = vector_width for SRAM_KB in [64, 128, 256, 512, 1024]: arch_specs["device"]["compute_chiplet"]["core"][ "SRAM_KB" ] = SRAM_KB # global buffer for total_global_buffer_MB in [ 80, 160, 240, 320, 400, 480, 640, 800, 960, ]: global_buffer_MB = ( total_global_buffer_MB // device_count ) global_buffer_bandwidth_per_cycle_byte = ( 5120 * global_buffer_MB // 40 ) arch_specs["device"]["io"][ "global_buffer_MB" ] = global_buffer_MB arch_specs["device"]["io"][ "global_buffer_bandwidth_per_cycle_byte" ] = global_buffer_bandwidth_per_cycle_byte # memory memory_capacity_requirement_GB = ceil(model_auto_regression.memory_requirement*n_layers/1e9/16)*16 # print(f"memory_capacity_requirement_GB={model_auto_regression.memory_requirement*n_layers/1e9}") # exit() for memory_protocol in [ "HBM2e", "DDR5", "PCIe5", # "GDDR6X" ]: arch_specs['device']['memory_protocol']=memory_protocol if memory_protocol == "HBM2e": # 400 GB/s per channel, 16 GB channel_count=memory_capacity_requirement_GB // 16 if channel_count>8: continue channel_count_list = [channel_count] pin_count_per_channel=1024 bandwidth_per_pin_bit=3.2e9 elif memory_protocol == "DDR5": # 19.2 GB/s per channel, 2 channel per dimm channel_count_list = [16, 24, 32] pin_count_per_channel=32 bandwidth_per_pin_bit=4.8e9 elif memory_protocol == "PCIe5": # 4 GB/s per channel channel_count_list = [64, 96, 128] pin_count_per_channel=1 bandwidth_per_pin_bit=32e9 # elif memory_protocol == "GDDR6X": # # 84 GB/s per channel, 2 GB # channel_count_list= memo for channel_count in channel_count_list: arch_specs['device']['memory']['total_capacity_GB'] = memory_capacity_requirement_GB arch_specs['device']['io']['memory_channel_active_count'] = channel_count arch_specs['device']['io']['memory_channel_physical_count'] = channel_count arch_specs['device']['io']['pin_count_per_channel'] = pin_count_per_channel arch_specs['device']['io']['bandwidth_per_pin_bit'] = bandwidth_per_pin_bit total_area_mm2=calc_compute_chiplet_area_mm2(arch_specs)+calc_io_die_area_mm2(arch_specs) # print(f"channel_count={arch_specs['device']['io']['memory_channel_active_count']},total area={total_area_mm2}") if total_area_mm2>900: continue system=template_to_system(arch_specs) init_roofline_latency=model_init.roofline_model(system)*n_layers if init_roofline_latency>init_latency: continue auto_regression_roofline_latency=model_auto_regression.roofline_model(system)*n_layers if auto_regression_roofline_latency>auto_regression_latency: continue auto_regression_latency_simulated = model_auto_regression.compile_and_simulate(system, 'heuristic-GPU') if auto_regression_latency_simulated>auto_regression_latency: continue init_latency_simulated = model_init.compile_and_simulate(system, 'heuristic-GPU') if init_latency_simulated>init_latency: continue if total_area_mm2*device_count