[
  {
    "path": ".git/HEAD",
    "content": "ref: refs/heads/master\n"
  },
  {
    "path": ".git/config",
    "content": "[core]\n\trepositoryformatversion = 1\n\tfilemode = true\n\tbare = false\n\tlogallrefupdates = true\n[remote \"origin\"]\n\turl = https://github.com/xuqiantong/CUDA-Winograd\n\ttagOpt = --no-tags\n\tfetch = +refs/heads/master:refs/remotes/origin/master\n\tpromisor = true\n\tpartialclonefilter = blob:limit=1048576\n[branch \"master\"]\n\tremote = origin\n\tmerge = refs/heads/master\n"
  },
  {
    "path": ".git/description",
    "content": "Unnamed repository; edit this file 'description' to name the repository.\n"
  },
  {
    "path": ".git/hooks/applypatch-msg.sample",
    "content": "#!/bin/sh\n#\n# An example hook script to check the commit log message taken by\n# applypatch from an e-mail message.\n#\n# The hook should exit with non-zero status after issuing an\n# appropriate message if it wants to stop the commit.  The hook is\n# allowed to edit the commit message file.\n#\n# To enable this hook, rename this file to \"applypatch-msg\".\n\n. git-sh-setup\ncommitmsg=\"$(git rev-parse --git-path hooks/commit-msg)\"\ntest -x \"$commitmsg\" && exec \"$commitmsg\" ${1+\"$@\"}\n:\n"
  },
  {
    "path": ".git/hooks/commit-msg.sample",
    "content": "#!/bin/sh\n#\n# An example hook script to check the commit log message.\n# Called by \"git commit\" with one argument, the name of the file\n# that has the commit message.  The hook should exit with non-zero\n# status after issuing an appropriate message if it wants to stop the\n# commit.  The hook is allowed to edit the commit message file.\n#\n# To enable this hook, rename this file to \"commit-msg\".\n\n# Uncomment the below to add a Signed-off-by line to the message.\n# Doing this in a hook is a bad idea in general, but the prepare-commit-msg\n# hook is more suited to it.\n#\n# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\\(.*>\\).*$/Signed-off-by: \\1/p')\n# grep -qs \"^$SOB\" \"$1\" || echo \"$SOB\" >> \"$1\"\n\n# This example catches duplicate Signed-off-by lines.\n\ntest \"\" = \"$(grep '^Signed-off-by: ' \"$1\" |\n\t sort | uniq -c | sed -e '/^[ \t]*1[ \t]/d')\" || {\n\techo >&2 Duplicate Signed-off-by lines.\n\texit 1\n}\n"
  },
  {
    "path": ".git/hooks/fsmonitor-watchman.sample",
    "content": "#!/usr/bin/perl\n\nuse strict;\nuse warnings;\nuse IPC::Open2;\n\n# An example hook script to integrate Watchman\n# (https://facebook.github.io/watchman/) with git to speed up detecting\n# new and modified files.\n#\n# The hook is passed a version (currently 2) and last update token\n# formatted as a string and outputs to stdout a new update token and\n# all files that have been modified since the update token. Paths must\n# be relative to the root of the working tree and separated by a single NUL.\n#\n# To enable this hook, rename this file to \"query-watchman\" and set\n# 'git config core.fsmonitor .git/hooks/query-watchman'\n#\nmy ($version, $last_update_token) = @ARGV;\n\n# Uncomment for debugging\n# print STDERR \"$0 $version $last_update_token\\n\";\n\n# Check the hook interface version\nif ($version ne 2) {\n\tdie \"Unsupported query-fsmonitor hook version '$version'.\\n\" .\n\t    \"Falling back to scanning...\\n\";\n}\n\nmy $git_work_tree = get_working_dir();\n\nmy $retry = 1;\n\nmy $json_pkg;\neval {\n\trequire JSON::XS;\n\t$json_pkg = \"JSON::XS\";\n\t1;\n} or do {\n\trequire JSON::PP;\n\t$json_pkg = \"JSON::PP\";\n};\n\nlaunch_watchman();\n\nsub launch_watchman {\n\tmy $o = watchman_query();\n\tif (is_work_tree_watched($o)) {\n\t\toutput_result($o->{clock}, @{$o->{files}});\n\t}\n}\n\nsub output_result {\n\tmy ($clockid, @files) = @_;\n\n\t# Uncomment for debugging watchman output\n\t# open (my $fh, \">\", \".git/watchman-output.out\");\n\t# binmode $fh, \":utf8\";\n\t# print $fh \"$clockid\\n@files\\n\";\n\t# close $fh;\n\n\tbinmode STDOUT, \":utf8\";\n\tprint $clockid;\n\tprint \"\\0\";\n\tlocal $, = \"\\0\";\n\tprint @files;\n}\n\nsub watchman_clock {\n\tmy $response = qx/watchman clock \"$git_work_tree\"/;\n\tdie \"Failed to get clock id on '$git_work_tree'.\\n\" .\n\t\t\"Falling back to scanning...\\n\" if $? != 0;\n\n\treturn $json_pkg->new->utf8->decode($response);\n}\n\nsub watchman_query {\n\tmy $pid = open2(\\*CHLD_OUT, \\*CHLD_IN, 'watchman -j --no-pretty')\n\tor die \"open2() failed: $!\\n\" .\n\t\"Falling back to scanning...\\n\";\n\n\t# In the query expression below we're asking for names of files that\n\t# changed since $last_update_token but not from the .git folder.\n\t#\n\t# To accomplish this, we're using the \"since\" generator to use the\n\t# recency index to select candidate nodes and \"fields\" to limit the\n\t# output to file names only. Then we're using the \"expression\" term to\n\t# further constrain the results.\n\tmy $last_update_line = \"\";\n\tif (substr($last_update_token, 0, 1) eq \"c\") {\n\t\t$last_update_token = \"\\\"$last_update_token\\\"\";\n\t\t$last_update_line = qq[\\n\"since\": $last_update_token,];\n\t}\n\tmy $query = <<\"\tEND\";\n\t\t[\"query\", \"$git_work_tree\", {$last_update_line\n\t\t\t\"fields\": [\"name\"],\n\t\t\t\"expression\": [\"not\", [\"dirname\", \".git\"]]\n\t\t}]\n\tEND\n\n\t# Uncomment for debugging the watchman query\n\t# open (my $fh, \">\", \".git/watchman-query.json\");\n\t# print $fh $query;\n\t# close $fh;\n\n\tprint CHLD_IN $query;\n\tclose CHLD_IN;\n\tmy $response = do {local $/; <CHLD_OUT>};\n\n\t# Uncomment for debugging the watch response\n\t# open ($fh, \">\", \".git/watchman-response.json\");\n\t# print $fh $response;\n\t# close $fh;\n\n\tdie \"Watchman: command returned no output.\\n\" .\n\t\"Falling back to scanning...\\n\" if $response eq \"\";\n\tdie \"Watchman: command returned invalid output: $response\\n\" .\n\t\"Falling back to scanning...\\n\" unless $response =~ /^\\{/;\n\n\treturn $json_pkg->new->utf8->decode($response);\n}\n\nsub is_work_tree_watched {\n\tmy ($output) = @_;\n\tmy $error = $output->{error};\n\tif ($retry > 0 and $error and $error =~ m/unable to resolve root .* directory (.*) is not watched/) {\n\t\t$retry--;\n\t\tmy $response = qx/watchman watch \"$git_work_tree\"/;\n\t\tdie \"Failed to make watchman watch '$git_work_tree'.\\n\" .\n\t\t    \"Falling back to scanning...\\n\" if $? != 0;\n\t\t$output = $json_pkg->new->utf8->decode($response);\n\t\t$error = $output->{error};\n\t\tdie \"Watchman: $error.\\n\" .\n\t\t\"Falling back to scanning...\\n\" if $error;\n\n\t\t# Uncomment for debugging watchman output\n\t\t# open (my $fh, \">\", \".git/watchman-output.out\");\n\t\t# close $fh;\n\n\t\t# Watchman will always return all files on the first query so\n\t\t# return the fast \"everything is dirty\" flag to git and do the\n\t\t# Watchman query just to get it over with now so we won't pay\n\t\t# the cost in git to look up each individual file.\n\t\tmy $o = watchman_clock();\n\t\t$error = $output->{error};\n\n\t\tdie \"Watchman: $error.\\n\" .\n\t\t\"Falling back to scanning...\\n\" if $error;\n\n\t\toutput_result($o->{clock}, (\"/\"));\n\t\t$last_update_token = $o->{clock};\n\n\t\teval { launch_watchman() };\n\t\treturn 0;\n\t}\n\n\tdie \"Watchman: $error.\\n\" .\n\t\"Falling back to scanning...\\n\" if $error;\n\n\treturn 1;\n}\n\nsub get_working_dir {\n\tmy $working_dir;\n\tif ($^O =~ 'msys' || $^O =~ 'cygwin') {\n\t\t$working_dir = Win32::GetCwd();\n\t\t$working_dir =~ tr/\\\\/\\//;\n\t} else {\n\t\trequire Cwd;\n\t\t$working_dir = Cwd::cwd();\n\t}\n\n\treturn $working_dir;\n}\n"
  },
  {
    "path": ".git/hooks/post-update.sample",
    "content": "#!/bin/sh\n#\n# An example hook script to prepare a packed repository for use over\n# dumb transports.\n#\n# To enable this hook, rename this file to \"post-update\".\n\nexec git update-server-info\n"
  },
  {
    "path": ".git/hooks/pre-applypatch.sample",
    "content": "#!/bin/sh\n#\n# An example hook script to verify what is about to be committed\n# by applypatch from an e-mail message.\n#\n# The hook should exit with non-zero status after issuing an\n# appropriate message if it wants to stop the commit.\n#\n# To enable this hook, rename this file to \"pre-applypatch\".\n\n. git-sh-setup\nprecommit=\"$(git rev-parse --git-path hooks/pre-commit)\"\ntest -x \"$precommit\" && exec \"$precommit\" ${1+\"$@\"}\n:\n"
  },
  {
    "path": ".git/hooks/pre-commit.sample",
    "content": "#!/bin/sh\n#\n# An example hook script to verify what is about to be committed.\n# Called by \"git commit\" with no arguments.  The hook should\n# exit with non-zero status after issuing an appropriate message if\n# it wants to stop the commit.\n#\n# To enable this hook, rename this file to \"pre-commit\".\n\nif git rev-parse --verify HEAD >/dev/null 2>&1\nthen\n\tagainst=HEAD\nelse\n\t# Initial commit: diff against an empty tree object\n\tagainst=$(git hash-object -t tree /dev/null)\nfi\n\n# If you want to allow non-ASCII filenames set this variable to true.\nallownonascii=$(git config --type=bool hooks.allownonascii)\n\n# Redirect output to stderr.\nexec 1>&2\n\n# Cross platform projects tend to avoid non-ASCII filenames; prevent\n# them from being added to the repository. We exploit the fact that the\n# printable range starts at the space character and ends with tilde.\nif [ \"$allownonascii\" != \"true\" ] &&\n\t# Note that the use of brackets around a tr range is ok here, (it's\n\t# even required, for portability to Solaris 10's /usr/bin/tr), since\n\t# the square bracket bytes happen to fall in the designated range.\n\ttest $(git diff-index --cached --name-only --diff-filter=A -z $against |\n\t  LC_ALL=C tr -d '[ -~]\\0' | wc -c) != 0\nthen\n\tcat <<\\EOF\nError: Attempt to add a non-ASCII file name.\n\nThis can cause problems if you want to work with people on other platforms.\n\nTo be portable it is advisable to rename the file.\n\nIf you know what you are doing you can disable this check using:\n\n  git config hooks.allownonascii true\nEOF\n\texit 1\nfi\n\n# If there are whitespace errors, print the offending file names and fail.\nexec git diff-index --check --cached $against --\n"
  },
  {
    "path": ".git/hooks/pre-merge-commit.sample",
    "content": "#!/bin/sh\n#\n# An example hook script to verify what is about to be committed.\n# Called by \"git merge\" with no arguments.  The hook should\n# exit with non-zero status after issuing an appropriate message to\n# stderr if it wants to stop the merge commit.\n#\n# To enable this hook, rename this file to \"pre-merge-commit\".\n\n. git-sh-setup\ntest -x \"$GIT_DIR/hooks/pre-commit\" &&\n        exec \"$GIT_DIR/hooks/pre-commit\"\n:\n"
  },
  {
    "path": ".git/hooks/pre-push.sample",
    "content": "#!/bin/sh\n\n# An example hook script to verify what is about to be pushed.  Called by \"git\n# push\" after it has checked the remote status, but before anything has been\n# pushed.  If this script exits with a non-zero status nothing will be pushed.\n#\n# This hook is called with the following parameters:\n#\n# $1 -- Name of the remote to which the push is being done\n# $2 -- URL to which the push is being done\n#\n# If pushing without using a named remote those arguments will be equal.\n#\n# Information about the commits which are being pushed is supplied as lines to\n# the standard input in the form:\n#\n#   <local ref> <local oid> <remote ref> <remote oid>\n#\n# This sample shows how to prevent push of commits where the log message starts\n# with \"WIP\" (work in progress).\n\nremote=\"$1\"\nurl=\"$2\"\n\nzero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')\n\nwhile read local_ref local_oid remote_ref remote_oid\ndo\n\tif test \"$local_oid\" = \"$zero\"\n\tthen\n\t\t# Handle delete\n\t\t:\n\telse\n\t\tif test \"$remote_oid\" = \"$zero\"\n\t\tthen\n\t\t\t# New branch, examine all commits\n\t\t\trange=\"$local_oid\"\n\t\telse\n\t\t\t# Update to existing branch, examine new commits\n\t\t\trange=\"$remote_oid..$local_oid\"\n\t\tfi\n\n\t\t# Check for WIP commit\n\t\tcommit=$(git rev-list -n 1 --grep '^WIP' \"$range\")\n\t\tif test -n \"$commit\"\n\t\tthen\n\t\t\techo >&2 \"Found WIP commit in $local_ref, not pushing\"\n\t\t\texit 1\n\t\tfi\n\tfi\ndone\n\nexit 0\n"
  },
  {
    "path": ".git/hooks/pre-rebase.sample",
    "content": "#!/bin/sh\n#\n# Copyright (c) 2006, 2008 Junio C Hamano\n#\n# The \"pre-rebase\" hook is run just before \"git rebase\" starts doing\n# its job, and can prevent the command from running by exiting with\n# non-zero status.\n#\n# The hook is called with the following parameters:\n#\n# $1 -- the upstream the series was forked from.\n# $2 -- the branch being rebased (or empty when rebasing the current branch).\n#\n# This sample shows how to prevent topic branches that are already\n# merged to 'next' branch from getting rebased, because allowing it\n# would result in rebasing already published history.\n\npublish=next\nbasebranch=\"$1\"\nif test \"$#\" = 2\nthen\n\ttopic=\"refs/heads/$2\"\nelse\n\ttopic=`git symbolic-ref HEAD` ||\n\texit 0 ;# we do not interrupt rebasing detached HEAD\nfi\n\ncase \"$topic\" in\nrefs/heads/??/*)\n\t;;\n*)\n\texit 0 ;# we do not interrupt others.\n\t;;\nesac\n\n# Now we are dealing with a topic branch being rebased\n# on top of master.  Is it OK to rebase it?\n\n# Does the topic really exist?\ngit show-ref -q \"$topic\" || {\n\techo >&2 \"No such branch $topic\"\n\texit 1\n}\n\n# Is topic fully merged to master?\nnot_in_master=`git rev-list --pretty=oneline ^master \"$topic\"`\nif test -z \"$not_in_master\"\nthen\n\techo >&2 \"$topic is fully merged to master; better remove it.\"\n\texit 1 ;# we could allow it, but there is no point.\nfi\n\n# Is topic ever merged to next?  If so you should not be rebasing it.\nonly_next_1=`git rev-list ^master \"^$topic\" ${publish} | sort`\nonly_next_2=`git rev-list ^master           ${publish} | sort`\nif test \"$only_next_1\" = \"$only_next_2\"\nthen\n\tnot_in_topic=`git rev-list \"^$topic\" master`\n\tif test -z \"$not_in_topic\"\n\tthen\n\t\techo >&2 \"$topic is already up to date with master\"\n\t\texit 1 ;# we could allow it, but there is no point.\n\telse\n\t\texit 0\n\tfi\nelse\n\tnot_in_next=`git rev-list --pretty=oneline ^${publish} \"$topic\"`\n\t/usr/bin/perl -e '\n\t\tmy $topic = $ARGV[0];\n\t\tmy $msg = \"* $topic has commits already merged to public branch:\\n\";\n\t\tmy (%not_in_next) = map {\n\t\t\t/^([0-9a-f]+) /;\n\t\t\t($1 => 1);\n\t\t} split(/\\n/, $ARGV[1]);\n\t\tfor my $elem (map {\n\t\t\t\t/^([0-9a-f]+) (.*)$/;\n\t\t\t\t[$1 => $2];\n\t\t\t} split(/\\n/, $ARGV[2])) {\n\t\t\tif (!exists $not_in_next{$elem->[0]}) {\n\t\t\t\tif ($msg) {\n\t\t\t\t\tprint STDERR $msg;\n\t\t\t\t\tundef $msg;\n\t\t\t\t}\n\t\t\t\tprint STDERR \" $elem->[1]\\n\";\n\t\t\t}\n\t\t}\n\t' \"$topic\" \"$not_in_next\" \"$not_in_master\"\n\texit 1\nfi\n\n<<\\DOC_END\n\nThis sample hook safeguards topic branches that have been\npublished from being rewound.\n\nThe workflow assumed here is:\n\n * Once a topic branch forks from \"master\", \"master\" is never\n   merged into it again (either directly or indirectly).\n\n * Once a topic branch is fully cooked and merged into \"master\",\n   it is deleted.  If you need to build on top of it to correct\n   earlier mistakes, a new topic branch is created by forking at\n   the tip of the \"master\".  This is not strictly necessary, but\n   it makes it easier to keep your history simple.\n\n * Whenever you need to test or publish your changes to topic\n   branches, merge them into \"next\" branch.\n\nThe script, being an example, hardcodes the publish branch name\nto be \"next\", but it is trivial to make it configurable via\n$GIT_DIR/config mechanism.\n\nWith this workflow, you would want to know:\n\n(1) ... if a topic branch has ever been merged to \"next\".  Young\n    topic branches can have stupid mistakes you would rather\n    clean up before publishing, and things that have not been\n    merged into other branches can be easily rebased without\n    affecting other people.  But once it is published, you would\n    not want to rewind it.\n\n(2) ... if a topic branch has been fully merged to \"master\".\n    Then you can delete it.  More importantly, you should not\n    build on top of it -- other people may already want to\n    change things related to the topic as patches against your\n    \"master\", so if you need further changes, it is better to\n    fork the topic (perhaps with the same name) afresh from the\n    tip of \"master\".\n\nLet's look at this example:\n\n\t\t   o---o---o---o---o---o---o---o---o---o \"next\"\n\t\t  /       /           /           /\n\t\t /   a---a---b A     /           /\n\t\t/   /               /           /\n\t       /   /   c---c---c---c B         /\n\t      /   /   /             \\         /\n\t     /   /   /   b---b C     \\       /\n\t    /   /   /   /             \\     /\n    ---o---o---o---o---o---o---o---o---o---o---o \"master\"\n\n\nA, B and C are topic branches.\n\n * A has one fix since it was merged up to \"next\".\n\n * B has finished.  It has been fully merged up to \"master\" and \"next\",\n   and is ready to be deleted.\n\n * C has not merged to \"next\" at all.\n\nWe would want to allow C to be rebased, refuse A, and encourage\nB to be deleted.\n\nTo compute (1):\n\n\tgit rev-list ^master ^topic next\n\tgit rev-list ^master        next\n\n\tif these match, topic has not merged in next at all.\n\nTo compute (2):\n\n\tgit rev-list master..topic\n\n\tif this is empty, it is fully merged to \"master\".\n\nDOC_END\n"
  },
  {
    "path": ".git/hooks/pre-receive.sample",
    "content": "#!/bin/sh\n#\n# An example hook script to make use of push options.\n# The example simply echoes all push options that start with 'echoback='\n# and rejects all pushes when the \"reject\" push option is used.\n#\n# To enable this hook, rename this file to \"pre-receive\".\n\nif test -n \"$GIT_PUSH_OPTION_COUNT\"\nthen\n\ti=0\n\twhile test \"$i\" -lt \"$GIT_PUSH_OPTION_COUNT\"\n\tdo\n\t\teval \"value=\\$GIT_PUSH_OPTION_$i\"\n\t\tcase \"$value\" in\n\t\techoback=*)\n\t\t\techo \"echo from the pre-receive-hook: ${value#*=}\" >&2\n\t\t\t;;\n\t\treject)\n\t\t\texit 1\n\t\tesac\n\t\ti=$((i + 1))\n\tdone\nfi\n"
  },
  {
    "path": ".git/hooks/prepare-commit-msg.sample",
    "content": "#!/bin/sh\n#\n# An example hook script to prepare the commit log message.\n# Called by \"git commit\" with the name of the file that has the\n# commit message, followed by the description of the commit\n# message's source.  The hook's purpose is to edit the commit\n# message file.  If the hook fails with a non-zero status,\n# the commit is aborted.\n#\n# To enable this hook, rename this file to \"prepare-commit-msg\".\n\n# This hook includes three examples. The first one removes the\n# \"# Please enter the commit message...\" help message.\n#\n# The second includes the output of \"git diff --name-status -r\"\n# into the message, just before the \"git status\" output.  It is\n# commented because it doesn't cope with --amend or with squashed\n# commits.\n#\n# The third example adds a Signed-off-by line to the message, that can\n# still be edited.  This is rarely a good idea.\n\nCOMMIT_MSG_FILE=$1\nCOMMIT_SOURCE=$2\nSHA1=$3\n\n/usr/bin/perl -i.bak -ne 'print unless(m/^. Please enter the commit message/..m/^#$/)' \"$COMMIT_MSG_FILE\"\n\n# case \"$COMMIT_SOURCE,$SHA1\" in\n#  ,|template,)\n#    /usr/bin/perl -i.bak -pe '\n#       print \"\\n\" . `git diff --cached --name-status -r`\n# \t if /^#/ && $first++ == 0' \"$COMMIT_MSG_FILE\" ;;\n#  *) ;;\n# esac\n\n# SOB=$(git var GIT_COMMITTER_IDENT | sed -n 's/^\\(.*>\\).*$/Signed-off-by: \\1/p')\n# git interpret-trailers --in-place --trailer \"$SOB\" \"$COMMIT_MSG_FILE\"\n# if test -z \"$COMMIT_SOURCE\"\n# then\n#   /usr/bin/perl -i.bak -pe 'print \"\\n\" if !$first_line++' \"$COMMIT_MSG_FILE\"\n# fi\n"
  },
  {
    "path": ".git/hooks/push-to-checkout.sample",
    "content": "#!/bin/sh\n\n# An example hook script to update a checked-out tree on a git push.\n#\n# This hook is invoked by git-receive-pack(1) when it reacts to git\n# push and updates reference(s) in its repository, and when the push\n# tries to update the branch that is currently checked out and the\n# receive.denyCurrentBranch configuration variable is set to\n# updateInstead.\n#\n# By default, such a push is refused if the working tree and the index\n# of the remote repository has any difference from the currently\n# checked out commit; when both the working tree and the index match\n# the current commit, they are updated to match the newly pushed tip\n# of the branch. This hook is to be used to override the default\n# behaviour; however the code below reimplements the default behaviour\n# as a starting point for convenient modification.\n#\n# The hook receives the commit with which the tip of the current\n# branch is going to be updated:\ncommit=$1\n\n# It can exit with a non-zero status to refuse the push (when it does\n# so, it must not modify the index or the working tree).\ndie () {\n\techo >&2 \"$*\"\n\texit 1\n}\n\n# Or it can make any necessary changes to the working tree and to the\n# index to bring them to the desired state when the tip of the current\n# branch is updated to the new commit, and exit with a zero status.\n#\n# For example, the hook can simply run git read-tree -u -m HEAD \"$1\"\n# in order to emulate git fetch that is run in the reverse direction\n# with git push, as the two-tree form of git read-tree -u -m is\n# essentially the same as git switch or git checkout that switches\n# branches while keeping the local changes in the working tree that do\n# not interfere with the difference between the branches.\n\n# The below is a more-or-less exact translation to shell of the C code\n# for the default behaviour for git's push-to-checkout hook defined in\n# the push_to_deploy() function in builtin/receive-pack.c.\n#\n# Note that the hook will be executed from the repository directory,\n# not from the working tree, so if you want to perform operations on\n# the working tree, you will have to adapt your code accordingly, e.g.\n# by adding \"cd ..\" or using relative paths.\n\nif ! git update-index -q --ignore-submodules --refresh\nthen\n\tdie \"Up-to-date check failed\"\nfi\n\nif ! git diff-files --quiet --ignore-submodules --\nthen\n\tdie \"Working directory has unstaged changes\"\nfi\n\n# This is a rough translation of:\n#\n#   head_has_history() ? \"HEAD\" : EMPTY_TREE_SHA1_HEX\nif git cat-file -e HEAD 2>/dev/null\nthen\n\thead=HEAD\nelse\n\thead=$(git hash-object -t tree --stdin </dev/null)\nfi\n\nif ! git diff-index --quiet --cached --ignore-submodules $head --\nthen\n\tdie \"Working directory has staged changes\"\nfi\n\nif ! git read-tree -u -m \"$commit\"\nthen\n\tdie \"Could not update working tree to new HEAD\"\nfi\n"
  },
  {
    "path": ".git/hooks/sendemail-validate.sample",
    "content": "#!/bin/sh\n\n# An example hook script to validate a patch (and/or patch series) before\n# sending it via email.\n#\n# The hook should exit with non-zero status after issuing an appropriate\n# message if it wants to prevent the email(s) from being sent.\n#\n# To enable this hook, rename this file to \"sendemail-validate\".\n#\n# By default, it will only check that the patch(es) can be applied on top of\n# the default upstream branch without conflicts in a secondary worktree. After\n# validation (successful or not) of the last patch of a series, the worktree\n# will be deleted.\n#\n# The following config variables can be set to change the default remote and\n# remote ref that are used to apply the patches against:\n#\n#   sendemail.validateRemote (default: origin)\n#   sendemail.validateRemoteRef (default: HEAD)\n#\n# Replace the TODO placeholders with appropriate checks according to your\n# needs.\n\nvalidate_cover_letter () {\n\tfile=\"$1\"\n\t# TODO: Replace with appropriate checks (e.g. spell checking).\n\ttrue\n}\n\nvalidate_patch () {\n\tfile=\"$1\"\n\t# Ensure that the patch applies without conflicts.\n\tgit am -3 \"$file\" || return\n\t# TODO: Replace with appropriate checks for this patch\n\t# (e.g. checkpatch.pl).\n\ttrue\n}\n\nvalidate_series () {\n\t# TODO: Replace with appropriate checks for the whole series\n\t# (e.g. quick build, coding style checks, etc.).\n\ttrue\n}\n\n# main -------------------------------------------------------------------------\n\nif test \"$GIT_SENDEMAIL_FILE_COUNTER\" = 1\nthen\n\tremote=$(git config --default origin --get sendemail.validateRemote) &&\n\tref=$(git config --default HEAD --get sendemail.validateRemoteRef) &&\n\tworktree=$(mktemp --tmpdir -d sendemail-validate.XXXXXXX) &&\n\tgit worktree add -fd --checkout \"$worktree\" \"refs/remotes/$remote/$ref\" &&\n\tgit config --replace-all sendemail.validateWorktree \"$worktree\"\nelse\n\tworktree=$(git config --get sendemail.validateWorktree)\nfi || {\n\techo \"sendemail-validate: error: failed to prepare worktree\" >&2\n\texit 1\n}\n\nunset GIT_DIR GIT_WORK_TREE\ncd \"$worktree\" &&\n\nif grep -q \"^diff --git \" \"$1\"\nthen\n\tvalidate_patch \"$1\"\nelse\n\tvalidate_cover_letter \"$1\"\nfi &&\n\nif test \"$GIT_SENDEMAIL_FILE_COUNTER\" = \"$GIT_SENDEMAIL_FILE_TOTAL\"\nthen\n\tgit config --unset-all sendemail.validateWorktree &&\n\ttrap 'git worktree remove -ff \"$worktree\"' EXIT &&\n\tvalidate_series\nfi\n"
  },
  {
    "path": ".git/hooks/update.sample",
    "content": "#!/bin/sh\n#\n# An example hook script to block unannotated tags from entering.\n# Called by \"git receive-pack\" with arguments: refname sha1-old sha1-new\n#\n# To enable this hook, rename this file to \"update\".\n#\n# Config\n# ------\n# hooks.allowunannotated\n#   This boolean sets whether unannotated tags will be allowed into the\n#   repository.  By default they won't be.\n# hooks.allowdeletetag\n#   This boolean sets whether deleting tags will be allowed in the\n#   repository.  By default they won't be.\n# hooks.allowmodifytag\n#   This boolean sets whether a tag may be modified after creation. By default\n#   it won't be.\n# hooks.allowdeletebranch\n#   This boolean sets whether deleting branches will be allowed in the\n#   repository.  By default they won't be.\n# hooks.denycreatebranch\n#   This boolean sets whether remotely creating branches will be denied\n#   in the repository.  By default this is allowed.\n#\n\n# --- Command line\nrefname=\"$1\"\noldrev=\"$2\"\nnewrev=\"$3\"\n\n# --- Safety check\nif [ -z \"$GIT_DIR\" ]; then\n\techo \"Don't run this script from the command line.\" >&2\n\techo \" (if you want, you could supply GIT_DIR then run\" >&2\n\techo \"  $0 <ref> <oldrev> <newrev>)\" >&2\n\texit 1\nfi\n\nif [ -z \"$refname\" -o -z \"$oldrev\" -o -z \"$newrev\" ]; then\n\techo \"usage: $0 <ref> <oldrev> <newrev>\" >&2\n\texit 1\nfi\n\n# --- Config\nallowunannotated=$(git config --type=bool hooks.allowunannotated)\nallowdeletebranch=$(git config --type=bool hooks.allowdeletebranch)\ndenycreatebranch=$(git config --type=bool hooks.denycreatebranch)\nallowdeletetag=$(git config --type=bool hooks.allowdeletetag)\nallowmodifytag=$(git config --type=bool hooks.allowmodifytag)\n\n# check for no description\nprojectdesc=$(sed -e '1q' \"$GIT_DIR/description\")\ncase \"$projectdesc\" in\n\"Unnamed repository\"* | \"\")\n\techo \"*** Project description file hasn't been set\" >&2\n\texit 1\n\t;;\nesac\n\n# --- Check types\n# if $newrev is 0000...0000, it's a commit to delete a ref.\nzero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')\nif [ \"$newrev\" = \"$zero\" ]; then\n\tnewrev_type=delete\nelse\n\tnewrev_type=$(git cat-file -t $newrev)\nfi\n\ncase \"$refname\",\"$newrev_type\" in\n\trefs/tags/*,commit)\n\t\t# un-annotated tag\n\t\tshort_refname=${refname##refs/tags/}\n\t\tif [ \"$allowunannotated\" != \"true\" ]; then\n\t\t\techo \"*** The un-annotated tag, $short_refname, is not allowed in this repository\" >&2\n\t\t\techo \"*** Use 'git tag [ -a | -s ]' for tags you want to propagate.\" >&2\n\t\t\texit 1\n\t\tfi\n\t\t;;\n\trefs/tags/*,delete)\n\t\t# delete tag\n\t\tif [ \"$allowdeletetag\" != \"true\" ]; then\n\t\t\techo \"*** Deleting a tag is not allowed in this repository\" >&2\n\t\t\texit 1\n\t\tfi\n\t\t;;\n\trefs/tags/*,tag)\n\t\t# annotated tag\n\t\tif [ \"$allowmodifytag\" != \"true\" ] && git rev-parse $refname > /dev/null 2>&1\n\t\tthen\n\t\t\techo \"*** Tag '$refname' already exists.\" >&2\n\t\t\techo \"*** Modifying a tag is not allowed in this repository.\" >&2\n\t\t\texit 1\n\t\tfi\n\t\t;;\n\trefs/heads/*,commit)\n\t\t# branch\n\t\tif [ \"$oldrev\" = \"$zero\" -a \"$denycreatebranch\" = \"true\" ]; then\n\t\t\techo \"*** Creating a branch is not allowed in this repository\" >&2\n\t\t\texit 1\n\t\tfi\n\t\t;;\n\trefs/heads/*,delete)\n\t\t# delete branch\n\t\tif [ \"$allowdeletebranch\" != \"true\" ]; then\n\t\t\techo \"*** Deleting a branch is not allowed in this repository\" >&2\n\t\t\texit 1\n\t\tfi\n\t\t;;\n\trefs/remotes/*,commit)\n\t\t# tracking branch\n\t\t;;\n\trefs/remotes/*,delete)\n\t\t# delete tracking branch\n\t\tif [ \"$allowdeletebranch\" != \"true\" ]; then\n\t\t\techo \"*** Deleting a tracking branch is not allowed in this repository\" >&2\n\t\t\texit 1\n\t\tfi\n\t\t;;\n\t*)\n\t\t# Anything else (is there anything else?)\n\t\techo \"*** Update hook: unknown type of update to ref $refname of type $newrev_type\" >&2\n\t\texit 1\n\t\t;;\nesac\n\n# --- Finished\nexit 0\n"
  },
  {
    "path": ".git/info/exclude",
    "content": "# git ls-files --others --exclude-from=.git/info/exclude\n# Lines that start with '#' are comments.\n# For a project mostly in C, the following would be a good set of\n# exclude patterns (uncomment them if you want to use them):\n# *.[oa]\n# *~\n"
  },
  {
    "path": ".git/logs/HEAD",
    "content": "0000000000000000000000000000000000000000 7d5aec268304b92b90f3bb109ddf4cad80d5ea1a appuser <appuser@7c99e0e64a07.(none)> 1778362562 +0000\tclone: from https://github.com/xuqiantong/CUDA-Winograd\n"
  },
  {
    "path": ".git/logs/refs/heads/master",
    "content": "0000000000000000000000000000000000000000 7d5aec268304b92b90f3bb109ddf4cad80d5ea1a appuser <appuser@7c99e0e64a07.(none)> 1778362562 +0000\tclone: from https://github.com/xuqiantong/CUDA-Winograd\n"
  },
  {
    "path": ".git/logs/refs/remotes/origin/HEAD",
    "content": "0000000000000000000000000000000000000000 7d5aec268304b92b90f3bb109ddf4cad80d5ea1a appuser <appuser@7c99e0e64a07.(none)> 1778362562 +0000\tclone: from https://github.com/xuqiantong/CUDA-Winograd\n"
  },
  {
    "path": ".git/objects/pack/pack-49e0e400885c832ca9946c8b7ba9584079082307.promisor",
    "content": "7d5aec268304b92b90f3bb109ddf4cad80d5ea1a refs/heads/master\n"
  },
  {
    "path": ".git/packed-refs",
    "content": "# pack-refs with: peeled fully-peeled sorted \n7d5aec268304b92b90f3bb109ddf4cad80d5ea1a refs/remotes/origin/master\n"
  },
  {
    "path": ".git/refs/heads/master",
    "content": "7d5aec268304b92b90f3bb109ddf4cad80d5ea1a\n"
  },
  {
    "path": ".git/refs/remotes/origin/HEAD",
    "content": "ref: refs/remotes/origin/master\n"
  },
  {
    "path": ".git/shallow",
    "content": "7d5aec268304b92b90f3bb109ddf4cad80d5ea1a\n"
  },
  {
    "path": ".gitignore",
    "content": "#executable\n*\n!*.*\n!*/\n!Makefile\n\n# Compiled source #\n###################\n*.com\n*.class\n*.dll\n*.exe\n*.o\n*.so\n\n# Packages #\n############\n# it's better to unpack these files and commit the raw source\n# git has its own built in compression methods\n*.7z\n*.dmg\n*.gz\n*.iso\n*.jar\n*.rar\n*.tar\n*.zip\n*.bin\n\n# Logs and databases #\n######################\n*.log\n*.sql\n*.sqlite\n\n# OS generated files #\n######################\n.DS_Store\n.DS_Store?\n._*\n.Spotlight-V100\n.Trashes\nehthumbs.db\nThumbs.db\n\n"
  },
  {
    "path": "Kernel128_one.cu",
    "content": "#include <stdlib.h>\n#include <string.h>\n#include <stdio.h>\n#include <string.h>\n#include <float.h>\n#include <math.h>\n#include <assert.h>\n#include <xmmintrin.h>\n#include <immintrin.h>\n\n#include \"cudnn.h\"\n#include \"util.h\"\n#include \"Kernel128_one.h\"\n\n\n#define cudaCheckError() {\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n\tcudaError_t e=cudaGetLastError();\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n\tif(e!=cudaSuccess) {\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n\t\tprintf(\"Cuda failure %s:%d:'%s'\\n\",__FILE__,__LINE__,cudaGetErrorString(e));\t\\\n\t\texit(EXIT_FAILURE);\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n}\n\n__global__ void kernel_512_one_128(float *A, float *B, float *bnBias, float *bnScale, float *C) {\n\tint tile = blockIdx.x, in_channel = threadIdx.x, line = threadIdx.y;\n\tint ind = line*128 + in_channel;\n\n\textern __shared__ float shared_[];\n\tfloat *weights = shared_ + 512*4, *output = weights + 128*64, *input = shared_;\n\tfloat *bias = output + 4*128, *scale = bias + 128;\n\n\tfor (int i = 0; i < 4; i++)\n\t\tinput[ind + i*512] = A[tile*2048 + i*512 + ind];\n\tbias[in_channel] = bnBias[in_channel];\n\tscale[in_channel] = bnScale[in_channel];\n\toutput[ind] = 0.0f;\n\t__syncthreads();\n\n\tfor (int k = 0; k < 512; k += 64) {\n\t\tfloat *B_start = B + k*128;\n\t\tfor (int i = 0; i < 16; i++)\n\t\t\tweights[ind + i*512] = B_start[i*512 + ind];\n\t\t__syncthreads();\n\n\t\tfloat *A_start = input + k;\n\t\tfor (int p = 0; p < 64; p++) {\n\t\t\toutput[ind] += A_start[line*512 + p] * weights[in_channel + p*128];\n\t\t}\n\t\t__syncthreads();\n\t}\n\n\tfloat *C_start = C + tile*512, res = scale[in_channel] * output[ind] + bias[in_channel];\n\tC_start[ind] = res > 0 ? res : 0;\n}\n\n\nint kernel_128_1_in() {\n\tfloat *input = get_parameter(inputName128one, 14*14*512);\n\tfloat *weight = get_parameter(weightName128one, 128*512);\n\n\tfloat *bnBias = get_parameter(bnBiasName128one, 128);\n\tfloat *bnScale = get_parameter(bnScaleName128one, 128);\n\tfloat *bnBias_myKernel = get_parameter(bnBias_myKernel_Name128one, 128);\n\tfloat *bnScale_myKernel = get_parameter(bnScale_myKernel_Name128one, 128);\n\tfloat *eMeanName = get_parameter(eMeanName128one, 128);\n\tfloat *eVarName = get_parameter(eVarName128one, 128);\n\n\tfloat *input_, *output_, *weight_, *bnBias_, *bnScale_, *eMeanName_, *eVarName_;\n\n\tint nInput = 14*14*512, nOutput = 14*14*128, nWeights = 128*512;\n\tfloat tmp[nOutput], tmp_cudnn[nOutput];\n\n\tuint64_t nT1 = 0, nT2 = 0, nT1_cudnn = 0, nT2_cudnn = 0;\n\tcudaError_t s;\n\n\t/////////////////////////////////\n\n\t// My Kernel\n\n\t/////////////////////////////////\n\n\t/*  1. Data preparation  */\n\tcudaMalloc((void **) &input_, nInput<<3);\n\tcudaMalloc((void **) &output_, nOutput<<2);\n\tcudaMalloc((void **) &weight_, nWeights<<2);\n\tcudaMalloc((void **) &bnBias_, 128<<2);\n\tcudaMalloc((void **) &bnScale_, 128<<2);\n\n\tcudaMemcpy(input_, input, nInput<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(weight_, weight, nWeights<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(bnBias_, bnBias_myKernel, 128<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(bnScale_, bnScale_myKernel, 128<<2, cudaMemcpyHostToDevice);\n\n\n\t/*  2. Computing  */\n\tnT1 = getTimeMicroseconds64();\n\n\tkernel_512_one_128 <<<dim3(49), dim3(128, 4), (4*512 + 64*128 + 4*128 + 2*128)<<2 >>> (input_, weight_, bnBias_, bnScale_, output_);\n\n\t//cudaCheckError();\n\tcudaDeviceSynchronize();\n\n\tnT2 = getTimeMicroseconds64();\n\tprintf(\"TotalTime = %d us\\n\", nT2-nT1);\n\n\n\t/*  3. Copy back and free  */\n\ts = cudaMemcpy(tmp, output_, nOutput<<2, cudaMemcpyDeviceToHost);\n\tprintf(\"%s\\n\", cudaGetErrorName(s));\n\tcudaCheckError();\n\n\tfree(bnBias_myKernel);\n\tfree(bnScale_myKernel);\n\n\n\t/////////////////////////////////\n\n\t// cuDNN\n\n\t/////////////////////////////////\n\n\t/*  1. Data preparation  */\n\tcudaMalloc((void **) &eMeanName_, 128<<2);\n\tcudaMalloc((void **) &eVarName_, 128<<2);\n\n\tcudaMemcpy(bnBias_, bnBias, 128<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(bnScale_, bnScale, 128<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(eMeanName_, eMeanName, 128<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(eVarName_, eVarName, 128<<2, cudaMemcpyHostToDevice);\n\n\tweight = transpose(weight, 128, 512);\n\tcudaMemcpy(weight_, weight, nWeights<<2, cudaMemcpyHostToDevice);\n\n\t/*  2. cuDNN preparation  */\n\tcudnnStatus_t status;\n\tfloat one = 1.0, zero = 0.0;\n\tint size;\n\n\tcudnnHandle_t handle;\n\tstatus = cudnnCreate(&handle);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed1\\n\");\n\n\tcudnnTensorDescriptor_t xdesc, ydesc;\n\tcudnnFilterDescriptor_t wdesc; // CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NCHW\n\tstatus = cudnnCreateTensorDescriptor(&xdesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed2\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(xdesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 512, 14, 14);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed3\\n\");\n\tstatus = cudnnCreateTensorDescriptor(&ydesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed4\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(ydesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 128, 14, 14);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed5\\n\");\n\tstatus = cudnnCreateFilterDescriptor(&wdesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed6\\n\");\n\tstatus = cudnnSetFilter4dDescriptor(wdesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 128, 512, 1, 1);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed7\\n\");\n\n\tcudnnConvolutionDescriptor_t conv_desc;\n\tstatus = cudnnCreateConvolutionDescriptor(&conv_desc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed10\\n\");\n\tstatus = cudnnSetConvolution2dDescriptor(conv_desc, 0,0, 1,1,1,1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); //CUDNN_CONVOLUTION\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed11\\n\");\n\n\tcudnnActivationDescriptor_t act_desc;\n\tstatus = cudnnCreateActivationDescriptor(&act_desc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed12\\n\");\n\tstatus = cudnnSetActivationDescriptor(act_desc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed13\\n\");\n\n\tcudnnTensorDescriptor_t bnScaleBiasMeanVarDesc;\n\tstatus = cudnnCreateTensorDescriptor(&bnScaleBiasMeanVarDesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed14\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(bnScaleBiasMeanVarDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 128, 1, 1);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed15\\n\");\n\n\tcudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)0;\n\n\tstatus = cudnnGetConvolutionForwardWorkspaceSize(handle,\n\t   xdesc,\n\t   wdesc,\n\t   conv_desc,\n\t   ydesc,\n\t   algo,\n\t   (size_t *)&(size));\n\tfloat *extra;\n\tcudaMalloc((void **) &extra, size);\n\n\n\t/*  3. Computing  */\n\tnT1_cudnn = getTimeMicroseconds64();\n\n\tstatus = cudnnConvolutionForward(handle, &one,\n\t\txdesc, input_, wdesc, weight_,\n\t\tconv_desc, algo,\n\t\textra, size, &zero,\n\t\tydesc, output_);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"Not Successed1\\n\");\n\n\tstatus = cudnnBatchNormalizationForwardInference(handle, CUDNN_BATCHNORM_SPATIAL,\n\t\t&one, &zero,\n\t\tydesc, output_, ydesc, output_,\n\t\tbnScaleBiasMeanVarDesc, bnScale_, bnBias_, eMeanName_, eVarName_, CUDNN_BN_MIN_EPSILON);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"Not Successed2\\n\");\n\n\tstatus = cudnnActivationForward(handle, act_desc, &one,\n\t\tydesc, output_, &zero,\n\t\tydesc, output_);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"Not Successed3\\n\");\n\n\tcudaDeviceSynchronize();\n\tnT2_cudnn = getTimeMicroseconds64();\n\tprintf(\"cuDNN TotalTime = %d us\\n\", nT2_cudnn-nT1_cudnn);\n\n\n\t/*  4. Copy back and free  */\n\ts = cudaMemcpy(tmp_cudnn, output_, nOutput<<2, cudaMemcpyDeviceToHost);\n\tprintf(\"%s\\n\", cudaGetErrorName(s));\n\n\tcudaFree(extra);\n\tcudaFree(input_);\n\tcudaFree(output_);\n\tcudaFree(weight_);\n\n\tcudaFree(bnScale_);\n\tcudaFree(bnBias_);\n\tcudaFree(eMeanName_);\n\tcudaFree(eVarName_);\n\n\tfree(input);\n\tfree(weight);\n\n\tfree(bnScale);\n\tfree(bnBias);\n\tfree(eMeanName);\n\tfree(eVarName);\n\n\toutput_checker(tmp, tmp_cudnn, 14, 128, 0);\n\n\treturn ((nT2-nT1) << 16) | (nT2_cudnn-nT1_cudnn);\n}\n\n\n\n__global__ void kernel_128_one_512(float *A, float *B, float *bnBias, float *bnScale, float *C) {\n\tint tile = blockIdx.x, part = blockIdx.y, in_channel = threadIdx.x, line = threadIdx.y;\n\tint ind = line*128 + in_channel;\n\n\textern __shared__ float shared_[];\n\tfloat *weights = shared_ + 128*4, *output = weights + 128*64, *input = shared_;\n\tfloat *bias = output + 4*128, *scale = bias + 128;\n\n\tinput[ind] = A[tile * 512 + ind];\n\tbias[in_channel] = bnBias[part*128 + in_channel];\n\tscale[in_channel] = bnScale[part*128+ in_channel];\n\toutput[ind] = 0.0f;\n\t__syncthreads();\n\n\tfor (int k = 0; k < 128; k += 64) {\n\t\tfor (int i = 0; i < 16; i++)\n\t\t\tweights[ind + 512*i] = B[(k + i*4 + line)*512 + part*128 + in_channel];\n\t\t__syncthreads();\n\n\t\tfloat *A_start = input + k;\n\t\tfor (int p = 0; p < 64; p++) {\n\t\t\toutput[ind] += A_start[line*128 + p] * weights[in_channel + p*128];\n\t\t}\n\t\t__syncthreads();\n\t}\n\n\tfloat *C_start = C + tile*2048 + part*128;\n\tfloat res = scale[in_channel] * output[ind] + bias[in_channel];\n\tC_start[line * 512 + in_channel] = res;\n}\n\n\nint kernel_128_1_out() {\n\tfloat *input = get_parameter(inputName128one, 14*14*128);\n\tfloat *weight = get_parameter(weightName128one, 128*512);\n\n\tfloat *bnBias = get_parameter(bnBiasName128one, 512);\n\tfloat *bnScale = get_parameter(bnScaleName128one, 512);\n\tfloat *bnBias_myKernel = get_parameter(bnBias_myKernel_Name128one, 512);\n\tfloat *bnScale_myKernel = get_parameter(bnScale_myKernel_Name128one, 512);\n\tfloat *eMeanName = get_parameter(eMeanName128one, 512);\n\tfloat *eVarName = get_parameter(eVarName128one, 512);\n\n\tfloat *input_, *output_, *weight_, *bnBias_, *bnScale_, *eMeanName_, *eVarName_;\n\n\tint nInput = 14*14*128, nOutput = 14*14*512, nWeights = 128*512;\n\tfloat tmp[nOutput], tmp_cudnn[nOutput];\n\n\tuint64_t nT1 = 0, nT2 = 0, nT1_cudnn = 0, nT2_cudnn = 0;\n\tcudaError_t s;\n\n\t/////////////////////////////////\n\n\t// My Kernel\n\n\t/////////////////////////////////\n\n\t/*  1. Data preparation  */\n\tcudaMalloc((void **) &input_, nInput<<3);\n\tcudaMalloc((void **) &output_, nOutput<<2);\n\tcudaMalloc((void **) &weight_, nWeights<<2);\n\tcudaMalloc((void **) &bnBias_, 512<<2);\n\tcudaMalloc((void **) &bnScale_, 512<<2);\n\n\tcudaMemcpy(input_, input, nInput<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(weight_, weight, nWeights<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(bnBias_, bnBias_myKernel, 512<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(bnScale_, bnScale_myKernel, 512<<2, cudaMemcpyHostToDevice);\n\n\t/*  2. Computing  */\n\tnT1 = getTimeMicroseconds64();\n\n\tkernel_128_one_512 <<<dim3(49, 4), dim3(128, 4), (4*128 + 64*128 + 4*128 + 2*128)<<2 >>> (input_, weight_, bnBias_, bnScale_, output_);\n\n\t//cudaCheckError();\n\tcudaDeviceSynchronize();\n\n\tnT2 = getTimeMicroseconds64();\n\tprintf(\"TotalTime = %d us\\n\", nT2-nT1);\n\n\n\t/*  3. Copy back and free  */\n\ts = cudaMemcpy(tmp, output_, nOutput<<2, cudaMemcpyDeviceToHost);\n\tprintf(\"%s\\n\", cudaGetErrorName(s));\n\tcudaCheckError();\n\n\tfree(bnBias_myKernel);\n\tfree(bnScale_myKernel);\n\n\n\t/////////////////////////////////\n\n\t// cuDNN\n\n\t/////////////////////////////////\n\n\t/*  1. Data preparation  */\n\tcudaMalloc((void **) &eMeanName_, 512<<2);\n\tcudaMalloc((void **) &eVarName_, 512<<2);\n\n\tcudaMemcpy(bnBias_, bnBias, 512<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(bnScale_, bnScale, 512<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(eMeanName_, eMeanName, 512<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(eVarName_, eVarName, 512<<2, cudaMemcpyHostToDevice);\n\n\tweight = transpose(weight, 512, 128);\n\tcudaMemcpy(weight_, weight, nWeights<<2, cudaMemcpyHostToDevice);\n\n\t/*  2. cuDNN preparation  */\n\tcudnnStatus_t status;\n\tfloat one = 1.0, zero = 0.0;\n\tint size;\n\n\tcudnnHandle_t handle;\n\tstatus = cudnnCreate(&handle);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed1\\n\");\n\n\tcudnnTensorDescriptor_t xdesc, ydesc;\n\tcudnnFilterDescriptor_t wdesc; // CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NCHW\n\tstatus = cudnnCreateTensorDescriptor(&xdesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed2\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(xdesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 128, 14, 14);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed3\\n\");\n\tstatus = cudnnCreateTensorDescriptor(&ydesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed4\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(ydesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 512, 14, 14);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed5\\n\");\n\tstatus = cudnnCreateFilterDescriptor(&wdesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed6\\n\");\n\tstatus = cudnnSetFilter4dDescriptor(wdesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 512, 128, 1, 1);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed7\\n\");\n\n\tcudnnConvolutionDescriptor_t conv_desc;\n\tstatus = cudnnCreateConvolutionDescriptor(&conv_desc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed10\\n\");\n\tstatus = cudnnSetConvolution2dDescriptor(conv_desc, 0,0, 1,1,1,1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); //CUDNN_CONVOLUTION\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed11\\n\");\n\n\tcudnnTensorDescriptor_t bnScaleBiasMeanVarDesc;\n\tstatus = cudnnCreateTensorDescriptor(&bnScaleBiasMeanVarDesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed14\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(bnScaleBiasMeanVarDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 512, 1, 1);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed15\\n\");\n\n\tcudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)0;\n\n\tstatus = cudnnGetConvolutionForwardWorkspaceSize(handle,\n\t   xdesc,\n\t   wdesc,\n\t   conv_desc,\n\t   ydesc,\n\t   algo,\n\t   (size_t *)&(size));\n\tfloat *extra;\n\tcudaMalloc((void **) &extra, size);\n\n\n\t/*  3. Computing  */\n\tnT1_cudnn = getTimeMicroseconds64();\n\n\tstatus = cudnnConvolutionForward(handle, &one,\n\t\txdesc, input_, wdesc, weight_,\n\t\tconv_desc, algo,\n\t\textra, size, &zero,\n\t\tydesc, output_);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"Not Successed1\\n\");\n\n\tstatus = cudnnBatchNormalizationForwardInference(handle, CUDNN_BATCHNORM_SPATIAL,\n\t\t&one, &zero,\n\t\tydesc, output_, ydesc, output_,\n\t\tbnScaleBiasMeanVarDesc, bnScale_, bnBias_, eMeanName_, eVarName_, CUDNN_BN_MIN_EPSILON);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"Not Successed2\\n\");\n\n\tcudaDeviceSynchronize();\n\tnT2_cudnn = getTimeMicroseconds64();\n\tprintf(\"cuDNN TotalTime = %d us\\n\", nT2_cudnn-nT1_cudnn);\n\n\n\t/*  4. Copy back and free  */\n\ts = cudaMemcpy(tmp_cudnn, output_, nOutput<<2, cudaMemcpyDeviceToHost);\n\tprintf(\"%s\\n\", cudaGetErrorName(s));\n\n\tcudaFree(extra);\n\tcudaFree(input_);\n\tcudaFree(output_);\n\tcudaFree(weight_);\n\n\tcudaFree(bnScale_);\n\tcudaFree(bnBias_);\n\tcudaFree(eMeanName_);\n\tcudaFree(eVarName_);\n\n\tfree(input);\n\tfree(weight);\n\n\tfree(bnScale);\n\tfree(bnBias);\n\tfree(eMeanName);\n\tfree(eVarName);\n\n\toutput_checker(tmp, tmp_cudnn, 14, 512, 0);\n\n\treturn ((nT2-nT1) << 16) | (nT2_cudnn-nT1_cudnn);\n}\n"
  },
  {
    "path": "Kernel128_one.h",
    "content": "#ifndef __KERNEL128_ONE_H__\n#define __KERNEL128_ONE_H__\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nconst char inputName128one[] = \"data/input_one_14_1024.bin\";\nconst char weightName128one[] = \"data/weight_one_1024.bin\";\n\nconst char bnBiasName128one[] = \"data/bnBias_one_1024.bin\";\nconst char bnScaleName128one[] = \"data/bnScale_one_1024.bin\";\nconst char bnBias_myKernel_Name128one[] = \"data/bnBias_myKernel_one_1024.bin\";\nconst char bnScale_myKernel_Name128one[] = \"data/bnScale_myKernel_one_1024.bin\";\nconst char eMeanName128one[] = \"data/eMean_one_1024.bin\";\nconst char eVarName128one[] = \"data/eVar_one_1024.bin\";\n\nint kernel_128_1_in();\nint kernel_128_1_out();\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif"
  },
  {
    "path": "Kernel128_winograd.cu",
    "content": "#include <stdlib.h>\n#include <string.h>\n#include <stdio.h>\n#include <string.h>\n#include <float.h>\n#include <math.h>\n#include <assert.h>\n#include <xmmintrin.h>\n#include <immintrin.h>\n\n#include \"cudnn.h\"\n#include \"util.h\"\n#include \"Kernel128_winograd.h\"\n\n\n#define cudaCheckError() {\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n\tcudaError_t e=cudaGetLastError();\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n\tif(e!=cudaSuccess) {\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n\t\tprintf(\"Cuda failure %s:%d:'%s'\\n\",__FILE__,__LINE__,cudaGetErrorString(e));\t\\\n\t\texit(EXIT_FAILURE);\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n}\n\n#define MY_KERNEL 1\n\n#define d(input, i, j, Inz) ( input[Inz + i*768 + (j<<7)] )\n\n__global__ void kernel_128_winograd_BtdB(float *pInputs, float *pOutputs) {\n\tint Inx = blockIdx.x<<2, Iny0 = blockIdx.y<<2, Iny1 = threadIdx.y, Inz = threadIdx.x;\n\tint Iny = Iny0+Iny1, stride_r = 2048, stride_c = 128; // 2048 = 16*128\n\tint c_glb_start = Inx*stride_r + Iny*stride_c + Inz, c_input = Iny1*stride_c + Inz;\n\n\textern __shared__ float input[];\n\n\tint tmp[6] = {0, 768, 1536, 2304, 3072, 3840}; // 768 = 6*128\n\tfor (int i = 0; i < 6; i++) {\n\t\tinput[c_input + tmp[i]] = pInputs[c_glb_start + i*stride_r];\n\t}\n\t__syncthreads();\n\n\tfloat BTd[6];\n\tswitch(Iny1) {\n\t\tcase 0:\n\t\t\tfor (int j = 0; j < 6; j++) {\n\t\t\t\tBTd[j] = d(input, 0, j, Inz)*4 - d(input, 2, j, Inz)*5 + d(input, 4, j, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 1:\n\t\t\tfor (int j = 0; j < 6; j++) {\n\t\t\t\tBTd[j] = -d(input, 1, j, Inz)*4 - d(input, 2, j, Inz)*4 + d(input, 3, j, Inz) + d(input, 4, j, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 2:\n\t\t\tfor (int j = 0; j < 6; j++) {\n\t\t\t\tBTd[j] = d(input, 1, j, Inz)*4 - d(input, 2, j, Inz)*4 - d(input, 3, j, Inz) + d(input, 4, j, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 3:\n\t\t\tfor (int j = 0; j < 6; j++) {\n\t\t\t\tBTd[j] = -d(input, 1, j, Inz)*2 - d(input, 2, j, Inz) + d(input, 3, j, Inz)*2 + d(input, 4, j, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 4:\n\t\t\tfor (int j = 0; j < 6; j++) {\n\t\t\t\tBTd[j] = d(input, 1, j, Inz)*2 - d(input, 2, j, Inz) - d(input, 3, j, Inz)*2 + d(input, 4, j, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 5:\n\t\t\tfor (int j = 0; j < 6; j++) {\n\t\t\t\tBTd[j] = d(input, 1, j, Inz)*4 - d(input, 3, j, Inz)*5 + d(input, 5, j, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t}\n\t__syncthreads();\n\n\tint tmp_offset = Iny1*768+Inz;\n\tfor (int i = 0; i < 6; i++) {\n\t\tinput[tmp_offset + i*stride_c] = BTd[i];\n\t}\n\t__syncthreads();\n\n\tfloat BTdB[6];\n\tswitch(Iny1) {\n\t\tcase 0:\n\t\t\tfor (int i = 0; i < 6; i++) {\n\t\t\t\tBTdB[i] = 4*d(input, i, 0, Inz) - 5*d(input, i, 2, Inz) + d(input, i, 4, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 1:\n\t\t\tfor (int i = 0; i < 6; i++) {\n\t\t\t\tBTdB[i] = -4*d(input, i, 1, Inz) - 4*d(input, i, 2, Inz) + d(input, i, 3, Inz) + d(input, i, 4, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 2:\n\t\t\tfor (int i = 0; i < 6; i++) {\n\t\t\t\tBTdB[i] = 4*d(input, i, 1, Inz) - 4*d(input, i, 2, Inz) - d(input, i, 3, Inz) + d(input, i, 4, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 3:\n\t\t\tfor (int i = 0; i < 6; i++) {\n\t\t\t\tBTdB[i] = -2*d(input, i, 1, Inz) - d(input, i, 2, Inz) + 2*d(input, i, 3, Inz) + d(input, i, 4, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 4:\n\t\t\tfor (int i = 0; i < 6; i++) {\n\t\t\t\tBTdB[i] = 2*d(input, i, 1, Inz) - d(input, i, 2, Inz) - 2*d(input, i, 3, Inz) + d(input, i, 4, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 5:\n\t\t\tfor (int i = 0; i < 6; i++) {\n\t\t\t\tBTdB[i] = 4*d(input, i, 1, Inz) - 5*d(input, i, 3, Inz) + d(input, i, 5, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t}\n\t__syncthreads();\n\n\tfor (int i = 0; i < 6; i++) {\n\t\tpOutputs[(Iny1 + i*6)*2048 + (blockIdx.x*4+blockIdx.y)*128 + Inz] = BTdB[i];\n\t}\n}\n\n\n__global__ void kernel_128_winograd_AtIA(float *pInputs, float *pBiases, float *pScales, float *pOutputs) {\n\tint Tilex = blockIdx.x, Tiley = blockIdx.y, Iny = threadIdx.y, kz = blockIdx.z, Inx = threadIdx.x;\n\tint c_input = Inx*6 + Iny;\n\n\t__shared__ float bias, scale;\n\textern __shared__ float input[];\n\n\tinput[c_input] = pInputs[c_input*16*128 + (Tilex*4+Tiley)*128 + kz];\n\tbias = pBiases[kz];\n\tscale = pScales[kz];\n\t__syncthreads();\n\n\tfloat tmp = 0;\n\tswitch(Inx) {\n\t\tcase 0:\n\t\t\ttmp = input[Iny] + input[6+Iny] + input[12+Iny] + input[18+Iny] + input[24+Iny];\n\t\t\tbreak;\n\t\tcase 1:\n\t\t\ttmp = input[6+Iny] - input[12+Iny] + 2*input[18+Iny] - 2*input[24+Iny];\n\t\t\tbreak;\n\t\tcase 2:\n\t\t\ttmp = input[6+Iny] + input[12+Iny] + 4*input[18+Iny] + 4*input[24+Iny];\n\t\t\tbreak;\n\t\tcase 3:\n\t\t\ttmp = input[6+Iny] - input[12+Iny] + 8*input[18+Iny] - 8*input[24+Iny] + input[30+Iny];\n\t\t\tbreak;\n\t}\n\t__syncthreads();\n\n\tinput[c_input] = tmp;\n\t__syncthreads();\n\n\tif (Inx > 3 || (Tilex == 3 && Inx > 1)) return;\n\t\n\tint x;\n\tfloat o;\n\tswitch(Iny) {\n\t\tcase 0:\n\t\t\tx = Inx*6;\n\t\t\to = scale*(input[x]+input[x+1]+input[x+2]+input[x+3]+input[x+4])+ bias;\n\t\t\tpOutputs[(((Tilex<<2)+1+Inx)*16 + (Tiley<<2)+1)*128 + kz] = o > 0 ? o : 0;\n\t\t\tbreak;\n\t\tcase 1:\n\t\t\tx = Inx*6;\n\t\t\to = scale*(input[x+1] - input[x+2] + 2*input[x+3] - 2*input[x+4]) + bias;\n\t\t\tpOutputs[(((Tilex<<2)+1+Inx)*16 + (Tiley<<2)+2)*128 + kz] = o > 0 ? o : 0;\n\t\t\tbreak;\n\t\tcase 2:\n\t\t\tif (Tiley == 3) break;\n\t\t\tx = Inx*6;\n\t\t\to = scale*(input[x+1] + input[x+2] + 4*input[x+3] + 4*input[x+4]) + bias;\n\t\t\tpOutputs[(((Tilex<<2)+1+Inx)*16 + (Tiley<<2)+3)*128 + kz] = o > 0 ? o : 0;\n\t\t\tbreak;\n\t\tcase 3:\n\t\t\tif (Tiley == 3) break;\n\t\t\tx = Inx*6;\n\t\t\to = scale*(input[x+1] - input[x+2] + 8*input[x+3] - 8*input[x+4] + input[x+5]) + bias;\n\t\t\tpOutputs[(((Tilex<<2)+1+Inx)*16 + (Tiley<<2)+4)*128 + kz] = o > 0 ? o : 0;\n\t\t\tbreak;\n\t}\n}\n\n\n__global__ void kernel_128_OuterProduct_128(float *A, float *B, float *C) {\n\tint Tile = blockIdx.x, Part = blockIdx.y, tX = threadIdx.x, tY = threadIdx.y;\n\tint c_input = tY*128 + tX, c_kernel = c_input, T_offset = (Tile<<11) + (Part<<10) + c_input, B_offset = (Tile<<14) + c_kernel;\n\t\n\textern __shared__ float input[];\n\tfloat *kernel = input + 1024, *out = kernel + 8192;\n\tint B_stride[32] = {0, 128, 256, 384, 512, 640, 768, 896, 1024, 1152, 1280, 1408, 1536, 1664, 1792, 1920, 2048, 2176, 2304, 2432, 2560, 2688, 2816, 2944, 3072, 3200, 3328, 3456, 3584, 3712, 3840, 3968};//, 4096, 4224, 4352, 4480, 4608, 4736, 4864, 4992, 5120, 5248, 5376, 5504, 5632, 5760, 5888, 6016, 6144, 6272, 6400, 6528, 6656, 6784, 6912, 7040, 7168, 7296, 7424, 7552, 7680, 7808, 7936, 8064};\n\tout[c_input] = 0.0f;\n\n\tinput[c_input] = A[T_offset];\n\n\tfor (int k = 0; k < 4; k++) {\n\t\tint B_start = B_offset + (k<<12); // 32*64\n\t\tkernel[c_kernel] = B[B_start], kernel[c_kernel+1024] = B[B_start+1024];\n\t\tkernel[c_kernel+2048] = B[B_start+2048], kernel[c_kernel+3072] = B[B_start+3072];\n\t\t__syncthreads();\n\n\t\tfloat sum = 0;\n\t\tint y_tmp = (tY<<7)+(k<<5);\n\t\tfor (int j = 0; j < 32; j++) {\n\t\t\tsum += input[y_tmp + j] * kernel[tX + B_stride[j]];\n\t\t}\n\t\tout[tY*128 + tX] += sum;\n\t\t__syncthreads();\n\t}\n\n\tC[T_offset] = out[c_input];\n}\n\nint kernel_128() {\n\tfloat *input_ = get_parameter(inputName128, 16*16*128);\n\tfloat *bias = get_parameter(biasName128, 128);\n\tfloat *input, *output, *l_weights, *l_bias;\n\tuint64_t nT1 = 0, nT2 = 0, nT1_cudnn = 0, nT2_cudnn = 0;\n\tcudaError_t s;\n\n\t/////////////////////////////////\n\n\t// My Kernel\n\n\t/////////////////////////////////\n\n\n\t/*  1. Data preparation  */\n\tfloat *t_input, *ip;\n\t//float *kernel = get_Winograd_Kernel128(weight_winograd_Name128, 128);\n\tfloat *kernel = get_parameter(weight_winograd_Name128, 36*128*128);\n\tfloat *l_bnBias, *l_bnScale, *bnBias, *bnScale;\n\n\tint nInput = 16*16*128, nOutput = 16*16*128, nWeights = 36*128*128, nBias = 128, nTransInput = 16*6*6*128, nInnerProd = 16*6*6*128;\n\tcudaMalloc((void **) &input, nInput<<3);\n\tcudaMalloc((void **) &output, nOutput<<2);\n\tcudaMalloc((void **) &l_weights, nWeights<<2);\n\tcudaMalloc((void **) &l_bias, nBias<<2);\n\tcudaMalloc((void **) &t_input, nTransInput<<2);\n\tcudaMalloc((void **) &ip, nInnerProd<<2);\n\tcudaMemset((void *) input, 0, nInput<<3);\n\tcudaMemset((void *) output, 0, nOutput<<2);\n\tcudaMemset((void *) t_input, 0, nTransInput<<2);\n\tcudaMemset((void *) l_weights, 0, nWeights<<2);\n\tcudaMemset((void *) ip, 0, nInnerProd<<2);\n\tcudaMemcpy(input, input_, nInput<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(l_weights, kernel, nWeights<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(l_bias, bias, nBias<<2, cudaMemcpyHostToDevice);\n\t\n\tbnBias = get_parameter(bnBias_winograd_Name128, 128);\n\tbnScale = get_parameter(bnScale_winograd_Name128, 128);\n\tcudaMalloc((void **) &l_bnBias, nBias<<2);\n\tcudaMalloc((void **) &l_bnScale, nBias<<2);\n\tcudaMemcpy(l_bnBias, bnBias, nBias<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(l_bnScale, bnScale, nBias<<2, cudaMemcpyHostToDevice);\n\tfloat tmp_winograd[nOutput];\n\n\t\n\t/*  2. Computing  */\n\tnT1 = getTimeMicroseconds64();\n\n\tkernel_128_winograd_BtdB <<<dim3(4, 4), dim3(128, 6), (6*6*128)<<2 >>> (input, t_input);\n\tkernel_128_OuterProduct_128<<<dim3(36, 2), dim3(128, 8), (8*128 + 64*128 + 8*128)<<2 >>> (t_input, l_weights, ip);\n\tkernel_128_winograd_AtIA <<<dim3(4, 4, 128), dim3(6, 6), ((6*6)<<2)>>> (ip, l_bnBias, l_bnScale, output);\n\t//cudaCheckError();\n\tcudaDeviceSynchronize();\n\t\n\tnT2 = getTimeMicroseconds64();\n\tprintf(\"TotalTime = %d us\\n\", nT2-nT1); \n\n\n\t/*  3. Copy back and free  */\n\ts = cudaMemcpy(tmp_winograd, output, nOutput<<2, cudaMemcpyDeviceToHost);\n\tprintf(\"%s\\n\", cudaGetErrorName(s));\n\t//cudaCheckError();\n\n\tcudaFree(t_input);\n\tcudaFree(output);\n\tcudaFree(l_weights);\n\tcudaFree(l_bias);\n\tcudaFree(ip);\n\n\tfree(kernel);\n\tfree(bnScale);\n\tfree(bnBias);\n\n\n\t/////////////////////////////////\n\n\t// cuDNN\n\n\t/////////////////////////////////\n\n\t/*  1. Data preparation  */\n\tkernel = get_parameter(weight_NCHW_Name128, 9*128*128);\n\tbnBias = get_parameter(bnBiasName128, 128);\n\tbnScale = get_parameter(bnScaleName128, 128);\n\tfloat* eMean = get_parameter(eMeanName128, 128);\n\tfloat* eVar = get_parameter(eVarName128, 128);\n\tfloat *l_eMean, *l_eVar;\n\tnInput = 16*16*128, nOutput = 14*14*128, nWeights = 3*3*128*128, nBias = 128;\n\n\tcudaMalloc((void **) &output, nOutput<<2);\n\tcudaMalloc((void **) &l_weights, nWeights<<2);\n\tcudaMalloc((void **) &l_bias, nBias<<2);\n\tcudaMemcpy(l_weights, kernel, nWeights<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(l_bias, bias, nBias<<2, cudaMemcpyHostToDevice);\n\n\tcudaMalloc((void **) &l_eMean, nBias<<2);\n\tcudaMalloc((void **) &l_eVar, nBias<<2);\n\tcudaMemcpy(l_bnBias, bnBias, nBias<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(l_bnScale, bnScale, nBias<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(l_eMean, eMean, nBias<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(l_eVar, eVar, nBias<<2, cudaMemcpyHostToDevice);\n\n\tcudaMemset((void *) output, 0, nOutput<<2);\n\n\tfloat tmp_cudnn[nOutput];\n\n\n\t/*  2. cuDNN preparation  */\n\tcudnnStatus_t status;\n\tfloat one = 1.0, zero = 0.0;\n\tint size;\n\n\tcudnnHandle_t handle;\n\tstatus = cudnnCreate(&handle);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed1\\n\");\n\n\tcudnnTensorDescriptor_t xdesc, ydesc, bdesc;\n\tcudnnFilterDescriptor_t wdesc; // CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NCHW\n\tstatus = cudnnCreateTensorDescriptor(&xdesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed2\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(xdesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 128, 16, 16);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed3\\n\");\n\tstatus = cudnnCreateTensorDescriptor(&ydesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed4\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(ydesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 128, 14, 14);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed5\\n\");\n\tstatus = cudnnCreateFilterDescriptor(&wdesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed6\\n\");\n\tstatus = cudnnSetFilter4dDescriptor(wdesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 128, 128, 3, 3);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed7\\n\");\n\tstatus = cudnnCreateTensorDescriptor(&bdesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed8\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(bdesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 128, 1, 1);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed9\\n\");\n\tcudnnConvolutionDescriptor_t conv_desc;\n\tstatus = cudnnCreateConvolutionDescriptor(&conv_desc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed10\\n\");\n\tstatus = cudnnSetConvolution2dDescriptor(conv_desc, 0,0, 1,1,1,1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); //CUDNN_CONVOLUTION\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed11\\n\");\n\n\tcudnnActivationDescriptor_t act_desc;\n\tstatus = cudnnCreateActivationDescriptor(&act_desc);  \n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed12\\n\");\n\tstatus = cudnnSetActivationDescriptor(act_desc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed13\\n\");\n\n\tcudnnTensorDescriptor_t bnScaleBiasMeanVarDesc;\n\tstatus = cudnnCreateTensorDescriptor(&bnScaleBiasMeanVarDesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed14\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(bnScaleBiasMeanVarDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 128, 1, 1);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed15\\n\");\n\n\tcudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)6;\n\n\tstatus = cudnnGetConvolutionForwardWorkspaceSize(handle,\n\t   xdesc,\n\t   wdesc,\n\t   conv_desc,\n\t   ydesc,\n\t   algo,\n\t   (size_t *)&(size));\n\n\tfloat *extra;\n\tcudaMalloc((void **) &extra, size);\n\n\n\t/*  3. Computing  */\n\tnT1_cudnn = getTimeMicroseconds64();\n\n\tstatus = cudnnConvolutionForward(handle, &one,\n\t\txdesc, input, wdesc, l_weights, \n\t\tconv_desc, algo, \n\t\textra, size, &zero,\n\t\tydesc, output);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"Not Successed1\\n\");\n\n\tstatus = cudnnBatchNormalizationForwardInference(handle, CUDNN_BATCHNORM_SPATIAL,\n\t\t&one, &zero, \n\t\tydesc, output, ydesc, output,\n\t\tbnScaleBiasMeanVarDesc, l_bnScale, l_bnBias, l_eMean, l_eVar, CUDNN_BN_MIN_EPSILON);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"Not Successed2\\n\");\n\n\tstatus = cudnnActivationForward(handle, act_desc, &one,\n\t\tydesc, output, &zero,\n\t\tydesc, output);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"Not Successed3\\n\");\n\n\tcudaDeviceSynchronize();\n\tnT2_cudnn = getTimeMicroseconds64();\n\tprintf(\"cuDNN TotalTime = %d us\\n\", nT2_cudnn-nT1_cudnn);\n\n\n\t/*  4. Copy back and free  */\n\ts = cudaMemcpy(tmp_cudnn, output, nOutput<<2, cudaMemcpyDeviceToHost);\n\tprintf(\"%s\\n\", cudaGetErrorName(s));\n\n\tcudaFree(extra);\n\tcudaFree(input);\n\tcudaFree(output);\n\tcudaFree(l_weights);\n\tcudaFree(l_bias);\n\n\tcudaFree(l_bnScale);\n\tcudaFree(l_bnBias);\n\tcudaFree(l_eMean);\n\tcudaFree(l_eVar);\n\n\tfree(bias);\n\tfree(kernel);\n\n\tfree(bnScale);\n\tfree(bnBias);\n\tfree(eMean);\n\tfree(eVar);\n\tfree(input_);\n\n\toutput_checker(tmp_winograd, tmp_cudnn, 14, 128, 1);\n\n\treturn ((nT2-nT1) << 16) | (nT2_cudnn-nT1_cudnn);\n}"
  },
  {
    "path": "Kernel128_winograd.h",
    "content": "#ifndef __KERNEL128_WINOGRAD_H__\n#define __KERNEL128_WINOGRAD_H__\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nconst char inputName128[] = \"data/input_14_1_128.bin\";\nconst char biasName128[] = \"data/bias_128.bin\";\nconst char weight_winograd_Name128[] = \"data/weight_winograd_128_128.bin\";\nconst char weight_NCHW_Name128[] = \"data/weight_NCHW_128_128.bin\";\n\nconst char bnBiasName128[] = \"data/bnBias_128.bin\";\nconst char bnScaleName128[] = \"data/bnScale_128.bin\";\nconst char bnBias_winograd_Name128[] = \"data/bnBias_winograd_128.bin\";\nconst char bnScale_winograd_Name128[] = \"data/bnScale_winograd_128.bin\";\nconst char eMeanName128[] = \"data/eMean_128.bin\";\nconst char eVarName128[] = \"data/eVar_128.bin\";\n\nint kernel_128();\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif"
  },
  {
    "path": "Kernel256_one.cu",
    "content": "#include <stdlib.h>\n#include <string.h>\n#include <stdio.h>\n#include <string.h>\n#include <float.h>\n#include <math.h>\n#include <assert.h>\n#include <xmmintrin.h>\n#include <immintrin.h>\n\n#include \"cudnn.h\"\n#include \"util.h\"\n#include \"Kernel256_one.h\"\n\n\n#define cudaCheckError() {\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n\tcudaError_t e=cudaGetLastError();\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n\tif(e!=cudaSuccess) {\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n\t\tprintf(\"Cuda failure %s:%d:'%s'\\n\",__FILE__,__LINE__,cudaGetErrorString(e));\t\\\n\t\texit(EXIT_FAILURE);\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n}\n\n\n\n__global__ void kernel_1024_one_256(float *A, float *B, float *bnBias, float *bnScale, float *C) {\n\tint tile = blockIdx.x, in_channel = threadIdx.x, line = threadIdx.y;\n\tint ind = line*256 + in_channel;\n\n\textern __shared__ float shared_[];\n\tfloat *weights = shared_ + 1024*4, *output = weights + 256*16, *input = shared_;\n\tfloat *bias = output + 4*256, *scale = bias + 256;\n\n\tfor (int i = 0; i < 4; i++)\n\t\tinput[ind + i*1024] = A[tile*4096 + i*1024 + ind];\n\tbias[in_channel] = bnBias[in_channel];\n\tscale[in_channel] = bnScale[in_channel];\n\toutput[ind] = 0.0f;\n\t__syncthreads();\n\n\tfor (int k = 0; k < 1024; k += 16) {\n\t\tfloat *B_start = B + k*256;\n\t\tfor (int i = 0; i < 4; i++)\n\t\t\tweights[ind + i*1024] = B_start[i*1024 + ind];\n\t\t__syncthreads();\n\n\t\tfloat *A_start = input + k;\n\t\tfor (int p = 0; p < 16; p++) {\n\t\t\toutput[ind] += A_start[line*1024 + p] * weights[in_channel + p*256];\n\t\t}\n\t\t__syncthreads();\n\t}\n\n\tfloat *C_start = C + tile*1024, res = scale[in_channel] * output[ind] + bias[in_channel];\n\tC_start[ind] = res > 0 ? res : 0;\n}\n\n\nint kernel_256_1_in() {\n\tfloat *input = get_parameter(inputName256one, 14*14*1024);\n\tfloat *weight = get_parameter(weightName256one, 256*1024);\n\n\tfloat *bnBias = get_parameter(bnBiasName256one, 256);\n\tfloat *bnScale = get_parameter(bnScaleName256one, 256);\n\tfloat *bnBias_myKernel = get_parameter(bnBias_myKernel_Name256one, 256);\n\tfloat *bnScale_myKernel = get_parameter(bnScale_myKernel_Name256one, 256);\n\tfloat *eMeanName = get_parameter(eMeanName256one, 256);\n\tfloat *eVarName = get_parameter(eVarName256one, 256);\n\n\tfloat *input_, *output_, *weight_, *bnBias_, *bnScale_, *eMeanName_, *eVarName_;\n\n\tint nInput = 14*14*1024, nOutput = 14*14*256, nWeights = 256*1024;\n\tfloat tmp[nOutput], tmp_cudnn[nOutput];\n\n\tuint64_t nT1 = 0, nT2 = 0, nT1_cudnn = 0, nT2_cudnn = 0;\n\tcudaError_t s;\n\n\t/////////////////////////////////\n\n\t// My Kernel\n\n\t/////////////////////////////////\n\n\t/*  1. Data preparation  */\n\tcudaMalloc((void **) &input_, nInput<<3);\n\tcudaMalloc((void **) &output_, nOutput<<2);\n\tcudaMalloc((void **) &weight_, nWeights<<2);\n\tcudaMalloc((void **) &bnBias_, 256<<2);\n\tcudaMalloc((void **) &bnScale_, 256<<2);\n\n\tcudaMemcpy(input_, input, nInput<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(weight_, weight, nWeights<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(bnBias_, bnBias_myKernel, 256<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(bnScale_, bnScale_myKernel, 256<<2, cudaMemcpyHostToDevice);\n\n\n\t/*  2. Computing  */\n\tnT1 = getTimeMicroseconds64();\n\n\tkernel_1024_one_256 <<<dim3(49), dim3(256, 4), (4*1024 + 16*256 + 4*256 + 2*256)<<2 >>> (input_, weight_, bnBias_, bnScale_, output_);\n\n\t//cudaCheckError();\n\tcudaDeviceSynchronize();\n\n\tnT2 = getTimeMicroseconds64();\n\tprintf(\"TotalTime = %d us\\n\", nT2-nT1);\n\n\n\t/*  3. Copy back and free  */\n\ts = cudaMemcpy(tmp, output_, nOutput<<2, cudaMemcpyDeviceToHost);\n\tprintf(\"%s\\n\", cudaGetErrorName(s));\n\tcudaCheckError();\n\n\tfree(bnBias_myKernel);\n\tfree(bnScale_myKernel);\n\n\n\t/////////////////////////////////\n\n\t// cuDNN\n\n\t/////////////////////////////////\n\n\t/*  1. Data preparation  */\n\tcudaMalloc((void **) &eMeanName_, 256<<2);\n\tcudaMalloc((void **) &eVarName_, 256<<2);\n\n\tcudaMemcpy(bnBias_, bnBias, 256<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(bnScale_, bnScale, 256<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(eMeanName_, eMeanName, 256<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(eVarName_, eVarName, 256<<2, cudaMemcpyHostToDevice);\n\n\tweight = transpose(weight, 256, 1024);\n\tcudaMemcpy(weight_, weight, nWeights<<2, cudaMemcpyHostToDevice);\n\n\t/*  2. cuDNN preparation  */\n\tcudnnStatus_t status;\n\tfloat one = 1.0, zero = 0.0;\n\tint size;\n\n\tcudnnHandle_t handle;\n\tstatus = cudnnCreate(&handle);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed1\\n\");\n\n\tcudnnTensorDescriptor_t xdesc, ydesc;\n\tcudnnFilterDescriptor_t wdesc; // CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NCHW\n\tstatus = cudnnCreateTensorDescriptor(&xdesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed2\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(xdesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 1024, 14, 14);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed3\\n\");\n\tstatus = cudnnCreateTensorDescriptor(&ydesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed4\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(ydesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 256, 14, 14);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed5\\n\");\n\tstatus = cudnnCreateFilterDescriptor(&wdesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed6\\n\");\n\tstatus = cudnnSetFilter4dDescriptor(wdesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 256, 1024, 1, 1);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed7\\n\");\n\n\tcudnnConvolutionDescriptor_t conv_desc;\n\tstatus = cudnnCreateConvolutionDescriptor(&conv_desc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed10\\n\");\n\tstatus = cudnnSetConvolution2dDescriptor(conv_desc, 0,0, 1,1,1,1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); //CUDNN_DATA_FLOAT\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed11\\n\");\n\n\tcudnnActivationDescriptor_t act_desc;\n\tstatus = cudnnCreateActivationDescriptor(&act_desc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed12\\n\");\n\tstatus = cudnnSetActivationDescriptor(act_desc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed13\\n\");\n\n\tcudnnTensorDescriptor_t bnScaleBiasMeanVarDesc;\n\tstatus = cudnnCreateTensorDescriptor(&bnScaleBiasMeanVarDesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed14\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(bnScaleBiasMeanVarDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 256, 1, 1);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed15\\n\");\n\n\tcudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)0;\n\n\tstatus = cudnnGetConvolutionForwardWorkspaceSize(handle,\n\t   xdesc,\n\t   wdesc,\n\t   conv_desc,\n\t   ydesc,\n\t   algo,\n\t   (size_t *)&(size));\n\tfloat *extra;\n\tcudaMalloc((void **) &extra, size);\n\n\n\t/*  3. Computing  */\n\tnT1_cudnn = getTimeMicroseconds64();\n\n\tstatus = cudnnConvolutionForward(handle, &one,\n\t\txdesc, input_, wdesc, weight_,\n\t\tconv_desc, algo,\n\t\textra, size, &zero,\n\t\tydesc, output_);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"Not Successed1\\n\");\n\n\tstatus = cudnnBatchNormalizationForwardInference(handle, CUDNN_BATCHNORM_SPATIAL,\n\t\t&one, &zero,\n\t\tydesc, output_, ydesc, output_,\n\t\tbnScaleBiasMeanVarDesc, bnScale_, bnBias_, eMeanName_, eVarName_, CUDNN_BN_MIN_EPSILON);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"Not Successed2\\n\");\n\n\tstatus = cudnnActivationForward(handle, act_desc, &one,\n\t\tydesc, output_, &zero,\n\t\tydesc, output_);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"Not Successed3\\n\");\n\n\tcudaDeviceSynchronize();\n\tnT2_cudnn = getTimeMicroseconds64();\n\tprintf(\"cuDNN TotalTime = %d us\\n\", nT2_cudnn-nT1_cudnn);\n\n\n\t/*  4. Copy back and free  */\n\ts = cudaMemcpy(tmp_cudnn, output_, nOutput<<2, cudaMemcpyDeviceToHost);\n\tprintf(\"%s\\n\", cudaGetErrorName(s));\n\n\tcudaFree(extra);\n\tcudaFree(input_);\n\tcudaFree(output_);\n\tcudaFree(weight_);\n\n\tcudaFree(bnScale_);\n\tcudaFree(bnBias_);\n\tcudaFree(eMeanName_);\n\tcudaFree(eVarName_);\n\n\tfree(input);\n\tfree(weight);\n\n\tfree(bnScale);\n\tfree(bnBias);\n\tfree(eMeanName);\n\tfree(eVarName);\n\n\toutput_checker(tmp, tmp_cudnn, 14, 256, 0);\n\n\treturn ((nT2-nT1) << 16) | (nT2_cudnn-nT1_cudnn);\n}\n\n\n\n__global__ void kernel_256_one_1024(float *A, float *B, float *bnBias, float *bnScale, float *C) {\n\tint tile = blockIdx.x, part = blockIdx.y, in_channel = threadIdx.x, line = threadIdx.y;\n\tint ind = line*256 + in_channel;\n\n\textern __shared__ float shared_[];\n\tfloat *weights = shared_ + 256*4, *output = weights + 256*32, *input = shared_;\n\tfloat *bias = output + 4*256, *scale = bias + 256;\n\n\tinput[ind] = A[tile * 1024 + ind];\n\tbias[in_channel] = bnBias[part*256 + in_channel];\n\tscale[in_channel] = bnScale[part*256+ in_channel];\n\toutput[ind] = 0.0f;\n\t__syncthreads();\n\n\tfor (int k = 0; k < 256; k += 32) {\n\t\tfor (int i = 0; i < 8; i++)\n\t\t\tweights[ind + 1024*i] = B[(k + i*4 + line)*1024 + part*256 + in_channel];\n\t\t__syncthreads();\n\n\t\tfloat *A_start = input + k;\n\t\tfor (int p = 0; p < 32; p++) {\n\t\t\toutput[ind] += A_start[line*256 + p] * weights[in_channel + p*256];\n\t\t}\n\t\t__syncthreads();\n\t}\n\n\tfloat *C_start = C + tile*4096 + part*256;\n\tC_start[line * 1024 + in_channel] = scale[in_channel] * output[ind] + bias[in_channel];\n}\n\n\nint kernel_256_1_out() {\n\tfloat *input = get_parameter(inputName256one, 14*14*256);\n\tfloat *weight = get_parameter(weightName256one, 256*1024);\n\n\tfloat *bnBias = get_parameter(bnBiasName256one, 1024);\n\tfloat *bnScale = get_parameter(bnScaleName256one, 1024);\n\tfloat *bnBias_myKernel = get_parameter(bnBias_myKernel_Name256one, 1024);\n\tfloat *bnScale_myKernel = get_parameter(bnScale_myKernel_Name256one, 1024);\n\tfloat *eMeanName = get_parameter(eMeanName256one, 1024);\n\tfloat *eVarName = get_parameter(eVarName256one, 1024);\n\n\tfloat *input_, *output_, *weight_, *bnBias_, *bnScale_, *eMeanName_, *eVarName_;\n\n\tint nInput = 14*14*256, nOutput = 14*14*1024, nWeights = 256*1024;\n\tfloat tmp[nOutput], tmp_cudnn[nOutput];\n\n\tuint64_t nT1 = 0, nT2 = 0, nT1_cudnn = 0, nT2_cudnn = 0;\n\tcudaError_t s;\n\n\t/////////////////////////////////\n\n\t// My Kernel\n\n\t/////////////////////////////////\n\n\t/*  1. Data preparation  */\n\tcudaMalloc((void **) &input_, nInput<<3);\n\tcudaMalloc((void **) &output_, nOutput<<2);\n\tcudaMalloc((void **) &weight_, nWeights<<2);\n\tcudaMalloc((void **) &bnBias_, 1024<<2);\n\tcudaMalloc((void **) &bnScale_, 1024<<2);\n\n\tcudaMemcpy(input_, input, nInput<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(weight_, weight, nWeights<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(bnBias_, bnBias_myKernel, 1024<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(bnScale_, bnScale_myKernel, 1024<<2, cudaMemcpyHostToDevice);\n\n\n\t/*  2. Computing  */\n\tnT1 = getTimeMicroseconds64();\n\n\tkernel_256_one_1024 <<<dim3(49, 4), dim3(256, 4), (4*256 + 32*256 + 4*256 + 2*256)<<2 >>> (input_, weight_, bnBias_, bnScale_, output_);\n\n\tcudaCheckError();\n\tcudaDeviceSynchronize();\n\n\tnT2 = getTimeMicroseconds64();\n\tprintf(\"TotalTime = %d us\\n\", nT2-nT1);\n\n\n\t/*  3. Copy back and free  */\n\ts = cudaMemcpy(tmp, output_, nOutput<<2, cudaMemcpyDeviceToHost);\n\tprintf(\"%s\\n\", cudaGetErrorName(s));\n\tcudaCheckError();\n\n\tfree(bnBias_myKernel);\n\tfree(bnScale_myKernel);\n\n\n\t/////////////////////////////////\n\n\t// cuDNN\n\n\t/////////////////////////////////\n\n\t/*  1. Data preparation  */\n\tcudaMalloc((void **) &eMeanName_, 1024<<2);\n\tcudaMalloc((void **) &eVarName_, 1024<<2);\n\n\tcudaMemcpy(bnBias_, bnBias, 1024<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(bnScale_, bnScale, 1024<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(eMeanName_, eMeanName, 1024<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(eVarName_, eVarName, 1024<<2, cudaMemcpyHostToDevice);\n\n\tweight = transpose(weight, 1024, 256);\n\tcudaMemcpy(weight_, weight, nWeights<<2, cudaMemcpyHostToDevice);\n\n\t/*  2. cuDNN preparation  */\n\tcudnnStatus_t status;\n\tfloat one = 1.0, zero = 0.0;\n\tint size;\n\n\tcudnnHandle_t handle;\n\tstatus = cudnnCreate(&handle);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed1\\n\");\n\n\tcudnnTensorDescriptor_t xdesc, ydesc;\n\tcudnnFilterDescriptor_t wdesc; // CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NCHW\n\tstatus = cudnnCreateTensorDescriptor(&xdesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed2\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(xdesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 256, 14, 14);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed3\\n\");\n\tstatus = cudnnCreateTensorDescriptor(&ydesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed4\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(ydesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 1024, 14, 14);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed5\\n\");\n\tstatus = cudnnCreateFilterDescriptor(&wdesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed6\\n\");\n\tstatus = cudnnSetFilter4dDescriptor(wdesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1024, 256, 1, 1);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed7\\n\");\n\n\tcudnnConvolutionDescriptor_t conv_desc;\n\tstatus = cudnnCreateConvolutionDescriptor(&conv_desc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed10\\n\");\n\tstatus = cudnnSetConvolution2dDescriptor(conv_desc, 0,0, 1,1,1,1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); //CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed11\\n\");\n\n\tcudnnTensorDescriptor_t bnScaleBiasMeanVarDesc;\n\tstatus = cudnnCreateTensorDescriptor(&bnScaleBiasMeanVarDesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed14\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(bnScaleBiasMeanVarDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1024, 1, 1);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed15\\n\");\n\n\tcudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)0;\n\n\tstatus = cudnnGetConvolutionForwardWorkspaceSize(handle,\n\t   xdesc,\n\t   wdesc,\n\t   conv_desc,\n\t   ydesc,\n\t   algo,\n\t   (size_t *)&(size));\n\tfloat *extra;\n\tcudaMalloc((void **) &extra, size);\n\n\n\t/*  3. Computing  */\n\tnT1_cudnn = getTimeMicroseconds64();\n\n\tstatus = cudnnConvolutionForward(handle, &one,\n\t\txdesc, input_, wdesc, weight_,\n\t\tconv_desc, algo,\n\t\textra, size, &zero,\n\t\tydesc, output_);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"Not Successed1\\n\");\n\n\tstatus = cudnnBatchNormalizationForwardInference(handle, CUDNN_BATCHNORM_SPATIAL,\n\t\t&one, &zero,\n\t\tydesc, output_, ydesc, output_,\n\t\tbnScaleBiasMeanVarDesc, bnScale_, bnBias_, eMeanName_, eVarName_, CUDNN_BN_MIN_EPSILON);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"Not Successed2\\n\");\n\n\tcudaDeviceSynchronize();\n\tnT2_cudnn = getTimeMicroseconds64();\n\tprintf(\"cuDNN TotalTime = %d us\\n\", nT2_cudnn-nT1_cudnn);\n\n\n\t/*  4. Copy back and free  */\n\ts = cudaMemcpy(tmp_cudnn, output_, nOutput<<2, cudaMemcpyDeviceToHost);\n\tprintf(\"%s\\n\", cudaGetErrorName(s));\n\n\tcudaFree(extra);\n\tcudaFree(input_);\n\tcudaFree(output_);\n\tcudaFree(weight_);\n\n\tcudaFree(bnScale_);\n\tcudaFree(bnBias_);\n\tcudaFree(eMeanName_);\n\tcudaFree(eVarName_);\n\n\tfree(input);\n\tfree(weight);\n\n\tfree(bnScale);\n\tfree(bnBias);\n\tfree(eMeanName);\n\tfree(eVarName);\n\n\toutput_checker(tmp, tmp_cudnn, 14, 1024, 0);\n\n\treturn ((nT2-nT1) << 16) | (nT2_cudnn-nT1_cudnn);\n}\n"
  },
  {
    "path": "Kernel256_one.h",
    "content": "#ifndef __KERNEL256_ONE_H__\n#define __KERNEL256_ONE_H__\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nconst char inputName256one[] = \"data/input_one_14_1024.bin\";\nconst char weightName256one[] = \"data/weight_one_1024.bin\";\n\nconst char bnBiasName256one[] = \"data/bnBias_one_1024.bin\";\nconst char bnScaleName256one[] = \"data/bnScale_one_1024.bin\";\nconst char bnBias_myKernel_Name256one[] = \"data/bnBias_myKernel_one_1024.bin\";\nconst char bnScale_myKernel_Name256one[] = \"data/bnScale_myKernel_one_1024.bin\";\nconst char eMeanName256one[] = \"data/eMean_one_1024.bin\";\nconst char eVarName256one[] = \"data/eVar_one_1024.bin\";\n\nint kernel_256_1_in();\nint kernel_256_1_out();\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif"
  },
  {
    "path": "Kernel256_winograd.cu",
    "content": "#include <stdlib.h>\n#include <string.h>\n#include <stdio.h>\n#include <string.h>\n#include <float.h>\n#include <math.h>\n#include <assert.h>\n#include <xmmintrin.h>\n#include <immintrin.h>\n\n#include \"cudnn.h\"\n#include \"util.h\"\n#include \"Kernel256_winograd.h\"\n\n\n#define cudaCheckError() {\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n\tcudaError_t e=cudaGetLastError();\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n\tif(e!=cudaSuccess) {\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n\t\tprintf(\"Cuda failure %s:%d:'%s'\\n\",__FILE__,__LINE__,cudaGetErrorString(e));\t\\\n\t\texit(EXIT_FAILURE);\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n\t}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\\\n}\n\n#define MY_KERNEL 0\n\n#define d(input, i, j, Inz) ( input[Inz + i*768 + (j<<7)] )\n__global__ void kernel_256_winograd_BtdB(float *pInputs, float *pOutputs) {\n\tint Inx = blockIdx.x<<2, Iny0 = blockIdx.y<<2, Part = blockIdx.z, Iny1 = threadIdx.y, Inz = threadIdx.x;\n\tint Iny = Iny0+Iny1, stride_r = 4096, stride_c = 256; // 4096 = 16*256\n\tint c_glb_start = Inx*stride_r + Iny*stride_c + Inz + (Part<<7), c_input = Iny1*128 + Inz;\n\n\textern __shared__ float input[];\n\n\tint stride_768[6] = {0, 768, 1536, 2304, 3072, 3840}; // 768 = 6*128\n\tfor (int i = 0; i < 6; i++) {\n\t\tinput[c_input + stride_768[i]] = pInputs[c_glb_start + i*stride_r];\n\t}\n\t__syncthreads();\n\n\tfloat BTd[6];\n\tswitch(Iny1) {\n\t\tcase 0:\n\t\t\tfor (int j = 0; j < 6; j++) {\n\t\t\t\tBTd[j] = d(input, 0, j, Inz)*4 - d(input, 2, j, Inz)*5 + d(input, 4, j, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 1:\n\t\t\tfor (int j = 0; j < 6; j++) {\n\t\t\t\tBTd[j] = -d(input, 1, j, Inz)*4 - d(input, 2, j, Inz)*4 + d(input, 3, j, Inz) + d(input, 4, j, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 2:\n\t\t\tfor (int j = 0; j < 6; j++) {\n\t\t\t\tBTd[j] = d(input, 1, j, Inz)*4 - d(input, 2, j, Inz)*4 - d(input, 3, j, Inz) + d(input, 4, j, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 3:\n\t\t\tfor (int j = 0; j < 6; j++) {\n\t\t\t\tBTd[j] = -d(input, 1, j, Inz)*2 - d(input, 2, j, Inz) + d(input, 3, j, Inz)*2 + d(input, 4, j, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 4:\n\t\t\tfor (int j = 0; j < 6; j++) {\n\t\t\t\tBTd[j] = d(input, 1, j, Inz)*2 - d(input, 2, j, Inz) - d(input, 3, j, Inz)*2 + d(input, 4, j, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 5:\n\t\t\tfor (int j = 0; j < 6; j++) {\n\t\t\t\tBTd[j] = d(input, 1, j, Inz)*4 - d(input, 3, j, Inz)*5 + d(input, 5, j, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t}\n\t__syncthreads();\n\n\tint tmp_offset = Iny1*768+Inz;\n\tfor (int i = 0; i < 6; i++) {\n\t\tinput[tmp_offset + i*128] = BTd[i];\n\t}\n\t__syncthreads();\n\n\tfloat BTdB[6];\n\tswitch(Iny1) {\n\t\tcase 0:\n\t\t\tfor (int i = 0; i < 6; i++) {\n\t\t\t\tBTdB[i] = 4*d(input, i, 0, Inz) - 5*d(input, i, 2, Inz) + d(input, i, 4, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 1:\n\t\t\tfor (int i = 0; i < 6; i++) {\n\t\t\t\tBTdB[i] = -4*d(input, i, 1, Inz) - 4*d(input, i, 2, Inz) + d(input, i, 3, Inz) + d(input, i, 4, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 2:\n\t\t\tfor (int i = 0; i < 6; i++) {\n\t\t\t\tBTdB[i] = 4*d(input, i, 1, Inz) - 4*d(input, i, 2, Inz) - d(input, i, 3, Inz) + d(input, i, 4, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 3:\n\t\t\tfor (int i = 0; i < 6; i++) {\n\t\t\t\tBTdB[i] = -2*d(input, i, 1, Inz) - d(input, i, 2, Inz) + 2*d(input, i, 3, Inz) + d(input, i, 4, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 4:\n\t\t\tfor (int i = 0; i < 6; i++) {\n\t\t\t\tBTdB[i] = 2*d(input, i, 1, Inz) - d(input, i, 2, Inz) - 2*d(input, i, 3, Inz) + d(input, i, 4, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t\tcase 5:\n\t\t\tfor (int i = 0; i < 6; i++) {\n\t\t\t\tBTdB[i] = 4*d(input, i, 1, Inz) - 5*d(input, i, 3, Inz) + d(input, i, 5, Inz);\n\t\t\t}\n\t\t\tbreak;\n\t}\n\t__syncthreads();\n\n\tfor (int i = 0; i < 6; i++) {\n\t\tpOutputs[(Iny1 + i*6)*4096 + (blockIdx.x*4+blockIdx.y)*256 + Inz + (Part<<7)] = BTdB[i];\n\t}\n}\n\n__global__ void kernel_256_winograd_AtIA(float *pInputs, float *pBiases, float *pScales, float *pOutputs) {\n\tint Tilex = blockIdx.x, Tiley = blockIdx.y, Iny = threadIdx.y, kz = blockIdx.z, Inx = threadIdx.x;\n\tint c_input = Inx*6 + Iny;\n\n\t__shared__ float bias, scale;\n\textern __shared__ float input[];\n\n\tinput[c_input] = pInputs[c_input*16*256 + (Tilex*4+Tiley)*256 + kz];\n\tbias = pBiases[kz];\n\tscale = pScales[kz];\n\t__syncthreads();\n\n\tfloat tmp = 0;\n\tswitch(Inx) {\n\t\tcase 0:\n\t\t\ttmp = input[Iny] + input[6+Iny] + input[12+Iny] + input[18+Iny] + input[24+Iny];\n\t\t\tbreak;\n\t\tcase 1:\n\t\t\ttmp = input[6+Iny] - input[12+Iny] + 2*input[18+Iny] - 2*input[24+Iny];\n\t\t\tbreak;\n\t\tcase 2:\n\t\t\ttmp = input[6+Iny] + input[12+Iny] + 4*input[18+Iny] + 4*input[24+Iny];\n\t\t\tbreak;\n\t\tcase 3:\n\t\t\ttmp = input[6+Iny] - input[12+Iny] + 8*input[18+Iny] - 8*input[24+Iny] + input[30+Iny];\n\t\t\tbreak;\n\t}\n\t__syncthreads();\n\n\tinput[c_input] = tmp;\n\t__syncthreads();\n\n\tif (Inx > 3 || (Tilex == 3 && Inx > 1)) return;\n\t\n\tint x;\n\tfloat o;\n\tswitch(Iny) {\n\t\tcase 0:\n\t\t\tx = Inx*6;\n\t\t\to = scale*(input[x]+input[x+1]+input[x+2]+input[x+3]+input[x+4]) + bias;\n\t\t\tpOutputs[(((Tilex<<2)+1+Inx)*16 + (Tiley<<2)+1)*256 + kz] = o > 0 ? o : 0;\n\t\t\tbreak;\n\t\tcase 1:\n\t\t\tx = Inx*6;\n\t\t\to = scale*(input[x+1] - input[x+2] + 2*input[x+3] - 2*input[x+4]) + bias;\n\t\t\tpOutputs[(((Tilex<<2)+1+Inx)*16 + (Tiley<<2)+2)*256 + kz] = o > 0 ? o : 0;\n\t\t\tbreak;\n\t\tcase 2:\n\t\t\tif (Tiley == 3) break;\n\t\t\tx = Inx*6;\n\t\t\to = scale*(input[x+1] + input[x+2] + 4*input[x+3] + 4*input[x+4]) + bias;\n\t\t\tpOutputs[(((Tilex<<2)+1+Inx)*16 + (Tiley<<2)+3)*256 + kz] = o > 0 ? o : 0;\n\t\t\tbreak;\n\t\tcase 3:\n\t\t\tif (Tiley == 3) break;\n\t\t\tx = Inx*6;\n\t\t\to = scale*(input[x+1] - input[x+2] + 8*input[x+3] - 8*input[x+4] + input[x+5]) + bias;\n\t\t\tpOutputs[(((Tilex<<2)+1+Inx)*16 + (Tiley<<2)+4)*256 + kz] = o > 0 ? o : 0;\n\t\t\tbreak;\n\t}\n}\n\n__global__ void kernel_256_OuterProduct_256(float *A, float *B, float *C) {\n\tint Tile = blockIdx.x, Part = blockIdx.y, tX = threadIdx.x, tY = threadIdx.y;\n\tint c_input = tY*256 + tX, c_kernel = c_input, T_offset = (Tile<<12) + (Part<<11) + c_input, B_offset = (Tile<<16) + c_kernel;\n\t\n\textern __shared__ float input[];\n\tfloat *kernel = input + 2048, *out = kernel + 8192;\n\tint B_stride[32] = {0, 256, 512, 768, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816, 3072, 3328, 3584, 3840, 4096, 4352, 4608, 4864, 5120, 5376, 5632, 5888, 6144, 6400, 6656, 6912, 7168, 7424, 7680, 7936};\n\tout[c_input] = 0.0f;\n\tout[c_input+1024] = 0;\n\n\tinput[c_input] = A[T_offset];\n\tinput[c_input+1024] = A[T_offset+1024];\n\n\tfor (int k = 0; k < 8; k++) {\n\t\tint B_start = B_offset + (k<<13); // 32*64\n\t\tkernel[c_kernel] = B[B_start], kernel[c_kernel+1024] = B[B_start+1024];\n\t\tkernel[c_kernel+2048] = B[B_start+2048], kernel[c_kernel+3072] = B[B_start+3072];\n\t\tkernel[c_kernel+4096] = B[B_start+4096], kernel[c_kernel+5120] = B[B_start+5120];\n\t\tkernel[c_kernel+6144] = B[B_start+6144], kernel[c_kernel+7168] = B[B_start+7168];\n\n\t\t__syncthreads();\n\n\t\tfloat sum = 0, sum1 = 0;\n\t\tint y_tmp = (tY<<8)+(k<<5), y_tmp1 = y_tmp+1024;\n\t\tfor (int j = 0; j < 32; j++) {\n\t\t\tsum += input[y_tmp + j] * kernel[tX + B_stride[j]];\n\t\t\tsum1 += input[y_tmp1 + j] * kernel[tX + B_stride[j]];\n\t\t}\n\t\tout[c_input] += sum;\n\t\tout[c_input+1024] += sum1;\n\t\t__syncthreads();\n\t}\n\n\tC[T_offset] = out[c_input];\n\tC[T_offset+1024] = out[c_input+1024];\n}\n\nint kernel_256() {\n\tfloat *input_ = get_parameter(inputName256, 16*16*256);\n\tfloat *bias = get_parameter(biasName256, 256);\n\tfloat *input, *output, *l_weights, *l_bias;\n\tuint64_t nT1 = 0, nT2 = 0, nT1_cudnn = 0, nT2_cudnn = 0;\n\tcudaError_t s;\n\n\n\n\t/////////////////////////////////\n\n\t// My Kernel\n\n\t/////////////////////////////////\n\tfloat *kernel = get_parameter(weight_winograd_Name256, 36*256*256), *t_input, *ip;\n\tint nInput = 16*16*256, nOutput = 16*16*256, nWeights = 36*256*256, nBias = 256, nTransInput = 16*6*6*256, nInnerProd = 16*6*6*256;\n\tfloat *l_bnBias, *l_bnScale, *bnBias, *bnScale;\n\n\tcudaMalloc((void **) &input, nInput<<3);\n\tcudaMalloc((void **) &output, nOutput<<2);\n\tcudaMalloc((void **) &l_weights, nWeights<<2);\n\tcudaMalloc((void **) &l_bias, nBias<<2);\n\tcudaMalloc((void **) &t_input, nTransInput<<2);\n\tcudaMalloc((void **) &ip, nInnerProd<<2);\n\n\tcudaMemset((void *) input, 0, nInput<<3);\n\tcudaMemset((void *) output, 0, nOutput<<2);\n\tcudaMemset((void *) t_input, 0, nTransInput<<2);\n\tcudaMemset((void *) l_weights, 0, nWeights<<2);\n\tcudaMemset((void *) ip, 0, nInnerProd<<2);\n\n\tcudaMemcpy(input, input_, nInput<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(l_weights, kernel, nWeights<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(l_bias, bias, nBias<<2, cudaMemcpyHostToDevice);\n\n\tbnBias = get_parameter(bnBias_winograd_Name256, 256);\n\tbnScale = get_parameter(bnScale_winograd_Name256, 256);\n\tcudaMalloc((void **) &l_bnBias, nBias<<2);\n\tcudaMalloc((void **) &l_bnScale, nBias<<2);\n\tcudaMemcpy(l_bnBias, bnBias, nBias<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(l_bnScale, bnScale, nBias<<2, cudaMemcpyHostToDevice);\n\n\tfloat tmp[nOutput];\n\n\tnT1 = getTimeMicroseconds64();\n\n\tkernel_256_winograd_BtdB <<<dim3(4, 4, 2), dim3(128, 6), (6*6*128)<<2 >>> (input, t_input);\n\tkernel_256_OuterProduct_256<<<dim3(36, 2), dim3(256, 4), (8*256 + 32*256 + 8*256)<<2 >>> (t_input, l_weights, ip);\n\tkernel_256_winograd_AtIA <<<dim3(4, 4, 256), dim3(6, 6), ((6*6)<<2)>>> (ip, l_bnBias, l_bnScale, output);\n\t//cudaCheckError();\n\tcudaDeviceSynchronize();\n\t\n\tnT2 = getTimeMicroseconds64();\n\tprintf(\"TotalTime = %d us\\n\", nT2-nT1); \n\n\ts = cudaMemcpy(tmp, output, nOutput<<2, cudaMemcpyDeviceToHost);\n\tprintf(\"%s\\n\", cudaGetErrorName(s));\n\t//cudaCheckError();\n\n\tcudaFree(t_input);\n\tcudaFree(output);\n\tcudaFree(l_weights);\n\tcudaFree(l_bias);\n\tcudaFree(ip);\n\n\tfree(kernel);\n\tfree(bnScale);\n\tfree(bnBias);\n\n\n\n\t/////////////////////////////////\n\n\t// cuDNN\n\n\t/////////////////////////////////\n\tkernel = get_parameter(weight_NCHW_Name256, 9*256*256);\n\tbnBias = get_parameter(bnBiasName256, 256);\n\tbnScale = get_parameter(bnScaleName256, 256);\n\tfloat* eMean = get_parameter(eMeanName256, 256);\n\tfloat* eVar = get_parameter(eVarName256, 256);\n\tfloat *l_eMean, *l_eVar;\n\tnInput = 16*16*256, nOutput = 14*14*256, nWeights = 3*3*256*256, nBias = 256;\n\n\tcudaMalloc((void **) &output, nOutput<<2);\n\tcudaMalloc((void **) &l_weights, nWeights<<2);\n\tcudaMalloc((void **) &l_bias, nBias<<2);\n\tcudaMemcpy(l_weights, kernel, nWeights<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(l_bias, bias, nBias<<2, cudaMemcpyHostToDevice);\n\n\tcudaMalloc((void **) &l_eMean, nBias<<2);\n\tcudaMalloc((void **) &l_eVar, nBias<<2);\n\tcudaMemcpy(l_bnBias, bnBias, nBias<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(l_bnScale, bnScale, nBias<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(l_eMean, eMean, nBias<<2, cudaMemcpyHostToDevice);\n\tcudaMemcpy(l_eVar, eVar, nBias<<2, cudaMemcpyHostToDevice);\n\n\tcudaMemset((void *) output, 0, nOutput<<2);\n\n\tfloat tmp_cudnn[nOutput];\n\n\tcudnnStatus_t status;\n\tfloat one = 1.0, zero = 0.0;\n\tint size;\n\n\tcudnnHandle_t handle;\n\tstatus = cudnnCreate(&handle);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed1\\n\");\n\n\tcudnnTensorDescriptor_t xdesc, ydesc, bdesc;\n\tcudnnFilterDescriptor_t wdesc; // CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NCHW\n\tstatus = cudnnCreateTensorDescriptor(&xdesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed2\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(xdesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 256, 16, 16);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed3\\n\");\n\tstatus = cudnnCreateTensorDescriptor(&ydesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed4\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(ydesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 256, 14, 14);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed5\\n\");\n\tstatus = cudnnCreateFilterDescriptor(&wdesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed6\\n\");\n\tstatus = cudnnSetFilter4dDescriptor(wdesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 256, 256, 3, 3);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed7\\n\");\n\tstatus = cudnnCreateTensorDescriptor(&bdesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed8\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(bdesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 256, 1, 1);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed9\\n\");\n\tcudnnConvolutionDescriptor_t conv_desc;\n\tstatus = cudnnCreateConvolutionDescriptor(&conv_desc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed10\\n\");\n\tstatus = cudnnSetConvolution2dDescriptor(conv_desc, 0,0, 1,1,1,1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); //CUDNN_CONVOLUTION\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed11\\n\");\n\n\tcudnnActivationDescriptor_t act_desc;\n\tstatus = cudnnCreateActivationDescriptor(&act_desc);  \n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed12\\n\");\n\tstatus = cudnnSetActivationDescriptor(act_desc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed13\\n\");\n\n\tcudnnTensorDescriptor_t bnScaleBiasMeanVarDesc;\n\tstatus = cudnnCreateTensorDescriptor(&bnScaleBiasMeanVarDesc);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed14\\n\");\n\tstatus = cudnnSetTensor4dDescriptor(bnScaleBiasMeanVarDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 256, 1, 1);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"failed15\\n\");\n\n\tcudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)6;\n\n\tstatus = cudnnGetConvolutionForwardWorkspaceSize(handle,\n\t   xdesc,\n\t   wdesc,\n\t   conv_desc,\n\t   ydesc,\n\t   algo,\n\t   (size_t *)&(size));\n\n\tfloat *extra;\n\tcudaMalloc((void **) &extra, size);\n\t\n\tnT1_cudnn = getTimeMicroseconds64();\n\n\tstatus = cudnnConvolutionForward(handle, &one,\n\t\txdesc, input, wdesc, l_weights, \n\t\tconv_desc, algo, \n\t\textra, size, &zero,\n\t\tydesc, output);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"Not Successed1\\n\");\n\n\tstatus = cudnnBatchNormalizationForwardInference(handle, CUDNN_BATCHNORM_SPATIAL,\n\t\t&one, &zero, \n\t\tydesc, output, ydesc, output,\n\t\tbnScaleBiasMeanVarDesc, l_bnScale, l_bnBias, l_eMean, l_eVar, CUDNN_BN_MIN_EPSILON);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"Not Successed2\\n\");\n\n\tstatus = cudnnActivationForward(handle, act_desc, &one,\n\t\tydesc, output, &zero,\n\t\tydesc, output);\n\tif (status != CUDNN_STATUS_SUCCESS) printf(\"Not Successed3\\n\");\n\n\tcudaDeviceSynchronize();\n\tnT2_cudnn = getTimeMicroseconds64();\n\tprintf(\"cuDNN TotalTime = %d us\\n\", nT2_cudnn-nT1_cudnn);\n\t\n\ts = cudaMemcpy(tmp_cudnn, output, nOutput<<2, cudaMemcpyDeviceToHost);\n\tprintf(\"%s\\n\", cudaGetErrorName(s));\n\n\n\tcudaFree(extra);\n\tcudaFree(input);\n\tcudaFree(output);\n\tcudaFree(l_weights);\n\tcudaFree(l_bias);\n\n\tcudaFree(l_bnScale);\n\tcudaFree(l_bnBias);\n\tcudaFree(l_eMean);\n\tcudaFree(l_eVar);\n\n\tfree(bias);\n\tfree(kernel);\n\n\tfree(bnScale);\n\tfree(bnBias);\n\tfree(eMean);\n\tfree(eVar);\n\tfree(input_);\n\n\toutput_checker(tmp, tmp_cudnn, 14, 256, 1);\n\n\treturn ((nT2-nT1) << 16) | (nT2_cudnn-nT1_cudnn);\n}"
  },
  {
    "path": "Kernel256_winograd.h",
    "content": "#ifndef __KERNEL256_WINOGRAD_H__\n#define __KERNEL256_WINOGRAD_H__\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nconst char inputName256[] = \"data/input_14_1_256.bin\";\nconst char biasName256[] = \"data/bias_256.bin\";\nconst char weight_winograd_Name256[] = \"data/weight_winograd_256_256.bin\";\nconst char weight_NCHW_Name256[] = \"data/weight_NCHW_256_256.bin\";\n\nconst char bnBiasName256[] = \"data/bnBias_256.bin\";\nconst char bnScaleName256[] = \"data/bnScale_256.bin\";\nconst char bnBias_winograd_Name256[] = \"data/bnBias_winograd_256.bin\";\nconst char bnScale_winograd_Name256[] = \"data/bnScale_winograd_256.bin\";\nconst char eMeanName256[] = \"data/eMean_256.bin\";\nconst char eVarName256[] = \"data/eVar_256.bin\";\n\nint kernel_256();\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif\n"
  },
  {
    "path": "Makefile",
    "content": "CC=gcc\nCPP=g++\nAR=ar\nNVCC=nvcc\n\nCSRCS := $(shell find . -name '*.c' -not -name '._*')\nCOBJS := $(subst .c,.o,$(CSRCS))\n\nCUSRCS := $(shell find . -name '*.cu' -not -name '._*')\nCUOBJS := $(subst .cu,.o,$(CUSRCS))\n\nLIBDIR := -L/usr/local/cuda/lib64\n\nCUFLAGS= \\\n-I. \\\n-Xcompiler \\\n-fPIC\n\nLDFLAGS=-L. -lm -lpthread -lrt\n\nall: Test\n\n%.o: %.c\n\t$(NVCC) $(CUFLAGS) -c $< -o $(basename $@).o\n\n%.o: %.cu\n\t$(NVCC) $(CUFLAGS) -c $< -o $(basename $@).o\n\nTest: $(CUOBJS) $(COBJS)\n\t$(NVCC) -o Test $(CUOBJS) $(COBJS) $(LIBDIR) $(LDFLAGS) -lcudart -lcuda -lcublas -lcudnn\n\nclean:\n\tfind . -name \"*.o\" -exec rm -f '{}' ';'\n\trm -f Test\n"
  },
  {
    "path": "README.md",
    "content": "## Introduction\n\nThis code implements fast cuda kernels for DNN inference, especially for convolution layers / residule blocks in ResNet. Specifically, the kernels combine three parts into one piece:\n- Convolution\n- Batch Nomalization (BN + Scale)\n- Activation (ReLU)\n\nFor implementation details, please refer to the technical report included in this repo. Winograd algorithm is used for 3 * 3 convolutional kernels.\n\n## Usage\n``` sh\nmkdir data\npython data_generator.py\nmake\n./Test 0\n```\n- Set parameters in `data_generator.py`\n- Run 6 test cases with changing numbers from 0 to 5 after `./Test`\n\n## Results\n\n### 3 * 3 Kernels\nKernals | Operations | 128 / 128 | 256 / 256\n--- | --- | --- | ---\nCudnn | Gemm + BN + ReLU | 214us | 384us\nCudnn | Winograd + BN + ReLU  | 95us | 155us\nOur Kernel | Winograd + BN + ReLU | 59us | 117us\n\n### 1 * 1 Kernels [BUGGY NUMBERS]\nKernals | 512 / 128 | 128 / 512 | 1024 / 256 | 256 / 1024\n--- | --- | --- | --- | ---\nOperations | Gemm + BN + ReLU | Gemm + BN | Gemm + BN + ReLU | Gemm + BN + ReLU\nCudnn  | 119us | 115us | 219us | 214us\nOur Kernel | 58us | 55us | 186us | 181us\n"
  },
  {
    "path": "Test.c",
    "content": "#include <assert.h>\n#include <pthread.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n\n#include \"Kernel128_one.h\"\n#include \"Kernel128_winograd.h\"\n#include \"Kernel256_one.h\"\n#include \"Kernel256_winograd.h\"\n#include \"util.h\"\n\nint main(int argc, char** argv) {\n  int nTest = 100, sum = 0, sum_cudnn = 0, i;\n  cudaSetDevice(0);\n\n  int mode = 0;\n  if (argc == 2) {\n    mode = atoi(argv[1]);\n  }\n\n  for (i = 0; i < nTest; i++) {\n    printf(\"---- Iter: %d ----\\n\", i);\n    int res = -1;\n    switch (mode) {\n      case 0:\n        res = kernel_128();\n        break;\n      case 1:\n        res = kernel_256();\n        break;\n      case 2:\n        res = kernel_128_1_in();\n        break;\n      case 3:\n        res = kernel_128_1_out();\n        break;\n      case 4:\n        res = kernel_256_1_in();\n        break;\n      case 5:\n        res = kernel_256_1_out();\n        break;\n    }\n    if (i > 1) {\n      sum += res >> 16;\n      sum_cudnn += res & 0xFFFF;\n    }\n  }\n  printf(\n      \"Average Total Time: [Mine: %d us], [cuDNN: %d us]\\n\",\n      sum / (nTest - 2),\n      sum_cudnn / (nTest - 2));\n\n  return 0;\n}\n"
  },
  {
    "path": "data_generator.py",
    "content": "from __future__ import print_function\nimport sys\nimport numpy as np\nimport re\nimport time\nimport difflib\nimport array\nimport requests\nimport os\nimport shutil\nimport scipy.spatial.distance as spd\nimport numpy as np\nfrom numpy.random import *\nfrom random import randint\nimport random\nimport matplotlib.pyplot as plt\nfrom scipy import misc\n\n\ndef bias_generator(output_channel = 128):\n\tbias = (np.array(rand(output_channel))-0.5).astype(np.float32)\n\tdes = open(\"data/bias_\" + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(bias)\n\n\tbnScale = (np.array(rand(output_channel))-0.5).astype(np.float32)\n\tdes = open(\"data/bnScale_\" + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(bnScale)\n\n\tbnBias = (np.array(rand(output_channel))-0.5).astype(np.float32)\n\tdes = open(\"data/bnBias_\" + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(bnBias)\n\n\teMean = (np.array(rand(output_channel))-0.5).astype(np.float32)\n\tdes = open(\"data/eMean_\" + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(eMean)\n\n\teVar = (np.array(rand(output_channel))*3 + 5).astype(np.float32)\n\tdes = open(\"data/eVar_\" + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(eVar)\n\n\teps = 1e-5\n\tbnScale_winograd = bnScale / np.sqrt(eVar + eps)\n\tdes = open(\"data/bnScale_winograd_\" + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(bnScale_winograd)\n\tbnBias_winograd = bnBias - bnScale*eMean / np.sqrt(eVar + eps)\n\tdes = open(\"data/bnBias_winograd_\" + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(bnBias_winograd)\n\ndef input_generator(input_channel = 128, feature_map_size = 14, padding = 1):\n\tparameters = (feature_map_size + 2*padding)*(feature_map_size + 2*padding) * input_channel\n\ta = (np.array(rand(parameters))-0.5).astype(np.float32)\n\tdes = open(\"data/input_\" + str(feature_map_size) + '_' + str(padding) + '_' + str(input_channel) + \".bin\", \"wb\")\n\tdes.write(a)\n\ndef weight_generator(input_channel = 128, output_channel = 128):\n\t### Weights_NCHW\n\tparameters = input_channel*output_channel * 3*3\n\tin_ = (np.array(rand(parameters))-0.5).astype(np.float32)\n\n\tdes = open(\"data/weight_NCHW_\" + str(input_channel) + '_' + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(in_)\n\n\t### Weights_Winograd\n\tin_ = in_.reshape(input_channel*output_channel, 3,3)\n\tG = np.array([[0.25,0,0], [-1.0/6,-1.0/6,-1.0/6], [-1.0/6,1.0/6,-1.0/6], [1.0/24,1.0/12,1.0/6], [1.0/24,-1.0/12,1.0/6], [0,0,1]])\n\n\tout_ = [0] * input_channel*output_channel * 6*6\n\tfor i in range(output_channel):\n\t\tfor j in range(input_channel):\n\t\t\tb = np.dot(G, in_[i*input_channel+j])\n\t\t\tb = np.dot(b, G.transpose())\n\t\t\toffset = j*output_channel+i\n\t\t\tfor x in range(6):\n\t\t\t\tfor y in range(6):\n\t\t\t\t\tout_[((x*6+y) * input_channel*output_channel) + offset] = b[x][y]\n\n\tdes = open(\"data/weight_winograd_\" + str(input_channel) + '_' + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(np.array(out_).astype(np.float32))\n\ndef onebyone_generator(input_channel = 256, output_channel = 1024, feature_map_size = 14):\n\tparameters = feature_map_size*feature_map_size * output_channel\n\ta = ((np.array(rand(parameters))-0.5)*40).astype(np.float32)\n\tdes = open(\"data/input_one_\" + str(feature_map_size) + '_' + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(a)\n\n\tparameters = input_channel * output_channel\n\ta = ((np.array(rand(parameters))-0.5)*40).astype(np.float32)\n\tdes = open(\"data/weight_one_\" + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(a)\n\n\tbnScale = ((np.array(rand(output_channel))-0.5)*40).astype(np.float32)\n\tdes = open(\"data/bnScale_one_\" + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(bnScale)\n\n\tbnBias = ((np.array(rand(output_channel))-0.5)*40).astype(np.float32)\n\tdes = open(\"data/bnBias_one_\" + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(bnBias)\n\n\teMean = ((np.array(rand(output_channel))-0.5)*40).astype(np.float32)\n\tdes = open(\"data/eMean_one_\" + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(eMean)\n\n\teVar = (np.array(rand(output_channel))*20 + 5).astype(np.float32)\n\tdes = open(\"data/eVar_one_\" + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(eVar)\n\n\teps = 1e-5\n\tbnScale_winograd = bnScale / np.sqrt(eVar + eps)\n\tdes = open(\"data/bnScale_myKernel_one_\" + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(bnScale_winograd)\n\tbnBias_winograd = bnBias - bnScale*eMean / np.sqrt(eVar + eps)\n\tdes = open(\"data/bnBias_myKernel_one_\" + str(output_channel) + \".bin\", \"wb\")\n\tdes.write(bnBias_winograd)\n\n\nif __name__ == '__main__':\n\tbias_generator(output_channel = 128)\n\tprint('Biases generated')\n\n\tinput_generator(input_channel = 128)\n\tprint('Input generated')\n\n\tweight_generator(128, 128)\n\tprint('Weights generated')\n\n\tonebyone_generator()\n\tprint('Parameters for 1*1 conv generated')\n"
  },
  {
    "path": "util.c",
    "content": "#include \"util.h\"\n#include <time.h>\n#include \"math.h\"\n\nuint64_t getTimeMicroseconds64() {\n  uint64_t nTime;\n  struct timespec tSpec;\n\n  clock_gettime(CLOCK_REALTIME, &tSpec);\n\n  nTime = (uint64_t)tSpec.tv_sec * 1000000 + (uint64_t)tSpec.tv_nsec / 1000;\n  return nTime;\n}\n\nfloat* transpose(float* weight, int h, int w) {\n  float* new_weight = (float*)malloc(w * h * 4);\n  int i, j;\n  for (i = 0; i < w; ++i) {\n    for (j = 0; j < h; ++j) {\n      new_weight[j * w + i] = weight[i * h + j];\n    }\n  }\n\n  free(weight);\n  return new_weight;\n}\n\nfloat* get_parameter(const char* filename, int size) {\n  float* parameter = (float*)malloc(size * 4);\n  if (!parameter) {\n    printf(\"Bad Malloc\\n\");\n    exit(0);\n  }\n  FILE* ptr = fopen(filename, \"rb\");\n\n  if (!ptr) {\n    printf(\"Bad file path: %p, %s\\n\", ptr, strerror(errno));\n    exit(0);\n  }\n  fread(parameter, size * 4, 1, ptr);\n\n  fclose(ptr);\n  return parameter;\n}\n\nfloat output_checker(float* A, float* B, int len, int channel, int shift) {\n  int error_cnt = 0, i, j, k;\n  float max_error = 0;\n  for (i = 0; i < len; i++) {\n    for (j = 0; j < len; j++) {\n      for (k = 0; k < channel; k++) {\n        float diff = fabs(\n            A[((i + shift) * (len + 2 * shift) + j + shift) * channel + k] -\n            B[(i * len + j) * channel + k]);\n        if (diff > 1e-5)\n          error_cnt++;\n        if (diff > max_error)\n          max_error = diff;\n      }\n    }\n  }\n  printf(\"[max_error: %f][error_cnt: %d]\\n\", max_error, error_cnt);\n}\n"
  },
  {
    "path": "util.h",
    "content": "#ifndef __UTIL_H__\n#define __UTIL_H__\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n#include <assert.h>\n#include <errno.h>\n#include <float.h>\n#include <immintrin.h>\n#include <inttypes.h>\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <xmmintrin.h>\n\nfloat* get_parameter(const char* filename, int size);\n\nfloat* transpose(float* weight, int h, int w);\n\nuint64_t getTimeMicroseconds64();\n\nfloat output_checker(float* A, float* B, int len, int channel, int shift);\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif\n"
  }
]