Full Code of xuqiantong/CUDA-Winograd for AI

master 7d5aec268304 cached

45 files

95.5 KB

33.3k tokens

7 symbols

1 requests

Download .txt

Repository: xuqiantong/CUDA-Winograd
Branch: master
Commit: 7d5aec268304
Files: 45
Total size: 95.5 KB

Directory structure:
gitextract_1urew_xc/

├── .git/
│   ├── HEAD
│   ├── config
│   ├── description
│   ├── hooks/
│   │   ├── applypatch-msg.sample
│   │   ├── commit-msg.sample
│   │   ├── fsmonitor-watchman.sample
│   │   ├── post-update.sample
│   │   ├── pre-applypatch.sample
│   │   ├── pre-commit.sample
│   │   ├── pre-merge-commit.sample
│   │   ├── pre-push.sample
│   │   ├── pre-rebase.sample
│   │   ├── pre-receive.sample
│   │   ├── prepare-commit-msg.sample
│   │   ├── push-to-checkout.sample
│   │   ├── sendemail-validate.sample
│   │   └── update.sample
│   ├── index
│   ├── info/
│   │   └── exclude
│   ├── logs/
│   │   ├── HEAD
│   │   └── refs/
│   │       ├── heads/
│   │       │   └── master
│   │       └── remotes/
│   │           └── origin/
│   │               └── HEAD
│   ├── objects/
│   │   └── pack/
│   │       ├── pack-49e0e400885c832ca9946c8b7ba9584079082307.idx
│   │       ├── pack-49e0e400885c832ca9946c8b7ba9584079082307.pack
│   │       ├── pack-49e0e400885c832ca9946c8b7ba9584079082307.promisor
│   │       └── pack-49e0e400885c832ca9946c8b7ba9584079082307.rev
│   ├── packed-refs
│   ├── refs/
│   │   ├── heads/
│   │   │   └── master
│   │   └── remotes/
│   │       └── origin/
│   │           └── HEAD
│   └── shallow
├── .gitignore
├── Kernel128_one.cu
├── Kernel128_one.h
├── Kernel128_winograd.cu
├── Kernel128_winograd.h
├── Kernel256_one.cu
├── Kernel256_one.h
├── Kernel256_winograd.cu
├── Kernel256_winograd.h
├── Makefile
├── README.md
├── Test.c
├── data_generator.py
├── util.c
└── util.h

================================================
FILE CONTENTS
================================================

================================================
FILE: .git/HEAD
================================================
ref: refs/heads/master


================================================
FILE: .git/config
================================================
[core]
	repositoryformatversion = 1
	filemode = true
	bare = false
	logallrefupdates = true
[remote "origin"]
	url = https://github.com/xuqiantong/CUDA-Winograd
	tagOpt = --no-tags
	fetch = +refs/heads/master:refs/remotes/origin/master
	promisor = true
	partialclonefilter = blob:limit=1048576
[branch "master"]
	remote = origin
	merge = refs/heads/master


================================================
FILE: .git/description
================================================
Unnamed repository; edit this file 'description' to name the repository.


================================================
FILE: .git/hooks/applypatch-msg.sample
================================================
#!/bin/sh
#
# An example hook script to check the commit log message taken by
# applypatch from an e-mail message.
#
# The hook should exit with non-zero status after issuing an
# appropriate message if it wants to stop the commit.  The hook is
# allowed to edit the commit message file.
#
# To enable this hook, rename this file to "applypatch-msg".

. git-sh-setup
commitmsg="$(git rev-parse --git-path hooks/commit-msg)"
test -x "$commitmsg" && exec "$commitmsg" ${1+"$@"}
:


================================================
FILE: .git/hooks/commit-msg.sample
================================================
#!/bin/sh
#
# An example hook script to check the commit log message.
# Called by "git commit" with one argument, the name of the file
# that has the commit message.  The hook should exit with non-zero
# status after issuing an appropriate message if it wants to stop the
# commit.  The hook is allowed to edit the commit message file.
#
# To enable this hook, rename this file to "commit-msg".

# Uncomment the below to add a Signed-off-by line to the message.
# Doing this in a hook is a bad idea in general, but the prepare-commit-msg
# hook is more suited to it.
#
# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"

# This example catches duplicate Signed-off-by lines.

test "" = "$(grep '^Signed-off-by: ' "$1" |
	 sort | uniq -c | sed -e '/^[ 	]*1[ 	]/d')" || {
	echo >&2 Duplicate Signed-off-by lines.
	exit 1
}


================================================
FILE: .git/hooks/fsmonitor-watchman.sample
================================================
#!/usr/bin/perl

use strict;
use warnings;
use IPC::Open2;

# An example hook script to integrate Watchman
# (https://facebook.github.io/watchman/) with git to speed up detecting
# new and modified files.
#
# The hook is passed a version (currently 2) and last update token
# formatted as a string and outputs to stdout a new update token and
# all files that have been modified since the update token. Paths must
# be relative to the root of the working tree and separated by a single NUL.
#
# To enable this hook, rename this file to "query-watchman" and set
# 'git config core.fsmonitor .git/hooks/query-watchman'
#
my ($version, $last_update_token) = @ARGV;

# Uncomment for debugging
# print STDERR "$0 $version $last_update_token\n";

# Check the hook interface version
if ($version ne 2) {
	die "Unsupported query-fsmonitor hook version '$version'.\n" .
	    "Falling back to scanning...\n";
}

my $git_work_tree = get_working_dir();

my $retry = 1;

my $json_pkg;
eval {
	require JSON::XS;
	$json_pkg = "JSON::XS";
	1;
} or do {
	require JSON::PP;
	$json_pkg = "JSON::PP";
};

launch_watchman();

sub launch_watchman {
	my $o = watchman_query();
	if (is_work_tree_watched($o)) {
		output_result($o->{clock}, @{$o->{files}});
	}
}

sub output_result {
	my ($clockid, @files) = @_;

	# Uncomment for debugging watchman output
	# open (my $fh, ">", ".git/watchman-output.out");
	# binmode $fh, ":utf8";
	# print $fh "$clockid\n@files\n";
	# close $fh;

	binmode STDOUT, ":utf8";
	print $clockid;
	print "\0";
	local $, = "\0";
	print @files;
}

sub watchman_clock {
	my $response = qx/watchman clock "$git_work_tree"/;
	die "Failed to get clock id on '$git_work_tree'.\n" .
		"Falling back to scanning...\n" if $? != 0;

	return $json_pkg->new->utf8->decode($response);
}

sub watchman_query {
	my $pid = open2(\*CHLD_OUT, \*CHLD_IN, 'watchman -j --no-pretty')
	or die "open2() failed: $!\n" .
	"Falling back to scanning...\n";

	# In the query expression below we're asking for names of files that
	# changed since $last_update_token but not from the .git folder.
	#
	# To accomplish this, we're using the "since" generator to use the
	# recency index to select candidate nodes and "fields" to limit the
	# output to file names only. Then we're using the "expression" term to
	# further constrain the results.
	my $last_update_line = "";
	if (substr($last_update_token, 0, 1) eq "c") {
		$last_update_token = "\"$last_update_token\"";
		$last_update_line = qq[\n"since": $last_update_token,];
	}
	my $query = <<"	END";
		["query", "$git_work_tree", {$last_update_line
			"fields": ["name"],
			"expression": ["not", ["dirname", ".git"]]
		}]
	END

	# Uncomment for debugging the watchman query
	# open (my $fh, ">", ".git/watchman-query.json");
	# print $fh $query;
	# close $fh;

	print CHLD_IN $query;
	close CHLD_IN;
	my $response = do {local $/; <CHLD_OUT>};

	# Uncomment for debugging the watch response
	# open ($fh, ">", ".git/watchman-response.json");
	# print $fh $response;
	# close $fh;

	die "Watchman: command returned no output.\n" .
	"Falling back to scanning...\n" if $response eq "";
	die "Watchman: command returned invalid output: $response\n" .
	"Falling back to scanning...\n" unless $response =~ /^\{/;

	return $json_pkg->new->utf8->decode($response);
}

sub is_work_tree_watched {
	my ($output) = @_;
	my $error = $output->{error};
	if ($retry > 0 and $error and $error =~ m/unable to resolve root .* directory (.*) is not watched/) {
		$retry--;
		my $response = qx/watchman watch "$git_work_tree"/;
		die "Failed to make watchman watch '$git_work_tree'.\n" .
		    "Falling back to scanning...\n" if $? != 0;
		$output = $json_pkg->new->utf8->decode($response);
		$error = $output->{error};
		die "Watchman: $error.\n" .
		"Falling back to scanning...\n" if $error;

		# Uncomment for debugging watchman output
		# open (my $fh, ">", ".git/watchman-output.out");
		# close $fh;

		# Watchman will always return all files on the first query so
		# return the fast "everything is dirty" flag to git and do the
		# Watchman query just to get it over with now so we won't pay
		# the cost in git to look up each individual file.
		my $o = watchman_clock();
		$error = $output->{error};

		die "Watchman: $error.\n" .
		"Falling back to scanning...\n" if $error;

		output_result($o->{clock}, ("/"));
		$last_update_token = $o->{clock};

		eval { launch_watchman() };
		return 0;
	}

	die "Watchman: $error.\n" .
	"Falling back to scanning...\n" if $error;

	return 1;
}

sub get_working_dir {
	my $working_dir;
	if ($^O =~ 'msys' || $^O =~ 'cygwin') {
		$working_dir = Win32::GetCwd();
		$working_dir =~ tr/\\/\//;
	} else {
		require Cwd;
		$working_dir = Cwd::cwd();
	}

	return $working_dir;
}


================================================
FILE: .git/hooks/post-update.sample
================================================
#!/bin/sh
#
# An example hook script to prepare a packed repository for use over
# dumb transports.
#
# To enable this hook, rename this file to "post-update".

exec git update-server-info


================================================
FILE: .git/hooks/pre-applypatch.sample
================================================
#!/bin/sh
#
# An example hook script to verify what is about to be committed
# by applypatch from an e-mail message.
#
# The hook should exit with non-zero status after issuing an
# appropriate message if it wants to stop the commit.
#
# To enable this hook, rename this file to "pre-applypatch".

. git-sh-setup
precommit="$(git rev-parse --git-path hooks/pre-commit)"
test -x "$precommit" && exec "$precommit" ${1+"$@"}
:


================================================
FILE: .git/hooks/pre-commit.sample
================================================
#!/bin/sh
#
# An example hook script to verify what is about to be committed.
# Called by "git commit" with no arguments.  The hook should
# exit with non-zero status after issuing an appropriate message if
# it wants to stop the commit.
#
# To enable this hook, rename this file to "pre-commit".

if git rev-parse --verify HEAD >/dev/null 2>&1
then
	against=HEAD
else
	# Initial commit: diff against an empty tree object
	against=$(git hash-object -t tree /dev/null)
fi

# If you want to allow non-ASCII filenames set this variable to true.
allownonascii=$(git config --type=bool hooks.allownonascii)

# Redirect output to stderr.
exec 1>&2

# Cross platform projects tend to avoid non-ASCII filenames; prevent
# them from being added to the repository. We exploit the fact that the
# printable range starts at the space character and ends with tilde.
if [ "$allownonascii" != "true" ] &&
	# Note that the use of brackets around a tr range is ok here, (it's
	# even required, for portability to Solaris 10's /usr/bin/tr), since
	# the square bracket bytes happen to fall in the designated range.
	test $(git diff-index --cached --name-only --diff-filter=A -z $against |
	  LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0
then
	cat <<\EOF
Error: Attempt to add a non-ASCII file name.

This can cause problems if you want to work with people on other platforms.

To be portable it is advisable to rename the file.

If you know what you are doing you can disable this check using:

  git config hooks.allownonascii true
EOF
	exit 1
fi

# If there are whitespace errors, print the offending file names and fail.
exec git diff-index --check --cached $against --


================================================
FILE: .git/hooks/pre-merge-commit.sample
================================================
#!/bin/sh
#
# An example hook script to verify what is about to be committed.
# Called by "git merge" with no arguments.  The hook should
# exit with non-zero status after issuing an appropriate message to
# stderr if it wants to stop the merge commit.
#
# To enable this hook, rename this file to "pre-merge-commit".

. git-sh-setup
test -x "$GIT_DIR/hooks/pre-commit" &&
        exec "$GIT_DIR/hooks/pre-commit"
:


================================================
FILE: .git/hooks/pre-push.sample
================================================
#!/bin/sh

# An example hook script to verify what is about to be pushed.  Called by "git
# push" after it has checked the remote status, but before anything has been
# pushed.  If this script exits with a non-zero status nothing will be pushed.
#
# This hook is called with the following parameters:
#
# $1 -- Name of the remote to which the push is being done
# $2 -- URL to which the push is being done
#
# If pushing without using a named remote those arguments will be equal.
#
# Information about the commits which are being pushed is supplied as lines to
# the standard input in the form:
#
#   <local ref> <local oid> <remote ref> <remote oid>
#
# This sample shows how to prevent push of commits where the log message starts
# with "WIP" (work in progress).

remote="$1"
url="$2"

zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')

while read local_ref local_oid remote_ref remote_oid
do
	if test "$local_oid" = "$zero"
	then
		# Handle delete
		:
	else
		if test "$remote_oid" = "$zero"
		then
			# New branch, examine all commits
			range="$local_oid"
		else
			# Update to existing branch, examine new commits
			range="$remote_oid..$local_oid"
		fi

		# Check for WIP commit
		commit=$(git rev-list -n 1 --grep '^WIP' "$range")
		if test -n "$commit"
		then
			echo >&2 "Found WIP commit in $local_ref, not pushing"
			exit 1
		fi
	fi
done

exit 0


================================================
FILE: .git/hooks/pre-rebase.sample
================================================
#!/bin/sh
#
# Copyright (c) 2006, 2008 Junio C Hamano
#
# The "pre-rebase" hook is run just before "git rebase" starts doing
# its job, and can prevent the command from running by exiting with
# non-zero status.
#
# The hook is called with the following parameters:
#
# $1 -- the upstream the series was forked from.
# $2 -- the branch being rebased (or empty when rebasing the current branch).
#
# This sample shows how to prevent topic branches that are already
# merged to 'next' branch from getting rebased, because allowing it
# would result in rebasing already published history.

publish=next
basebranch="$1"
if test "$#" = 2
then
	topic="refs/heads/$2"
else
	topic=`git symbolic-ref HEAD` ||
	exit 0 ;# we do not interrupt rebasing detached HEAD
fi

case "$topic" in
refs/heads/??/*)
	;;
*)
	exit 0 ;# we do not interrupt others.
	;;
esac

# Now we are dealing with a topic branch being rebased
# on top of master.  Is it OK to rebase it?

# Does the topic really exist?
git show-ref -q "$topic" || {
	echo >&2 "No such branch $topic"
	exit 1
}

# Is topic fully merged to master?
not_in_master=`git rev-list --pretty=oneline ^master "$topic"`
if test -z "$not_in_master"
then
	echo >&2 "$topic is fully merged to master; better remove it."
	exit 1 ;# we could allow it, but there is no point.
fi

# Is topic ever merged to next?  If so you should not be rebasing it.
only_next_1=`git rev-list ^master "^$topic" ${publish} | sort`
only_next_2=`git rev-list ^master           ${publish} | sort`
if test "$only_next_1" = "$only_next_2"
then
	not_in_topic=`git rev-list "^$topic" master`
	if test -z "$not_in_topic"
	then
		echo >&2 "$topic is already up to date with master"
		exit 1 ;# we could allow it, but there is no point.
	else
		exit 0
	fi
else
	not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"`
	/usr/bin/perl -e '
		my $topic = $ARGV[0];
		my $msg = "* $topic has commits already merged to public branch:\n";
		my (%not_in_next) = map {
			/^([0-9a-f]+) /;
			($1 => 1);
		} split(/\n/, $ARGV[1]);
		for my $elem (map {
				/^([0-9a-f]+) (.*)$/;
				[$1 => $2];
			} split(/\n/, $ARGV[2])) {
			if (!exists $not_in_next{$elem->[0]}) {
				if ($msg) {
					print STDERR $msg;
					undef $msg;
				}
				print STDERR " $elem->[1]\n";
			}
		}
	' "$topic" "$not_in_next" "$not_in_master"
	exit 1
fi

<<\DOC_END

This sample hook safeguards topic branches that have been
published from being rewound.

The workflow assumed here is:

 * Once a topic branch forks from "master", "master" is never
   merged into it again (either directly or indirectly).

 * Once a topic branch is fully cooked and merged into "master",
   it is deleted.  If you need to build on top of it to correct
   earlier mistakes, a new topic branch is created by forking at
   the tip of the "master".  This is not strictly necessary, but
   it makes it easier to keep your history simple.

 * Whenever you need to test or publish your changes to topic
   branches, merge them into "next" branch.

The script, being an example, hardcodes the publish branch name
to be "next", but it is trivial to make it configurable via
$GIT_DIR/config mechanism.

With this workflow, you would want to know:

(1) ... if a topic branch has ever been merged to "next".  Young
    topic branches can have stupid mistakes you would rather
    clean up before publishing, and things that have not been
    merged into other branches can be easily rebased without
    affecting other people.  But once it is published, you would
    not want to rewind it.

(2) ... if a topic branch has been fully merged to "master".
    Then you can delete it.  More importantly, you should not
    build on top of it -- other people may already want to
    change things related to the topic as patches against your
    "master", so if you need further changes, it is better to
    fork the topic (perhaps with the same name) afresh from the
    tip of "master".

Let's look at this example:

		   o---o---o---o---o---o---o---o---o---o "next"
		  /       /           /           /
		 /   a---a---b A     /           /
		/   /               /           /
	       /   /   c---c---c---c B         /
	      /   /   /             \         /
	     /   /   /   b---b C     \       /
	    /   /   /   /             \     /
    ---o---o---o---o---o---o---o---o---o---o---o "master"


A, B and C are topic branches.

 * A has one fix since it was merged up to "next".

 * B has finished.  It has been fully merged up to "master" and "next",
   and is ready to be deleted.

 * C has not merged to "next" at all.

We would want to allow C to be rebased, refuse A, and encourage
B to be deleted.

To compute (1):

	git rev-list ^master ^topic next
	git rev-list ^master        next

	if these match, topic has not merged in next at all.

To compute (2):

	git rev-list master..topic

	if this is empty, it is fully merged to "master".

DOC_END


================================================
FILE: .git/hooks/pre-receive.sample
================================================
#!/bin/sh
#
# An example hook script to make use of push options.
# The example simply echoes all push options that start with 'echoback='
# and rejects all pushes when the "reject" push option is used.
#
# To enable this hook, rename this file to "pre-receive".

if test -n "$GIT_PUSH_OPTION_COUNT"
then
	i=0
	while test "$i" -lt "$GIT_PUSH_OPTION_COUNT"
	do
		eval "value=\$GIT_PUSH_OPTION_$i"
		case "$value" in
		echoback=*)
			echo "echo from the pre-receive-hook: ${value#*=}" >&2
			;;
		reject)
			exit 1
		esac
		i=$((i + 1))
	done
fi


================================================
FILE: .git/hooks/prepare-commit-msg.sample
================================================
#!/bin/sh
#
# An example hook script to prepare the commit log message.
# Called by "git commit" with the name of the file that has the
# commit message, followed by the description of the commit
# message's source.  The hook's purpose is to edit the commit
# message file.  If the hook fails with a non-zero status,
# the commit is aborted.
#
# To enable this hook, rename this file to "prepare-commit-msg".

# This hook includes three examples. The first one removes the
# "# Please enter the commit message..." help message.
#
# The second includes the output of "git diff --name-status -r"
# into the message, just before the "git status" output.  It is
# commented because it doesn't cope with --amend or with squashed
# commits.
#
# The third example adds a Signed-off-by line to the message, that can
# still be edited.  This is rarely a good idea.

COMMIT_MSG_FILE=$1
COMMIT_SOURCE=$2
SHA1=$3

/usr/bin/perl -i.bak -ne 'print unless(m/^. Please enter the commit message/..m/^#$/)' "$COMMIT_MSG_FILE"

# case "$COMMIT_SOURCE,$SHA1" in
#  ,|template,)
#    /usr/bin/perl -i.bak -pe '
#       print "\n" . `git diff --cached --name-status -r`
# 	 if /^#/ && $first++ == 0' "$COMMIT_MSG_FILE" ;;
#  *) ;;
# esac

# SOB=$(git var GIT_COMMITTER_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
# git interpret-trailers --in-place --trailer "$SOB" "$COMMIT_MSG_FILE"
# if test -z "$COMMIT_SOURCE"
# then
#   /usr/bin/perl -i.bak -pe 'print "\n" if !$first_line++' "$COMMIT_MSG_FILE"
# fi


================================================
FILE: .git/hooks/push-to-checkout.sample
================================================
#!/bin/sh

# An example hook script to update a checked-out tree on a git push.
#
# This hook is invoked by git-receive-pack(1) when it reacts to git
# push and updates reference(s) in its repository, and when the push
# tries to update the branch that is currently checked out and the
# receive.denyCurrentBranch configuration variable is set to
# updateInstead.
#
# By default, such a push is refused if the working tree and the index
# of the remote repository has any difference from the currently
# checked out commit; when both the working tree and the index match
# the current commit, they are updated to match the newly pushed tip
# of the branch. This hook is to be used to override the default
# behaviour; however the code below reimplements the default behaviour
# as a starting point for convenient modification.
#
# The hook receives the commit with which the tip of the current
# branch is going to be updated:
commit=$1

# It can exit with a non-zero status to refuse the push (when it does
# so, it must not modify the index or the working tree).
die () {
	echo >&2 "$*"
	exit 1
}

# Or it can make any necessary changes to the working tree and to the
# index to bring them to the desired state when the tip of the current
# branch is updated to the new commit, and exit with a zero status.
#
# For example, the hook can simply run git read-tree -u -m HEAD "$1"
# in order to emulate git fetch that is run in the reverse direction
# with git push, as the two-tree form of git read-tree -u -m is
# essentially the same as git switch or git checkout that switches
# branches while keeping the local changes in the working tree that do
# not interfere with the difference between the branches.

# The below is a more-or-less exact translation to shell of the C code
# for the default behaviour for git's push-to-checkout hook defined in
# the push_to_deploy() function in builtin/receive-pack.c.
#
# Note that the hook will be executed from the repository directory,
# not from the working tree, so if you want to perform operations on
# the working tree, you will have to adapt your code accordingly, e.g.
# by adding "cd .." or using relative paths.

if ! git update-index -q --ignore-submodules --refresh
then
	die "Up-to-date check failed"
fi

if ! git diff-files --quiet --ignore-submodules --
then
	die "Working directory has unstaged changes"
fi

# This is a rough translation of:
#
#   head_has_history() ? "HEAD" : EMPTY_TREE_SHA1_HEX
if git cat-file -e HEAD 2>/dev/null
then
	head=HEAD
else
	head=$(git hash-object -t tree --stdin </dev/null)
fi

if ! git diff-index --quiet --cached --ignore-submodules $head --
then
	die "Working directory has staged changes"
fi

if ! git read-tree -u -m "$commit"
then
	die "Could not update working tree to new HEAD"
fi


================================================
FILE: .git/hooks/sendemail-validate.sample
================================================
#!/bin/sh

# An example hook script to validate a patch (and/or patch series) before
# sending it via email.
#
# The hook should exit with non-zero status after issuing an appropriate
# message if it wants to prevent the email(s) from being sent.
#
# To enable this hook, rename this file to "sendemail-validate".
#
# By default, it will only check that the patch(es) can be applied on top of
# the default upstream branch without conflicts in a secondary worktree. After
# validation (successful or not) of the last patch of a series, the worktree
# will be deleted.
#
# The following config variables can be set to change the default remote and
# remote ref that are used to apply the patches against:
#
#   sendemail.validateRemote (default: origin)
#   sendemail.validateRemoteRef (default: HEAD)
#
# Replace the TODO placeholders with appropriate checks according to your
# needs.

validate_cover_letter () {
	file="$1"
	# TODO: Replace with appropriate checks (e.g. spell checking).
	true
}

validate_patch () {
	file="$1"
	# Ensure that the patch applies without conflicts.
	git am -3 "$file" || return
	# TODO: Replace with appropriate checks for this patch
	# (e.g. checkpatch.pl).
	true
}

validate_series () {
	# TODO: Replace with appropriate checks for the whole series
	# (e.g. quick build, coding style checks, etc.).
	true
}

# main -------------------------------------------------------------------------

if test "$GIT_SENDEMAIL_FILE_COUNTER" = 1
then
	remote=$(git config --default origin --get sendemail.validateRemote) &&
	ref=$(git config --default HEAD --get sendemail.validateRemoteRef) &&
	worktree=$(mktemp --tmpdir -d sendemail-validate.XXXXXXX) &&
	git worktree add -fd --checkout "$worktree" "refs/remotes/$remote/$ref" &&
	git config --replace-all sendemail.validateWorktree "$worktree"
else
	worktree=$(git config --get sendemail.validateWorktree)
fi || {
	echo "sendemail-validate: error: failed to prepare worktree" >&2
	exit 1
}

unset GIT_DIR GIT_WORK_TREE
cd "$worktree" &&

if grep -q "^diff --git " "$1"
then
	validate_patch "$1"
else
	validate_cover_letter "$1"
fi &&

if test "$GIT_SENDEMAIL_FILE_COUNTER" = "$GIT_SENDEMAIL_FILE_TOTAL"
then
	git config --unset-all sendemail.validateWorktree &&
	trap 'git worktree remove -ff "$worktree"' EXIT &&
	validate_series
fi


================================================
FILE: .git/hooks/update.sample
================================================
#!/bin/sh
#
# An example hook script to block unannotated tags from entering.
# Called by "git receive-pack" with arguments: refname sha1-old sha1-new
#
# To enable this hook, rename this file to "update".
#
# Config
# ------
# hooks.allowunannotated
#   This boolean sets whether unannotated tags will be allowed into the
#   repository.  By default they won't be.
# hooks.allowdeletetag
#   This boolean sets whether deleting tags will be allowed in the
#   repository.  By default they won't be.
# hooks.allowmodifytag
#   This boolean sets whether a tag may be modified after creation. By default
#   it won't be.
# hooks.allowdeletebranch
#   This boolean sets whether deleting branches will be allowed in the
#   repository.  By default they won't be.
# hooks.denycreatebranch
#   This boolean sets whether remotely creating branches will be denied
#   in the repository.  By default this is allowed.
#

# --- Command line
refname="$1"
oldrev="$2"
newrev="$3"

# --- Safety check
if [ -z "$GIT_DIR" ]; then
	echo "Don't run this script from the command line." >&2
	echo " (if you want, you could supply GIT_DIR then run" >&2
	echo "  $0 <ref> <oldrev> <newrev>)" >&2
	exit 1
fi

if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then
	echo "usage: $0 <ref> <oldrev> <newrev>" >&2
	exit 1
fi

# --- Config
allowunannotated=$(git config --type=bool hooks.allowunannotated)
allowdeletebranch=$(git config --type=bool hooks.allowdeletebranch)
denycreatebranch=$(git config --type=bool hooks.denycreatebranch)
allowdeletetag=$(git config --type=bool hooks.allowdeletetag)
allowmodifytag=$(git config --type=bool hooks.allowmodifytag)

# check for no description
projectdesc=$(sed -e '1q' "$GIT_DIR/description")
case "$projectdesc" in
"Unnamed repository"* | "")
	echo "*** Project description file hasn't been set" >&2
	exit 1
	;;
esac

# --- Check types
# if $newrev is 0000...0000, it's a commit to delete a ref.
zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
if [ "$newrev" = "$zero" ]; then
	newrev_type=delete
else
	newrev_type=$(git cat-file -t $newrev)
fi

case "$refname","$newrev_type" in
	refs/tags/*,commit)
		# un-annotated tag
		short_refname=${refname##refs/tags/}
		if [ "$allowunannotated" != "true" ]; then
			echo "*** The un-annotated tag, $short_refname, is not allowed in this repository" >&2
			echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2
			exit 1
		fi
		;;
	refs/tags/*,delete)
		# delete tag
		if [ "$allowdeletetag" != "true" ]; then
			echo "*** Deleting a tag is not allowed in this repository" >&2
			exit 1
		fi
		;;
	refs/tags/*,tag)
		# annotated tag
		if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1
		then
			echo "*** Tag '$refname' already exists." >&2
			echo "*** Modifying a tag is not allowed in this repository." >&2
			exit 1
		fi
		;;
	refs/heads/*,commit)
		# branch
		if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then
			echo "*** Creating a branch is not allowed in this repository" >&2
			exit 1
		fi
		;;
	refs/heads/*,delete)
		# delete branch
		if [ "$allowdeletebranch" != "true" ]; then
			echo "*** Deleting a branch is not allowed in this repository" >&2
			exit 1
		fi
		;;
	refs/remotes/*,commit)
		# tracking branch
		;;
	refs/remotes/*,delete)
		# delete tracking branch
		if [ "$allowdeletebranch" != "true" ]; then
			echo "*** Deleting a tracking branch is not allowed in this repository" >&2
			exit 1
		fi
		;;
	*)
		# Anything else (is there anything else?)
		echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2
		exit 1
		;;
esac

# --- Finished
exit 0


================================================
FILE: .git/info/exclude
================================================
# git ls-files --others --exclude-from=.git/info/exclude
# Lines that start with '#' are comments.
# For a project mostly in C, the following would be a good set of
# exclude patterns (uncomment them if you want to use them):
# *.[oa]
# *~


================================================
FILE: .git/logs/HEAD
================================================
0000000000000000000000000000000000000000 7d5aec268304b92b90f3bb109ddf4cad80d5ea1a appuser <appuser@7c99e0e64a07.(none)> 1778362562 +0000	clone: from https://github.com/xuqiantong/CUDA-Winograd


================================================
FILE: .git/logs/refs/heads/master
================================================
0000000000000000000000000000000000000000 7d5aec268304b92b90f3bb109ddf4cad80d5ea1a appuser <appuser@7c99e0e64a07.(none)> 1778362562 +0000	clone: from https://github.com/xuqiantong/CUDA-Winograd


================================================
FILE: .git/logs/refs/remotes/origin/HEAD
================================================
0000000000000000000000000000000000000000 7d5aec268304b92b90f3bb109ddf4cad80d5ea1a appuser <appuser@7c99e0e64a07.(none)> 1778362562 +0000	clone: from https://github.com/xuqiantong/CUDA-Winograd


================================================
FILE: .git/objects/pack/pack-49e0e400885c832ca9946c8b7ba9584079082307.promisor
================================================
7d5aec268304b92b90f3bb109ddf4cad80d5ea1a refs/heads/master


================================================
FILE: .git/packed-refs
================================================
# pack-refs with: peeled fully-peeled sorted 
7d5aec268304b92b90f3bb109ddf4cad80d5ea1a refs/remotes/origin/master


================================================
FILE: .git/refs/heads/master
================================================
7d5aec268304b92b90f3bb109ddf4cad80d5ea1a


================================================
FILE: .git/refs/remotes/origin/HEAD
================================================
ref: refs/remotes/origin/master


================================================
FILE: .git/shallow
================================================
7d5aec268304b92b90f3bb109ddf4cad80d5ea1a


================================================
FILE: .gitignore
================================================
#executable
*
!*.*
!*/
!Makefile

# Compiled source #
###################
*.com
*.class
*.dll
*.exe
*.o
*.so

# Packages #
############
# it's better to unpack these files and commit the raw source
# git has its own built in compression methods
*.7z
*.dmg
*.gz
*.iso
*.jar
*.rar
*.tar
*.zip
*.bin

# Logs and databases #
######################
*.log
*.sql
*.sqlite

# OS generated files #
######################
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db



================================================
FILE: Kernel128_one.cu
================================================
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <string.h>
#include <float.h>
#include <math.h>
#include <assert.h>
#include <xmmintrin.h>
#include <immintrin.h>

#include "cudnn.h"
#include "util.h"
#include "Kernel128_one.h"


#define cudaCheckError() {																\
	cudaError_t e=cudaGetLastError();													\
	if(e!=cudaSuccess) {																\
		printf("Cuda failure %s:%d:'%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));	\
		exit(EXIT_FAILURE);																\
	}																					\
}

__global__ void kernel_512_one_128(float *A, float *B, float *bnBias, float *bnScale, float *C) {
	int tile = blockIdx.x, in_channel = threadIdx.x, line = threadIdx.y;
	int ind = line*128 + in_channel;

	extern __shared__ float shared_[];
	float *weights = shared_ + 512*4, *output = weights + 128*64, *input = shared_;
	float *bias = output + 4*128, *scale = bias + 128;

	for (int i = 0; i < 4; i++)
		input[ind + i*512] = A[tile*2048 + i*512 + ind];
	bias[in_channel] = bnBias[in_channel];
	scale[in_channel] = bnScale[in_channel];
	output[ind] = 0.0f;
	__syncthreads();

	for (int k = 0; k < 512; k += 64) {
		float *B_start = B + k*128;
		for (int i = 0; i < 16; i++)
			weights[ind + i*512] = B_start[i*512 + ind];
		__syncthreads();

		float *A_start = input + k;
		for (int p = 0; p < 64; p++) {
			output[ind] += A_start[line*512 + p] * weights[in_channel + p*128];
		}
		__syncthreads();
	}

	float *C_start = C + tile*512, res = scale[in_channel] * output[ind] + bias[in_channel];
	C_start[ind] = res > 0 ? res : 0;
}


int kernel_128_1_in() {
	float *input = get_parameter(inputName128one, 14*14*512);
	float *weight = get_parameter(weightName128one, 128*512);

	float *bnBias = get_parameter(bnBiasName128one, 128);
	float *bnScale = get_parameter(bnScaleName128one, 128);
	float *bnBias_myKernel = get_parameter(bnBias_myKernel_Name128one, 128);
	float *bnScale_myKernel = get_parameter(bnScale_myKernel_Name128one, 128);
	float *eMeanName = get_parameter(eMeanName128one, 128);
	float *eVarName = get_parameter(eVarName128one, 128);

	float *input_, *output_, *weight_, *bnBias_, *bnScale_, *eMeanName_, *eVarName_;

	int nInput = 14*14*512, nOutput = 14*14*128, nWeights = 128*512;
	float tmp[nOutput], tmp_cudnn[nOutput];

	uint64_t nT1 = 0, nT2 = 0, nT1_cudnn = 0, nT2_cudnn = 0;
	cudaError_t s;

	/////////////////////////////////

	// My Kernel

	/////////////////////////////////

	/*  1. Data preparation  */
	cudaMalloc((void **) &input_, nInput<<3);
	cudaMalloc((void **) &output_, nOutput<<2);
	cudaMalloc((void **) &weight_, nWeights<<2);
	cudaMalloc((void **) &bnBias_, 128<<2);
	cudaMalloc((void **) &bnScale_, 128<<2);

	cudaMemcpy(input_, input, nInput<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(weight_, weight, nWeights<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(bnBias_, bnBias_myKernel, 128<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(bnScale_, bnScale_myKernel, 128<<2, cudaMemcpyHostToDevice);


	/*  2. Computing  */
	nT1 = getTimeMicroseconds64();

	kernel_512_one_128 <<<dim3(49), dim3(128, 4), (4*512 + 64*128 + 4*128 + 2*128)<<2 >>> (input_, weight_, bnBias_, bnScale_, output_);

	//cudaCheckError();
	cudaDeviceSynchronize();

	nT2 = getTimeMicroseconds64();
	printf("TotalTime = %d us\n", nT2-nT1);


	/*  3. Copy back and free  */
	s = cudaMemcpy(tmp, output_, nOutput<<2, cudaMemcpyDeviceToHost);
	printf("%s\n", cudaGetErrorName(s));
	cudaCheckError();

	free(bnBias_myKernel);
	free(bnScale_myKernel);


	/////////////////////////////////

	// cuDNN

	/////////////////////////////////

	/*  1. Data preparation  */
	cudaMalloc((void **) &eMeanName_, 128<<2);
	cudaMalloc((void **) &eVarName_, 128<<2);

	cudaMemcpy(bnBias_, bnBias, 128<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(bnScale_, bnScale, 128<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(eMeanName_, eMeanName, 128<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(eVarName_, eVarName, 128<<2, cudaMemcpyHostToDevice);

	weight = transpose(weight, 128, 512);
	cudaMemcpy(weight_, weight, nWeights<<2, cudaMemcpyHostToDevice);

	/*  2. cuDNN preparation  */
	cudnnStatus_t status;
	float one = 1.0, zero = 0.0;
	int size;

	cudnnHandle_t handle;
	status = cudnnCreate(&handle);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed1\n");

	cudnnTensorDescriptor_t xdesc, ydesc;
	cudnnFilterDescriptor_t wdesc; // CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NCHW
	status = cudnnCreateTensorDescriptor(&xdesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed2\n");
	status = cudnnSetTensor4dDescriptor(xdesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 512, 14, 14);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed3\n");
	status = cudnnCreateTensorDescriptor(&ydesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed4\n");
	status = cudnnSetTensor4dDescriptor(ydesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 128, 14, 14);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed5\n");
	status = cudnnCreateFilterDescriptor(&wdesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed6\n");
	status = cudnnSetFilter4dDescriptor(wdesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 128, 512, 1, 1);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed7\n");

	cudnnConvolutionDescriptor_t conv_desc;
	status = cudnnCreateConvolutionDescriptor(&conv_desc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed10\n");
	status = cudnnSetConvolution2dDescriptor(conv_desc, 0,0, 1,1,1,1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); //CUDNN_CONVOLUTION
	if (status != CUDNN_STATUS_SUCCESS) printf("failed11\n");

	cudnnActivationDescriptor_t act_desc;
	status = cudnnCreateActivationDescriptor(&act_desc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed12\n");
	status = cudnnSetActivationDescriptor(act_desc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed13\n");

	cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc;
	status = cudnnCreateTensorDescriptor(&bnScaleBiasMeanVarDesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed14\n");
	status = cudnnSetTensor4dDescriptor(bnScaleBiasMeanVarDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 128, 1, 1);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed15\n");

	cudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)0;

	status = cudnnGetConvolutionForwardWorkspaceSize(handle,
	   xdesc,
	   wdesc,
	   conv_desc,
	   ydesc,
	   algo,
	   (size_t *)&(size));
	float *extra;
	cudaMalloc((void **) &extra, size);


	/*  3. Computing  */
	nT1_cudnn = getTimeMicroseconds64();

	status = cudnnConvolutionForward(handle, &one,
		xdesc, input_, wdesc, weight_,
		conv_desc, algo,
		extra, size, &zero,
		ydesc, output_);
	if (status != CUDNN_STATUS_SUCCESS) printf("Not Successed1\n");

	status = cudnnBatchNormalizationForwardInference(handle, CUDNN_BATCHNORM_SPATIAL,
		&one, &zero,
		ydesc, output_, ydesc, output_,
		bnScaleBiasMeanVarDesc, bnScale_, bnBias_, eMeanName_, eVarName_, CUDNN_BN_MIN_EPSILON);
	if (status != CUDNN_STATUS_SUCCESS) printf("Not Successed2\n");

	status = cudnnActivationForward(handle, act_desc, &one,
		ydesc, output_, &zero,
		ydesc, output_);
	if (status != CUDNN_STATUS_SUCCESS) printf("Not Successed3\n");

	cudaDeviceSynchronize();
	nT2_cudnn = getTimeMicroseconds64();
	printf("cuDNN TotalTime = %d us\n", nT2_cudnn-nT1_cudnn);


	/*  4. Copy back and free  */
	s = cudaMemcpy(tmp_cudnn, output_, nOutput<<2, cudaMemcpyDeviceToHost);
	printf("%s\n", cudaGetErrorName(s));

	cudaFree(extra);
	cudaFree(input_);
	cudaFree(output_);
	cudaFree(weight_);

	cudaFree(bnScale_);
	cudaFree(bnBias_);
	cudaFree(eMeanName_);
	cudaFree(eVarName_);

	free(input);
	free(weight);

	free(bnScale);
	free(bnBias);
	free(eMeanName);
	free(eVarName);

	output_checker(tmp, tmp_cudnn, 14, 128, 0);

	return ((nT2-nT1) << 16) | (nT2_cudnn-nT1_cudnn);
}



__global__ void kernel_128_one_512(float *A, float *B, float *bnBias, float *bnScale, float *C) {
	int tile = blockIdx.x, part = blockIdx.y, in_channel = threadIdx.x, line = threadIdx.y;
	int ind = line*128 + in_channel;

	extern __shared__ float shared_[];
	float *weights = shared_ + 128*4, *output = weights + 128*64, *input = shared_;
	float *bias = output + 4*128, *scale = bias + 128;

	input[ind] = A[tile * 512 + ind];
	bias[in_channel] = bnBias[part*128 + in_channel];
	scale[in_channel] = bnScale[part*128+ in_channel];
	output[ind] = 0.0f;
	__syncthreads();

	for (int k = 0; k < 128; k += 64) {
		for (int i = 0; i < 16; i++)
			weights[ind + 512*i] = B[(k + i*4 + line)*512 + part*128 + in_channel];
		__syncthreads();

		float *A_start = input + k;
		for (int p = 0; p < 64; p++) {
			output[ind] += A_start[line*128 + p] * weights[in_channel + p*128];
		}
		__syncthreads();
	}

	float *C_start = C + tile*2048 + part*128;
	float res = scale[in_channel] * output[ind] + bias[in_channel];
	C_start[line * 512 + in_channel] = res;
}


int kernel_128_1_out() {
	float *input = get_parameter(inputName128one, 14*14*128);
	float *weight = get_parameter(weightName128one, 128*512);

	float *bnBias = get_parameter(bnBiasName128one, 512);
	float *bnScale = get_parameter(bnScaleName128one, 512);
	float *bnBias_myKernel = get_parameter(bnBias_myKernel_Name128one, 512);
	float *bnScale_myKernel = get_parameter(bnScale_myKernel_Name128one, 512);
	float *eMeanName = get_parameter(eMeanName128one, 512);
	float *eVarName = get_parameter(eVarName128one, 512);

	float *input_, *output_, *weight_, *bnBias_, *bnScale_, *eMeanName_, *eVarName_;

	int nInput = 14*14*128, nOutput = 14*14*512, nWeights = 128*512;
	float tmp[nOutput], tmp_cudnn[nOutput];

	uint64_t nT1 = 0, nT2 = 0, nT1_cudnn = 0, nT2_cudnn = 0;
	cudaError_t s;

	/////////////////////////////////

	// My Kernel

	/////////////////////////////////

	/*  1. Data preparation  */
	cudaMalloc((void **) &input_, nInput<<3);
	cudaMalloc((void **) &output_, nOutput<<2);
	cudaMalloc((void **) &weight_, nWeights<<2);
	cudaMalloc((void **) &bnBias_, 512<<2);
	cudaMalloc((void **) &bnScale_, 512<<2);

	cudaMemcpy(input_, input, nInput<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(weight_, weight, nWeights<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(bnBias_, bnBias_myKernel, 512<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(bnScale_, bnScale_myKernel, 512<<2, cudaMemcpyHostToDevice);

	/*  2. Computing  */
	nT1 = getTimeMicroseconds64();

	kernel_128_one_512 <<<dim3(49, 4), dim3(128, 4), (4*128 + 64*128 + 4*128 + 2*128)<<2 >>> (input_, weight_, bnBias_, bnScale_, output_);

	//cudaCheckError();
	cudaDeviceSynchronize();

	nT2 = getTimeMicroseconds64();
	printf("TotalTime = %d us\n", nT2-nT1);


	/*  3. Copy back and free  */
	s = cudaMemcpy(tmp, output_, nOutput<<2, cudaMemcpyDeviceToHost);
	printf("%s\n", cudaGetErrorName(s));
	cudaCheckError();

	free(bnBias_myKernel);
	free(bnScale_myKernel);


	/////////////////////////////////

	// cuDNN

	/////////////////////////////////

	/*  1. Data preparation  */
	cudaMalloc((void **) &eMeanName_, 512<<2);
	cudaMalloc((void **) &eVarName_, 512<<2);

	cudaMemcpy(bnBias_, bnBias, 512<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(bnScale_, bnScale, 512<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(eMeanName_, eMeanName, 512<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(eVarName_, eVarName, 512<<2, cudaMemcpyHostToDevice);

	weight = transpose(weight, 512, 128);
	cudaMemcpy(weight_, weight, nWeights<<2, cudaMemcpyHostToDevice);

	/*  2. cuDNN preparation  */
	cudnnStatus_t status;
	float one = 1.0, zero = 0.0;
	int size;

	cudnnHandle_t handle;
	status = cudnnCreate(&handle);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed1\n");

	cudnnTensorDescriptor_t xdesc, ydesc;
	cudnnFilterDescriptor_t wdesc; // CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NCHW
	status = cudnnCreateTensorDescriptor(&xdesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed2\n");
	status = cudnnSetTensor4dDescriptor(xdesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 128, 14, 14);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed3\n");
	status = cudnnCreateTensorDescriptor(&ydesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed4\n");
	status = cudnnSetTensor4dDescriptor(ydesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 512, 14, 14);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed5\n");
	status = cudnnCreateFilterDescriptor(&wdesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed6\n");
	status = cudnnSetFilter4dDescriptor(wdesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 512, 128, 1, 1);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed7\n");

	cudnnConvolutionDescriptor_t conv_desc;
	status = cudnnCreateConvolutionDescriptor(&conv_desc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed10\n");
	status = cudnnSetConvolution2dDescriptor(conv_desc, 0,0, 1,1,1,1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); //CUDNN_CONVOLUTION
	if (status != CUDNN_STATUS_SUCCESS) printf("failed11\n");

	cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc;
	status = cudnnCreateTensorDescriptor(&bnScaleBiasMeanVarDesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed14\n");
	status = cudnnSetTensor4dDescriptor(bnScaleBiasMeanVarDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 512, 1, 1);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed15\n");

	cudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)0;

	status = cudnnGetConvolutionForwardWorkspaceSize(handle,
	   xdesc,
	   wdesc,
	   conv_desc,
	   ydesc,
	   algo,
	   (size_t *)&(size));
	float *extra;
	cudaMalloc((void **) &extra, size);


	/*  3. Computing  */
	nT1_cudnn = getTimeMicroseconds64();

	status = cudnnConvolutionForward(handle, &one,
		xdesc, input_, wdesc, weight_,
		conv_desc, algo,
		extra, size, &zero,
		ydesc, output_);
	if (status != CUDNN_STATUS_SUCCESS) printf("Not Successed1\n");

	status = cudnnBatchNormalizationForwardInference(handle, CUDNN_BATCHNORM_SPATIAL,
		&one, &zero,
		ydesc, output_, ydesc, output_,
		bnScaleBiasMeanVarDesc, bnScale_, bnBias_, eMeanName_, eVarName_, CUDNN_BN_MIN_EPSILON);
	if (status != CUDNN_STATUS_SUCCESS) printf("Not Successed2\n");

	cudaDeviceSynchronize();
	nT2_cudnn = getTimeMicroseconds64();
	printf("cuDNN TotalTime = %d us\n", nT2_cudnn-nT1_cudnn);


	/*  4. Copy back and free  */
	s = cudaMemcpy(tmp_cudnn, output_, nOutput<<2, cudaMemcpyDeviceToHost);
	printf("%s\n", cudaGetErrorName(s));

	cudaFree(extra);
	cudaFree(input_);
	cudaFree(output_);
	cudaFree(weight_);

	cudaFree(bnScale_);
	cudaFree(bnBias_);
	cudaFree(eMeanName_);
	cudaFree(eVarName_);

	free(input);
	free(weight);

	free(bnScale);
	free(bnBias);
	free(eMeanName);
	free(eVarName);

	output_checker(tmp, tmp_cudnn, 14, 512, 0);

	return ((nT2-nT1) << 16) | (nT2_cudnn-nT1_cudnn);
}


================================================
FILE: Kernel128_one.h
================================================
#ifndef __KERNEL128_ONE_H__
#define __KERNEL128_ONE_H__

#ifdef __cplusplus
extern "C" {
#endif

const char inputName128one[] = "data/input_one_14_1024.bin";
const char weightName128one[] = "data/weight_one_1024.bin";

const char bnBiasName128one[] = "data/bnBias_one_1024.bin";
const char bnScaleName128one[] = "data/bnScale_one_1024.bin";
const char bnBias_myKernel_Name128one[] = "data/bnBias_myKernel_one_1024.bin";
const char bnScale_myKernel_Name128one[] = "data/bnScale_myKernel_one_1024.bin";
const char eMeanName128one[] = "data/eMean_one_1024.bin";
const char eVarName128one[] = "data/eVar_one_1024.bin";

int kernel_128_1_in();
int kernel_128_1_out();

#ifdef __cplusplus
}
#endif

#endif

================================================
FILE: Kernel128_winograd.cu
================================================
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <string.h>
#include <float.h>
#include <math.h>
#include <assert.h>
#include <xmmintrin.h>
#include <immintrin.h>

#include "cudnn.h"
#include "util.h"
#include "Kernel128_winograd.h"


#define cudaCheckError() {																\
	cudaError_t e=cudaGetLastError();													\
	if(e!=cudaSuccess) {																\
		printf("Cuda failure %s:%d:'%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));	\
		exit(EXIT_FAILURE);																\
	}																					\
}

#define MY_KERNEL 1

#define d(input, i, j, Inz) ( input[Inz + i*768 + (j<<7)] )

__global__ void kernel_128_winograd_BtdB(float *pInputs, float *pOutputs) {
	int Inx = blockIdx.x<<2, Iny0 = blockIdx.y<<2, Iny1 = threadIdx.y, Inz = threadIdx.x;
	int Iny = Iny0+Iny1, stride_r = 2048, stride_c = 128; // 2048 = 16*128
	int c_glb_start = Inx*stride_r + Iny*stride_c + Inz, c_input = Iny1*stride_c + Inz;

	extern __shared__ float input[];

	int tmp[6] = {0, 768, 1536, 2304, 3072, 3840}; // 768 = 6*128
	for (int i = 0; i < 6; i++) {
		input[c_input + tmp[i]] = pInputs[c_glb_start + i*stride_r];
	}
	__syncthreads();

	float BTd[6];
	switch(Iny1) {
		case 0:
			for (int j = 0; j < 6; j++) {
				BTd[j] = d(input, 0, j, Inz)*4 - d(input, 2, j, Inz)*5 + d(input, 4, j, Inz);
			}
			break;
		case 1:
			for (int j = 0; j < 6; j++) {
				BTd[j] = -d(input, 1, j, Inz)*4 - d(input, 2, j, Inz)*4 + d(input, 3, j, Inz) + d(input, 4, j, Inz);
			}
			break;
		case 2:
			for (int j = 0; j < 6; j++) {
				BTd[j] = d(input, 1, j, Inz)*4 - d(input, 2, j, Inz)*4 - d(input, 3, j, Inz) + d(input, 4, j, Inz);
			}
			break;
		case 3:
			for (int j = 0; j < 6; j++) {
				BTd[j] = -d(input, 1, j, Inz)*2 - d(input, 2, j, Inz) + d(input, 3, j, Inz)*2 + d(input, 4, j, Inz);
			}
			break;
		case 4:
			for (int j = 0; j < 6; j++) {
				BTd[j] = d(input, 1, j, Inz)*2 - d(input, 2, j, Inz) - d(input, 3, j, Inz)*2 + d(input, 4, j, Inz);
			}
			break;
		case 5:
			for (int j = 0; j < 6; j++) {
				BTd[j] = d(input, 1, j, Inz)*4 - d(input, 3, j, Inz)*5 + d(input, 5, j, Inz);
			}
			break;
	}
	__syncthreads();

	int tmp_offset = Iny1*768+Inz;
	for (int i = 0; i < 6; i++) {
		input[tmp_offset + i*stride_c] = BTd[i];
	}
	__syncthreads();

	float BTdB[6];
	switch(Iny1) {
		case 0:
			for (int i = 0; i < 6; i++) {
				BTdB[i] = 4*d(input, i, 0, Inz) - 5*d(input, i, 2, Inz) + d(input, i, 4, Inz);
			}
			break;
		case 1:
			for (int i = 0; i < 6; i++) {
				BTdB[i] = -4*d(input, i, 1, Inz) - 4*d(input, i, 2, Inz) + d(input, i, 3, Inz) + d(input, i, 4, Inz);
			}
			break;
		case 2:
			for (int i = 0; i < 6; i++) {
				BTdB[i] = 4*d(input, i, 1, Inz) - 4*d(input, i, 2, Inz) - d(input, i, 3, Inz) + d(input, i, 4, Inz);
			}
			break;
		case 3:
			for (int i = 0; i < 6; i++) {
				BTdB[i] = -2*d(input, i, 1, Inz) - d(input, i, 2, Inz) + 2*d(input, i, 3, Inz) + d(input, i, 4, Inz);
			}
			break;
		case 4:
			for (int i = 0; i < 6; i++) {
				BTdB[i] = 2*d(input, i, 1, Inz) - d(input, i, 2, Inz) - 2*d(input, i, 3, Inz) + d(input, i, 4, Inz);
			}
			break;
		case 5:
			for (int i = 0; i < 6; i++) {
				BTdB[i] = 4*d(input, i, 1, Inz) - 5*d(input, i, 3, Inz) + d(input, i, 5, Inz);
			}
			break;
	}
	__syncthreads();

	for (int i = 0; i < 6; i++) {
		pOutputs[(Iny1 + i*6)*2048 + (blockIdx.x*4+blockIdx.y)*128 + Inz] = BTdB[i];
	}
}


__global__ void kernel_128_winograd_AtIA(float *pInputs, float *pBiases, float *pScales, float *pOutputs) {
	int Tilex = blockIdx.x, Tiley = blockIdx.y, Iny = threadIdx.y, kz = blockIdx.z, Inx = threadIdx.x;
	int c_input = Inx*6 + Iny;

	__shared__ float bias, scale;
	extern __shared__ float input[];

	input[c_input] = pInputs[c_input*16*128 + (Tilex*4+Tiley)*128 + kz];
	bias = pBiases[kz];
	scale = pScales[kz];
	__syncthreads();

	float tmp = 0;
	switch(Inx) {
		case 0:
			tmp = input[Iny] + input[6+Iny] + input[12+Iny] + input[18+Iny] + input[24+Iny];
			break;
		case 1:
			tmp = input[6+Iny] - input[12+Iny] + 2*input[18+Iny] - 2*input[24+Iny];
			break;
		case 2:
			tmp = input[6+Iny] + input[12+Iny] + 4*input[18+Iny] + 4*input[24+Iny];
			break;
		case 3:
			tmp = input[6+Iny] - input[12+Iny] + 8*input[18+Iny] - 8*input[24+Iny] + input[30+Iny];
			break;
	}
	__syncthreads();

	input[c_input] = tmp;
	__syncthreads();

	if (Inx > 3 || (Tilex == 3 && Inx > 1)) return;
	
	int x;
	float o;
	switch(Iny) {
		case 0:
			x = Inx*6;
			o = scale*(input[x]+input[x+1]+input[x+2]+input[x+3]+input[x+4])+ bias;
			pOutputs[(((Tilex<<2)+1+Inx)*16 + (Tiley<<2)+1)*128 + kz] = o > 0 ? o : 0;
			break;
		case 1:
			x = Inx*6;
			o = scale*(input[x+1] - input[x+2] + 2*input[x+3] - 2*input[x+4]) + bias;
			pOutputs[(((Tilex<<2)+1+Inx)*16 + (Tiley<<2)+2)*128 + kz] = o > 0 ? o : 0;
			break;
		case 2:
			if (Tiley == 3) break;
			x = Inx*6;
			o = scale*(input[x+1] + input[x+2] + 4*input[x+3] + 4*input[x+4]) + bias;
			pOutputs[(((Tilex<<2)+1+Inx)*16 + (Tiley<<2)+3)*128 + kz] = o > 0 ? o : 0;
			break;
		case 3:
			if (Tiley == 3) break;
			x = Inx*6;
			o = scale*(input[x+1] - input[x+2] + 8*input[x+3] - 8*input[x+4] + input[x+5]) + bias;
			pOutputs[(((Tilex<<2)+1+Inx)*16 + (Tiley<<2)+4)*128 + kz] = o > 0 ? o : 0;
			break;
	}
}


__global__ void kernel_128_OuterProduct_128(float *A, float *B, float *C) {
	int Tile = blockIdx.x, Part = blockIdx.y, tX = threadIdx.x, tY = threadIdx.y;
	int c_input = tY*128 + tX, c_kernel = c_input, T_offset = (Tile<<11) + (Part<<10) + c_input, B_offset = (Tile<<14) + c_kernel;
	
	extern __shared__ float input[];
	float *kernel = input + 1024, *out = kernel + 8192;
	int B_stride[32] = {0, 128, 256, 384, 512, 640, 768, 896, 1024, 1152, 1280, 1408, 1536, 1664, 1792, 1920, 2048, 2176, 2304, 2432, 2560, 2688, 2816, 2944, 3072, 3200, 3328, 3456, 3584, 3712, 3840, 3968};//, 4096, 4224, 4352, 4480, 4608, 4736, 4864, 4992, 5120, 5248, 5376, 5504, 5632, 5760, 5888, 6016, 6144, 6272, 6400, 6528, 6656, 6784, 6912, 7040, 7168, 7296, 7424, 7552, 7680, 7808, 7936, 8064};
	out[c_input] = 0.0f;

	input[c_input] = A[T_offset];

	for (int k = 0; k < 4; k++) {
		int B_start = B_offset + (k<<12); // 32*64
		kernel[c_kernel] = B[B_start], kernel[c_kernel+1024] = B[B_start+1024];
		kernel[c_kernel+2048] = B[B_start+2048], kernel[c_kernel+3072] = B[B_start+3072];
		__syncthreads();

		float sum = 0;
		int y_tmp = (tY<<7)+(k<<5);
		for (int j = 0; j < 32; j++) {
			sum += input[y_tmp + j] * kernel[tX + B_stride[j]];
		}
		out[tY*128 + tX] += sum;
		__syncthreads();
	}

	C[T_offset] = out[c_input];
}

int kernel_128() {
	float *input_ = get_parameter(inputName128, 16*16*128);
	float *bias = get_parameter(biasName128, 128);
	float *input, *output, *l_weights, *l_bias;
	uint64_t nT1 = 0, nT2 = 0, nT1_cudnn = 0, nT2_cudnn = 0;
	cudaError_t s;

	/////////////////////////////////

	// My Kernel

	/////////////////////////////////


	/*  1. Data preparation  */
	float *t_input, *ip;
	//float *kernel = get_Winograd_Kernel128(weight_winograd_Name128, 128);
	float *kernel = get_parameter(weight_winograd_Name128, 36*128*128);
	float *l_bnBias, *l_bnScale, *bnBias, *bnScale;

	int nInput = 16*16*128, nOutput = 16*16*128, nWeights = 36*128*128, nBias = 128, nTransInput = 16*6*6*128, nInnerProd = 16*6*6*128;
	cudaMalloc((void **) &input, nInput<<3);
	cudaMalloc((void **) &output, nOutput<<2);
	cudaMalloc((void **) &l_weights, nWeights<<2);
	cudaMalloc((void **) &l_bias, nBias<<2);
	cudaMalloc((void **) &t_input, nTransInput<<2);
	cudaMalloc((void **) &ip, nInnerProd<<2);
	cudaMemset((void *) input, 0, nInput<<3);
	cudaMemset((void *) output, 0, nOutput<<2);
	cudaMemset((void *) t_input, 0, nTransInput<<2);
	cudaMemset((void *) l_weights, 0, nWeights<<2);
	cudaMemset((void *) ip, 0, nInnerProd<<2);
	cudaMemcpy(input, input_, nInput<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(l_weights, kernel, nWeights<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(l_bias, bias, nBias<<2, cudaMemcpyHostToDevice);
	
	bnBias = get_parameter(bnBias_winograd_Name128, 128);
	bnScale = get_parameter(bnScale_winograd_Name128, 128);
	cudaMalloc((void **) &l_bnBias, nBias<<2);
	cudaMalloc((void **) &l_bnScale, nBias<<2);
	cudaMemcpy(l_bnBias, bnBias, nBias<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(l_bnScale, bnScale, nBias<<2, cudaMemcpyHostToDevice);
	float tmp_winograd[nOutput];

	
	/*  2. Computing  */
	nT1 = getTimeMicroseconds64();

	kernel_128_winograd_BtdB <<<dim3(4, 4), dim3(128, 6), (6*6*128)<<2 >>> (input, t_input);
	kernel_128_OuterProduct_128<<<dim3(36, 2), dim3(128, 8), (8*128 + 64*128 + 8*128)<<2 >>> (t_input, l_weights, ip);
	kernel_128_winograd_AtIA <<<dim3(4, 4, 128), dim3(6, 6), ((6*6)<<2)>>> (ip, l_bnBias, l_bnScale, output);
	//cudaCheckError();
	cudaDeviceSynchronize();
	
	nT2 = getTimeMicroseconds64();
	printf("TotalTime = %d us\n", nT2-nT1); 


	/*  3. Copy back and free  */
	s = cudaMemcpy(tmp_winograd, output, nOutput<<2, cudaMemcpyDeviceToHost);
	printf("%s\n", cudaGetErrorName(s));
	//cudaCheckError();

	cudaFree(t_input);
	cudaFree(output);
	cudaFree(l_weights);
	cudaFree(l_bias);
	cudaFree(ip);

	free(kernel);
	free(bnScale);
	free(bnBias);


	/////////////////////////////////

	// cuDNN

	/////////////////////////////////

	/*  1. Data preparation  */
	kernel = get_parameter(weight_NCHW_Name128, 9*128*128);
	bnBias = get_parameter(bnBiasName128, 128);
	bnScale = get_parameter(bnScaleName128, 128);
	float* eMean = get_parameter(eMeanName128, 128);
	float* eVar = get_parameter(eVarName128, 128);
	float *l_eMean, *l_eVar;
	nInput = 16*16*128, nOutput = 14*14*128, nWeights = 3*3*128*128, nBias = 128;

	cudaMalloc((void **) &output, nOutput<<2);
	cudaMalloc((void **) &l_weights, nWeights<<2);
	cudaMalloc((void **) &l_bias, nBias<<2);
	cudaMemcpy(l_weights, kernel, nWeights<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(l_bias, bias, nBias<<2, cudaMemcpyHostToDevice);

	cudaMalloc((void **) &l_eMean, nBias<<2);
	cudaMalloc((void **) &l_eVar, nBias<<2);
	cudaMemcpy(l_bnBias, bnBias, nBias<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(l_bnScale, bnScale, nBias<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(l_eMean, eMean, nBias<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(l_eVar, eVar, nBias<<2, cudaMemcpyHostToDevice);

	cudaMemset((void *) output, 0, nOutput<<2);

	float tmp_cudnn[nOutput];


	/*  2. cuDNN preparation  */
	cudnnStatus_t status;
	float one = 1.0, zero = 0.0;
	int size;

	cudnnHandle_t handle;
	status = cudnnCreate(&handle);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed1\n");

	cudnnTensorDescriptor_t xdesc, ydesc, bdesc;
	cudnnFilterDescriptor_t wdesc; // CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NCHW
	status = cudnnCreateTensorDescriptor(&xdesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed2\n");
	status = cudnnSetTensor4dDescriptor(xdesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 128, 16, 16);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed3\n");
	status = cudnnCreateTensorDescriptor(&ydesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed4\n");
	status = cudnnSetTensor4dDescriptor(ydesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 128, 14, 14);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed5\n");
	status = cudnnCreateFilterDescriptor(&wdesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed6\n");
	status = cudnnSetFilter4dDescriptor(wdesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 128, 128, 3, 3);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed7\n");
	status = cudnnCreateTensorDescriptor(&bdesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed8\n");
	status = cudnnSetTensor4dDescriptor(bdesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 128, 1, 1);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed9\n");
	cudnnConvolutionDescriptor_t conv_desc;
	status = cudnnCreateConvolutionDescriptor(&conv_desc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed10\n");
	status = cudnnSetConvolution2dDescriptor(conv_desc, 0,0, 1,1,1,1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); //CUDNN_CONVOLUTION
	if (status != CUDNN_STATUS_SUCCESS) printf("failed11\n");

	cudnnActivationDescriptor_t act_desc;
	status = cudnnCreateActivationDescriptor(&act_desc);  
	if (status != CUDNN_STATUS_SUCCESS) printf("failed12\n");
	status = cudnnSetActivationDescriptor(act_desc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed13\n");

	cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc;
	status = cudnnCreateTensorDescriptor(&bnScaleBiasMeanVarDesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed14\n");
	status = cudnnSetTensor4dDescriptor(bnScaleBiasMeanVarDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 128, 1, 1);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed15\n");

	cudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)6;

	status = cudnnGetConvolutionForwardWorkspaceSize(handle,
	   xdesc,
	   wdesc,
	   conv_desc,
	   ydesc,
	   algo,
	   (size_t *)&(size));

	float *extra;
	cudaMalloc((void **) &extra, size);


	/*  3. Computing  */
	nT1_cudnn = getTimeMicroseconds64();

	status = cudnnConvolutionForward(handle, &one,
		xdesc, input, wdesc, l_weights, 
		conv_desc, algo, 
		extra, size, &zero,
		ydesc, output);
	if (status != CUDNN_STATUS_SUCCESS) printf("Not Successed1\n");

	status = cudnnBatchNormalizationForwardInference(handle, CUDNN_BATCHNORM_SPATIAL,
		&one, &zero, 
		ydesc, output, ydesc, output,
		bnScaleBiasMeanVarDesc, l_bnScale, l_bnBias, l_eMean, l_eVar, CUDNN_BN_MIN_EPSILON);
	if (status != CUDNN_STATUS_SUCCESS) printf("Not Successed2\n");

	status = cudnnActivationForward(handle, act_desc, &one,
		ydesc, output, &zero,
		ydesc, output);
	if (status != CUDNN_STATUS_SUCCESS) printf("Not Successed3\n");

	cudaDeviceSynchronize();
	nT2_cudnn = getTimeMicroseconds64();
	printf("cuDNN TotalTime = %d us\n", nT2_cudnn-nT1_cudnn);


	/*  4. Copy back and free  */
	s = cudaMemcpy(tmp_cudnn, output, nOutput<<2, cudaMemcpyDeviceToHost);
	printf("%s\n", cudaGetErrorName(s));

	cudaFree(extra);
	cudaFree(input);
	cudaFree(output);
	cudaFree(l_weights);
	cudaFree(l_bias);

	cudaFree(l_bnScale);
	cudaFree(l_bnBias);
	cudaFree(l_eMean);
	cudaFree(l_eVar);

	free(bias);
	free(kernel);

	free(bnScale);
	free(bnBias);
	free(eMean);
	free(eVar);
	free(input_);

	output_checker(tmp_winograd, tmp_cudnn, 14, 128, 1);

	return ((nT2-nT1) << 16) | (nT2_cudnn-nT1_cudnn);
}

================================================
FILE: Kernel128_winograd.h
================================================
#ifndef __KERNEL128_WINOGRAD_H__
#define __KERNEL128_WINOGRAD_H__

#ifdef __cplusplus
extern "C" {
#endif

const char inputName128[] = "data/input_14_1_128.bin";
const char biasName128[] = "data/bias_128.bin";
const char weight_winograd_Name128[] = "data/weight_winograd_128_128.bin";
const char weight_NCHW_Name128[] = "data/weight_NCHW_128_128.bin";

const char bnBiasName128[] = "data/bnBias_128.bin";
const char bnScaleName128[] = "data/bnScale_128.bin";
const char bnBias_winograd_Name128[] = "data/bnBias_winograd_128.bin";
const char bnScale_winograd_Name128[] = "data/bnScale_winograd_128.bin";
const char eMeanName128[] = "data/eMean_128.bin";
const char eVarName128[] = "data/eVar_128.bin";

int kernel_128();

#ifdef __cplusplus
}
#endif

#endif

================================================
FILE: Kernel256_one.cu
================================================
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <string.h>
#include <float.h>
#include <math.h>
#include <assert.h>
#include <xmmintrin.h>
#include <immintrin.h>

#include "cudnn.h"
#include "util.h"
#include "Kernel256_one.h"


#define cudaCheckError() {																\
	cudaError_t e=cudaGetLastError();													\
	if(e!=cudaSuccess) {																\
		printf("Cuda failure %s:%d:'%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));	\
		exit(EXIT_FAILURE);																\
	}																					\
}



__global__ void kernel_1024_one_256(float *A, float *B, float *bnBias, float *bnScale, float *C) {
	int tile = blockIdx.x, in_channel = threadIdx.x, line = threadIdx.y;
	int ind = line*256 + in_channel;

	extern __shared__ float shared_[];
	float *weights = shared_ + 1024*4, *output = weights + 256*16, *input = shared_;
	float *bias = output + 4*256, *scale = bias + 256;

	for (int i = 0; i < 4; i++)
		input[ind + i*1024] = A[tile*4096 + i*1024 + ind];
	bias[in_channel] = bnBias[in_channel];
	scale[in_channel] = bnScale[in_channel];
	output[ind] = 0.0f;
	__syncthreads();

	for (int k = 0; k < 1024; k += 16) {
		float *B_start = B + k*256;
		for (int i = 0; i < 4; i++)
			weights[ind + i*1024] = B_start[i*1024 + ind];
		__syncthreads();

		float *A_start = input + k;
		for (int p = 0; p < 16; p++) {
			output[ind] += A_start[line*1024 + p] * weights[in_channel + p*256];
		}
		__syncthreads();
	}

	float *C_start = C + tile*1024, res = scale[in_channel] * output[ind] + bias[in_channel];
	C_start[ind] = res > 0 ? res : 0;
}


int kernel_256_1_in() {
	float *input = get_parameter(inputName256one, 14*14*1024);
	float *weight = get_parameter(weightName256one, 256*1024);

	float *bnBias = get_parameter(bnBiasName256one, 256);
	float *bnScale = get_parameter(bnScaleName256one, 256);
	float *bnBias_myKernel = get_parameter(bnBias_myKernel_Name256one, 256);
	float *bnScale_myKernel = get_parameter(bnScale_myKernel_Name256one, 256);
	float *eMeanName = get_parameter(eMeanName256one, 256);
	float *eVarName = get_parameter(eVarName256one, 256);

	float *input_, *output_, *weight_, *bnBias_, *bnScale_, *eMeanName_, *eVarName_;

	int nInput = 14*14*1024, nOutput = 14*14*256, nWeights = 256*1024;
	float tmp[nOutput], tmp_cudnn[nOutput];

	uint64_t nT1 = 0, nT2 = 0, nT1_cudnn = 0, nT2_cudnn = 0;
	cudaError_t s;

	/////////////////////////////////

	// My Kernel

	/////////////////////////////////

	/*  1. Data preparation  */
	cudaMalloc((void **) &input_, nInput<<3);
	cudaMalloc((void **) &output_, nOutput<<2);
	cudaMalloc((void **) &weight_, nWeights<<2);
	cudaMalloc((void **) &bnBias_, 256<<2);
	cudaMalloc((void **) &bnScale_, 256<<2);

	cudaMemcpy(input_, input, nInput<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(weight_, weight, nWeights<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(bnBias_, bnBias_myKernel, 256<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(bnScale_, bnScale_myKernel, 256<<2, cudaMemcpyHostToDevice);


	/*  2. Computing  */
	nT1 = getTimeMicroseconds64();

	kernel_1024_one_256 <<<dim3(49), dim3(256, 4), (4*1024 + 16*256 + 4*256 + 2*256)<<2 >>> (input_, weight_, bnBias_, bnScale_, output_);

	//cudaCheckError();
	cudaDeviceSynchronize();

	nT2 = getTimeMicroseconds64();
	printf("TotalTime = %d us\n", nT2-nT1);


	/*  3. Copy back and free  */
	s = cudaMemcpy(tmp, output_, nOutput<<2, cudaMemcpyDeviceToHost);
	printf("%s\n", cudaGetErrorName(s));
	cudaCheckError();

	free(bnBias_myKernel);
	free(bnScale_myKernel);


	/////////////////////////////////

	// cuDNN

	/////////////////////////////////

	/*  1. Data preparation  */
	cudaMalloc((void **) &eMeanName_, 256<<2);
	cudaMalloc((void **) &eVarName_, 256<<2);

	cudaMemcpy(bnBias_, bnBias, 256<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(bnScale_, bnScale, 256<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(eMeanName_, eMeanName, 256<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(eVarName_, eVarName, 256<<2, cudaMemcpyHostToDevice);

	weight = transpose(weight, 256, 1024);
	cudaMemcpy(weight_, weight, nWeights<<2, cudaMemcpyHostToDevice);

	/*  2. cuDNN preparation  */
	cudnnStatus_t status;
	float one = 1.0, zero = 0.0;
	int size;

	cudnnHandle_t handle;
	status = cudnnCreate(&handle);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed1\n");

	cudnnTensorDescriptor_t xdesc, ydesc;
	cudnnFilterDescriptor_t wdesc; // CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NCHW
	status = cudnnCreateTensorDescriptor(&xdesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed2\n");
	status = cudnnSetTensor4dDescriptor(xdesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 1024, 14, 14);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed3\n");
	status = cudnnCreateTensorDescriptor(&ydesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed4\n");
	status = cudnnSetTensor4dDescriptor(ydesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 256, 14, 14);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed5\n");
	status = cudnnCreateFilterDescriptor(&wdesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed6\n");
	status = cudnnSetFilter4dDescriptor(wdesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 256, 1024, 1, 1);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed7\n");

	cudnnConvolutionDescriptor_t conv_desc;
	status = cudnnCreateConvolutionDescriptor(&conv_desc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed10\n");
	status = cudnnSetConvolution2dDescriptor(conv_desc, 0,0, 1,1,1,1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); //CUDNN_DATA_FLOAT
	if (status != CUDNN_STATUS_SUCCESS) printf("failed11\n");

	cudnnActivationDescriptor_t act_desc;
	status = cudnnCreateActivationDescriptor(&act_desc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed12\n");
	status = cudnnSetActivationDescriptor(act_desc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed13\n");

	cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc;
	status = cudnnCreateTensorDescriptor(&bnScaleBiasMeanVarDesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed14\n");
	status = cudnnSetTensor4dDescriptor(bnScaleBiasMeanVarDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 256, 1, 1);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed15\n");

	cudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)0;

	status = cudnnGetConvolutionForwardWorkspaceSize(handle,
	   xdesc,
	   wdesc,
	   conv_desc,
	   ydesc,
	   algo,
	   (size_t *)&(size));
	float *extra;
	cudaMalloc((void **) &extra, size);


	/*  3. Computing  */
	nT1_cudnn = getTimeMicroseconds64();

	status = cudnnConvolutionForward(handle, &one,
		xdesc, input_, wdesc, weight_,
		conv_desc, algo,
		extra, size, &zero,
		ydesc, output_);
	if (status != CUDNN_STATUS_SUCCESS) printf("Not Successed1\n");

	status = cudnnBatchNormalizationForwardInference(handle, CUDNN_BATCHNORM_SPATIAL,
		&one, &zero,
		ydesc, output_, ydesc, output_,
		bnScaleBiasMeanVarDesc, bnScale_, bnBias_, eMeanName_, eVarName_, CUDNN_BN_MIN_EPSILON);
	if (status != CUDNN_STATUS_SUCCESS) printf("Not Successed2\n");

	status = cudnnActivationForward(handle, act_desc, &one,
		ydesc, output_, &zero,
		ydesc, output_);
	if (status != CUDNN_STATUS_SUCCESS) printf("Not Successed3\n");

	cudaDeviceSynchronize();
	nT2_cudnn = getTimeMicroseconds64();
	printf("cuDNN TotalTime = %d us\n", nT2_cudnn-nT1_cudnn);


	/*  4. Copy back and free  */
	s = cudaMemcpy(tmp_cudnn, output_, nOutput<<2, cudaMemcpyDeviceToHost);
	printf("%s\n", cudaGetErrorName(s));

	cudaFree(extra);
	cudaFree(input_);
	cudaFree(output_);
	cudaFree(weight_);

	cudaFree(bnScale_);
	cudaFree(bnBias_);
	cudaFree(eMeanName_);
	cudaFree(eVarName_);

	free(input);
	free(weight);

	free(bnScale);
	free(bnBias);
	free(eMeanName);
	free(eVarName);

	output_checker(tmp, tmp_cudnn, 14, 256, 0);

	return ((nT2-nT1) << 16) | (nT2_cudnn-nT1_cudnn);
}



__global__ void kernel_256_one_1024(float *A, float *B, float *bnBias, float *bnScale, float *C) {
	int tile = blockIdx.x, part = blockIdx.y, in_channel = threadIdx.x, line = threadIdx.y;
	int ind = line*256 + in_channel;

	extern __shared__ float shared_[];
	float *weights = shared_ + 256*4, *output = weights + 256*32, *input = shared_;
	float *bias = output + 4*256, *scale = bias + 256;

	input[ind] = A[tile * 1024 + ind];
	bias[in_channel] = bnBias[part*256 + in_channel];
	scale[in_channel] = bnScale[part*256+ in_channel];
	output[ind] = 0.0f;
	__syncthreads();

	for (int k = 0; k < 256; k += 32) {
		for (int i = 0; i < 8; i++)
			weights[ind + 1024*i] = B[(k + i*4 + line)*1024 + part*256 + in_channel];
		__syncthreads();

		float *A_start = input + k;
		for (int p = 0; p < 32; p++) {
			output[ind] += A_start[line*256 + p] * weights[in_channel + p*256];
		}
		__syncthreads();
	}

	float *C_start = C + tile*4096 + part*256;
	C_start[line * 1024 + in_channel] = scale[in_channel] * output[ind] + bias[in_channel];
}


int kernel_256_1_out() {
	float *input = get_parameter(inputName256one, 14*14*256);
	float *weight = get_parameter(weightName256one, 256*1024);

	float *bnBias = get_parameter(bnBiasName256one, 1024);
	float *bnScale = get_parameter(bnScaleName256one, 1024);
	float *bnBias_myKernel = get_parameter(bnBias_myKernel_Name256one, 1024);
	float *bnScale_myKernel = get_parameter(bnScale_myKernel_Name256one, 1024);
	float *eMeanName = get_parameter(eMeanName256one, 1024);
	float *eVarName = get_parameter(eVarName256one, 1024);

	float *input_, *output_, *weight_, *bnBias_, *bnScale_, *eMeanName_, *eVarName_;

	int nInput = 14*14*256, nOutput = 14*14*1024, nWeights = 256*1024;
	float tmp[nOutput], tmp_cudnn[nOutput];

	uint64_t nT1 = 0, nT2 = 0, nT1_cudnn = 0, nT2_cudnn = 0;
	cudaError_t s;

	/////////////////////////////////

	// My Kernel

	/////////////////////////////////

	/*  1. Data preparation  */
	cudaMalloc((void **) &input_, nInput<<3);
	cudaMalloc((void **) &output_, nOutput<<2);
	cudaMalloc((void **) &weight_, nWeights<<2);
	cudaMalloc((void **) &bnBias_, 1024<<2);
	cudaMalloc((void **) &bnScale_, 1024<<2);

	cudaMemcpy(input_, input, nInput<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(weight_, weight, nWeights<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(bnBias_, bnBias_myKernel, 1024<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(bnScale_, bnScale_myKernel, 1024<<2, cudaMemcpyHostToDevice);


	/*  2. Computing  */
	nT1 = getTimeMicroseconds64();

	kernel_256_one_1024 <<<dim3(49, 4), dim3(256, 4), (4*256 + 32*256 + 4*256 + 2*256)<<2 >>> (input_, weight_, bnBias_, bnScale_, output_);

	cudaCheckError();
	cudaDeviceSynchronize();

	nT2 = getTimeMicroseconds64();
	printf("TotalTime = %d us\n", nT2-nT1);


	/*  3. Copy back and free  */
	s = cudaMemcpy(tmp, output_, nOutput<<2, cudaMemcpyDeviceToHost);
	printf("%s\n", cudaGetErrorName(s));
	cudaCheckError();

	free(bnBias_myKernel);
	free(bnScale_myKernel);


	/////////////////////////////////

	// cuDNN

	/////////////////////////////////

	/*  1. Data preparation  */
	cudaMalloc((void **) &eMeanName_, 1024<<2);
	cudaMalloc((void **) &eVarName_, 1024<<2);

	cudaMemcpy(bnBias_, bnBias, 1024<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(bnScale_, bnScale, 1024<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(eMeanName_, eMeanName, 1024<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(eVarName_, eVarName, 1024<<2, cudaMemcpyHostToDevice);

	weight = transpose(weight, 1024, 256);
	cudaMemcpy(weight_, weight, nWeights<<2, cudaMemcpyHostToDevice);

	/*  2. cuDNN preparation  */
	cudnnStatus_t status;
	float one = 1.0, zero = 0.0;
	int size;

	cudnnHandle_t handle;
	status = cudnnCreate(&handle);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed1\n");

	cudnnTensorDescriptor_t xdesc, ydesc;
	cudnnFilterDescriptor_t wdesc; // CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NCHW
	status = cudnnCreateTensorDescriptor(&xdesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed2\n");
	status = cudnnSetTensor4dDescriptor(xdesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 256, 14, 14);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed3\n");
	status = cudnnCreateTensorDescriptor(&ydesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed4\n");
	status = cudnnSetTensor4dDescriptor(ydesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 1024, 14, 14);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed5\n");
	status = cudnnCreateFilterDescriptor(&wdesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed6\n");
	status = cudnnSetFilter4dDescriptor(wdesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1024, 256, 1, 1);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed7\n");

	cudnnConvolutionDescriptor_t conv_desc;
	status = cudnnCreateConvolutionDescriptor(&conv_desc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed10\n");
	status = cudnnSetConvolution2dDescriptor(conv_desc, 0,0, 1,1,1,1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); //CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT
	if (status != CUDNN_STATUS_SUCCESS) printf("failed11\n");

	cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc;
	status = cudnnCreateTensorDescriptor(&bnScaleBiasMeanVarDesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed14\n");
	status = cudnnSetTensor4dDescriptor(bnScaleBiasMeanVarDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1024, 1, 1);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed15\n");

	cudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)0;

	status = cudnnGetConvolutionForwardWorkspaceSize(handle,
	   xdesc,
	   wdesc,
	   conv_desc,
	   ydesc,
	   algo,
	   (size_t *)&(size));
	float *extra;
	cudaMalloc((void **) &extra, size);


	/*  3. Computing  */
	nT1_cudnn = getTimeMicroseconds64();

	status = cudnnConvolutionForward(handle, &one,
		xdesc, input_, wdesc, weight_,
		conv_desc, algo,
		extra, size, &zero,
		ydesc, output_);
	if (status != CUDNN_STATUS_SUCCESS) printf("Not Successed1\n");

	status = cudnnBatchNormalizationForwardInference(handle, CUDNN_BATCHNORM_SPATIAL,
		&one, &zero,
		ydesc, output_, ydesc, output_,
		bnScaleBiasMeanVarDesc, bnScale_, bnBias_, eMeanName_, eVarName_, CUDNN_BN_MIN_EPSILON);
	if (status != CUDNN_STATUS_SUCCESS) printf("Not Successed2\n");

	cudaDeviceSynchronize();
	nT2_cudnn = getTimeMicroseconds64();
	printf("cuDNN TotalTime = %d us\n", nT2_cudnn-nT1_cudnn);


	/*  4. Copy back and free  */
	s = cudaMemcpy(tmp_cudnn, output_, nOutput<<2, cudaMemcpyDeviceToHost);
	printf("%s\n", cudaGetErrorName(s));

	cudaFree(extra);
	cudaFree(input_);
	cudaFree(output_);
	cudaFree(weight_);

	cudaFree(bnScale_);
	cudaFree(bnBias_);
	cudaFree(eMeanName_);
	cudaFree(eVarName_);

	free(input);
	free(weight);

	free(bnScale);
	free(bnBias);
	free(eMeanName);
	free(eVarName);

	output_checker(tmp, tmp_cudnn, 14, 1024, 0);

	return ((nT2-nT1) << 16) | (nT2_cudnn-nT1_cudnn);
}


================================================
FILE: Kernel256_one.h
================================================
#ifndef __KERNEL256_ONE_H__
#define __KERNEL256_ONE_H__

#ifdef __cplusplus
extern "C" {
#endif

const char inputName256one[] = "data/input_one_14_1024.bin";
const char weightName256one[] = "data/weight_one_1024.bin";

const char bnBiasName256one[] = "data/bnBias_one_1024.bin";
const char bnScaleName256one[] = "data/bnScale_one_1024.bin";
const char bnBias_myKernel_Name256one[] = "data/bnBias_myKernel_one_1024.bin";
const char bnScale_myKernel_Name256one[] = "data/bnScale_myKernel_one_1024.bin";
const char eMeanName256one[] = "data/eMean_one_1024.bin";
const char eVarName256one[] = "data/eVar_one_1024.bin";

int kernel_256_1_in();
int kernel_256_1_out();

#ifdef __cplusplus
}
#endif

#endif

================================================
FILE: Kernel256_winograd.cu
================================================
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <string.h>
#include <float.h>
#include <math.h>
#include <assert.h>
#include <xmmintrin.h>
#include <immintrin.h>

#include "cudnn.h"
#include "util.h"
#include "Kernel256_winograd.h"


#define cudaCheckError() {																\
	cudaError_t e=cudaGetLastError();													\
	if(e!=cudaSuccess) {																\
		printf("Cuda failure %s:%d:'%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));	\
		exit(EXIT_FAILURE);																\
	}																					\
}

#define MY_KERNEL 0

#define d(input, i, j, Inz) ( input[Inz + i*768 + (j<<7)] )
__global__ void kernel_256_winograd_BtdB(float *pInputs, float *pOutputs) {
	int Inx = blockIdx.x<<2, Iny0 = blockIdx.y<<2, Part = blockIdx.z, Iny1 = threadIdx.y, Inz = threadIdx.x;
	int Iny = Iny0+Iny1, stride_r = 4096, stride_c = 256; // 4096 = 16*256
	int c_glb_start = Inx*stride_r + Iny*stride_c + Inz + (Part<<7), c_input = Iny1*128 + Inz;

	extern __shared__ float input[];

	int stride_768[6] = {0, 768, 1536, 2304, 3072, 3840}; // 768 = 6*128
	for (int i = 0; i < 6; i++) {
		input[c_input + stride_768[i]] = pInputs[c_glb_start + i*stride_r];
	}
	__syncthreads();

	float BTd[6];
	switch(Iny1) {
		case 0:
			for (int j = 0; j < 6; j++) {
				BTd[j] = d(input, 0, j, Inz)*4 - d(input, 2, j, Inz)*5 + d(input, 4, j, Inz);
			}
			break;
		case 1:
			for (int j = 0; j < 6; j++) {
				BTd[j] = -d(input, 1, j, Inz)*4 - d(input, 2, j, Inz)*4 + d(input, 3, j, Inz) + d(input, 4, j, Inz);
			}
			break;
		case 2:
			for (int j = 0; j < 6; j++) {
				BTd[j] = d(input, 1, j, Inz)*4 - d(input, 2, j, Inz)*4 - d(input, 3, j, Inz) + d(input, 4, j, Inz);
			}
			break;
		case 3:
			for (int j = 0; j < 6; j++) {
				BTd[j] = -d(input, 1, j, Inz)*2 - d(input, 2, j, Inz) + d(input, 3, j, Inz)*2 + d(input, 4, j, Inz);
			}
			break;
		case 4:
			for (int j = 0; j < 6; j++) {
				BTd[j] = d(input, 1, j, Inz)*2 - d(input, 2, j, Inz) - d(input, 3, j, Inz)*2 + d(input, 4, j, Inz);
			}
			break;
		case 5:
			for (int j = 0; j < 6; j++) {
				BTd[j] = d(input, 1, j, Inz)*4 - d(input, 3, j, Inz)*5 + d(input, 5, j, Inz);
			}
			break;
	}
	__syncthreads();

	int tmp_offset = Iny1*768+Inz;
	for (int i = 0; i < 6; i++) {
		input[tmp_offset + i*128] = BTd[i];
	}
	__syncthreads();

	float BTdB[6];
	switch(Iny1) {
		case 0:
			for (int i = 0; i < 6; i++) {
				BTdB[i] = 4*d(input, i, 0, Inz) - 5*d(input, i, 2, Inz) + d(input, i, 4, Inz);
			}
			break;
		case 1:
			for (int i = 0; i < 6; i++) {
				BTdB[i] = -4*d(input, i, 1, Inz) - 4*d(input, i, 2, Inz) + d(input, i, 3, Inz) + d(input, i, 4, Inz);
			}
			break;
		case 2:
			for (int i = 0; i < 6; i++) {
				BTdB[i] = 4*d(input, i, 1, Inz) - 4*d(input, i, 2, Inz) - d(input, i, 3, Inz) + d(input, i, 4, Inz);
			}
			break;
		case 3:
			for (int i = 0; i < 6; i++) {
				BTdB[i] = -2*d(input, i, 1, Inz) - d(input, i, 2, Inz) + 2*d(input, i, 3, Inz) + d(input, i, 4, Inz);
			}
			break;
		case 4:
			for (int i = 0; i < 6; i++) {
				BTdB[i] = 2*d(input, i, 1, Inz) - d(input, i, 2, Inz) - 2*d(input, i, 3, Inz) + d(input, i, 4, Inz);
			}
			break;
		case 5:
			for (int i = 0; i < 6; i++) {
				BTdB[i] = 4*d(input, i, 1, Inz) - 5*d(input, i, 3, Inz) + d(input, i, 5, Inz);
			}
			break;
	}
	__syncthreads();

	for (int i = 0; i < 6; i++) {
		pOutputs[(Iny1 + i*6)*4096 + (blockIdx.x*4+blockIdx.y)*256 + Inz + (Part<<7)] = BTdB[i];
	}
}

__global__ void kernel_256_winograd_AtIA(float *pInputs, float *pBiases, float *pScales, float *pOutputs) {
	int Tilex = blockIdx.x, Tiley = blockIdx.y, Iny = threadIdx.y, kz = blockIdx.z, Inx = threadIdx.x;
	int c_input = Inx*6 + Iny;

	__shared__ float bias, scale;
	extern __shared__ float input[];

	input[c_input] = pInputs[c_input*16*256 + (Tilex*4+Tiley)*256 + kz];
	bias = pBiases[kz];
	scale = pScales[kz];
	__syncthreads();

	float tmp = 0;
	switch(Inx) {
		case 0:
			tmp = input[Iny] + input[6+Iny] + input[12+Iny] + input[18+Iny] + input[24+Iny];
			break;
		case 1:
			tmp = input[6+Iny] - input[12+Iny] + 2*input[18+Iny] - 2*input[24+Iny];
			break;
		case 2:
			tmp = input[6+Iny] + input[12+Iny] + 4*input[18+Iny] + 4*input[24+Iny];
			break;
		case 3:
			tmp = input[6+Iny] - input[12+Iny] + 8*input[18+Iny] - 8*input[24+Iny] + input[30+Iny];
			break;
	}
	__syncthreads();

	input[c_input] = tmp;
	__syncthreads();

	if (Inx > 3 || (Tilex == 3 && Inx > 1)) return;
	
	int x;
	float o;
	switch(Iny) {
		case 0:
			x = Inx*6;
			o = scale*(input[x]+input[x+1]+input[x+2]+input[x+3]+input[x+4]) + bias;
			pOutputs[(((Tilex<<2)+1+Inx)*16 + (Tiley<<2)+1)*256 + kz] = o > 0 ? o : 0;
			break;
		case 1:
			x = Inx*6;
			o = scale*(input[x+1] - input[x+2] + 2*input[x+3] - 2*input[x+4]) + bias;
			pOutputs[(((Tilex<<2)+1+Inx)*16 + (Tiley<<2)+2)*256 + kz] = o > 0 ? o : 0;
			break;
		case 2:
			if (Tiley == 3) break;
			x = Inx*6;
			o = scale*(input[x+1] + input[x+2] + 4*input[x+3] + 4*input[x+4]) + bias;
			pOutputs[(((Tilex<<2)+1+Inx)*16 + (Tiley<<2)+3)*256 + kz] = o > 0 ? o : 0;
			break;
		case 3:
			if (Tiley == 3) break;
			x = Inx*6;
			o = scale*(input[x+1] - input[x+2] + 8*input[x+3] - 8*input[x+4] + input[x+5]) + bias;
			pOutputs[(((Tilex<<2)+1+Inx)*16 + (Tiley<<2)+4)*256 + kz] = o > 0 ? o : 0;
			break;
	}
}

__global__ void kernel_256_OuterProduct_256(float *A, float *B, float *C) {
	int Tile = blockIdx.x, Part = blockIdx.y, tX = threadIdx.x, tY = threadIdx.y;
	int c_input = tY*256 + tX, c_kernel = c_input, T_offset = (Tile<<12) + (Part<<11) + c_input, B_offset = (Tile<<16) + c_kernel;
	
	extern __shared__ float input[];
	float *kernel = input + 2048, *out = kernel + 8192;
	int B_stride[32] = {0, 256, 512, 768, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816, 3072, 3328, 3584, 3840, 4096, 4352, 4608, 4864, 5120, 5376, 5632, 5888, 6144, 6400, 6656, 6912, 7168, 7424, 7680, 7936};
	out[c_input] = 0.0f;
	out[c_input+1024] = 0;

	input[c_input] = A[T_offset];
	input[c_input+1024] = A[T_offset+1024];

	for (int k = 0; k < 8; k++) {
		int B_start = B_offset + (k<<13); // 32*64
		kernel[c_kernel] = B[B_start], kernel[c_kernel+1024] = B[B_start+1024];
		kernel[c_kernel+2048] = B[B_start+2048], kernel[c_kernel+3072] = B[B_start+3072];
		kernel[c_kernel+4096] = B[B_start+4096], kernel[c_kernel+5120] = B[B_start+5120];
		kernel[c_kernel+6144] = B[B_start+6144], kernel[c_kernel+7168] = B[B_start+7168];

		__syncthreads();

		float sum = 0, sum1 = 0;
		int y_tmp = (tY<<8)+(k<<5), y_tmp1 = y_tmp+1024;
		for (int j = 0; j < 32; j++) {
			sum += input[y_tmp + j] * kernel[tX + B_stride[j]];
			sum1 += input[y_tmp1 + j] * kernel[tX + B_stride[j]];
		}
		out[c_input] += sum;
		out[c_input+1024] += sum1;
		__syncthreads();
	}

	C[T_offset] = out[c_input];
	C[T_offset+1024] = out[c_input+1024];
}

int kernel_256() {
	float *input_ = get_parameter(inputName256, 16*16*256);
	float *bias = get_parameter(biasName256, 256);
	float *input, *output, *l_weights, *l_bias;
	uint64_t nT1 = 0, nT2 = 0, nT1_cudnn = 0, nT2_cudnn = 0;
	cudaError_t s;



	/////////////////////////////////

	// My Kernel

	/////////////////////////////////
	float *kernel = get_parameter(weight_winograd_Name256, 36*256*256), *t_input, *ip;
	int nInput = 16*16*256, nOutput = 16*16*256, nWeights = 36*256*256, nBias = 256, nTransInput = 16*6*6*256, nInnerProd = 16*6*6*256;
	float *l_bnBias, *l_bnScale, *bnBias, *bnScale;

	cudaMalloc((void **) &input, nInput<<3);
	cudaMalloc((void **) &output, nOutput<<2);
	cudaMalloc((void **) &l_weights, nWeights<<2);
	cudaMalloc((void **) &l_bias, nBias<<2);
	cudaMalloc((void **) &t_input, nTransInput<<2);
	cudaMalloc((void **) &ip, nInnerProd<<2);

	cudaMemset((void *) input, 0, nInput<<3);
	cudaMemset((void *) output, 0, nOutput<<2);
	cudaMemset((void *) t_input, 0, nTransInput<<2);
	cudaMemset((void *) l_weights, 0, nWeights<<2);
	cudaMemset((void *) ip, 0, nInnerProd<<2);

	cudaMemcpy(input, input_, nInput<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(l_weights, kernel, nWeights<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(l_bias, bias, nBias<<2, cudaMemcpyHostToDevice);

	bnBias = get_parameter(bnBias_winograd_Name256, 256);
	bnScale = get_parameter(bnScale_winograd_Name256, 256);
	cudaMalloc((void **) &l_bnBias, nBias<<2);
	cudaMalloc((void **) &l_bnScale, nBias<<2);
	cudaMemcpy(l_bnBias, bnBias, nBias<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(l_bnScale, bnScale, nBias<<2, cudaMemcpyHostToDevice);

	float tmp[nOutput];

	nT1 = getTimeMicroseconds64();

	kernel_256_winograd_BtdB <<<dim3(4, 4, 2), dim3(128, 6), (6*6*128)<<2 >>> (input, t_input);
	kernel_256_OuterProduct_256<<<dim3(36, 2), dim3(256, 4), (8*256 + 32*256 + 8*256)<<2 >>> (t_input, l_weights, ip);
	kernel_256_winograd_AtIA <<<dim3(4, 4, 256), dim3(6, 6), ((6*6)<<2)>>> (ip, l_bnBias, l_bnScale, output);
	//cudaCheckError();
	cudaDeviceSynchronize();
	
	nT2 = getTimeMicroseconds64();
	printf("TotalTime = %d us\n", nT2-nT1); 

	s = cudaMemcpy(tmp, output, nOutput<<2, cudaMemcpyDeviceToHost);
	printf("%s\n", cudaGetErrorName(s));
	//cudaCheckError();

	cudaFree(t_input);
	cudaFree(output);
	cudaFree(l_weights);
	cudaFree(l_bias);
	cudaFree(ip);

	free(kernel);
	free(bnScale);
	free(bnBias);



	/////////////////////////////////

	// cuDNN

	/////////////////////////////////
	kernel = get_parameter(weight_NCHW_Name256, 9*256*256);
	bnBias = get_parameter(bnBiasName256, 256);
	bnScale = get_parameter(bnScaleName256, 256);
	float* eMean = get_parameter(eMeanName256, 256);
	float* eVar = get_parameter(eVarName256, 256);
	float *l_eMean, *l_eVar;
	nInput = 16*16*256, nOutput = 14*14*256, nWeights = 3*3*256*256, nBias = 256;

	cudaMalloc((void **) &output, nOutput<<2);
	cudaMalloc((void **) &l_weights, nWeights<<2);
	cudaMalloc((void **) &l_bias, nBias<<2);
	cudaMemcpy(l_weights, kernel, nWeights<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(l_bias, bias, nBias<<2, cudaMemcpyHostToDevice);

	cudaMalloc((void **) &l_eMean, nBias<<2);
	cudaMalloc((void **) &l_eVar, nBias<<2);
	cudaMemcpy(l_bnBias, bnBias, nBias<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(l_bnScale, bnScale, nBias<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(l_eMean, eMean, nBias<<2, cudaMemcpyHostToDevice);
	cudaMemcpy(l_eVar, eVar, nBias<<2, cudaMemcpyHostToDevice);

	cudaMemset((void *) output, 0, nOutput<<2);

	float tmp_cudnn[nOutput];

	cudnnStatus_t status;
	float one = 1.0, zero = 0.0;
	int size;

	cudnnHandle_t handle;
	status = cudnnCreate(&handle);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed1\n");

	cudnnTensorDescriptor_t xdesc, ydesc, bdesc;
	cudnnFilterDescriptor_t wdesc; // CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NCHW
	status = cudnnCreateTensorDescriptor(&xdesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed2\n");
	status = cudnnSetTensor4dDescriptor(xdesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 256, 16, 16);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed3\n");
	status = cudnnCreateTensorDescriptor(&ydesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed4\n");
	status = cudnnSetTensor4dDescriptor(ydesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 256, 14, 14);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed5\n");
	status = cudnnCreateFilterDescriptor(&wdesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed6\n");
	status = cudnnSetFilter4dDescriptor(wdesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 256, 256, 3, 3);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed7\n");
	status = cudnnCreateTensorDescriptor(&bdesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed8\n");
	status = cudnnSetTensor4dDescriptor(bdesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 256, 1, 1);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed9\n");
	cudnnConvolutionDescriptor_t conv_desc;
	status = cudnnCreateConvolutionDescriptor(&conv_desc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed10\n");
	status = cudnnSetConvolution2dDescriptor(conv_desc, 0,0, 1,1,1,1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); //CUDNN_CONVOLUTION
	if (status != CUDNN_STATUS_SUCCESS) printf("failed11\n");

	cudnnActivationDescriptor_t act_desc;
	status = cudnnCreateActivationDescriptor(&act_desc);  
	if (status != CUDNN_STATUS_SUCCESS) printf("failed12\n");
	status = cudnnSetActivationDescriptor(act_desc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed13\n");

	cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc;
	status = cudnnCreateTensorDescriptor(&bnScaleBiasMeanVarDesc);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed14\n");
	status = cudnnSetTensor4dDescriptor(bnScaleBiasMeanVarDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 256, 1, 1);
	if (status != CUDNN_STATUS_SUCCESS) printf("failed15\n");

	cudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)6;

	status = cudnnGetConvolutionForwardWorkspaceSize(handle,
	   xdesc,
	   wdesc,
	   conv_desc,
	   ydesc,
	   algo,
	   (size_t *)&(size));

	float *extra;
	cudaMalloc((void **) &extra, size);
	
	nT1_cudnn = getTimeMicroseconds64();

	status = cudnnConvolutionForward(handle, &one,
		xdesc, input, wdesc, l_weights, 
		conv_desc, algo, 
		extra, size, &zero,
		ydesc, output);
	if (status != CUDNN_STATUS_SUCCESS) printf("Not Successed1\n");

	status = cudnnBatchNormalizationForwardInference(handle, CUDNN_BATCHNORM_SPATIAL,
		&one, &zero, 
		ydesc, output, ydesc, output,
		bnScaleBiasMeanVarDesc, l_bnScale, l_bnBias, l_eMean, l_eVar, CUDNN_BN_MIN_EPSILON);
	if (status != CUDNN_STATUS_SUCCESS) printf("Not Successed2\n");

	status = cudnnActivationForward(handle, act_desc, &one,
		ydesc, output, &zero,
		ydesc, output);
	if (status != CUDNN_STATUS_SUCCESS) printf("Not Successed3\n");

	cudaDeviceSynchronize();
	nT2_cudnn = getTimeMicroseconds64();
	printf("cuDNN TotalTime = %d us\n", nT2_cudnn-nT1_cudnn);
	
	s = cudaMemcpy(tmp_cudnn, output, nOutput<<2, cudaMemcpyDeviceToHost);
	printf("%s\n", cudaGetErrorName(s));


	cudaFree(extra);
	cudaFree(input);
	cudaFree(output);
	cudaFree(l_weights);
	cudaFree(l_bias);

	cudaFree(l_bnScale);
	cudaFree(l_bnBias);
	cudaFree(l_eMean);
	cudaFree(l_eVar);

	free(bias);
	free(kernel);

	free(bnScale);
	free(bnBias);
	free(eMean);
	free(eVar);
	free(input_);

	output_checker(tmp, tmp_cudnn, 14, 256, 1);

	return ((nT2-nT1) << 16) | (nT2_cudnn-nT1_cudnn);
}

================================================
FILE: Kernel256_winograd.h
================================================
#ifndef __KERNEL256_WINOGRAD_H__
#define __KERNEL256_WINOGRAD_H__

#ifdef __cplusplus
extern "C" {
#endif

const char inputName256[] = "data/input_14_1_256.bin";
const char biasName256[] = "data/bias_256.bin";
const char weight_winograd_Name256[] = "data/weight_winograd_256_256.bin";
const char weight_NCHW_Name256[] = "data/weight_NCHW_256_256.bin";

const char bnBiasName256[] = "data/bnBias_256.bin";
const char bnScaleName256[] = "data/bnScale_256.bin";
const char bnBias_winograd_Name256[] = "data/bnBias_winograd_256.bin";
const char bnScale_winograd_Name256[] = "data/bnScale_winograd_256.bin";
const char eMeanName256[] = "data/eMean_256.bin";
const char eVarName256[] = "data/eVar_256.bin";

int kernel_256();

#ifdef __cplusplus
}
#endif

#endif


================================================
FILE: Makefile
================================================
CC=gcc
CPP=g++
AR=ar
NVCC=nvcc

CSRCS := $(shell find . -name '*.c' -not -name '._*')
COBJS := $(subst .c,.o,$(CSRCS))

CUSRCS := $(shell find . -name '*.cu' -not -name '._*')
CUOBJS := $(subst .cu,.o,$(CUSRCS))

LIBDIR := -L/usr/local/cuda/lib64

CUFLAGS= \
-I. \
-Xcompiler \
-fPIC

LDFLAGS=-L. -lm -lpthread -lrt

all: Test

%.o: %.c
	$(NVCC) $(CUFLAGS) -c $< -o $(basename $@).o

%.o: %.cu
	$(NVCC) $(CUFLAGS) -c $< -o $(basename $@).o

Test: $(CUOBJS) $(COBJS)
	$(NVCC) -o Test $(CUOBJS) $(COBJS) $(LIBDIR) $(LDFLAGS) -lcudart -lcuda -lcublas -lcudnn

clean:
	find . -name "*.o" -exec rm -f '{}' ';'
	rm -f Test


================================================
FILE: README.md
================================================
## Introduction

This code implements fast cuda kernels for DNN inference, especially for convolution layers / residule blocks in ResNet. Specifically, the kernels combine three parts into one piece:
- Convolution
- Batch Nomalization (BN + Scale)
- Activation (ReLU)

For implementation details, please refer to the technical report included in this repo. Winograd algorithm is used for 3 * 3 convolutional kernels.

## Usage
``` sh
mkdir data
python data_generator.py
make
./Test 0
```
- Set parameters in `data_generator.py`
- Run 6 test cases with changing numbers from 0 to 5 after `./Test`

## Results

### 3 * 3 Kernels
Kernals | Operations | 128 / 128 | 256 / 256
--- | --- | --- | ---
Cudnn | Gemm + BN + ReLU | 214us | 384us
Cudnn | Winograd + BN + ReLU  | 95us | 155us
Our Kernel | Winograd + BN + ReLU | 59us | 117us

### 1 * 1 Kernels [BUGGY NUMBERS]
Kernals | 512 / 128 | 128 / 512 | 1024 / 256 | 256 / 1024
--- | --- | --- | --- | ---
Operations | Gemm + BN + ReLU | Gemm + BN | Gemm + BN + ReLU | Gemm + BN + ReLU
Cudnn  | 119us | 115us | 219us | 214us
Our Kernel | 58us | 55us | 186us | 181us


================================================
FILE: Test.c
================================================
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "Kernel128_one.h"
#include "Kernel128_winograd.h"
#include "Kernel256_one.h"
#include "Kernel256_winograd.h"
#include "util.h"

int main(int argc, char** argv) {
  int nTest = 100, sum = 0, sum_cudnn = 0, i;
  cudaSetDevice(0);

  int mode = 0;
  if (argc == 2) {
    mode = atoi(argv[1]);
  }

  for (i = 0; i < nTest; i++) {
    printf("---- Iter: %d ----\n", i);
    int res = -1;
    switch (mode) {
      case 0:
        res = kernel_128();
        break;
      case 1:
        res = kernel_256();
        break;
      case 2:
        res = kernel_128_1_in();
        break;
      case 3:
        res = kernel_128_1_out();
        break;
      case 4:
        res = kernel_256_1_in();
        break;
      case 5:
        res = kernel_256_1_out();
        break;
    }
    if (i > 1) {
      sum += res >> 16;
      sum_cudnn += res & 0xFFFF;
    }
  }
  printf(
      "Average Total Time: [Mine: %d us], [cuDNN: %d us]\n",
      sum / (nTest - 2),
      sum_cudnn / (nTest - 2));

  return 0;
}


================================================
FILE: data_generator.py
================================================
from __future__ import print_function
import sys
import numpy as np
import re
import time
import difflib
import array
import requests
import os
import shutil
import scipy.spatial.distance as spd
import numpy as np
from numpy.random import *
from random import randint
import random
import matplotlib.pyplot as plt
from scipy import misc


def bias_generator(output_channel = 128):
	bias = (np.array(rand(output_channel))-0.5).astype(np.float32)
	des = open("data/bias_" + str(output_channel) + ".bin", "wb")
	des.write(bias)

	bnScale = (np.array(rand(output_channel))-0.5).astype(np.float32)
	des = open("data/bnScale_" + str(output_channel) + ".bin", "wb")
	des.write(bnScale)

	bnBias = (np.array(rand(output_channel))-0.5).astype(np.float32)
	des = open("data/bnBias_" + str(output_channel) + ".bin", "wb")
	des.write(bnBias)

	eMean = (np.array(rand(output_channel))-0.5).astype(np.float32)
	des = open("data/eMean_" + str(output_channel) + ".bin", "wb")
	des.write(eMean)

	eVar = (np.array(rand(output_channel))*3 + 5).astype(np.float32)
	des = open("data/eVar_" + str(output_channel) + ".bin", "wb")
	des.write(eVar)

	eps = 1e-5
	bnScale_winograd = bnScale / np.sqrt(eVar + eps)
	des = open("data/bnScale_winograd_" + str(output_channel) + ".bin", "wb")
	des.write(bnScale_winograd)
	bnBias_winograd = bnBias - bnScale*eMean / np.sqrt(eVar + eps)
	des = open("data/bnBias_winograd_" + str(output_channel) + ".bin", "wb")
	des.write(bnBias_winograd)

def input_generator(input_channel = 128, feature_map_size = 14, padding = 1):
	parameters = (feature_map_size + 2*padding)*(feature_map_size + 2*padding) * input_channel
	a = (np.array(rand(parameters))-0.5).astype(np.float32)
	des = open("data/input_" + str(feature_map_size) + '_' + str(padding) + '_' + str(input_channel) + ".bin", "wb")
	des.write(a)

def weight_generator(input_channel = 128, output_channel = 128):
	### Weights_NCHW
	parameters = input_channel*output_channel * 3*3
	in_ = (np.array(rand(parameters))-0.5).astype(np.float32)

	des = open("data/weight_NCHW_" + str(input_channel) + '_' + str(output_channel) + ".bin", "wb")
	des.write(in_)

	### Weights_Winograd
	in_ = in_.reshape(input_channel*output_channel, 3,3)
	G = np.array([[0.25,0,0], [-1.0/6,-1.0/6,-1.0/6], [-1.0/6,1.0/6,-1.0/6], [1.0/24,1.0/12,1.0/6], [1.0/24,-1.0/12,1.0/6], [0,0,1]])

	out_ = [0] * input_channel*output_channel * 6*6
	for i in range(output_channel):
		for j in range(input_channel):
			b = np.dot(G, in_[i*input_channel+j])
			b = np.dot(b, G.transpose())
			offset = j*output_channel+i
			for x in range(6):
				for y in range(6):
					out_[((x*6+y) * input_channel*output_channel) + offset] = b[x][y]

	des = open("data/weight_winograd_" + str(input_channel) + '_' + str(output_channel) + ".bin", "wb")
	des.write(np.array(out_).astype(np.float32))

def onebyone_generator(input_channel = 256, output_channel = 1024, feature_map_size = 14):
	parameters = feature_map_size*feature_map_size * output_channel
	a = ((np.array(rand(parameters))-0.5)*40).astype(np.float32)
	des = open("data/input_one_" + str(feature_map_size) + '_' + str(output_channel) + ".bin", "wb")
	des.write(a)

	parameters = input_channel * output_channel
	a = ((np.array(rand(parameters))-0.5)*40).astype(np.float32)
	des = open("data/weight_one_" + str(output_channel) + ".bin", "wb")
	des.write(a)

	bnScale = ((np.array(rand(output_channel))-0.5)*40).astype(np.float32)
	des = open("data/bnScale_one_" + str(output_channel) + ".bin", "wb")
	des.write(bnScale)

	bnBias = ((np.array(rand(output_channel))-0.5)*40).astype(np.float32)
	des = open("data/bnBias_one_" + str(output_channel) + ".bin", "wb")
	des.write(bnBias)

	eMean = ((np.array(rand(output_channel))-0.5)*40).astype(np.float32)
	des = open("data/eMean_one_" + str(output_channel) + ".bin", "wb")
	des.write(eMean)

	eVar = (np.array(rand(output_channel))*20 + 5).astype(np.float32)
	des = open("data/eVar_one_" + str(output_channel) + ".bin", "wb")
	des.write(eVar)

	eps = 1e-5
	bnScale_winograd = bnScale / np.sqrt(eVar + eps)
	des = open("data/bnScale_myKernel_one_" + str(output_channel) + ".bin", "wb")
	des.write(bnScale_winograd)
	bnBias_winograd = bnBias - bnScale*eMean / np.sqrt(eVar + eps)
	des = open("data/bnBias_myKernel_one_" + str(output_channel) + ".bin", "wb")
	des.write(bnBias_winograd)


if __name__ == '__main__':
	bias_generator(output_channel = 128)
	print('Biases generated')

	input_generator(input_channel = 128)
	print('Input generated')

	weight_generator(128, 128)
	print('Weights generated')

	onebyone_generator()
	print('Parameters for 1*1 conv generated')


================================================
FILE: util.c
================================================
#include "util.h"
#include <time.h>
#include "math.h"

uint64_t getTimeMicroseconds64() {
  uint64_t nTime;
  struct timespec tSpec;

  clock_gettime(CLOCK_REALTIME, &tSpec);

  nTime = (uint64_t)tSpec.tv_sec * 1000000 + (uint64_t)tSpec.tv_nsec / 1000;
  return nTime;
}

float* transpose(float* weight, int h, int w) {
  float* new_weight = (float*)malloc(w * h * 4);
  int i, j;
  for (i = 0; i < w; ++i) {
    for (j = 0; j < h; ++j) {
      new_weight[j * w + i] = weight[i * h + j];
    }
  }

  free(weight);
  return new_weight;
}

float* get_parameter(const char* filename, int size) {
  float* parameter = (float*)malloc(size * 4);
  if (!parameter) {
    printf("Bad Malloc\n");
    exit(0);
  }
  FILE* ptr = fopen(filename, "rb");

  if (!ptr) {
    printf("Bad file path: %p, %s\n", ptr, strerror(errno));
    exit(0);
  }
  fread(parameter, size * 4, 1, ptr);

  fclose(ptr);
  return parameter;
}

float output_checker(float* A, float* B, int len, int channel, int shift) {
  int error_cnt = 0, i, j, k;
  float max_error = 0;
  for (i = 0; i < len; i++) {
    for (j = 0; j < len; j++) {
      for (k = 0; k < channel; k++) {
        float diff = fabs(
            A[((i + shift) * (len + 2 * shift) + j + shift) * channel + k] -
            B[(i * len + j) * channel + k]);
        if (diff > 1e-5)
          error_cnt++;
        if (diff > max_error)
          max_error = diff;
      }
    }
  }
  printf("[max_error: %f][error_cnt: %d]\n", max_error, error_cnt);
}


================================================
FILE: util.h
================================================
#ifndef __UTIL_H__
#define __UTIL_H__

#ifdef __cplusplus
extern "C" {
#endif

#include <assert.h>
#include <errno.h>
#include <float.h>
#include <immintrin.h>
#include <inttypes.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <xmmintrin.h>

float* get_parameter(const char* filename, int size);

float* transpose(float* weight, int h, int w);

uint64_t getTimeMicroseconds64();

float output_checker(float* A, float* B, int len, int channel, int shift);

#ifdef __cplusplus
}
#endif

#endif

Download .txt

gitextract_1urew_xc/

├── .git/
│   ├── HEAD
│   ├── config
│   ├── description
│   ├── hooks/
│   │   ├── applypatch-msg.sample
│   │   ├── commit-msg.sample
│   │   ├── fsmonitor-watchman.sample
│   │   ├── post-update.sample
│   │   ├── pre-applypatch.sample
│   │   ├── pre-commit.sample
│   │   ├── pre-merge-commit.sample
│   │   ├── pre-push.sample
│   │   ├── pre-rebase.sample
│   │   ├── pre-receive.sample
│   │   ├── prepare-commit-msg.sample
│   │   ├── push-to-checkout.sample
│   │   ├── sendemail-validate.sample
│   │   └── update.sample
│   ├── index
│   ├── info/
│   │   └── exclude
│   ├── logs/
│   │   ├── HEAD
│   │   └── refs/
│   │       ├── heads/
│   │       │   └── master
│   │       └── remotes/
│   │           └── origin/
│   │               └── HEAD
│   ├── objects/
│   │   └── pack/
│   │       ├── pack-49e0e400885c832ca9946c8b7ba9584079082307.idx
│   │       ├── pack-49e0e400885c832ca9946c8b7ba9584079082307.pack
│   │       ├── pack-49e0e400885c832ca9946c8b7ba9584079082307.promisor
│   │       └── pack-49e0e400885c832ca9946c8b7ba9584079082307.rev
│   ├── packed-refs
│   ├── refs/
│   │   ├── heads/
│   │   │   └── master
│   │   └── remotes/
│   │       └── origin/
│   │           └── HEAD
│   └── shallow
├── .gitignore
├── Kernel128_one.cu
├── Kernel128_one.h
├── Kernel128_winograd.cu
├── Kernel128_winograd.h
├── Kernel256_one.cu
├── Kernel256_one.h
├── Kernel256_winograd.cu
├── Kernel256_winograd.h
├── Makefile
├── README.md
├── Test.c
├── data_generator.py
├── util.c
└── util.h

Download .txt

SYMBOL INDEX (7 symbols across 3 files)

FILE: Test.c
  function main (line 13) | int main(int argc, char** argv) {

FILE: data_generator.py
  function bias_generator (line 20) | def bias_generator(output_channel = 128):
  function input_generator (line 49) | def input_generator(input_channel = 128, feature_map_size = 14, padding ...
  function weight_generator (line 55) | def weight_generator(input_channel = 128, output_channel = 128):
  function onebyone_generator (line 80) | def onebyone_generator(input_channel = 256, output_channel = 1024, featu...

FILE: util.c
  function getTimeMicroseconds64 (line 5) | uint64_t getTimeMicroseconds64() {
  function output_checker (line 46) | float output_checker(float* A, float* B, int len, int channel, int shift) {

Download .json

Condensed preview — 45 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (107K chars).

[
  {
    "path": ".git/HEAD",
    "chars": 23,
    "preview": "ref: refs/heads/master\n"
  },
  {
    "path": ".git/config",
    "chars": 356,
    "preview": "[core]\n\trepositoryformatversion = 1\n\tfilemode = true\n\tbare = false\n\tlogallrefupdates = true\n[remote \"origin\"]\n\turl = htt"
  },
  {
    "path": ".git/description",
    "chars": 73,
    "preview": "Unnamed repository; edit this file 'description' to name the repository.\n"
  },
  {
    "path": ".git/hooks/applypatch-msg.sample",
    "chars": 478,
    "preview": "#!/bin/sh\n#\n# An example hook script to check the commit log message taken by\n# applypatch from an e-mail message.\n#\n# T"
  },
  {
    "path": ".git/hooks/commit-msg.sample",
    "chars": 896,
    "preview": "#!/bin/sh\n#\n# An example hook script to check the commit log message.\n# Called by \"git commit\" with one argument, the na"
  },
  {
    "path": ".git/hooks/fsmonitor-watchman.sample",
    "chars": 4726,
    "preview": "#!/usr/bin/perl\n\nuse strict;\nuse warnings;\nuse IPC::Open2;\n\n# An example hook script to integrate Watchman\n# (https://fa"
  },
  {
    "path": ".git/hooks/post-update.sample",
    "chars": 189,
    "preview": "#!/bin/sh\n#\n# An example hook script to prepare a packed repository for use over\n# dumb transports.\n#\n# To enable this h"
  },
  {
    "path": ".git/hooks/pre-applypatch.sample",
    "chars": 424,
    "preview": "#!/bin/sh\n#\n# An example hook script to verify what is about to be committed\n# by applypatch from an e-mail message.\n#\n#"
  },
  {
    "path": ".git/hooks/pre-commit.sample",
    "chars": 1649,
    "preview": "#!/bin/sh\n#\n# An example hook script to verify what is about to be committed.\n# Called by \"git commit\" with no arguments"
  },
  {
    "path": ".git/hooks/pre-merge-commit.sample",
    "chars": 416,
    "preview": "#!/bin/sh\n#\n# An example hook script to verify what is about to be committed.\n# Called by \"git merge\" with no arguments."
  },
  {
    "path": ".git/hooks/pre-push.sample",
    "chars": 1374,
    "preview": "#!/bin/sh\n\n# An example hook script to verify what is about to be pushed.  Called by \"git\n# push\" after it has checked t"
  },
  {
    "path": ".git/hooks/pre-rebase.sample",
    "chars": 4898,
    "preview": "#!/bin/sh\n#\n# Copyright (c) 2006, 2008 Junio C Hamano\n#\n# The \"pre-rebase\" hook is run just before \"git rebase\" starts d"
  },
  {
    "path": ".git/hooks/pre-receive.sample",
    "chars": 544,
    "preview": "#!/bin/sh\n#\n# An example hook script to make use of push options.\n# The example simply echoes all push options that star"
  },
  {
    "path": ".git/hooks/prepare-commit-msg.sample",
    "chars": 1492,
    "preview": "#!/bin/sh\n#\n# An example hook script to prepare the commit log message.\n# Called by \"git commit\" with the name of the fi"
  },
  {
    "path": ".git/hooks/push-to-checkout.sample",
    "chars": 2783,
    "preview": "#!/bin/sh\n\n# An example hook script to update a checked-out tree on a git push.\n#\n# This hook is invoked by git-receive-"
  },
  {
    "path": ".git/hooks/sendemail-validate.sample",
    "chars": 2308,
    "preview": "#!/bin/sh\n\n# An example hook script to validate a patch (and/or patch series) before\n# sending it via email.\n#\n# The hoo"
  },
  {
    "path": ".git/hooks/update.sample",
    "chars": 3650,
    "preview": "#!/bin/sh\n#\n# An example hook script to block unannotated tags from entering.\n# Called by \"git receive-pack\" with argume"
  },
  {
    "path": ".git/info/exclude",
    "chars": 240,
    "preview": "# git ls-files --others --exclude-from=.git/info/exclude\n# Lines that start with '#' are comments.\n# For a project mostl"
  },
  {
    "path": ".git/logs/HEAD",
    "chars": 193,
    "preview": "0000000000000000000000000000000000000000 7d5aec268304b92b90f3bb109ddf4cad80d5ea1a appuser <appuser@7c99e0e64a07.(none)> "
  },
  {
    "path": ".git/logs/refs/heads/master",
    "chars": 193,
    "preview": "0000000000000000000000000000000000000000 7d5aec268304b92b90f3bb109ddf4cad80d5ea1a appuser <appuser@7c99e0e64a07.(none)> "
  },
  {
    "path": ".git/logs/refs/remotes/origin/HEAD",
    "chars": 193,
    "preview": "0000000000000000000000000000000000000000 7d5aec268304b92b90f3bb109ddf4cad80d5ea1a appuser <appuser@7c99e0e64a07.(none)> "
  },
  {
    "path": ".git/objects/pack/pack-49e0e400885c832ca9946c8b7ba9584079082307.promisor",
    "chars": 59,
    "preview": "7d5aec268304b92b90f3bb109ddf4cad80d5ea1a refs/heads/master\n"
  },
  {
    "path": ".git/packed-refs",
    "chars": 114,
    "preview": "# pack-refs with: peeled fully-peeled sorted \n7d5aec268304b92b90f3bb109ddf4cad80d5ea1a refs/remotes/origin/master\n"
  },
  {
    "path": ".git/refs/heads/master",
    "chars": 41,
    "preview": "7d5aec268304b92b90f3bb109ddf4cad80d5ea1a\n"
  },
  {
    "path": ".git/refs/remotes/origin/HEAD",
    "chars": 32,
    "preview": "ref: refs/remotes/origin/master\n"
  },
  {
    "path": ".git/shallow",
    "chars": 41,
    "preview": "7d5aec268304b92b90f3bb109ddf4cad80d5ea1a\n"
  },
  {
    "path": ".gitignore",
    "chars": 485,
    "preview": "#executable\n*\n!*.*\n!*/\n!Makefile\n\n# Compiled source #\n###################\n*.com\n*.class\n*.dll\n*.exe\n*.o\n*.so\n\n# Packages"
  },
  {
    "path": "Kernel128_one.cu",
    "chars": 14540,
    "preview": "#include <stdlib.h>\n#include <string.h>\n#include <stdio.h>\n#include <string.h>\n#include <float.h>\n#include <math.h>\n#inc"
  },
  {
    "path": "Kernel128_one.h",
    "chars": 699,
    "preview": "#ifndef __KERNEL128_ONE_H__\n#define __KERNEL128_ONE_H__\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nconst char inputName128"
  },
  {
    "path": "Kernel128_winograd.cu",
    "chars": 14246,
    "preview": "#include <stdlib.h>\n#include <string.h>\n#include <stdio.h>\n#include <string.h>\n#include <float.h>\n#include <math.h>\n#inc"
  },
  {
    "path": "Kernel128_winograd.h",
    "chars": 756,
    "preview": "#ifndef __KERNEL128_WINOGRAD_H__\n#define __KERNEL128_WINOGRAD_H__\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nconst char in"
  },
  {
    "path": "Kernel256_one.cu",
    "chars": 14586,
    "preview": "#include <stdlib.h>\n#include <string.h>\n#include <stdio.h>\n#include <string.h>\n#include <float.h>\n#include <math.h>\n#inc"
  },
  {
    "path": "Kernel256_one.h",
    "chars": 699,
    "preview": "#ifndef __KERNEL256_ONE_H__\n#define __KERNEL256_ONE_H__\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nconst char inputName256"
  },
  {
    "path": "Kernel256_winograd.cu",
    "chars": 14186,
    "preview": "#include <stdlib.h>\n#include <string.h>\n#include <stdio.h>\n#include <string.h>\n#include <float.h>\n#include <math.h>\n#inc"
  },
  {
    "path": "Kernel256_winograd.h",
    "chars": 757,
    "preview": "#ifndef __KERNEL256_WINOGRAD_H__\n#define __KERNEL256_WINOGRAD_H__\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nconst char in"
  },
  {
    "path": "Makefile",
    "chars": 617,
    "preview": "CC=gcc\nCPP=g++\nAR=ar\nNVCC=nvcc\n\nCSRCS := $(shell find . -name '*.c' -not -name '._*')\nCOBJS := $(subst .c,.o,$(CSRCS))\n\n"
  },
  {
    "path": "README.md",
    "chars": 1110,
    "preview": "## Introduction\n\nThis code implements fast cuda kernels for DNN inference, especially for convolution layers / residule "
  },
  {
    "path": "Test.c",
    "chars": 1111,
    "preview": "#include <assert.h>\n#include <pthread.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n\n#include \"Kernel128"
  },
  {
    "path": "data_generator.py",
    "chars": 4590,
    "preview": "from __future__ import print_function\nimport sys\nimport numpy as np\nimport re\nimport time\nimport difflib\nimport array\nim"
  },
  {
    "path": "util.c",
    "chars": 1485,
    "preview": "#include \"util.h\"\n#include <time.h>\n#include \"math.h\"\n\nuint64_t getTimeMicroseconds64() {\n  uint64_t nTime;\n  struct tim"
  },
  {
    "path": "util.h",
    "chars": 533,
    "preview": "#ifndef __UTIL_H__\n#define __UTIL_H__\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n#include <assert.h>\n#include <errno.h>\n#i"
  }
]

// ... and 4 more files (download for full content)

About this extraction

This page contains the full source code of the xuqiantong/CUDA-Winograd GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 45 files (95.5 KB), approximately 33.3k tokens, and a symbol index with 7 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo