Repository: hypelib/Hype
Branch: master
Commit: e467e926da4c
Files: 42
Total size: 398.3 KB
Directory structure:
gitextract__bdmbe28/
├── .gitattributes
├── .gitignore
├── .paket/
│ ├── Paket.Restore.targets
│ └── paket.targets
├── Hype.sln
├── LICENSE.txt
├── README.md
├── Roadmap.txt
├── docs/
│ ├── .gitignore
│ ├── BuildDocs.fsx
│ └── input/
│ ├── FeedforwardNets.fsx
│ ├── HMC.fsx
│ ├── Optimization.fsx
│ ├── RecurrentNets.fsx
│ ├── Regression.fsx
│ ├── Training.fsx
│ ├── download.fsx
│ ├── files/
│ │ └── misc/
│ │ ├── style.css
│ │ ├── style_light.css
│ │ └── tips.js
│ ├── housing.data
│ ├── index.fsx
│ ├── resources/
│ │ └── Hype.pspimage
│ └── templates/
│ ├── docpage.cshtml
│ ├── reference/
│ │ ├── module.cshtml
│ │ ├── namespaces.cshtml
│ │ ├── part-members.cshtml
│ │ ├── part-nested.cshtml
│ │ └── type.cshtml
│ ├── template.cshtml
│ └── template.html
├── paket.dependencies
└── src/
└── Hype/
├── AssemblyInfo.fs
├── Classifier.fs
├── Hype.fs
├── Hype.fsproj
├── Inference.fs
├── NLP.fs
├── Neural.fs
├── Optimize.fs
├── app.config
└── paket.references
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitattributes
================================================
###############################################################################
# Set default behavior to automatically normalize line endings.
###############################################################################
* text=auto
###############################################################################
# Set default behavior for command prompt diff.
#
# This is need for earlier builds of msysgit that does not have it on by
# default for csharp files.
# Note: This is only used by command line
###############################################################################
#*.cs diff=csharp
###############################################################################
# Set the merge driver for project and solution files
#
# Merging from the command prompt will add diff markers to the files if there
# are conflicts (Merging from VS is not affected by the settings below, in VS
# the diff markers are never inserted). Diff markers may cause the following
# file extensions to fail to load in VS. An alternative would be to treat
# these files as binary and thus will always conflict and require user
# intervention with every merge. To do so, just uncomment the entries below
###############################################################################
#*.sln merge=binary
#*.csproj merge=binary
#*.vbproj merge=binary
#*.vcxproj merge=binary
#*.vcproj merge=binary
#*.dbproj merge=binary
#*.fsproj merge=binary
#*.lsproj merge=binary
#*.wixproj merge=binary
#*.modelproj merge=binary
#*.sqlproj merge=binary
#*.wwaproj merge=binary
###############################################################################
# behavior for image files
#
# image files are treated as binary by default.
###############################################################################
#*.jpg binary
#*.png binary
#*.gif binary
###############################################################################
# diff behavior for common document formats
#
# Convert binary document formats to text before diffing them. This feature
# is only available from the command line. Turn it on by uncommenting the
# entries below.
###############################################################################
#*.doc diff=astextplain
#*.DOC diff=astextplain
#*.docx diff=astextplain
#*.DOCX diff=astextplain
#*.dot diff=astextplain
#*.DOT diff=astextplain
#*.pdf diff=astextplain
#*.PDF diff=astextplain
#*.rtf diff=astextplain
#*.RTF diff=astextplain
================================================
FILE: .gitignore
================================================
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
# User-specific files
*.suo
*.user
*.sln.docstates
# Build results
[Dd]ebug/
[Rr]elease/
x64/
build/
[Bb]in/
[Oo]bj/
# Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets
!packages/*/build/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
*_i.c
*_p.c
*.ilk
*.meta
*.obj
*.pch
*.pdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*.log
*.vspscc
*.vssscc
.builds
*.pidb
*.log
*.scc
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opensdf
*.sdf
*.cachefile
# Visual Studio profiler
*.psess
*.vsp
*.vspx
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# NCrunch
*.ncrunch*
.*crunch*.local.xml
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.Publish.xml
# NuGet Packages Directory
## TODO: If you have NuGet Package Restore enabled, uncomment the next line
packages/
# Windows Azure Build Output
csx
*.build.csdef
# Windows Store app package directory
AppPackages/
# Others
sql/
*.Cache
ClientBin/
[Ss]tyle[Cc]op.*
~$*
*~
*.dbmdl
*.[Pp]ublish.xml
*.pfx
*.publishsettings
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file to a newer
# Visual Studio version. Backup files are not needed, because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
# SQL Server files
App_Data/*.mdf
App_Data/*.ldf
#LightSwitch generated files
GeneratedArtifacts/
_Pvt_Extensions/
ModelManifest.xml
# =========================
# Windows detritus
# =========================
# Windows image file caches
Thumbs.db
ehthumbs.db
# Folder config file
Desktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Mac desktop service store files
.DS_Store
================================================
FILE: .paket/Paket.Restore.targets
================================================
$(MSBuildAllProjects);$(MSBuildThisFileFullPath)$(MSBuildVersion)15.0.0falsetruetrue$(MSBuildThisFileDirectory)$(MSBuildThisFileDirectory)..\$(PaketRootPath)paket-files\paket.restore.cached$(PaketRootPath)paket.lockclassicprojassemblynative/Library/Frameworks/Mono.framework/Commands/monomono$(PaketRootPath)paket.bootstrapper.exe$(PaketToolsPath)paket.bootstrapper.exe$([System.IO.Path]::GetDirectoryName("$(PaketBootStrapperExePath)"))\"$(PaketBootStrapperExePath)"$(MonoPath) --runtime=v4.0.30319 "$(PaketBootStrapperExePath)"truetrueTrueFalse$(BaseIntermediateOutputPath.TrimEnd('\').TrimEnd('\/'))$(PaketRootPath)paket$(PaketToolsPath)paket$(PaketRootPath)paket.exe$(PaketToolsPath)paket.exe
<_DotnetToolsJson Condition="Exists('$(PaketRootPath)/.config/dotnet-tools.json')">$([System.IO.File]::ReadAllText("$(PaketRootPath)/.config/dotnet-tools.json"))
<_ConfigContainsPaket Condition=" '$(_DotnetToolsJson)' != ''">$(_DotnetToolsJson.Contains('"paket"'))
<_ConfigContainsPaket Condition=" '$(_ConfigContainsPaket)' == ''">false
<_PaketCommand>dotnet paket
$(PaketToolsPath)paket$(PaketBootStrapperExeDir)paketpaket
<_PaketExeExtension>$([System.IO.Path]::GetExtension("$(PaketExePath)"))
<_PaketCommand Condition=" '$(_PaketCommand)' == '' AND '$(_PaketExeExtension)' == '.dll' ">dotnet "$(PaketExePath)"
<_PaketCommand Condition=" '$(_PaketCommand)' == '' AND '$(OS)' != 'Windows_NT' AND '$(_PaketExeExtension)' == '.exe' ">$(MonoPath) --runtime=v4.0.30319 "$(PaketExePath)"
<_PaketCommand Condition=" '$(_PaketCommand)' == '' ">"$(PaketExePath)"
true$(NoWarn);NU1603;NU1604;NU1605;NU1608falsetrue$([System.IO.File]::ReadAllText('$(PaketRestoreCacheFile)'))$([System.Text.RegularExpressions.Regex]::Split(`%(Identity)`, `": "`)[0].Replace(`"`, ``).Replace(` `, ``))$([System.Text.RegularExpressions.Regex]::Split(`%(Identity)`, `": "`)[1].Replace(`"`, ``).Replace(` `, ``))%(PaketRestoreCachedKeyValue.Value)%(PaketRestoreCachedKeyValue.Value)truefalsetruetrue$(PaketIntermediateOutputPath)\$(MSBuildProjectFile).paket.references.cached$(MSBuildProjectFullPath).paket.references$(MSBuildProjectDirectory)\$(MSBuildProjectName).paket.references$(MSBuildProjectDirectory)\paket.referencesfalsetruetruereferences-file-or-cache-not-found$([System.IO.File]::ReadAllText('$(PaketReferencesCachedFilePath)'))$([System.IO.File]::ReadAllText('$(PaketOriginalReferencesFilePath)'))references-filefalsefalsetruetarget-framework '$(TargetFramework)' or '$(TargetFrameworks)' files @(PaketResolvedFilePaths)falsetrue$([System.String]::Copy('%(PaketReferencesFileLines.Identity)').Split(',').Length)$([System.String]::Copy('%(PaketReferencesFileLines.Identity)').Split(',')[0])$([System.String]::Copy('%(PaketReferencesFileLines.Identity)').Split(',')[1])$([System.String]::Copy('%(PaketReferencesFileLines.Identity)').Split(',')[4])$([System.String]::Copy('%(PaketReferencesFileLines.Identity)').Split(',')[5])%(PaketReferencesFileLinesInfo.PackageVersion)Allruntimeruntimetruetrue$(PaketIntermediateOutputPath)/$(MSBuildProjectFile).paket.clitools$([System.String]::Copy('%(PaketCliToolFileLines.Identity)').Split(',')[0])$([System.String]::Copy('%(PaketCliToolFileLines.Identity)').Split(',')[1])%(PaketCliToolFileLinesInfo.PackageVersion)false
<_NuspecFilesNewLocation Include="$(PaketIntermediateOutputPath)\$(Configuration)\*.nuspec"/>
$(MSBuildProjectDirectory)/$(MSBuildProjectFile)truefalsetruefalsetruefalsetruefalsetrue$(PaketIntermediateOutputPath)\$(Configuration)$(PaketIntermediateOutputPath)
<_NuspecFiles Include="$(AdjustedNuspecOutputPath)\*.$(PackageVersion.Split(`+`)[0]).nuspec"/>
================================================
FILE: .paket/paket.targets
================================================
truetrue$(MSBuildThisFileDirectory)$(MSBuildThisFileDirectory)..\$(PaketToolsPath)paket.exe$(PaketToolsPath)paket.bootstrapper.exe"$(PaketExePath)"mono --runtime=v4.0.30319 "$(PaketExePath)""$(PaketBootStrapperExePath)"mono --runtime=v4.0.30319 $(PaketBootStrapperExePath)$(PaketCommand) restore$(PaketBootStrapperCommand)RestorePackages; $(BuildDependsOn);
================================================
FILE: Hype.sln
================================================
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.29009.5
MinimumVisualStudioVersion = 10.0.40219.1
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = ".paket", ".paket", "{B7FB3383-EF19-4645-986C-72D50C08F292}"
ProjectSection(SolutionItems) = preProject
paket.dependencies = paket.dependencies
EndProjectSection
EndProject
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "Hype", "src\Hype\Hype.fsproj", "{C923664D-182E-48D5-BB30-F1505D7D28DF}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "docs", "docs", "{56DA870A-0ED4-47A2-B78B-34A8D4D6AD28}"
ProjectSection(SolutionItems) = preProject
docs\BuildDocs.fsx = docs\BuildDocs.fsx
docs\input\download.fsx = docs\input\download.fsx
docs\input\FeedforwardNets.fsx = docs\input\FeedforwardNets.fsx
docs\input\HMC.fsx = docs\input\HMC.fsx
docs\input\index.fsx = docs\input\index.fsx
docs\input\Optimization.fsx = docs\input\Optimization.fsx
docs\input\RecurrentNets.fsx = docs\input\RecurrentNets.fsx
docs\input\Regression.fsx = docs\input\Regression.fsx
docs\input\Training.fsx = docs\input\Training.fsx
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Debug|x64 = Debug|x64
Release|Any CPU = Release|Any CPU
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{C923664D-182E-48D5-BB30-F1505D7D28DF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{C923664D-182E-48D5-BB30-F1505D7D28DF}.Debug|Any CPU.Build.0 = Debug|Any CPU
{C923664D-182E-48D5-BB30-F1505D7D28DF}.Debug|x64.ActiveCfg = Debug|Any CPU
{C923664D-182E-48D5-BB30-F1505D7D28DF}.Debug|x64.Build.0 = Debug|Any CPU
{C923664D-182E-48D5-BB30-F1505D7D28DF}.Release|Any CPU.ActiveCfg = Release|Any CPU
{C923664D-182E-48D5-BB30-F1505D7D28DF}.Release|Any CPU.Build.0 = Release|Any CPU
{C923664D-182E-48D5-BB30-F1505D7D28DF}.Release|x64.ActiveCfg = Release|Any CPU
{C923664D-182E-48D5-BB30-F1505D7D28DF}.Release|x64.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {028AF435-B43C-4E8E-8A82-4A65AF666086}
EndGlobalSection
EndGlobal
================================================
FILE: LICENSE.txt
================================================
The MIT License (MIT)
Copyright (c) 2015, National University of Ireland Maynooth (Atilim Gunes Baydin, Barak A. Pearlmutter)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
Hype: Compositional Machine Learning and Hyperparameter Optimization
--------------------------------------------------------------------
Hype is a proof-of-concept deep learning library, where you can perform optimization on compositional machine learning systems of many components, even when such components themselves internally perform optimization.
It is developed by [Atılım Güneş Baydin](http://www.cs.nuim.ie/~gunes/) and [Barak A. Pearlmutter](http://bcl.hamilton.ie/~barak/), at the [Brain and Computation Lab](http://www.bcl.hamilton.ie/), National University of Ireland Maynooth.
This work is supported by Science Foundation Ireland grant 09/IN.1/I2637.
Please visit the [project website](http://hypelib.github.io/Hype/) for documentation and tutorials.
You can come and join the Gitter chat room, if you want to chat with us:
[](https://gitter.im/hypelib/Hype?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
### Project statistics
[](http://issuestats.com/github/hypelib/Hype)
[](http://issuestats.com/github/hypelib/Hype)
### Current build status
[](https://ci.appveyor.com/project/gbaydin/hype)
### License
Hype is released under the MIT license.
================================================
FILE: Roadmap.txt
================================================
- CUDA backend (DiffSharp)
- Example for Hamiltonian MCMC
- Probabilistic inference
- Convolutional neural networks (ideally with DiffSharp tensor)
- Saving and loading models using a standard format
- Improve code comments
- Add references to research papers where relevant
- Add ability to read and write MATLAB files (scipy.io loadmat, savemat)
- Add ability to read and write FSL nifti files for fMRI (PyMVPA2, SampleAttributes, fmri_dataset, poly_detrend, zscore)
- Add ability to read and write standard image/video formats (OpenCV, MATLAB)
- Better integration with graph libraries (box plots, bar graphs, confusion matrix plots, write to .png support)
================================================
FILE: docs/.gitignore
================================================
output/
================================================
FILE: docs/BuildDocs.fsx
================================================
//
// This file is part of
// Hype: Compositional Machine Learning and Hyperparameter Optimization
//
// Copyright (c) 2015, National University of Ireland Maynooth (Atilim Gunes Baydin, Barak A. Pearlmutter)
//
// Hype is released under the MIT license.
// (See accompanying LICENSE file.)
//
// Written by:
//
// Atilim Gunes Baydin
// atilimgunes.baydin@nuim.ie
//
// Barak A. Pearlmutter
// barak@cs.nuim.ie
//
// Brain and Computation Lab
// Hamilton Institute & Department of Computer Science
// National University of Ireland Maynooth
// Maynooth, Co. Kildare
// Ireland
//
// www.bcl.hamilton.ie
//
#r "../packages/FSharp.Compiler.Service/lib/net40/FSharp.Compiler.Service.dll"
#r "../packages/FSharpVSPowerTools.Core/lib/net45/FSharpVSPowerTools.Core.dll"
#r "../packages/FSharp.Formatting/lib/net40/CSharpFormat.dll"
#r "../packages/FSharp.Formatting/lib/net40/FSharp.CodeFormat.dll"
#r "../packages/FSharp.Formatting/lib/net40/FSharp.Literate.dll"
#r "../packages/FSharp.Formatting/lib/net40/FSharp.MetadataFormat.dll"
#r "../packages/FSharp.Formatting/lib/net40/FSharp.Markdown.dll"
open System.IO
open FSharp.Literate
open FSharp.MetadataFormat
//
// Setup output directory structure and copy static files
//
let source = __SOURCE_DIRECTORY__
let docs = Path.Combine(source, "")
let relative subdir = Path.Combine(docs, subdir)
if not (Directory.Exists(relative "output")) then
Directory.CreateDirectory(relative "output") |> ignore
if not (Directory.Exists(relative "output/img")) then
Directory.CreateDirectory (relative "output/img") |> ignore
if not (Directory.Exists(relative "output/misc")) then
Directory.CreateDirectory (relative "output/misc") |> ignore
if not (Directory.Exists(relative "output/reference")) then
Directory.CreateDirectory (relative "output/reference") |> ignore
for fileInfo in DirectoryInfo(relative "input/files/misc").EnumerateFiles() do
fileInfo.CopyTo(Path.Combine(relative "output/misc", fileInfo.Name), true) |> ignore
for fileInfo in DirectoryInfo(relative "input/files/img").EnumerateFiles() do
fileInfo.CopyTo(Path.Combine(relative "output/img", fileInfo.Name), true) |> ignore
//
// Generate documentation
//
let tags = ["project-name", "Hype"; "project-author", "Atılım Güneş Baydin"; "project-github", "http://github.com/hypelib/Hype"; "project-nuget", "https://www.nuget.org/packages/hype"; "root", ""]
Literate.ProcessScriptFile(relative "input/index.fsx", relative "input/templates/template.html", relative "output/index.html", replacements = tags)
Literate.ProcessScriptFile(relative "input/download.fsx", relative "input/templates/template.html", relative "output/download.html", replacements = tags)
Literate.ProcessScriptFile(relative "input/Optimization.fsx", relative "input/templates/template.html", relative "output/optimization.html", replacements = tags)
Literate.ProcessScriptFile(relative "input/Training.fsx", relative "input/templates/template.html", relative "output/training.html", replacements = tags)
Literate.ProcessScriptFile(relative "input/Regression.fsx", relative "input/templates/template.html", relative "output/regression.html", replacements = tags)
Literate.ProcessScriptFile(relative "input/FeedforwardNets.fsx", relative "input/templates/template.html", relative "output/feedforwardnets.html", replacements = tags)
Literate.ProcessScriptFile(relative "input/RecurrentNets.fsx", relative "input/templates/template.html", relative "output/recurrentnets.html", replacements = tags)
Literate.ProcessScriptFile(relative "input/HMC.fsx", relative "input/templates/template.html", relative "output/hmc.html", replacements = tags)
//
// Generate API reference
//
let library = relative "../src/Hype/bin/Debug/Hype.dll"
let layoutRoots = [relative "input/templates"; relative "input/templates/reference" ]
MetadataFormat.Generate(library, relative "output/reference", layoutRoots, tags, markDownComments = true, libDirs = [relative "../src/Hype/bin/Debug/"])
================================================
FILE: docs/input/FeedforwardNets.fsx
================================================
(*** hide ***)
#r "../../src/Hype/bin/Release/netstandard2.0/DiffSharp.dll"
#r "../../src/Hype/bin/Release/netstandard2.0/Hype.dll"
#I "../../packages/R.NET.Community/lib/net40/"
#I "../../packages/R.NET.Community.FSharp/lib/net40/"
#I "../../packages/RProvider"
#load "RProvider.fsx"
fsi.ShowDeclarationValues <- true
(**
Feedforward neural networks
===========================
In this example, we implement a softmax classifier network with several hidden layers. Also see the [regression example](regression.html) for some relevant basics.
We again demonstrate the library with the [MNIST](http://yann.lecun.com/exdb/mnist/) database, this time using the full training set of 60,000 examples for building a classifier with 10 outputs representing the class probabilities of an input image belonging to one of the ten categories.
### Loading the data
We load the data and form the training, validation, and test datasets. The datasets are shuffled and the input data are normalized.
*)
open Hype
open Hype.Neural
open DiffSharp.AD.Float32
open DiffSharp.Util
let MNIST = Dataset(Util.LoadMNISTPixels("C:/datasets/MNIST/train-images.idx3-ubyte", 60000),
Util.LoadMNISTLabels("C:/datasets/MNIST/train-labels.idx1-ubyte", 60000)).NormalizeX()
let MNISTtrain = MNIST.[..58999].Shuffle()
let MNISTvalid = MNIST.[59000..].Shuffle()
let MNISTtest =
Dataset(Util.LoadMNISTPixels("C:/datasets/MNIST/t10k-images.idx3-ubyte", 10000),
Util.LoadMNISTLabels("C:/datasets/MNIST/t10k-labels.idx1-ubyte", 10000)).NormalizeX().Shuffle()
(**
val MNISTtrain : Dataset = Hype.Dataset
X: 784 x 59000
Y: 1 x 59000
val MNISTvalid : Dataset = Hype.Dataset
X: 784 x 1000
Y: 1 x 1000
val MNISTtest : Dataset = Hype.Dataset
X: 784 x 10000
Y: 1 x 10000
*)
MNISTtrain.[..5].VisualizeXColsAsImageGrid(28) |> printfn "%s"
(**
[lang=cs]
Hype.Dataset
X: 784 x 6
Y: 1 x 6
X's columns reshaped to (28 x 28), presented in a (2 x 3) grid:
DM : 56 x 84
·▴█
■■♦█· █■
▪███■▪ -██■-
·■███♦● ·●██■▪
-♦▪ ·████♦ -♦█♦
-♦■▪· █■█● ■█·
·●██♦ ██●· ██·
·▴ ·●██▪ -■██▪ ■█▪
·■▪ ·▪■■▪ ·███▪ ·▴· ♦█▪
♦■▪ ·■■▴ ♦███●▴▴♦██● ██▴
█ ■█· ■█■█■██████■ ■█▪
■█· - ██▴█████████▴ ·██·
●█▪ █▪●■████■███▴ ·███♦·
·■■ █▴●- · ●█· ●♦♦♦█▪
■█· █●■ ▪█ ▴█-
·█● ███· ·●■■██ ■█
·█● █●■▴▴██████● ██
●█▴ ♦█████████▴ ▴█▪
-♦██♦ ▪█████♦▪· ● █■
▴■■▴ ▪▪▪ █ ·■♦
■♦▪♦█▪
·■ ·♦ ▪■♦- ·●●█■·
██· ♦█● ██■■█▪-●· ▪■·▴●
██▴ ♦██▪ ■█- ▴███■ █♦
-██ ♦██● ▪█· ▪██▪ ▪█♦ ■▴
-██ ♦██■ █● ♦██▴ ♦█- ██■
▴█♦ ▴██■ ▴█ ·██■· ♦█- ███
♦█♦ -███ ▴█·▴██♦ ▪██■███·
♦█· ·██● ████▴ ■██●■█·
♦█▴ ··-███● ▪██♦ · ♦█·
♦██■■■■██████● ·■███ ██
▪█████████■♦█● ♦██■●█ █■
●██████▴- █■ ▴██■- ●█ ▴█▪
·███♦· ██ ♦█■▴ █● ██▴
▪▪ ██▪ ███ ■█· ██▴
●██ ●████■■██- ██
▴██ -▪▪▪▪▪▪▪- ██
██ -██
■█ -█●
●█· -█▪
·█ █
### Defining the model
We define a neural network with 3 layers: (1) a hidden layer with 300 units, followed by ReLU activation, (2) a hidden layer with 100 units, followed by ReLU activation, (3) a final layer with 10 units, followed by softmax transformation.
*)
let n = FeedForward()
n.Add(Linear(28 * 28, 300, Initializer.InitReLU))
n.Add(reLU)
n.Add(Linear(300, 100, Initializer.InitReLU))
n.Add(reLU)
n.Add(Linear(100, 10))
n.Add(fun m -> m |> DM.mapCols softmax) // Note the free inline implementation of the layer
n.ToString() |> printfn "%s"
(**
[lang=cs]
Hype.Neural.FeedForward
Learnable parameters: 266610
(0) -> (1) -> (2) -> (3) -> (4) -> (5)
(0): Hype.Neural.Linear
784 -> 300
Learnable parameters: 235500
Init: ReLU
W : 300 x 784
b : 300
(1): Hype.Neural.Activation
(2): Hype.Neural.Linear
300 -> 100
Learnable parameters: 30100
Init: ReLU
W : 100 x 300
b : 100
(3): Hype.Neural.Activation
(4): Hype.Neural.Linear
100 -> 10
Learnable parameters: 1010
Init: Standard
W : 10 x 100
b : 10
(5): Hype.Neural.Activation
*)
(**
### Freely implementing transformation layers
Now let's have a closer look at how we implemented the nonlinear transformations between the linear layers.
You might think that the instances of **reLU** in **n.Add(reLU)** above refer to a particular layer structure previously implemented as a layer module within the library. They don't. **reLU** is just a matrix-to-matrix elementwise function.
**An important thing to note** here is that the activation/transformation layers added with, for example, **n.Add(reLU)**, can be **any matrix-to-matrix function that you can express in the language,** unlike commonly seen in many machine learning frameworks where you are asked to select a particular layer type that has been implemented beforehand with it's (1) forward evaluation code and (2) reverse gradient code w.r.t. layer inputs, and (3) reverse gradient code w.r.t. any layer parameters. In such a setting, a new layer design would require you to add a new layer type to the system and carefully implement these components.
Here, because the system is based on nested AD, you can freely use any matrix-to-matrix transformation as a layer, and the forward and/or reverse AD operations of your code will be handled automatically by the underlying system. For example, you can write a layer like this:
*)
n.Add(fun w ->
let min = DM.Min(w)
let range = DM.Max(w) - min
(w - min) / range)
(**
which will be a normalization layer, scaling the values to be between 0 and 1.
In the above model, this is how the softmax layer is implemented as a mapping of the vector-to-vector **softmax** function to the columns of a matrix.
*)
n.Add(fun m -> m |> DM.mapCols softmax)
(**
In this particular example, the output matrix has 10 rows (for the 10 target classes) and each column (a vector of size 10) is individually passed through the **softmax** function. The output matrix would have as many columns as the input matrix, representing the class probabilities of each input.
*)
(**
### Weight initialization schemes
When layers with learnable weights are created, the weights are initialized using one of the following schemes. The correct initialization would depend on the activation function immediately following the layer and would take the fan-in/fan-out of the layer into account. If a specific scheme is not specified, the **InitStandard** scheme is used by default. These implementations are based on existing machine learning literature, such as _"Glorot, Xavier, and Yoshua Bengio. "Understanding the difficulty of training deep feedforward neural networks." International conference on artificial intelligence and statistics. 2010"_.
*)
type Initializer =
| InitUniform of D * D
| InitNormal of D * D
| InitRBM of D
| InitReLU
| InitSigmoid
| InitTanh
| InitStandard
| InitCustom of (int->int->D)
override i.ToString() =
match i with
| InitUniform(min, max) -> sprintf "Uniform min=%A max=%A" min max
| InitNormal(mu, sigma) -> sprintf "Normal mu=%A sigma=%A" mu sigma
| InitRBM sigma -> sprintf "RBM sigma=%A" sigma
| InitReLU -> "ReLU"
| InitSigmoid -> "Sigmoid"
| InitTanh -> "Tanh"
| InitStandard -> "Standard"
| InitCustom f -> "Custom"
member i.InitDM(m, n) =
let fanOut, fanIn = m, n
match i with
| InitUniform(min, max) -> Rnd.UniformDM(m, n, min, max)
| InitNormal(mu, sigma) -> Rnd.NormalDM(m, n, mu, sigma)
| InitRBM sigma -> Rnd.NormalDM(m, n, D 0.f, sigma)
| InitReLU -> Rnd.NormalDM(m, n, D 0.f, sqrt (D 2.f / (float32 fanIn)))
| InitSigmoid -> let r = D 4.f * sqrt (D 6.f / (fanIn + fanOut)) in Rnd.UniformDM(m, n, -r, r)
| InitTanh -> let r = sqrt (D 6.f / (fanIn + fanOut)) in Rnd.UniformDM(m, n, -r, r)
| InitStandard -> let r = (D 1.f) / sqrt (float32 fanIn) in Rnd.UniformDM(m, n, -r, r)
| InitCustom f -> DM.init m n (fun _ _ -> f fanIn fanOut)
member i.InitDM(m:DM) = i.InitDM(m.Rows, m.Cols)
(**
### Training
Before training, let's visualize the weights of the first layer in a grid where each row of the weight matrix of the first layer is shown as a 28-by-28 image. It is an image of random weights, as expected.
*)
let l = (n.[0] :?> Linear)
l.VisualizeWRowsAsImageGrid(28) |> printfn "%s"
(**
Hype.Neural.Linear
784 -> 300
Learnable parameters: 235500
Init: ReLU
W's rows reshaped to (28 x 28), presented in a (17 x 18) grid:
Now let's train the network with the training and validation datasets we've prepared, using RMSProp, Nesterov momentum, and cross-entropy loss.
*)
let p = {Params.Default with
Epochs = 2
EarlyStopping = Early (400, 100)
ValidationInterval = 10
Batch = Minibatch 100
Loss = CrossEntropyOnSoftmax
Momentum = Nesterov (D 0.9f)
LearningRate = RMSProp (D 0.001f, D 0.9f)}
let _, lhist = n.Train(MNISTtrain, MNISTvalid, p)
(**
[12/11/2015 22:42:07] --- Training started
[12/11/2015 22:42:07] Parameters : 266610
[12/11/2015 22:42:07] Iterations : 1180
[12/11/2015 22:42:07] Epochs : 2
[12/11/2015 22:42:07] Batches : Minibatches of 100 (590 per epoch)
[12/11/2015 22:42:07] Training data : 59000
[12/11/2015 22:42:07] Validation data: 1000
[12/11/2015 22:42:07] Valid. interval: 10
[12/11/2015 22:42:07] Method : Gradient descent
[12/11/2015 22:42:07] Learning rate : RMSProp a0 = D 0.00100000005f, k = D 0.899999976f
[12/11/2015 22:42:07] Momentum : Nesterov D 0.899999976f
[12/11/2015 22:42:07] Loss : Cross entropy after softmax layer
[12/11/2015 22:42:07] Regularizer : L2 lambda = D 9.99999975e-05f
[12/11/2015 22:42:07] Gradient clip. : None
[12/11/2015 22:42:07] Early stopping : Stagnation thresh. = 400, overfit. thresh. = 100
[12/11/2015 22:42:07] Improv. thresh.: D 0.995000005f
[12/11/2015 22:42:07] Return best : true
[12/11/2015 22:42:07] 1/2 | Batch 1/590 | D 2.383214e+000 [- ] | Valid D 2.411374e+000 [- ] | Stag: 0 Ovfit: 0
[12/11/2015 22:42:08] 1/2 | Batch 11/590 | D 6.371681e-001 [↓▼] | Valid D 6.128169e-001 [↓▼] | Stag: 0 Ovfit: 0
[12/11/2015 22:42:08] 1/2 | Batch 21/590 | D 4.729548e-001 [↓▼] | Valid D 4.779414e-001 [↓▼] | Stag: 0 Ovfit: 0
[12/11/2015 22:42:09] 1/2 | Batch 31/590 | D 4.792733e-001 [↑ ] | Valid D 3.651254e-001 [↓▼] | Stag: 0 Ovfit: 0
[12/11/2015 22:42:10] 1/2 | Batch 41/590 | D 2.977416e-001 [↓▼] | Valid D 3.680202e-001 [↑ ] | Stag: 10 Ovfit: 0
[12/11/2015 22:42:10] 1/2 | Batch 51/590 | D 4.242567e-001 [↑ ] | Valid D 3.525212e-001 [↓▼] | Stag: 0 Ovfit: 0
[12/11/2015 22:42:11] 1/2 | Batch 61/590 | D 2.464822e-001 [↓▼] | Valid D 3.365663e-001 [↓▼] | Stag: 0 Ovfit: 0
[12/11/2015 22:42:11] 1/2 | Batch 71/590 | D 6.299557e-001 [↑ ] | Valid D 3.981607e-001 [↑ ] | Stag: 10 Ovfit: 0
...
[12/11/2015 22:43:21] 2/2 | Batch 521/590 | D 1.163270e-001 [↓ ] | Valid D 2.264248e-001 [↓ ] | Stag: 50 Ovfit: 0
[12/11/2015 22:43:21] 2/2 | Batch 531/590 | D 2.169427e-001 [↑ ] | Valid D 2.203927e-001 [↓ ] | Stag: 60 Ovfit: 0
[12/11/2015 22:43:22] 2/2 | Batch 541/590 | D 2.233351e-001 [↑ ] | Valid D 2.353653e-001 [↑ ] | Stag: 70 Ovfit: 0
[12/11/2015 22:43:22] 2/2 | Batch 551/590 | D 3.425132e-001 [↑ ] | Valid D 2.559682e-001 [↑ ] | Stag: 80 Ovfit: 0
[12/11/2015 22:43:23] 2/2 | Batch 561/590 | D 2.768238e-001 [↓ ] | Valid D 2.412431e-001 [↓ ] | Stag: 90 Ovfit: 0
[12/11/2015 22:43:24] 2/2 | Batch 571/590 | D 2.550858e-001 [↓ ] | Valid D 2.726600e-001 [↑ ] | Stag:100 Ovfit: 0
[12/11/2015 22:43:24] 2/2 | Batch 581/590 | D 2.308137e-001 [↓ ] | Valid D 2.466903e-001 [↓ ] | Stag:110 Ovfit: 0
[12/11/2015 22:43:25] Duration : 00:01:17.5011734
[12/11/2015 22:43:25] Loss initial : D 2.383214e+000
[12/11/2015 22:43:25] Loss final : D 1.087980e-001 (Best)
[12/11/2015 22:43:25] Loss change : D -2.274415e+000 (-95.43 %)
[12/11/2015 22:43:25] Loss chg. / s : D -2.934685e-002
[12/11/2015 22:43:25] Epochs / s : 0.02580606089
[12/11/2015 22:43:25] Epochs / min : 1.548363654
[12/11/2015 22:43:25] --- Training finished
*)
(*** hide ***)
open RProvider
open RProvider.graphics
open RProvider.grDevices
let ll = lhist |> Array.map (float32>>float)
namedParams[
"x", box ll
"pch", box 19
"col", box "darkblue"
"type", box "l"
"xlab", box "Iteration"
"ylab", box "Loss"
"width", box 700
"height", box 500
]
|> R.plot|> ignore
(**
Now let's visualize the weights of the first layer in the grid. We see that the network has learned the problem domain.
*)
let l = (n.[0] :?> Linear)
l.VisualizeWRowsAsImageGrid(28) |> printfn "%s"
(**
*)
(**
### Building the softmax classifier
As explained in [regression](regression.html), we just construct an instance of **SoftmaxClassifier** with the trained neural network as its parameter. Please see the [API reference](reference/index.html) and the [source code](https://github.com/hypelib/Hype/blob/master/src/Hype/Classifier.fs) for a better understanding of how classifiers are implemented.
*)
let cc = SoftmaxClassifier(n)
(**
Testing class predictions for 10 random elements from the MNIST test set.
*)
let pred = cc.Classify(MNISTtest.X.[*,0..9]);;
let real = MNISTtest.Yi.[0..9]
(**
val pred : int [] = [|5; 1; 9; 2; 6; 0; 0; 5; 7; 6|]
val real : int [] = [|5; 1; 9; 2; 6; 0; 0; 5; 7; 6|]
Let's compute the classification error for the whole MNIST test set of 10,000 examples.
*)
cc.ClassificationError(MNISTtest)
(**
val it : float32 = 0.0502999984f
The classification error is around 5%. This can be lowered some more by training the model for more than 2 epochs as we did.
Classifying a single digit:
*)
let cls = cc.Classify(MNISTtest.X.[*,0]);;
MNISTtest.X.[*,0] |> DV.visualizeAsDM 28 |> printfn "%s"
(**
[lang=cs]
val cls : int = 5
DM : 28 x 28
·
▴●██♦-
▴♦██■▴-
♦█■■███▪·
■████■-
♦███▪
♦██♦
██●
■█▪
██· -▴■●-
▴██████■███-
♦██♦▪ ▪█■-
▪· ▴█●
-██
♦█●
■█■
-●██■·
-▴▪■███▪
███████●-
Classifying many digits at the same time:
*)
let clss = cc.Classify(MNISTtest.X.[*,0..4]);;
MNISTtest.[0..4].VisualizeXColsAsImageGrid(28) |> printfn "%s"
(**
[lang=cs]
val clss : int [] = [|5; 1; 9; 2; 6|]
Hype.Dataset
X: 784 x 5
Y: 1 x 5
X's columns reshaped to (28 x 28), presented in a (2 x 3) grid:
DM : 56 x 84
██♦
· ██
▴●██♦- ██▴ -♦█▪
▴♦██■▴- ♦██ ●█████●
♦█■■███▪· ██♦ ■███♦♦██
■████■- ███ ■██♦ ■█▴
♦███▪ ▴███ ·██♦ ●██
♦██♦ ███ ▪██ ■█■
██● ▴██▴ ·██· ·♦██▴
■█▪ ███ ███♦♦████▴
██· -▴■●- ███♦ ▴████████·
▴██████■███- ███ ▴ ·-●- ■██
♦██♦▪ ▪█■- ♦██▴ ██■
▪· ▴█● ▴██♦ -██▴
-██ ███▴ -██·
♦█● ♦██▴ ■██·
■█■ ███ ███
-●██■· ♦██▴ ▴██●
-▴▪■███▪ ██♦ ███
███████●- ♦█ -██■
-██♦
-██·
▴●█♦
●██ -████▴
▪████● ▴████
▴██████▴ ▴███■
■██▪▴██▴ ███▪
▴██● ▴█■ ■██▴
·███ ██- ·♦██▴
♦██● ▪█▪ -███▴
███ ██ ███♦
███ █♦ ███▪
█♦· █♦ ●██■ ▴▴▴
· ██ ███ -██-■█████▪
- ██ ██■ ●███████████-
·██■♦- ██ ▴██▴ ███●- ▪██▴
♦█████■███ ▪██ ·██- ·██▪
■█████████ ███▪·██▴ ♦██
♦█████████♦▪ ▪██████▴ ·♦██·
-███████████■●●●· ▪███████████████▴
■██████■■■█████▴ -▪██████████♦-
·████■ ▴████■ ·▴▴▴▴▴▴·
-■█- ■■▴
Nested optimization of training hyperparameters
-----------------------------------------------
As we've seen in [optimization](optimization.html), nested AD allows us to apply gradient-based optimization to functions that also internally perform optimization.
This gives us the possibility of optimizing the hyperparameters of training. We can, for example, compute the gradient of the final loss of a training procedure with respect to the continuous hyperparameters of the training such as learning rates, momentum parameters, regularization coefficients, or initialization conditions.
As an example, let's train a neural network with a learning rate schedule of 50 elements, and optimize this schedule vector with another level of optimization on top of the training.
*)
let train lrschedule =
Rnd.Seed(123)
n.Init()
let p = {Params.Default with
LearningRate = Schedule lrschedule
Loss = CrossEntropyOnSoftmax
ValidationInterval = 1
Silent = true
ReturnBest = false
Batch = Full}
let loss, _ = n.Train(MNISTvalid.[..20], p)
loss
let hypertrain epochs =
let p = {Params.Default with
Epochs = epochs
LearningRate = RMSProp(D 0.01f, D 0.9f)
ValidationInterval = 1}
let lr, _, _, _ = Optimize.Minimize(train, DV.create 50 (D 0.1f), p)
lr
let lr = hypertrain 50
(*** hide ***)
open RProvider
open RProvider.graphics
open RProvider.grDevices
let lrlr = lr |> DV.toArray |> Array.map (float32>>float)
namedParams[
"x", box lrlr
"pch", box 19
"col", box "darkblue"
"type", box "o"
"xlab", box "Iteration"
"ylab", box "Learning rate"
"width", box 700
"height", box 500
]
|> R.plot|> ignore
(**
*)
================================================
FILE: docs/input/HMC.fsx
================================================
(*** hide ***)
#r "../../src/Hype/bin/Release/netstandard2.0/DiffSharp.dll"
#r "../../src/Hype/bin/Release/netstandard2.0/Hype.dll"
#I "../../packages/R.NET.Community/lib/net40/"
#I "../../packages/R.NET.Community.FSharp/lib/net40/"
#I "../../packages/RProvider"
#load "RProvider.fsx"
fsi.ShowDeclarationValues <- false
(**
Markov Chain Monte Carlo
========================
Documentation coming soon.
*)
================================================
FILE: docs/input/Optimization.fsx
================================================
(*** hide ***)
#r "../../src/Hype/bin/Release/netstandard2.0/DiffSharp.dll"
#r "../../src/Hype/bin/Release/netstandard2.0/Hype.dll"
#I "../../packages/R.NET.Community/lib/net40/"
#I "../../packages/R.NET.Community.FSharp/lib/net40/"
#I "../../packages/RProvider"
#load "RProvider.fsx"
(**
Optimization
============
Hype provides a highly configurable and modular gradient-based optimization functionality. This works similar to many other machine learning libraries.
**Here's the novelty:**
Thanks to nested AD, gradient-based optimization can be combined with any code, including code which internally takes derivatives of a function to produce its output. In other words, you can optimize the value of a function that is internally optimizing another function, or using derivatives for any other purpose (e.g. running particle simulations, adaptive control), up to any level.
In such a compositional optimization setting, all arising higher-order derivatives are handled for you through **nested instantiations of forward and/or reverse AD**. In any case, you only need to write your algorithms as usual, **only implementing a regular forward algorithm**.
Let's explain this through a basic example from the article _"Jeffrey Mark Siskind and Barak A. Pearlmutter. Nesting forward-mode AD in a functional framework. Higher Order and Symbolic Computation 21(4):361-76, 2008. doi:10.1007/s10990-008-9037-1"_, where a parameter of a physics simulation using the gradient of an electric potential is optimized with Newton's method using the Hessian of an error, requiring third-order nesting of derivatives.
Optimizing a physics simulation
-------------------------------
Consider a charged particle traveling in a plane with position $\mathbf{x}(t)$, velocity $\dot{\mathbf{x}}(t)$, initial position $\mathbf{x}(0)=(0, 8)$, and initial velocity $\dot{\mathbf{x}}(0)=(0.75, 0)$. The particle is accelerated by an electric field formed by a pair of repulsive bodies,
$$$
p(\mathbf{x}; w) = \| \mathbf{x} - (10, 10 - w)\|^{-1} + \| \mathbf{x} - (10, 0)\|^{-1}
where $w$ is a parameter of this simple particle simulation, adjusting the location of one of the repulsive bodies.
We can simulate the time evolution of this system by using a naive Euler ODE integration
$$$
\begin{eqnarray*}
\ddot{\mathbf{x}}(t) &=& \left. -\nabla_{\mathbf{x}} p(\mathbf{x}) \right|_{\mathbf{x}=\mathbf{x}(t)}\\
\dot{\mathbf{x}}(t + \Delta t) &=& \dot{\mathbf{x}}(t) + \Delta t \ddot{\mathbf{x}}(t)\\
\mathbf{x}(t + \Delta t) &=& \mathbf{x}(t) + \Delta t \dot{\mathbf{x}}(t)
\end{eqnarray*}
where $\Delta t$ is an integration time step.
For a given parameter $w$, the simulation starts with $t=0$ and finishes when the particle hits the $x$-axis, at position $\mathbf{x}(t_f)$ at time $t_f$. When the particle hits the $x$-axis, we calculate an error $E(w) = x_0 (t_f)^2$, the squared horizontal distance of the particle from the origin. We then minimize this error using Newton's method, which finds the optimal value of $w$ so that the particle eventually hits the $x$-axis at the origin.
$$$
w^{(i+1)} = w^{(i)} - \frac{E'(w^{(i)})}{E''(w^{(i)})}
In other words, the code calculating the trajectory of the particle internally computes the gradient of the electric potential $p(\mathbf{x}; w)$, and, at the same time, the final position of the trajectory $\mathbf{x}(t_f)$ is used to compute an error, and the gradient and Hessian of this error are computed during the optimization procedure.
Here's how it goes.
*)
open Hype
open DiffSharp.AD.Float32
let dt = D 0.1f
let x0 = toDV [0.; 8.]
let v0 = toDV [0.75; 0.]
let p w (x:DV) = (1.f / DV.norm (x - toDV [D 10.f + w * D 0.f; D 10.f - w]))
+ (1.f / DV.norm (x - toDV [10.; 0.]))
let trajectory (w:D) =
(x0, v0)
|> Seq.unfold (fun (x, v) ->
let a = -grad (p w) x
let v = v + dt * a
let x = x + dt * v
Some(x, (x, v)))
|> Seq.takeWhile (fun x -> x.[1] > D 0.f)
let error (w:DV) =
let xf = trajectory w.[0] |> Seq.last
xf.[0] * xf.[0]
let w, l, whist, lhist = Optimize.Minimize(error, toDV [0.],
{Params.Default with
Method = Newton;
LearningRate = Constant (D 1.f)
ValidationInterval = 1;
Epochs = 10})
(**
[25/12/2015 23:53:10] --- Minimization started
[25/12/2015 23:53:10] Parameters : 1
[25/12/2015 23:53:10] Iterations : 10
[25/12/2015 23:53:10] Valid. interval: 1
[25/12/2015 23:53:10] Method : Exact Newton
[25/12/2015 23:53:10] Learning rate : Constant a = D 1.0f
[25/12/2015 23:53:10] Momentum : None
[25/12/2015 23:53:10] Gradient clip. : None
[25/12/2015 23:53:10] Early stopping : None
[25/12/2015 23:53:10] Improv. thresh.: D 0.995000005f
[25/12/2015 23:53:10] Return best : true
[25/12/2015 23:53:10] 1/10 | D 2.535113e+000 [- ]
[25/12/2015 23:53:10] 2/10 | D 7.528733e-002 [↓▼]
[25/12/2015 23:53:10] 3/10 | D 1.592970e-002 [↓▼]
[25/12/2015 23:53:10] 4/10 | D 4.178338e-003 [↓▼]
[25/12/2015 23:53:10] 5/10 | D 1.382800e-008 [↓▼]
[25/12/2015 23:53:11] 6/10 | D 3.274181e-011 [↓▼]
[25/12/2015 23:53:11] 7/10 | D 1.151079e-012 [↓▼]
[25/12/2015 23:53:11] 8/10 | D 1.151079e-012 [- ]
[25/12/2015 23:53:11] 9/10 | D 1.151079e-012 [- ]
[25/12/2015 23:53:11] 10/10 | D 3.274181e-011 [↑ ]
[25/12/2015 23:53:11] Duration : 00:00:00.9201285
[25/12/2015 23:53:11] Value initial : D 2.535113e+000
[25/12/2015 23:53:11] Value final : D 1.151079e-012 (Best)
[25/12/2015 23:53:11] Value change : D -2.535113e+000 (-100.00 %)
[25/12/2015 23:53:11] Value chg. / s : D -2.755173e+000
[25/12/2015 23:53:11] Iter. / s : 10.86804723
[25/12/2015 23:53:11] Iter. / min : 652.0828341
[25/12/2015 23:53:11] --- Minimization finished
val whist : DV [] =
[|DV [|0.0f|]; DV [|0.20767726f|]; DV [|0.17457059f|]; DV [|0.190040559f|];
DV [|0.182180524f|]; DV [|0.182166189f|]; DV [|0.182166889f|];
DV [|0.182166755f|]; DV [|0.182166621f|]; DV [|0.182166487f|]|]
val w : DV = DV [|0.182166889f|]
val lhist : D [] =
[|D 2.5351131f; D 2.5351131f; D 0.0752873272f; D 0.0159297027f;
D 0.00417833822f; D 1.38279992e-08f; D 3.27418093e-11f; D 1.15107923e-12f;
D 1.15107923e-12f; D 1.15107923e-12f|]
val l : D = D 1.15107923e-12f
*)
(*** hide ***)
open RProvider
open RProvider.graphics
open RProvider.grDevices
R.plot_new (namedParams [ ])
let t = trajectory (whist.[1].[0])
let tx, ty = t |> Seq.toArray |> Array.map (fun v -> v.[0] |> float32 |> float, v.[1] |> float32 |> float) |> Array.unzip
namedParams[
"x", box tx
"y", box ty
"pch", box 1
"xlab", box ""
"ylab", box ""
"col", box "darkblue"
"type", box "l"
"lty", box 4
"width", box 700
"height", box 500
]
|> R.lines |> ignore
(**
Optimization parameters
-----------------------
As another example, let's optimize the Beale function
$$$
f(\mathbf{x}) = (1.5 - x_1 + x_1 x_2)^2 + (2.25 - x_1 + x_1 x_2^2)^2 + (2.625 - x_1 + x_1 x_2^3)^2
starting from $\mathbf{x} = (1, 1.5)$, using RMSProp. The optimum is at $(3, 0.5)$
*)
let beale (x:DV) = (1.5f - x.[0] + (x.[0] * x.[1])) ** 2.f
+ (2.25f - x.[0] + x.[0] * x.[1] ** 2.f) ** 2.f
+ (2.625f - x.[0] + x.[0] * x.[1] ** 3.f) ** 2.f
let wopt, lopt, whist, lhist = Optimize.Minimize(beale, toDV [1.; 1.5],
{Params.Default with
Epochs = 3000;
LearningRate = RMSProp (D 0.01f, D 0.9f)})
(**
[12/11/2015 01:22:59] --- Minimization started
[12/11/2015 01:22:59] Parameters : 2
[12/11/2015 01:22:59] Iterations : 3000
[12/11/2015 01:22:59] Valid. interval: 10
[12/11/2015 01:22:59] Method : Gradient descent
[12/11/2015 01:22:59] Learning rate : RMSProp a0 = D 0.00999999978f, k = D 0.899999976f
[12/11/2015 01:22:59] Momentum : None
[12/11/2015 01:22:59] Gradient clip. : None
[12/11/2015 01:22:59] Early stopping : None
[12/11/2015 01:22:59] Improv. thresh.: D 0.995000005f
[12/11/2015 01:22:59] Return best : true
[12/11/2015 01:22:59] 1/3000 | D 4.125000e+001 [- ]
[12/11/2015 01:22:59] 11/3000 | D 2.655878e+001 [↓▼]
[12/11/2015 01:22:59] 21/3000 | D 2.154373e+001 [↓▼]
[12/11/2015 01:22:59] 31/3000 | D 1.841705e+001 [↓▼]
[12/11/2015 01:22:59] 41/3000 | D 1.624916e+001 [↓▼]
[12/11/2015 01:22:59] 51/3000 | D 1.465973e+001 [↓▼]
[12/11/2015 01:22:59] 61/3000 | D 1.334291e+001 [↓▼]
...
[12/11/2015 01:22:59] 2921/3000 | D 9.084024e-004 [- ]
[12/11/2015 01:22:59] 2931/3000 | D 9.084024e-004 [- ]
[12/11/2015 01:22:59] 2941/3000 | D 9.084024e-004 [- ]
[12/11/2015 01:22:59] 2951/3000 | D 9.084024e-004 [- ]
[12/11/2015 01:22:59] 2961/3000 | D 9.084024e-004 [- ]
[12/11/2015 01:22:59] 2971/3000 | D 9.084024e-004 [- ]
[12/11/2015 01:22:59] 2981/3000 | D 9.084024e-004 [- ]
[12/11/2015 01:22:59] 2991/3000 | D 9.084024e-004 [- ]
[12/11/2015 01:22:59] Duration : 00:00:00.3142646
[12/11/2015 01:22:59] Value initial : D 4.125000e+001
[12/11/2015 01:22:59] Value final : D 8.948371e-004 (Best)
[12/11/2015 01:22:59] Value change : D -4.124910e+001 (-100.00 %)
[12/11/2015 01:22:59] Value chg. / s : D -1.312560e+002
[12/11/2015 01:22:59] Iter. / s : 9546.09587
[12/11/2015 01:22:59] Iter. / min : 572765.7522
[12/11/2015 01:22:59] --- Minimization finished
val wopt : DV = DV [|2.99909306f; 0.50039643f|]
*)
(*** hide ***)
let contourplot3d (f:DV->D) (xmin, xmax) (ymin, ymax) =
let res = 100
let xstep = ((xmax - xmin) / float res)
let ystep = ((ymax - ymin) / float res)
let x = [|xmin .. xstep .. xmax|]
let y = [|ymin .. ystep .. ymax|]
let z = Array2D.init x.Length y.Length (fun i j -> f (toDV [x.[i]; y.[j]])) |> Array2D.map (float32>>float)
namedParams [
"x", box x
"y", box y
"z", box z
"labels", box ""
"levels", box [|0..5..200|]]
|> R.contour
contourplot3d beale (-4.5,4.5) (-4.5,4.5)
let xx, yy = whist |> Array.map (fun v -> v.[0] |> float32 |> float, v.[1] |> float32 |> float) |> Array.unzip
namedParams[
"x", box xx
"y", box yy
"col", box "blue"]
|> R.lines
namedParams[
"x", box (xx |>Array.last)
"y", box (yy |> Array.last)
"pch", box 16
"col", box "blue"]
|> R.points
(**
Each instantiation of gradient-based optimization is controlled through a collection of parameters, using the **Hype.Params** type.
If you do not supply any parameters to optimization, the default parameter set **Params.Default** is used. The default parameters look like this:
*)
module Params =
let Default = {Epochs = 100
LearningRate = LearningRate.DefaultRMSProp
Momentum = NoMomentum
Loss = L2Loss
Regularization = Regularization.DefaultL2Reg
GradientClipping = NoClip
Method = GD
Batch = Full
EarlyStopping = NoEarly
ImprovementThreshold = D 0.995f
Silent = false
ReturnBest = true
ValidationInterval = 10
LoggingFunction = fun _ _ _ -> ()}
(**
If you want to change only a specific element of the parameter type, you can do so by extending the **Params.Default** value and overwriting only the parts you need to change, such as this:
*)
let p = {Params.Default with
Epochs = 5000
LearningRate = LearningRate.AdaGrad (D 0.001f)
Momentum = Nesterov (D 0.9f)}
(**
### Optimization method
*)
type Method =
| GD // Gradient descent
| CG // Conjugate gradient
| CD // Conjugate descent
| NonlinearCG // Nonlinear conjugate gradient
| DaiYuanCG // Dai & Yuan conjugate gradient
| NewtonCG // Newton conjugate gradient
| Newton // Exact Newton
(**
### Learning rate
*)
type LearningRate =
| Constant of D // Constant
| Decay of D * D // 1 / t decay, a = a0 / (1 + kt). Initial value, decay rate
| ExpDecay of D * D // Exponential decay, a = a0 * Exp(-kt). Initial value, decay rate
| Schedule of DV // Scheduled learning rate vector, its length overrides Params.Epochs
| Backtrack of D * D * D // Backtracking line search. Initial value, c, rho
| StrongWolfe of D * D * D // Strong Wolfe line search. lmax, c1, c2
| AdaGrad of D // Adagrad. Initial value
| RMSProp of D * D // RMSProp. Initial value, decay rate
static member DefaultConstant = Constant (D 0.001f)
static member DefaultDecay = Decay (D 1.f, D 0.1f)
static member DefaultExpDecay = ExpDecay (D 1.f, D 0.1f)
static member DefaultBacktrack = Backtrack (D 1.f, D 0.0001f, D 0.5f)
static member DefaultStrongWolfe = StrongWolfe (D 1.f, D 0.0001f, D 0.5f)
static member DefaultAdaGrad = AdaGrad (D 0.001f)
static member DefaultRMSProp = RMSProp (D 0.001f, D 0.9f)
(**
### Momentum
*)
type Momentum =
| Momentum of D // Default momentum
| Nesterov of D // Nesterov momentum
| NoMomentum
static member DefaultMomentum = Momentum (D 0.9f)
static member DefaultNesterov = Nesterov (D 0.9f)
(**
### Gradient clipping
*)
type GradientClipping =
| NormClip of D // Norm clipping
| NoClip
static member DefaultNormClip = NormClip (D 1.f)
(**
Finally, looking at the [API reference](reference/index.html) and the [source code](https://github.com/hypelib/Hype/blob/master/src/Hype/Optimize.fs) of the optimization module can give you a better idea of the optimization algorithms currently implemented.
*)
================================================
FILE: docs/input/RecurrentNets.fsx
================================================
(*** hide ***)
#r "../../src/Hype/bin/Release/netstandard2.0/DiffSharp.dll"
#r "../../src/Hype/bin/Release/netstandard2.0/Hype.dll"
#I "../../packages/R.NET.Community/lib/net40/"
#I "../../packages/R.NET.Community.FSharp/lib/net40/"
#I "../../packages/RProvider"
#load "RProvider.fsx"
fsi.ShowDeclarationValues <- false
(**
Recurrent neural networks
=========================
In this example we build a recurrent neural network (RNN) for a language modeling task and train it with a short passage of text for a quick demonstration. Hype currently has three RNN models implemented as **Hype.Neural** layers, which can be combined freely with other layer types, explained, for example, in the [neural networks](feedforwardnets.html) page. **Hype.Neural.Recurrent** implements the "vanilla" RNN layer, **Hype.Neural.LSTM** implements the LSTM layer, and **Hype.Neural.GRU** implements the gated recurrent unit (GRU) layer.
### Language modeling
RNNs are well suited for constructing [language models,](https://en.wikipedia.org/wiki/Language_model) where we need to predict the probability of a word (or token) given the history of the tokens that came before it. Here, we will use an LSTM-based RNN to construct a word-level language model from a short passage of text, for a basic demonstration of usage. This model can be scaled to larger problems. State-of-the-art models of this type can require considerable computing resources and training time.
The text is from the beginning of Virgil's Aeneid, Book I.
*)
let text = "I sing of arms and the man, he who, exiled by fate, first came from the coast of Troy to Italy, and to Lavinian shores – hurled about endlessly by land and sea, by the will of the gods, by cruel Juno’s remorseless anger, long suffering also in war, until he founded a city and brought his gods to Latium: from that the Latin people came, the lords of Alba Longa, the walls of noble Rome. Muse, tell me the cause: how was she offended in her divinity, how was she grieved, the Queen of Heaven, to drive a man, noted for virtue, to endure such dangers, to face so many trials? Can there be such anger in the minds of the gods?"
(**
Hype provides a simple **Hype.NLP.Language** type for tokenizing text. You can look at the [API reference](reference/index.html) and the [source code](https://github.com/hypelib/Hype/blob/master/src/Hype/NLP.fs) for a better understanding of its usage.
*)
open Hype
open Hype.Neural
open Hype.NLP
open DiffSharp.AD.Float32
open DiffSharp.Util
let lang = Language(text)
lang.Tokens |> printfn "%A"
lang.Length |> printfn "%A"
(**
These are the tokens extracted from the text, including some of the punctuation marks. When we are sampling from the RNN language model, we will make use of the "." token for signaling the end of a sentence. The puncutation marks are configurable when you are constructing the **Language** instance. If they are not provided, a default set is used.
There are 86 tokens in this language instance.
Now let's transform the full text to a dataset, using the **Language** instance holding these tokens. The text will be encoded in a matrix where each column is a representation of each word as a _one-hot_ vector.
*)
let text' = lang.EncodeOneHot(text)
text'.Visualize() |> printfn "%s"
(**
DM : 86 x 145
Out of these 145 words, we will construct a dataset where the inputs are the first 144 words and the target outputs are the 144 words starting with a one word shift. This means that, for each word, we want the output (the prediction) to be the following word in our text passage.
*)
let data = Dataset(text'.[*, 0..(text'.Cols - 2)],
text'.[*, 1..(text'.Cols - 1)])
(**
val data : Dataset = Hype.Dataset
X: 86 x 144
Y: 86 x 144
RNNs, and especially the LSTM variety that we will use, can make predictions that take long-term dependencies and contextual information into account. When the language model is trained with a large enough text corpus and the network has enough capacity, state-of-the-art RNN language models are able to learn complex grammatical relations.
For our quick demonstration, we use a linear word embedding layer of 20 units, an LSTM of 100 units and a final linear layer of 86 units (the size of our vocabulary) followed by **softmax** activation.
*)
let dim = lang.Length // Vocabulary size, here 86
let n = FeedForward()
n.Add(Linear(dim, 20))
n.Add(LSTM(20, 100))
n.Add(Linear(100, dim))
n.Add(DM.mapCols softmax)
(**
You can also easily stack multiple RNNs on top of each other.
*)
let n = FeedForward()
n.Add(Linear(dim, 20))
n.Add(LSTM(20, 100))
n.Add(LSTM(100, 100))
n.Add(Linear(100, dim))
n.Add(DM.mapCols softmax)
(**
We will observe the the performance of our RNN during training by sampling random sentences from the language model.
Remember that the final output of the network, through the softmax activation, is a vector of word probabilities. When we are sampling, we start with a word, supply this to the network, and use the resulting probabilities at the output to sample from the vocabulary where words with higher probability are more likely to be selected. We then continue by giving the network the last sampled word and repeating this until we hit an "end of sentence" token (we use "." here) or reach a limit of maximum sentence length.
This is how we would sample a sentence starting with a specific word.
*)
n.Reset()
for i = 0 to 5 do
lang.Sample(n.Run, "I", [|"."|], 30) // Use "." as the stop token, limit maximum sentence length to 30.
|> printfn "%s"
(**
Because the model is not trained, we get sequences of random words from the vocabulary.
I be: she dangers Latium endlessly gods remorseless divinity tell and his offended lords trials? about war trials and anger shores so anger Alba a Alba sing her
I? came exiled – suffering shores anger came Latium people sing sing remorseless who brought war walls endlessly anger me founded his.
I – will long of in offended cruel until Queen Italy who anger lords Queen in Longa Muse who people about suffering Italy also grieved cruel hurled who me about
I endlessly city first by face, a Heaven me hurled sea such long noted she noted many sea city anger I noted remorseless cause Queen to remorseless Italy coast
I sea noted noble me minds long sing cause people in walls Italy by Longa first, for grieved sea many walls Troy came was endlessly of in Latium Latium
I and Latin of many suffering Alba Latium war.
We set a training cycle where we run one epoch of training followed by sampling one sentence starting with the word "I". In each epoch, we run through the whole training dataset. With a larger training corpus, we could also run the training with minibatches by stating this in the parameter set (commented out below).
Like the sample sentences above, at the beginning of training, we see mostly random orderings of words. As the training progresses, the cross-entropy loss for our dataset is decreasing and the sentences start exhibiting meaningful word patterns.
*)
for i = 0 to 1000 do
let par = {Params.Default with
//Batch = Minibatch 10
LearningRate = LearningRate.RMSProp(D 0.01f, D 0.9f)
Loss = CrossEntropyOnSoftmax
Epochs = 1
Silent = true // Suppress the regular printing of training progress
ReturnBest = false}
let loss, _ = Layer.Train(n, data, par)
printfn "Epoch: %*i | Loss: %O | Sample: %s" 3 i loss (lang.Sample(n.Run, "I", [|"."|], 30))
(**
Here is a selection of sentences demonstrating the progress of training.
Epoch: 0 | Loss: D 4.478101e+000 | Sample: I Queen drive she Alba endlessly Queen the by how tell his from grieved war her there drive people – lords coast he.
Epoch: 10 | Loss: D 4.102071e+000 | Sample: I people to,, Rome how the he of – sing fate, Muse, by,, Muse the of man Queen Latin and in her cause:
Epoch: 30 | Loss: D 3.438288e+000 | Sample: I walls long to first dangers she her, to founded to virtue sea first Can dangers a founded about Can Queen lords from sea by remorseless founded endlessly Latium
Epoch: 40 | Loss: D 2.007577e+000 | Sample: I Alba gods Alba Rome, the walls Alba Muse Rome anger me the the of the gods to who man me first founded offended endlessly until also grieved long
Epoch: 50 | Loss: D 9.753818e-001 | Sample: I sing people cruel: me the of Rome.
Epoch: 60 | Loss: D 3.944587e-001 | Sample: I sing sing Troy to so hurled endlessly by land sea, by to – hurled about by the of arms, by Juno’s such anger long also in her
Epoch: 70 | Loss: D 2.131431e-001 | Sample: I sing of and the of Longa, by Juno’s anger was in her of Heaven, to a city brought his gods to a gods to Lavinian hurled to
Epoch: 80 | Loss: D 1.895453e-001 | Sample: I sing, by will the of Rome.
Epoch: 90 | Loss: D 1.799535e-001 | Sample: I sing? there Muse the of the of the of arms by the: how she offended in the of? a, he shores hurled by land to
Epoch: 100 | Loss: D 1.733837e-001 | Sample: I sing arms the of Alba gods who, by Juno’s Rome such anger the of the of arms and, by, by from the coast Rome.
Epoch: 110 | Loss: D 1.682917e-001 | Sample: I sing Troy by, by from the of arms and, by, by from came, by Juno’s anger long in the of the of arms cruel Muse
Epoch: 120 | Loss: D 1.639529e-001 | Sample: I sing arms the of Rome.
Epoch: 130 | Loss: D 1.600647e-001 | Sample: I sing arms and, by Juno’s remorseless there and the of the of arms and, by Alba coast Troy to a – his gods by of the of
Epoch: 140 | Loss: D 1.564835e-001 | Sample: I sing arms by the of Rome.
Epoch: 150 | Loss: D 1.531392e-001 | Sample: I sing arms cruel, exiled by coast, he a city in the of the of arms.
Epoch: 160 | Loss: D 1.499920e-001 | Sample: I sing arms cruel man, by the trials arms to shores hurled endlessly by the of gods Italy, me the of Rome.
Epoch: 200 | Loss: D 1.390327e-001 | Sample: I sing arms and, by Juno’s such of the of the of arms Italy, by from the sing arms walls of the of Rome.
Epoch: 230 | Loss: D 1.322940e-001 | Sample: I sing arms the man he, tell from the of arms Italy, by fate, by the of Troy Italy, by fate first from the of the
Epoch: 260 | Loss: D 1.264137e-001 | Sample: I sing brought Muse Muse the of Heaven, by shores remorseless there he in the of arms cruel, by fate, he from the gods to Italy,
Epoch: 420 | Loss: D 1.131158e-001 | Sample: I sing of arms the of Heaven, by Juno’s remorseless hurled such in the of arms.
Epoch: 680 | Loss: D 9.938217e-002 | Sample: I of arms the man he, exiled fate, he virtue, to a? Can be such in the of the of of the of arms.
Epoch: 923 | Loss: D 9.283429e-002 | Sample: I sing of arms and the man he, by fate came from the of to Italy, by the, by Juno’s anger of Rome.
*)
================================================
FILE: docs/input/Regression.fsx
================================================
(*** hide ***)
#r "../../src/Hype/bin/Release/netstandard2.0/DiffSharp.dll"
#r "../../src/Hype/bin/Release/netstandard2.0/Hype.dll"
#I "../../packages/R.NET.Community/lib/net40/"
#I "../../packages/R.NET.Community.FSharp/lib/net40/"
#I "../../packages/RProvider"
#load "RProvider.fsx"
fsi.ShowDeclarationValues <- true
(**
Regression
==========
In this example we implement a logistic regression based binary classifier and train it to distinguish between the [MNIST](http://yann.lecun.com/exdb/mnist/) digits of 0 and 1.
### Loading the data
First, let's start by loading the MNIST training and testing data and arranging these into training, validation, and testing sets.
*)
open Hype
open Hype.Neural
open DiffSharp.AD.Float32
open DiffSharp.Util
let MNIST = Dataset(Util.LoadMNISTPixels("C:/datasets/MNIST/train-images.idx3-ubyte", 60000),
Util.LoadMNISTLabels("C:/datasets/MNIST/train-labels.idx1-ubyte", 60000) |> toDV |> DM.ofDV 1).NormalizeX()
let MNISTtrain = MNIST.[..58999]
let MNISTvalid = MNIST.[59000..]
let MNISTtest = Dataset(Util.LoadMNISTPixels("C:/datasets/MNIST/t10k-images.idx3-ubyte", 10000),
Util.LoadMNISTLabels("C:/datasets/MNIST/t10k-labels.idx1-ubyte", 10000) |> toDV |> DM.ofDV 1).NormalizeX()
(**
We shuffle the columns of the datasets and filter them to only keep the digits of 0 and 1.
*)
let MNISTtrain01 = MNISTtrain.Shuffle().Filter(fun (x, y) -> y.[0] <= D 1.f)
let MNISTvalid01 = MNISTvalid.Shuffle().Filter(fun (x, y) -> y.[0] <= D 1.f)
let MNISTtest01 = MNISTtest.Shuffle().Filter(fun (x, y) -> y.[0] <= D 1.f)
(**
val MNISTtrain01 : Dataset = Hype.Dataset
X: 784 x 12465
Y: 1 x 12465
val MNISTvalid01 : Dataset = Hype.Dataset
X: 784 x 200
Y: 1 x 200
val MNISTtest01 : Dataset = Hype.Dataset
X: 784 x 2115
Y: 1 x 2115
We can visualize individual digits from the dataset.
*)
MNISTtrain.X.[*,9] |> DV.visualizeAsDM 28 |> printfn "%s"
MNISTtrain.Y.[*,9]
(**
[lang=cs]
DM : 28 x 28
♦♦
▪█▪
▴██·
♦█♦
● ·█■
■█ ■█·
♦█ ██·
▴█■ ●█♦
■█ ▪█■
■█▪ ▴██-
-███♦▴ ♦█▪
·███■██♦■█■
·██■ ♦█████■
♦■- ♦████▪
- ██·
▪█■
▴█■
■█▴
■█▪
▴█·
val it : DV = DV [|4.0f|]
We can also visualize a series of digits in grid layout.
*)
MNISTtrain.[..5].VisualizeXColsAsImageGrid(28) |> printfn "%s"
(**
[lang=cs]
Hype.Dataset
X: 784 x 6
Y: 1 x 6
X's columns reshaped to (28 x 28), presented in a (2 x 3) grid:
DM : 56 x 84
▪█▪
▴▴● ●██▴ █████ ■
-▪●█████■●██♦ ■███■█ · ▴●
██████████-·· ■███♦·██▴ ▴● ▪♦
■█████♦●██ ●██████-♦█● ■● █▪
·▪-██♦ ▪ ███♦-█■ ·█● ■● ●█▴
▪█· ███● ·▴ ██ █● ♦█
▴█♦ ●█■♦· ██● ▴█● ■█
♦█· ●██· ██♦ ▪█▴ ●█■
█■▪- ██ ██♦ ▪█ ·●██·
·███▴ ♦█♦ ██♦ ▪█· ▴▪▪███●██
●██▪ ·██- ██▪ ▪██♦♦♦████♦▪· ■█
-██♦ ·█■ ▴█● ▴●●●●●- -█■
███ ·█■ ▴█■· ●█▴
▴●██♦ ·█▪ ●█● ●█
▪■████● ·█■ -██▪ ●█
-■████♦· ·██▪ ·●■█■● ●█-
■████♦· ·███■■███♦▴ ●█-
●■████♦· ♦█████■▪ ●█▪
●■█████▴ ▴███▪ ●█▪
▴███■▴▴ -█▪
▴██ -▴
-███ ▪♦███▪
▴███ ▪♦██-·▪ ▪███■■█■
██■ ·■██♦♦███● ▪████■ ██
■██- ██♦ ●██▴ -████■ ██
▪██♦ -██● ·██■ ●██■● ·██
███ ▴██▪ ■██· ▴ -██
♦██▴ ▴██● ·██▴ ▪██
-██● ■█● ♦██● -▴▴▴♦█♦
·██♦ ██ ▴♦████· ●█████■
███▪ ■█████■■█■ ■██■●███■▴
▪███ ██■▴ ♦█▪ ·██▴ ♦████·
■██● ██- ▴██● ♦██▴●██●
███♦ ·██ ▴██- ♦█■ ·●███■
███· ██ -██· ·●██▴ ·●●
▪███ ·██ ■██♦■███▴
■██▪ -██ ♦████●▴
██■ ██ -▪▴
██■ ■█
♦█■ -█♦
●█●
▪█
*)
MNISTtrain01.[..5].VisualizeXColsAsImageGrid(28) |> printfn "%s"
(**
[lang=cs]
Hype.Dataset
X: 784 x 6
Y: 1 x 6
X's columns reshaped to (28 x 28), presented in a (2 x 3) grid:
DM : 56 x 84
▴●███- ·♦██
·▪■█████■ ████● -▪██████
▪████████- ●████■ ▪██████████
●████▪ ●███▴ ■████● ■███■▪▪▴-███●
████● ▴███· ██████ -███■ ██████
███▪ ██▴ -██████ ■██▴ ████▴♦-
■██· ●█● ▪█████■ ■██ ■█▴█■
●██· ▴█● ██████ ▪██● - ■█-
-██▪ ▴█● ██████ ██■ ▪█▴
██■ ▴█● ■█████▴ ▴██- ■█▴
·██ ▴█● ██████- ▴██ ██▴
■█■ ▴█● █████▴ ▴█■ ██
■█▪ ▴█● ♦████- ▴█▪ ▴█▪
██- ●█● ♦█████ ▴█▪ ●█·
██▴ ██· █████♦ -██ -█▪
███ ●██· ●█████· ██■ █-
-███● ▴██● ·█████ ███ ■■
■█████♦●●■███■ ♦█████ ██- -■█·
▴■█████████♦ █████♦ ●██● -■██▪
·▪▪●█♦▪▴ ▴████· ●████████■-
▪█████▪-
■▴ ·····
▴■ ▪■█████·
♦♦♦♦♦-· ■█ ▴█████████-
▪████████■▪ ██ ███████████·
■███████████■ ▪█■ ▴■████♦♦ ▴████
█████████ ♦███ ██- ▴████♦▴ ■███·
▪███● ● ·██■ █● ▴████■ ♦███-
██■ -██· ▪█▴ ████▪ ████
▪██ ██♦ ██ ███- ████
●█♦ ▪██ ▪██ ·███· ████
██▴ ▴██ ●██ ·███♦ ▪████
██ ♦█▪ ██● ·███ ▪████·
██- ██ ██· ●███ ·●█████·
♦█● ■██ ·██- ████· ■██████▴
▪█■ ■██- █♦ ████■▴▴▴■████████▪
-██● ▴███♦ -█♦ ■██████████████♦
♦██♦-▪ ▪♦██■- ■█♦ ·████████████▪▴
♦███████████● ■█- -█████████▴
▴■█■■■■■■· █■ ▪ ▪▪
♦▴
### Defining the model
Let's now create our linear regression model. We implement this using the **Hype.Neural** module, as a linear layer with $28 \times 28 = 784$ inputs and one output. The output of the layer is passed through the sigmoid function.
*)
let n = Neural.FeedForward()
n.Add(Linear(28 * 28, 1))
n.Add(sigmoid)
(**
We can visualize the initial state of the linear model weights before the training. For information of about weight initialization parameters, please see the [neural networks example](feedforwardnets.html).
*)
let l = (n.[0] :?> Linear)
l.VisualizeWRowsAsImageGrid(28) |> printfn "%s"
(**
[lang=cs]
Hype.Neural.Linear
784 -> 1
Learnable parameters: 785
Init: Standard
W's rows reshaped to (28 x 28), presented in a (1 x 1) grid:
DM : 28 x 28
▴▪●●-█▴♦♦● ·▴█● ● ▴· ●●●●▪·
■ █- ▴●●▪ ■♦· ■▪■▪ █ ♦■●■
♦■ █♦●▪●♦ ♦■ ♦ ■ ▪- ■
■▪ ■♦■♦ █ ▪● ♦▪▴··■█ -▴●▪▪●
██··▴●●█▪♦■ -·█■ ▪- ··▪· ██
- ▪ ♦ ▪● ▪■█♦- ▴▪ ▴· ▪·●
- ●●▴▴ ▪■ ▴█ ▪▴·▴▴·♦■■♦·■■
♦▴ ▪■ ▪▪▴■·■--▪♦- ·♦▪■ ♦·●
·▴·♦▪♦●▪··▴·▪ ● ▪ █ ▴▪·♦▪
■ ▴ ♦█▴ - ♦●■ █▪■●▪█■▴●--█
♦■ ●■▴♦ ●· █· ▴· -█-▪●■■-■
█-·▪▴-▴█ ♦ █●·♦█▪▪●●■ - ·
- █ ■♦·●▪▴♦ -▴ - ■♦· ♦ -
■█ ▪- ▪■●♦█▴-█▪■ ■♦▪█■▪■ -
●♦█▴♦♦ ♦ ▴▪▴▴♦-▴♦♦█ ▴ ▪·●
·█▪■■█ ●· ●· -●■●·· ▴ --▴
·♦█▴ ♦♦■ ▴▪●▪- · -♦●♦ ■ · ■
■■▪---♦■·●▴▪-▪▴· ▪●● ·♦■ ▪♦▴
▴ -♦●■█·█ ● ♦▪●■- ·■♦-▪▴■▴
●-■● ···●█▴▪ -█·▪ ♦▴ ● ●
·█ █▴ ·♦---■▴·█■■▴ ▴■ - █
- ▪ ●█·▴♦▪ ■ ▪■ ■··· ▴
■ ♦♦- █▪♦-- ▴ ▴ ··█▴● ■♦
■·■■▪▴-·█♦●■ ▴ ♦ ♦▴■♦ ■ ●♦▪
·█▪- ■●▴▪▴▪ ▪ ▴▪ · ▪▴▴··♦
▪█♦■ ·♦ ■▪ ♦ ▴·●█▪· ·▪▴
· ■♦▪■ ▪■● ♦ ··· ·▪█■· ▪■●
●▴▪ ·■● -█●█·▪■▴ ▴▴♦ ■ ■ ▴
b:
DV : 1
### Training
Let's train the model for 10 epochs (full passes through the training data), with a minibatch size of 100, using the training and validation sets we've defined. The validation set will make sure that we're not overfitting the model.
*)
let p = {Params.Default with
Epochs = 10;
Batch = Minibatch 100;
EarlyStopping = EarlyStopping.DefaultEarly}
n.Train(MNISTtrain01, MNISTvalid01, p)
(**
After a 5-second training, we can see that the characteristics of the problem domain (distinguishing between the digits of 0 and 1) is captured in the model weights.
*)
let l = (n.[0] :?> Linear)
l.VisualizeWRowsAsImageGrid(28) |> printfn "%s"
(**
[lang=cs]
Hype.Neural.Linear
784 -> 1
Learnable parameters: 785
Init: Standard
W's rows reshaped to (28 x 28), presented in a (1 x 1) grid:
DM : 28 x 28
----------------------------
----------------------------
------------▴▴▴▴▴-----------
---------▴--▴▴▴▴▴▴-▴--------
--------▴▴▪▴▪▪▪▴-▴▴▴▪▪▪▴----
------▴-▴▴▴▴▪▪▴▴-·-▴▪▪▪▪▴---
------▴--▴▴-▴▴--···▴▪▴▴▴▴---
----▴---------▴▴-· ---------
---------··---▴▪--·-·····---
-------······▴▪▪▪▴-······---
------····· ▴●●●▴· ·---
-----·· · ▪♦■♦▪ ·---
-----· · ●■■♦▴ ·---
-----· ·♦██♦· ·---
-----· ▴■██● ·---
----· ▪██■▪ ·---
----· -●█■♦- ·---
----· ▴♦█■●· ···---
----· ·▴♦♦♦●· ····----
----· ·▴▪●●●▪·· ····--▴--
----······-▴▪▪▪▴--------▴---
-----▴▴----·--▴▴-▴▴-▴▴------
-----▴▪▪▴-· ··--▴▴▪▴▴▴▴-----
----▴▪▪▪▪▴-· ·-▴▴▪▴▴▴▴------
-----▴▪▴▴▴▴·---▴▴▴▴▴--------
------------▴▴▴▴------------
----------------------------
----------------------------
b:
DV : 1
### Classifier
You can create classifiers by instantiating types such as **LogisticClassifier** or **SoftmaxClassifier**, and passing a classification function of the form **DM->DM**in the constructor. Alternatively, you can directly pass the model we have just trained.
Please see the [API reference](reference/index.html) and the [source code](https://github.com/hypelib/Hype/blob/master/src/Hype/Classifier.fs) for a better understanding of how classifiers are implemented.
*)
let cc = LogisticClassifier(n)
(**
Let's test the class predictions for 10 random elements from the MNIST test set, which, if you remember, we've filtered to have only 0s and 1s.
*)
let pred = cc.Classify(MNISTtest01.X.[*,0..9]);;
let real = MNISTtest01.Y.[*, 0..9] |> DM.toDV |> DV.toArray |> Array.map (float32>>int)
(**
val pred : int [] = [|1; 0; 1; 0; 1; 0; 0; 1; 1; 1|]
val real : int [] = [|1; 0; 1; 0; 1; 0; 0; 1; 1; 1|]
The classifier seems to be working well. We can compute the classification error for a given dataset.
*)
let error = cc.ClassificationError(MNISTtest01);;
(**
val error : float32 = 0.000472813234f
The classification error is 0.047%.
Finally, this is how you would classify single digits.
*)
let cls = cc.Classify(MNISTtest01.X.[*,0]);;
MNISTtest01.X.[*,0] |> DV.visualizeAsDM 28 |> printfn "%s"
(**
[lang=cs]
val cls : int = 1
DM : 28 x 28
♦
●♦
█
■·
▪█-
▴█-
■♦
♦█·
-█▪
█▪
●▪
▪█
▪█-
▪█▴
▪█■
█■
██
▪█
█▴
█●
And this is how you would classify many digits efficiently at the same time, by running them through the model together as the columns of an input matrix.
*)
let clss = cc.Classify(MNISTtest01.X.[*,5..9]);;
MNISTtest01.[5..9].VisualizeXColsAsImageGrid(28) |> printfn "%s"
(**
[lang=cs]
val clss : int [] = [|0; 0; 1; 1; 1|]
Hype.Dataset
X: 784 x 5
Y: 1 x 5
X's columns reshaped to (28 x 28), presented in a (2 x 3) grid:
DM : 56 x 84
██·
●███♦- ·████♦· -█▴
██████■- ♦███████- ▪██·
●███████■ ♦███████████▴ ●██·
▪███● -███- ♦█████████████■ ▴♦█·
♦██▪ -██■ ████♦ ●●████████ ▪█·
▪███ ·██■ ●█████· ··██████ ▪█·
■██▪ ·██■ ▪██████· ██████ ▪█·
·██■ ▪██■ -██████♦ ██████ ▪█·
♦██▪ ■██■ ▪███████ ·█████♦ ●█·
·███- ■██■ ██████♦· ♦██████· ██·
♦██■ ·███■ ██████- ▪███████- ██·
■██- -███- ██████ ●███████▴ ██·
■██· ■██♦ ██████·♦████████■ ██·
■██· ■███● ████████████████ ██
■██● ████ ██████████████- ██
▴███· -■███- ▴████████████▴ ·██
·█████████♦ ■████████■● ██
▴███████♦ ████████ ·█♦·
▪███♦· ●●■●●- ·██▴
█■
·██
███ -██■
●██■ ▪████-
♦██ ♦███■
·███ ·████■
■██♦ ▴███■
▪██■ -████●
███- ▴████▪
▪██■ ·████♦
-███▴ █████·
♦██● ████♦
-███ ▪████·
███▴ ■███■
-███ ▴████·
███● ■███■
████ ■███▪
●███- -███♦
▴███● ●███▪
●██♦ ████▪
●██· ■███▴
■■-
*)
================================================
FILE: docs/input/Training.fsx
================================================
(*** hide ***)
#r "../../src/Hype/bin/Release/netstandard2.0/DiffSharp.dll"
#r "../../src/Hype/bin/Release/netstandard2.0/Hype.dll"
#I "../../packages/R.NET.Community/lib/net40/"
#I "../../packages/R.NET.Community.FSharp/lib/net40/"
#I "../../packages/RProvider"
#load "RProvider.fsx"
//fsi.ShowDeclarationValues <- false
System.Environment.CurrentDirectory <- __SOURCE_DIRECTORY__
(**
Training
========
In [optimization,](optimization.html) we've seen how nested AD and gradient-based optimization work together.
Training a model is the optimization of model parameters to minimize a loss function, or equivalently, to maximize the likelihood of a given set of data under the model parameters. In addition to the _optimization method_, _learning rate_, _momentum_, and _gradient clipping_ parameters we've seen, this introduces parameters for the _loss function_, _regularization_, _training batches_, and _validation and early stopping_.
But let's start with the **Dataset** type, which we will use for keeping the training, validation, and test data for the training procedure.
Dataset
-------
For supervised training, data consists of pairs of input vectors $\mathbf{x}_i \in \mathbb{R}^{d_x}$ and output vectors $\mathbf{y}_i \in \mathbb{R}^{d_y}$. We represent data using the **Dataset** type, which is basically a pair of matrices
$$$
\begin{eqnarray*}
\mathbf{X} &\in& \mathbb{R}^{d_x \times n}\\
\mathbf{Y} &\in& \mathbb{R}^{d_y \times n}\\
\end{eqnarray*}
holding these vectors, where $n$ is the number of input–output pairs, $d_x$ is the number of input features and $d_y$ is the number of output features. In other words, each of the $n$ columns of the matrix $\mathbf{X}$ is an input vector of length $d_x$ and each of the $n$ columns of matrix $\mathbf{Y}$ is the corresponding output vector of length $d_y$.
Keeping data in matrix form is essential for harnessing high-performance linear algebra engines tailored for your CPU or GPU. Hype, by default, uses a high-performance CPU backend using OpenBLAS for BLAS/LAPACK operations, and parallel implementations of non-BLAS operations such as elementwise functions.
*)
open Hype
open DiffSharp.AD.Float32
let x = toDM [[0; 0; 1; 1]
[0; 1; 0; 1]]
let y = toDM [[0; 1; 1; 0]]
let XORdata = Dataset(x, y)
(**
Hype provides several utility functions for loading data into matrices from images, delimited text files (e.g., CSV), or commonly used dataset files such as the MNIST.
*)
let MNIST = Dataset(Util.LoadMNISTPixels("train-images.idx3-ubyte", 60000),
Util.LoadMNISTLabels("train-labels.idx1-ubyte", 60000) |> toDV |> DM.ofDV 1).NormalizeX()
let MNISTtest = Dataset(Util.LoadMNISTPixels("t10k-images.idx3-ubyte", 10000),
Util.LoadMNISTLabels("t10k-labels.idx1-ubyte", 10000) |> toDV |> DM.ofDV 1).NormalizeX()
(**
You can see the [API reference](reference/index.html) and the [source code](https://github.com/hypelib/Hype/blob/master/src/Hype/Hype.fs) for various ways of constructing Datasets.
Training parameters
-------------------
Let's load the housing prices dataset from the [Stanford UFLDL Tutorial](http://ufldl.stanford.edu/tutorial/supervised/LinearRegression/) and divide it into input and output pairs. We will later train a simple linear regression model, to demonstrate the use of training parameters.
*)
let h = Util.LoadDelimited("housing.data") |> DM.Transpose
h.ToString() |> printfn "%s"
(**
The data has 14 rows and 506 columns, where each column represents the 14 features of one house. The values in the last row represent the price of the property, and we will train a model to predict this value, given the remaining 13 features.
We also add a row of ones to the input matrix that will account for the bias (intercept) of our model and simplify the implementation.
*)
let hx = h.[0..12, *]
let hy = h.[13..13, *]
let housing = Dataset(hx, hy).AppendBiasRowX()
(**
Our linear regression model is of the form
$$$
h_{\mathbf{w}} (\mathbf{x}) = \sum_j w_j x_j = \mathbf{w}^{T} \mathbf{x}
which represents a family of linear functions parameterized by the vector $\mathbf{w}$.
*)
let model (w:DV) (x:DV) = w * x
(**
For training the model, we minimize a loss function
$$$
J(\mathbf{w}) = \frac{1}{2} \sum_{i=1}^{n} \left(h_{\mathbf{w}} (\mathbf{x}^{(i)}) - y^{(i)} \right)^2 = \frac{1}{2} \sum_{i=1}^{n} \left( \mathbf{w}^{T} \mathbf{x}^{(i)} - y^{(i)} \right)^2
where $\mathbf{x}^{(i)}$ are vectors holding the 13 input features plus the bias input (the constant 1) and $y^{(i)}$ are the target values (which are here scalar).
*)
let wopt, lopt, whist, lhist = Optimize.Train(model, Rnd.UniformDV(14), housing,
{Params.Default with Epochs = 1000
Loss = Loss.Quadratic})
let trainedmodel = model wopt
(*** hide ***)
open RProvider
open RProvider.graphics
open RProvider.grDevices
let px, py = housing.Y.[0,*] |> DV.toArray |> Array.mapi (fun i v -> float i, (v |> float32 |> float)) |> Array.unzip
let ppx, ppy = housing.X |> DM.mapCols (fun v -> toDV [trainedmodel v]) |> DM.toDV |> DV.toArray |> Array.mapi (fun i v -> float i, (v |> float32 |> float)) |> Array.unzip
let ll = lhist |> Array.map (float32>>float)
namedParams[
"x", box px
"y", box py
"pch", box 19
"col", box "darkblue"
"type", box "p"
"xlab", box "House number"
"ylab", box "Price"
"width", box 700
"height", box 500
]
|> R.plot|> ignore
namedParams[
"x", box ppx
"y", box ppy
"pch", box 19
"col", box "red"
"type", box "p"
"width", box 700
"height", box 500
]
|> R.points|> ignore
(**
[12/11/2015 14:41:04] --- Training started
[12/11/2015 14:41:04] Parameters : 14
[12/11/2015 14:41:04] Iterations : 1000
[12/11/2015 14:41:04] Epochs : 1000
[12/11/2015 14:41:04] Batches : Full (1 per epoch)
[12/11/2015 14:41:04] Training data : 506
[12/11/2015 14:41:04] Validation data: None
[12/11/2015 14:41:04] Valid. interval: 10
[12/11/2015 14:41:04] Method : Gradient descent
[12/11/2015 14:41:04] Learning rate : RMSProp a0 = D 0.00100000005f, k = D 0.899999976f
[12/11/2015 14:41:04] Momentum : None
[12/11/2015 14:41:04] Loss : L2 norm
[12/11/2015 14:41:04] Regularizer : L2 lambda = D 9.99999975e-05f
[12/11/2015 14:41:04] Gradient clip. : None
[12/11/2015 14:41:04] Early stopping : None
[12/11/2015 14:41:04] Improv. thresh.: D 0.995000005f
[12/11/2015 14:41:04] Return best : true
[12/11/2015 14:41:04] 1/1000 | Batch 1/1 | D 5.281104e+002 [- ]
[12/11/2015 14:41:04] 2/1000 | Batch 1/1 | D 5.252324e+002 [↓▼]
[12/11/2015 14:41:04] 3/1000 | Batch 1/1 | D 5.231447e+002 [↓ ]
[12/11/2015 14:41:04] 4/1000 | Batch 1/1 | D 5.213967e+002 [↓▼]
[12/11/2015 14:41:04] 5/1000 | Batch 1/1 | D 5.198447e+002 [↓ ]
[12/11/2015 14:41:04] 6/1000 | Batch 1/1 | D 5.184225e+002 [↓▼]
[12/11/2015 14:41:04] 7/1000 | Batch 1/1 | D 5.170928e+002 [↓ ]
...
[12/11/2015 14:41:27] 994/1000 | Batch 1/1 | D 6.404338e+000 [↓ ]
[12/11/2015 14:41:27] 995/1000 | Batch 1/1 | D 6.392090e+000 [↓ ]
[12/11/2015 14:41:27] 996/1000 | Batch 1/1 | D 6.377205e+000 [↓▼]
[12/11/2015 14:41:28] 997/1000 | Batch 1/1 | D 6.363370e+000 [↓ ]
[12/11/2015 14:41:28] 998/1000 | Batch 1/1 | D 6.351198e+000 [↓ ]
[12/11/2015 14:41:28] 999/1000 | Batch 1/1 | D 6.344284e+000 [↓▼]
[12/11/2015 14:41:28] 1000/1000 | Batch 1/1 | D 6.334455e+000 [↓ ]
[12/11/2015 14:41:28] Duration : 00:00:23.3076639
[12/11/2015 14:41:28] Loss initial : D 5.281104e+002
[12/11/2015 14:41:28] Loss final : D 6.344284e+000 (Best)
[12/11/2015 14:41:28] Loss change : D -5.217661e+002 (-98.80 %)
[12/11/2015 14:41:28] Loss chg. / s : D -2.238603e+001
[12/11/2015 14:41:28] Epochs / s : 42.90434272
[12/11/2015 14:41:28] Epochs / min : 2574.260563
[12/11/2015 14:41:28] --- Training finished
val trainedmodel : (DV -> D)
The following is a plot of the prices in the dataset where the blue points represent the real price, and the red points are the values predicted by the trained linear model.
### Loss function
*)
type Loss =
| L1Loss // L1 norm, least absolute deviations
| L2Loss // L2 norm
| Quadratic // L2 norm squared, least squares
| CrossEntropyOnLinear // Cross entropy after linear layer
| CrossEntropyOnSoftmax // Cross entropy after softmax layer
(**
### Regularization
*)
type Regularization =
| L1Reg of D // L1 regularization
| L2Reg of D // L2 regularization
| NoReg
static member DefaultL1Reg = L1Reg (D 0.0001f)
static member DefaultL2Reg = L2Reg (D 0.0001f)
(**
### Batch
*)
type Batch =
| Full
| Minibatch of int // Minibatch of given size
| Stochastic // Minibatch with size 1, SGD
(**
### Validation and early stopping
*)
type EarlyStopping =
| Early of int * int // Stagnation patience, overfitting patience
| NoEarly
static member DefaultEarly = Early (750, 10)
(**
Training proceeds by minimizing the loss function by adjusting model parameters. Continuing this optimization for longer than necessary causes overfitting, where the model strives to precisely approximate the training data. Overfitting reduces the model's generalization ability and it's performance with new data in the field.
To prevent overfitting, data is divided into training and validation sets, and while the model is being optimized by computing the loss function using the training data, the model's performance with the validation data is also monitored. Generally, at the initial stages of training the loss for both the training and validation data will decrease. Eventually, the validation loss will asymptotically approach a minimum, and beyond a certain stage, it will start to increase even when the training loss keeps decreasing. This signifies a good time to stop the training, for preventing overfitting the model to the training data.
Hype does this via the **EarlyStopping** parameter, where you can specify a stagnation "patience" for the number of acceptable iterations for non-decreasing training loss and an overfitting patience for the number of acceptable iterations where the training loss decreases without an accompanying decrease in the validation loss.
Let's divide the housing dataset into training and validation sets and train the model using early stopping.
*)
let housingtrain = housing.[..399] // The first 400 data points
let housingvalid = housing.[400..] // The remaining 106 data points
(**
val housingtrain : Dataset = Hype.Dataset
X: 14 x 400
Y: 1 x 400
val housingvalid : Dataset = Hype.Dataset
X: 14 x 106
Y: 1 x 106
[12/11/2015 15:09:15] --- Training started
[12/11/2015 15:09:15] Parameters : 14
[12/11/2015 15:09:15] Iterations : 1000
[12/11/2015 15:09:15] Epochs : 1000
[12/11/2015 15:09:15] Batches : Full (1 per epoch)
[12/11/2015 15:09:15] Training data : 400
[12/11/2015 15:09:15] Validation data: 106
[12/11/2015 15:09:15] Valid. interval: 10
[12/11/2015 15:09:15] Method : Gradient descent
[12/11/2015 15:09:15] Learning rate : RMSProp a0 = D 0.00100000005f, k = D 0.899999976f
[12/11/2015 15:09:15] Momentum : None
[12/11/2015 15:09:15] Loss : L2 norm
[12/11/2015 15:09:15] Regularizer : L2 lambda = D 9.99999975e-05f
[12/11/2015 15:09:15] Gradient clip. : None
[12/11/2015 15:09:15] Early stopping : Stagnation thresh. = 750, overfit. thresh. = 10
[12/11/2015 15:09:15] Improv. thresh.: D 0.995000005f
[12/11/2015 15:09:15] Return best : true
[12/11/2015 15:09:15] 1/1000 | Batch 1/1 | D 3.221269e+002 [- ] | Valid D 3.322605e+002 [- ] | Stag: 0 Ovfit: 0
[12/11/2015 15:09:15] 2/1000 | Batch 1/1 | D 3.193867e+002 [↓▼] | Valid D 3.288632e+002 [↓▼] | Stag: 0 Ovfit: 0
[12/11/2015 15:09:15] 3/1000 | Batch 1/1 | D 3.173987e+002 [↓▼] | Valid D 3.263986e+002 [↓▼] | Stag: 0 Ovfit: 0
[12/11/2015 15:09:15] 4/1000 | Batch 1/1 | D 3.157341e+002 [↓▼] | Valid D 3.243348e+002 [↓▼] | Stag: 0 Ovfit: 0
[12/11/2015 15:09:15] 5/1000 | Batch 1/1 | D 3.142565e+002 [↓ ] | Valid D 3.225029e+002 [↓▼] | Stag: 0 Ovfit: 0
[12/11/2015 15:09:15] 6/1000 | Batch 1/1 | D 3.129025e+002 [↓▼] | Valid D 3.208241e+002 [↓▼] | Stag: 0 Ovfit: 0
[12/11/2015 15:09:15] 7/1000 | Batch 1/1 | D 3.116365e+002 [↓ ] | Valid D 3.192545e+002 [↓ ] | Stag: 10 Ovfit: 0
[12/11/2015 15:09:15] 8/1000 | Batch 1/1 | D 3.104370e+002 [↓▼] | Valid D 3.177671e+002 [↓▼] | Stag: 0 Ovfit: 0
[12/11/2015 15:09:15] 9/1000 | Batch 1/1 | D 3.092885e+002 [↓ ] | Valid D 3.163436e+002 [↓ ] | Stag: 10 Ovfit: 0
[12/11/2015 15:09:15] 10/1000 | Batch 1/1 | D 3.081814e+002 [↓▼] | Valid D 3.149709e+002 [↓▼] | Stag: 0 Ovfit: 0
[12/11/2015 15:09:15] 11/1000 | Batch 1/1 | D 3.071076e+002 [↓ ] | Valid D 3.136398e+002 [↓ ] | Stag: 10 Ovfit: 0
[12/11/2015 15:09:15] 12/1000 | Batch 1/1 | D 3.060618e+002 [↓▼] | Valid D 3.123428e+002 [↓▼] | Stag: 0 Ovfit: 0
[12/11/2015 15:09:15] 13/1000 | Batch 1/1 | D 3.050388e+002 [↓ ] | Valid D 3.110746e+002 [↓ ] | Stag: 10 Ovfit: 0
...
[12/11/2015 15:09:21] 318/1000 | Batch 1/1 | D 4.250416e+001 [↓▼] | Valid D 3.382476e+001 [↓▼] | Stag: 0 Ovfit: 0
[12/11/2015 15:09:21] 319/1000 | Batch 1/1 | D 4.178834e+001 [↓▼] | Valid D 3.371201e+001 [↓ ] | Stag: 10 Ovfit: 0
[12/11/2015 15:09:21] 320/1000 | Batch 1/1 | D 4.109373e+001 [↓▼] | Valid D 3.361367e+001 [↓▼] | Stag: 0 Ovfit: 0
[12/11/2015 15:09:21] 321/1000 | Batch 1/1 | D 4.040976e+001 [↓▼] | Valid D 3.362166e+001 [↑ ] | Stag: 10 Ovfit: 0
[12/11/2015 15:09:21] 322/1000 | Batch 1/1 | D 3.973472e+001 [↓▼] | Valid D 3.368684e+001 [↑ ] | Stag: 20 Ovfit: 1
[12/11/2015 15:09:21] 323/1000 | Batch 1/1 | D 3.907929e+001 [↓▼] | Valid D 3.382304e+001 [↑ ] | Stag: 30 Ovfit: 2
[12/11/2015 15:09:21] 324/1000 | Batch 1/1 | D 3.845267e+001 [↓▼] | Valid D 3.398524e+001 [↑ ] | Stag: 40 Ovfit: 3
[12/11/2015 15:09:21] 325/1000 | Batch 1/1 | D 3.783842e+001 [↓▼] | Valid D 3.418199e+001 [↑ ] | Stag: 50 Ovfit: 4
[12/11/2015 15:09:21] 326/1000 | Batch 1/1 | D 3.721857e+001 [↓▼] | Valid D 3.450164e+001 [↑ ] | Stag: 60 Ovfit: 5
[12/11/2015 15:09:21] 327/1000 | Batch 1/1 | D 3.659464e+001 [↓▼] | Valid D 3.499456e+001 [↑ ] | Stag: 70 Ovfit: 6
[12/11/2015 15:09:21] 328/1000 | Batch 1/1 | D 3.598552e+001 [↓▼] | Valid D 3.556280e+001 [↑ ] | Stag: 80 Ovfit: 7
[12/11/2015 15:09:21] 329/1000 | Batch 1/1 | D 3.538885e+001 [↓▼] | Valid D 3.616002e+001 [↑ ] | Stag: 90 Ovfit: 8
[12/11/2015 15:09:21] 330/1000 | Batch 1/1 | D 3.481464e+001 [↓▼] | Valid D 3.678414e+001 [↑ ] | Stag:100 Ovfit: 9
[12/11/2015 15:09:21] *** EARLY STOPPING TRIGGERED: Overfitting ***
[12/11/2015 15:09:21] 331/1000 | Batch 1/1 | D 3.426452e+001 [↓▼] | Valid D 3.741238e+001 [↑ ] | Stag:110 Ovfit:10
[12/11/2015 15:09:21] Duration : 00:00:05.9617220
[12/11/2015 15:09:21] Loss initial : D 3.221269e+002
[12/11/2015 15:09:21] Loss final : D 3.373809e+001 (Best)
[12/11/2015 15:09:21] Loss change : D -2.883888e+002 (-89.53 %)
[12/11/2015 15:09:21] Loss chg. / s : D -4.837340e+001
[12/11/2015 15:09:21] Epochs / s : 55.52087132
[12/11/2015 15:09:21] Epochs / min : 3331.252279
[12/11/2015 15:09:21] --- Training finished
*)
================================================
FILE: docs/input/download.fsx
================================================
(**
Download
========
Hype is tested on Linux and Windows.
You can download the source code or the binaries of the [latest release on GitHub](https://github.com/hypelib/Hype/releases).
You can also install the library as a package through [NuGet](https://www.nuget.org/packages/Hype), by running
In the current release
* OpenBLAS backend by default
* Regression, feedforward neural networks
* Recurrent neural networks, LSTMs, GRUs
* Hamiltonian Monte Carlo
About
-----
Hype is developed by [Atılım Güneş Baydin](http://www.cs.nuim.ie/~gunes/) and [Barak A. Pearlmutter](http://bcl.hamilton.ie/~barak/) at the [Brain and Computation Lab](http://www.bcl.hamilton.ie/), Hamilton Institute, National University of Ireland Maynooth.
License
-------
Hype is released under the MIT license.
*)
================================================
FILE: docs/input/templates/docpage.cshtml
================================================
@{
Layout = "template";
Title = Properties["page-title"];
Description = Properties["project-summary"];
}
@Properties["document"]
@Properties["tooltips"]
================================================
FILE: docs/input/templates/reference/module.cshtml
================================================
@using FSharp.MetadataFormat
@{
Layout = "template";
Title = Model.Module.Name + " - " + Properties["project-name"];
}
@{
// Get all the members & comment for the type
var members = (IEnumerable)Model.Module.AllMembers;
var comment = (Comment)Model.Module.Comment;
// Group all members by their category which is an inline annotation
// that can be added to members using special XML comment:
//
// /// [category:Something]
//
// ...and can be used to categorize members in large modules or types
// (but if this is not used, then all members end up in just one category)
var byCategory = members
.GroupBy(m => m.Category)
.OrderBy(g => String.IsNullOrEmpty(g.Key) ? "ZZZ" : g.Key)
.Select((g, n) => new {
Index = n,
GroupKey = g.Key,
Members = g.OrderBy(m => m.Name),
Name = String.IsNullOrEmpty(g.Key) ? "Other module members" : g.Key
});
// Get nested modules and nested types as statically typed collections
var nestModules = (IEnumerable)Model.Module.NestedModules;
var nestTypes = (IEnumerable)Model.Module.NestedTypes;
}
@Model.Module.Name
@foreach (var sec in comment.Sections) {
// XML comment for the type has multiple sections that can be labelled
// with categories (to give comment for an individual category). Here,
// we print only those that belong to the section.
if (!byCategory.Any(g => g.GroupKey == sec.Key))
{
if (sec.Key != "") {
@RenderPart("part-nested", new {
Types = nestTypes,
Modules = nestModules
})
}
@foreach (var g in byCategory)
{
// Iterate over all the categories and print members. If there are more than one
// categories, print the category heading (as
) and add XML comment from the type
// that is related to this specific category.
if (byCategory.Count() > 1)
{
@g.Name
var info = comment.Sections.FirstOrDefault(kvp => kvp.Key == g.GroupKey);
if (info.Key != null)
{
@info.Value
}
}
@RenderPart("part-members", new {
Header = "Functions and values",
TableHeader = "Function or value",
Members = g.Members.Where(m => m.Kind == MemberKind.ValueOrFunction)
})
@RenderPart("part-members", new {
Header = "Type extensions",
TableHeader = "Type extension",
Members = g.Members.Where(m => m.Kind == MemberKind.TypeExtension)
})
@RenderPart("part-members", new {
Header = "Active patterns",
TableHeader = "Active pattern",
Members = g.Members.Where(m => m.Kind == MemberKind.ActivePattern)
})
}
================================================
FILE: docs/input/templates/reference/namespaces.cshtml
================================================
@using FSharp.MetadataFormat
@{
Layout = "template";
Title = "Namespaces - " + Properties["project-name"];
}
@Model.Name
@{ var nsIndex = 0; }
@foreach (var ns in Model.Namespaces)
{
nsIndex++;
var typedNs = (Namespace)ns;
var allCategories =
typedNs.Types.Select(t => t.Category)
.Concat(typedNs.Modules.Select(m => m.Category))
.Distinct()
.OrderBy(s => String.IsNullOrEmpty(s) ? "ZZZ" : s);
var allByCategory =
allCategories
.Select((c, i) => new {
Name = String.IsNullOrEmpty(c) ? "Other namespace members" : c,
Index = String.Format("{0}_{1}", nsIndex, i),
Types = typedNs.Types.Where(t => t.Category == c).ToArray(),
Modules = typedNs.Modules.Where(m => m.Category == c).ToArray() })
.Where(c => c.Types.Length + c.Modules.Length > 0).ToArray();
}
================================================
FILE: docs/input/templates/reference/type.cshtml
================================================
@using FSharp.MetadataFormat
@{
Layout = "template";
Title = Model.Type.Name + " - " + Properties["project-name"];
}
@{
// Get all the members & comment for the type
var members = (IEnumerable)Model.Type.AllMembers;
var comment = (Comment)Model.Type.Comment;
// Group all members by their category which is an inline annotation
// that can be added to members using special XML comment:
//
// /// [category:Something]
//
// ...and can be used to categorize members in large modules or types
// (but if this is not used, then all members end up in just one category)
var byCategory = members
.GroupBy(m => m.Category)
.OrderBy(g => String.IsNullOrEmpty(g.Key) ? "ZZZ" : g.Key)
.Select((g, n) => new {
Index = n,
GroupKey = g.Key,
Members = g.OrderBy(m => m.Kind == MemberKind.StaticParameter ? "" : m.Name),
Name = String.IsNullOrEmpty(g.Key) ? "Other type members" : g.Key
});
}
@Model.Type.Name
@foreach (var sec in comment.Sections) {
// XML comment for the type has multiple sections that can be labelled
// with categories (to give comment for an individual category). Here,
// we print only those that belong to the section.
if (!byCategory.Any(g => g.GroupKey == sec.Key)) {
if (sec.Key != "") {
}
@foreach (var g in byCategory) {
// Iterate over all the categories and print members. If there are more than one
// categories, print the category heading (as
) and add XML comment from the type
// that is related to this specific category.
if (byCategory.Count() > 1) {
@g.Name
var info = comment.Sections.FirstOrDefault(kvp => kvp.Key == g.GroupKey);
if (info.Key != null) {
================================================
FILE: paket.dependencies
================================================
source https://api.nuget.org/v3/index.json
framework: netstandard2.0
redirects: on
storage: none
nuget System.Drawing.Common >= 4.5.1
nuget DiffSharp >= 0.8.4-beta
nuget FSharp.Core
#These packages are used in .fsx examples which are currently difficult to make cross-platform unless they're local
nuget FSharp.Formatting storage: packages
//nuget R.NET storage: packages
nuget RProvider storage: packages
nuget XPlot.GoogleCharts.WPF storage: packages
================================================
FILE: src/Hype/AssemblyInfo.fs
================================================
namespace Hype.AssemblyInfo
open System.Reflection
open System.Runtime.CompilerServices
open System.Runtime.InteropServices
// General Information about an assembly is controlled through the following
// set of attributes. Change these attribute values to modify the information
// associated with an assembly.
[]
[]
[]
[]
[]
[]
[]
[]
// Setting ComVisible to false makes the types in this assembly not visible
// to COM components. If you need to access a type in this assembly from
// COM, set the ComVisible attribute to true on that type.
[]
// The following GUID is for the ID of the typelib if this project is exposed to COM
[]
// Version information for an assembly consists of the following four values:
//
// Major Version
// Minor Version
// Build Number
// Revision
//
// You can specify all the values or you can default the Build and Revision Numbers
// by using the '*' as shown below:
// []
[]
[]
[]
do
()
================================================
FILE: src/Hype/Classifier.fs
================================================
//
// This file is part of
// Hype: Compositional Machine Learning and Hyperparameter Optimization
//
// Copyright (c) 2015, National University of Ireland Maynooth (Atilim Gunes Baydin, Barak A. Pearlmutter)
//
// Hype is released under the MIT license.
// (See accompanying LICENSE file.)
//
// Written by:
//
// Atilim Gunes Baydin
// atilimgunes.baydin@nuim.ie
//
// Barak A. Pearlmutter
// barak@cs.nuim.ie
//
// Brain and Computation Lab
// Hamilton Institute & Department of Computer Science
// National University of Ireland Maynooth
// Maynooth, Co. Kildare
// Ireland
//
// www.bcl.hamilton.ie
//
namespace Hype
open Hype
open Hype.Neural
open DiffSharp.AD.Float32
open DiffSharp.Util
/// Base type for classifiers
[]
type Classifier(f:DM->DM) =
let f = f
member c.Run(x:DM) = f x
member c.Run(x:DV) = x |> DM.ofDV x.Length |> f |> DM.toDV
abstract member Classify : DM -> int[]
abstract member Classify : DV -> int
member c.ClassificationError(x:DM, y:int[]) =
let cc = c.Classify(x)
let incorrect = Array.map2 (fun c y -> if c = y then 0 else 1) cc y
(float32 (incorrect |> Array.sum)) / (float32 incorrect.Length)
member c.ClassificationError(d:Dataset) =
c.ClassificationError(d.X, d.Yi)
/// Classifier for binary classification
type LogisticClassifier(f) =
inherit Classifier(f)
new(l:Layer) = LogisticClassifier(l.Run)
override c.Classify(x:DM) =
let cc = Array.zeroCreate x.Cols
x |> f |> DM.iteriCols (fun i v -> if v.[0] > D 0.5f then cc.[i] <- 1)
cc
override c.Classify(x:DV) =
if c.Run(x).[0] > D 0.5f then 1 else 0
member c.ClassificationError(d:Dataset) =
let yi = d.Y |> DM.toDV |> DV.toArray |> Array.map (float32>>int)
c.ClassificationError(d.X, yi)
/// Classifier for softmax classification
type SoftmaxClassifier(f) =
inherit Classifier(f)
new(l:Layer) = SoftmaxClassifier(l.Run)
override c.Classify(x:DM) =
let cc = Array.zeroCreate x.Cols
x |> f |> DM.iteriCols (fun i v -> cc.[i] <- DV.MaxIndex(v))
cc
override c.Classify(x:DV) =
DV.MaxIndex(c.Run(x))
================================================
FILE: src/Hype/Hype.fs
================================================
//
// This file is part of
// Hype: Compositional Machine Learning and Hyperparameter Optimization
//
// Copyright (c) 2015, National University of Ireland Maynooth (Atilim Gunes Baydin, Barak A. Pearlmutter)
//
// Hype is released under the MIT license.
// (See accompanying LICENSE file.)
//
// Written by:
//
// Atilim Gunes Baydin
// atilimgunes.baydin@nuim.ie
//
// Barak A. Pearlmutter
// barak@cs.nuim.ie
//
// Brain and Computation Lab
// Hamilton Institute & Department of Computer Science
// National University of Ireland Maynooth
// Maynooth, Co. Kildare
// Ireland
//
// www.bcl.hamilton.ie
//
/// Main namespace
namespace Hype
open System.IO
open DiffSharp.AD.Float32
open DiffSharp.Util
/// Random number generator
type Rnd() =
static let mutable R = new System.Random()
/// Seed the random number generator with integer `seed`
static member Seed(seed) = R <- new System.Random(seed)
/// Generate a random permutation of a set of length `n`
static member Permutation(n:int) =
let swap i j (a:_[]) =
let tmp = a.[i]
a.[i] <- a.[j]
a.[j] <- tmp
let a = Array.init n (fun i -> i)
a |> Array.iteri (fun i _ -> swap i (R.Next(i, n)) a)
a
/// Sample a non-negative random integer
static member UniformInt() = R.Next()
/// Sample a non-negative random integer less than `max`
static member UniformInt(max) = R.Next(max)
/// Sample a random integer between `min` and `max`
static member UniformInt(min, max) = R.Next(min, max)
/// Sample a `float32` from the standard uniform distribution. X ~ U(0,1)
static member Uniform() = float32 (R.NextDouble())
/// Sample a `D` from the standard uniform distribution. X ~ U(0,1)
static member UniformD() = D (float32 (R.NextDouble()))
/// Sample a `float32` from the uniform distribution between zero and `max`. X ~ U(0,max)
static member Uniform(max) = max * (float32 (R.NextDouble()))
/// Sample a `D` from the unifrom distribution between zero and `max`. X ~ U(0,max)
static member UniformD(max) = max * D (float32 (R.NextDouble()))
/// Sample a `float32` from the uniform distribution between `min` and `max`. X ~ U(min,max)
static member Uniform(min, max) = min + (float32 (R.NextDouble())) * (max - min)
/// Sample a `D` from the uniform distribution between `min` and `max`. X ~ U(min,max)
static member UniformD(min, max) = min + D (float32 (R.NextDouble())) * (max - min)
/// Sample a `float32` from the standard normal distribution. X ~ N(0,1)
static member Normal() =
let rec n() =
let x, y = (float32 (R.NextDouble())) * 2.0f - 1.0f, (float32 (R.NextDouble())) * 2.0f - 1.0f
let s = x * x + y * y
if s > 1.0f then n() else x * sqrt (-2.0f * (log s) / s)
n()
/// Sample a `D` from the standard normal distribution. X ~ N(0,1)
static member NormalD() = D (Rnd.Normal())
/// Sample a `float32` from the normal distribution with given mean `mu` and standard deviation `sigma`. X ~ N(mu,sigma)
static member Normal(mu, sigma) = Rnd.Normal() * sigma + mu
/// Sample a `D` from the normal distribution with given mean `mu` and standard deviation `sigma`. X ~ N(mu,sigma)
static member NormalD(mu, sigma) = Rnd.NormalD() * sigma + mu
/// Sample a `DV` of length `n` from the standard uniform distribution. Elements of vector X ~ U(0,1)
static member UniformDV(n) = DV (Array.Parallel.init n (fun _ -> Rnd.Uniform()))
/// Sample a `DV` of length `n` from the uniform distribution between zero and `max`. Elements of vector X ~ U(0,max)
static member UniformDV(n, max) = DV.init n (fun _ -> Rnd.UniformD(max))
/// Sample a `DV` of length `n` from the uniform distribution between `min` and `max`. Elements of vector X ~ U(min,max)
static member UniformDV(n, min, max) = DV.init n (fun _ -> Rnd.UniformD(min, max))
/// Sample a `DV` of length `n` from the standard normal distribution. Elements of vector X ~ N(0,1)
static member NormalDV(n) = DV (Array.Parallel.init n (fun _ -> Rnd.Normal()))
/// Sample a `DV` of length `n` from the normal distribution with given mean `mu` and standard deviation `sigma`. Elements of vector X ~ N(mu,sigma)
static member NormalDV(n, mu, sigma) = DV.init n (fun _ -> Rnd.NormalD(mu, sigma))
/// Sample a `DM` of `m` rows and `n` columns from the standard uniform distribution. Elements of matrix X ~ U(0,1)
static member UniformDM(m, n) = DM (Array2D.Parallel.init m n (fun _ _ -> Rnd.Uniform()))
/// Sample a `DM` of `m` rows and `n` columns from the uniform distribution between zero and `max`. Elements of matrix X ~ U(0,max)
static member UniformDM(m, n, max) = DM.init m n (fun _ _ -> Rnd.UniformD(max))
/// Sample a `DM` of `m` rows and `n` columns from the uniform distribution between `min` and `max`. Elements of matrix X ~ U(min,max)
static member UniformDM(m, n, min, max) = DM.init m n (fun _ _ -> Rnd.UniformD(min, max))
/// Sample a `DM` of `m` rows and `n` columns from the standard normal distribution. Elements of matrix X ~ N(0,1)
static member NormalDM(m, n) = DM (Array2D.Parallel.init m n (fun _ _ -> Rnd.Normal()))
/// Sample a `DM` of `m` rows and `n` columns from the normal distribution with given mean `mu` and standard deviation `sigma`. Elements of matrix X ~ N(mu,sigma)
static member NormalDM(m, n, mu, sigma) = DM.init m n (fun _ _ -> Rnd.NormalD(mu, sigma))
/// Select a random element of array `a`
static member Choice(a:_[]) = a.[R.Next(a.Length)]
/// Select a random element of array `a`, given selection probabilities in array `probs`
static member Choice(a:_[], probs:float32[]) = Rnd.Choice(a, toDV probs)
/// Select a random element of array `a`, given selection probabilities in vector `probs`
static member Choice(a:_[], probs:DV) =
let probs' = probs / (DV.sum(probs))
let p = float32 (R.NextDouble())
let mutable r = 0.f
let mutable i = 0
let mutable hit = false
while not hit do
r <- r + (float32 probs'.[i])
if r >= p then
hit <- true
else
i <- i + 1
a.[i]
/// Dataset for holding training data
type Dataset private (x:DM, y:DM, xi:seq, yi:seq) =
/// The matrix X of input values, where columns are the individual inputs Xi
member val X = x with get
/// The matrix Y of output values, where columns are the individual outputs Yi
member val Y = y with get
/// The index of the maximum elements of individual inputs Xi, used for one-hot representations
member val Xi = xi |> Array.ofSeq with get
/// The index of the maximum elements of individual outputs Yi, used for one-hot reprsentations
member val Yi = yi |> Array.ofSeq with get
/// Construct a dataset with given input matrix `x` and output matrix `y`. Columns of `x` and `y` are the individual inputs and corresponding outputs.
new(x:DM, y:DM) =
let xi = x |> DM.toCols |> Seq.toArray |> Array.map DV.maxIndex
let yi = y |> DM.toCols |> Seq.toArray |> Array.map DV.maxIndex
Dataset(x, y, xi, yi)
/// Construct a dataset of one-hot input and output elements. `xi` are the input indices, `onehotdimsx` is the input dimensions, `yi` are the output indices, `onehotdimsy` is the output dimensions.
new(xi:seq, onehotdimsx:int, yi:seq, onehotdimsy:int) =
let x = xi |> Seq.map (fun i -> DV.standardBasis onehotdimsx i) |> DM.ofCols
let y = yi |> Seq.map (fun i -> DV.standardBasis onehotdimsy i) |> DM.ofCols
Dataset(x, y, xi, yi)
/// Construct a dataset of one-hot input and output elements. `xi` are the input indices, input dimensions is max(xi) + 1, `yi` are the output indices, output dimensions is max(yi) + 1.
new(xi:seq, yi:seq) =
let onehotdimsx = 1 + Seq.max xi
let onehotdimsy = 1 + Seq.max yi
Dataset(xi, onehotdimsx, yi, onehotdimsy)
/// Construct a dataset with given input matrix `x` and one-hot output elements. `yi` are the output indices, `onehotdimsy` is the output dimensions.
new(x:DM, yi:seq, onehotdimsy:int) =
let xi = x |> DM.toCols |> Seq.toArray |> Array.map DV.maxIndex
let y = yi |> Seq.map (fun i -> DV.standardBasis onehotdimsy i) |> DM.ofCols
Dataset(x, y, xi, yi)
/// Construct a dataset with one-hot input elements and given output matrix `y`. `xi` are the input indices, `onehotdimsx` is the input dimensions.
new(xi:seq, onehotdimsx:int, y:DM) =
let x = xi |> Seq.map (fun i -> DV.standardBasis onehotdimsx i) |> DM.ofCols
let yi = y |> DM.toCols |> Seq.toArray |> Array.map DV.maxIndex
Dataset(x, y, xi, yi)
/// Construct a dataset with given input matrix `x` and one-hot output elements. `yi` are the output indices, output dimensions is max(yi) + 1.
new(x:DM, yi:seq) =
let onehotdimsy = 1 + Seq.max yi
Dataset(x, yi, onehotdimsy)
/// Construct a dataset with one-hot input elements and given output matrix `y`. `xi` are the input indices, input dimensions is max(xi) + 1.
new(xi:seq, y:DM) =
let onehotdimsx = 1 + Seq.max xi
Dataset(xi, onehotdimsx, y)
/// Construct a dataset from the given sequence of input-output vector pairs
new(s:seq) =
let x, y = s |> Seq.toArray |> Array.unzip
Dataset(x |> DM.ofCols, y |> DM.ofCols)
/// The empty dataset
static member empty = Dataset(DM.empty, DM.empty)
/// Check whether dataset `d` is empty
static member isEmpty (d:Dataset) = DM.isEmpty d.X && DM.isEmpty d.Y
/// Normalize the values in the input matrix X and output matrix Y of dataset `d` to be in the range [0,1]
static member normalize (d:Dataset) = d.Normalize()
/// Normalize the values in the input matrix X of dataset `d` to be in the range [0,1]
static member normalizeX (d:Dataset) = d.NormalizeX()
/// Normalize the values in the output matrix Y of dataset `d` to be in the range [0,1]
static member normalizeY (d:Dataset) = d.NormalizeY()
/// Standardize the values in the input matrix X and output matrix Y of dataset `d` to have zero mean and unit variance
static member standardize (d:Dataset) = d.Standardize()
/// Standardize the values in the input matrix X of dataset `d` to have zero mean and unit variance
static member standardizeX (d:Dataset) = d.StandardizeX()
/// Standardize the values in the output matrix Y of dataset `d` to have zero mean and unit variance
static member standardizeY (d:Dataset) = d.StandardizeY()
/// Append a new row `v` to the input matrix X of dataset `d`
static member appendRowX (v:DV) (d:Dataset) = d.AppendRowX(v)
/// Append a new tow `v` to the output matrix Y of dataset `d`
static member appendRowY (v:DV) (d:Dataset) = d.AppendRowY(v)
/// Append a row of ones to the input matrix X of dataset `d`
static member appendBiasRowX (d:Dataset) = d.AppendBiasRowX()
/// Get a summary string of dataset `d`
static member toString (d:Dataset) = d.ToString()
/// Get a string representation of dataset `d` showing all values
static member toStringFull (d:Dataset) = d.ToStringFull()
/// Get the input-output pairs of dataset `d` as a sequence
static member toSeq (d:Dataset) = d.ToSeq()
/// The length of dataset `d`, i.e., the number of columns in input matrix X and output matrix Y
static member length (d:Dataset) = d.Length
/// Sample a random subset of length `n` from dataset `d`
static member randomSubset (n:int) (d:Dataset) = d.RandomSubset(n)
/// Shuffle the order of elements in dataset `d`
static member shuffle (d:Dataset) = d.Shuffle()
/// Get the input-output pair with index `i` from dataset `d`
static member item (i:int) (d:Dataset) = d.[i]
/// Get element `i`
member d.Item
with get i = d.X.[*,i], d.Y.[*,i]
/// The length of the dataset, i.e., the number of columns in input matrix X and output matrix Y
member d.Length = d.X.Cols
/// Get the input-output pairs as a sequence
member d.ToSeq() =
Seq.init d.Length (fun i -> d.[i])
/// Sample a random subset of length `n` from this dataset
member d.RandomSubset(n) =
let bi = Rnd.Permutation(d.Length)
let x = Seq.init n (fun i -> d.X.[*, bi.[i]])
let y = Seq.init n (fun i -> d.Y.[*, bi.[i]])
Dataset(DM.ofCols x, DM.ofCols y)
/// Normalize the values in the input matrix X and output matrix Y to be in the range [0,1]
member d.Normalize() = Dataset(DM.normalize d.X, DM.normalize d.Y)
/// Normalize the values in the input matrix X to be in the range [0,1]
member d.NormalizeX() = Dataset(DM.normalize d.X, d.Y)
/// Normalize the values in the output matrix Y to be in the range [0,1]
member d.NormalizeY() = Dataset(d.X, DM.normalize d.Y)
/// Standardize the values in the input matrix X and output matrix Y to have zero mean and unit variance
member d.Standardize() = Dataset(DM.standardize d.X, DM.standardize d.Y)
/// Standardize the values in the input matrix X to have zero mean and unit variance
member d.StandardizeX() = Dataset(DM.standardize d.X, d.Y)
/// Standardize the values in the output matrix Y to have zero mean and unit variance
member d.StandardizeY() = Dataset(d.X, DM.standardize d.Y)
/// Shuffle the order of elements in the dataset
member d.Shuffle() = d.RandomSubset d.Length
/// Get a slice of the dataset between `lower` and `upper` indices
member d.GetSlice(lower, upper) =
let l = max 0 (defaultArg lower 0)
let u = min (d.X.Cols - 1) (defaultArg upper (d.Length - 1))
Dataset(d.X.[*,l..u], d.Y.[*,l..u])
/// Get a new dataset of the entries for which the `predicate` is true
member d.Filter (predicate:(DV*DV)->bool) =
d.ToSeq() |> Seq.filter predicate |> Dataset
/// Append a new row `v` to the input matrix X
member d.AppendRowX(v:DV) = Dataset(d.X |> DM.appendRow v, d.Y)
/// Append a new row `v` to the output matrix Y
member d.AppendRowY(v:DV) = Dataset(d.X, d.Y |> DM.appendRow v)
/// Append a row of all ones to the input matrix X
member d.AppendBiasRowX() = d.AppendRowX(DV.create d.Length 1.f)
/// Get a summary string of this dataset
override d.ToString() =
"Hype.Dataset\n"
+ sprintf " X: %i x %i\n" d.X.Rows d.X.Cols
+ sprintf " Y: %i x %i" d.Y.Rows d.Y.Cols
/// Get a string representation of this dataset showing all values
member d.ToStringFull() =
"Hype.Dataset\n"
+ sprintf " X:\n%O\n\n" d.X
+ sprintf " Y:\n%O" d.Y
/// Get a string visualization of this dataset
member d.Visualize() =
"Hype.Dataset\n"
+ sprintf " X:\n%s\n\n" (d.X.Visualize())
+ sprintf " Y:\n%s" (d.Y.Visualize())
/// Visualize the values of the input matrix X where each column will be reshaped to an image with `imagerows` rows
member d.VisualizeXColsAsImageGrid(imagerows:int) =
d.ToString() + "\n"
+ "X's columns " + Util.VisualizeDMRowsAsImageGrid(d.X |> DM.transpose, imagerows)
/// Visualize the values of the output matrix Y where each column will be reshaped to an image with `imagerows` rows
member d.VisualizeYColsAsImageGrid(imagerows:int) =
d.ToString() + "\n"
+ "Y's columns " + Util.VisualizeDMRowsAsImageGrid(d.Y |> DM.transpose, imagerows)
/// Various utility functions
and Util =
static member printLog (s:string) = printfn "[%A] %s" System.DateTime.Now s
static member printModel (f:DV->DV) (d:Dataset) =
d.ToSeq()
|> Seq.map (fun (x, y) -> f x, y)
|> Seq.iter (fun (x, y) -> printfn "f x: %A, y: %A" x y)
/// Load bitmap image with given `filename` to `DM`
static member LoadImage(filename:string) =
let bmp = new System.Drawing.Bitmap(filename)
let m = DM.init bmp.Height bmp.Width (fun i j -> float32 (bmp.GetPixel(i, j).GetBrightness()))
bmp.Dispose()
m
/// Load values from delimited text file with given `filename` and separator characters `separators`
static member LoadDelimited(filename:string, separators:char[]) =
System.IO.File.ReadLines(filename)
|> Seq.map (fun x -> x.Split(separators) |> Array.map float32)
|> Seq.map toDV
|> DM.ofRows
/// Load values from delimited text file with given `filename` and a default set of separator characters: space, comma, or tab
static member LoadDelimited(filename:string) =
Util.LoadDelimited(filename, [|' '; ','; '\t'|])
/// Load values from the MNIST database images, from given `filename`, reading `n` number of elements
static member LoadMNISTPixels(filename, n) =
let d = new BinaryReader(File.Open(filename, FileMode.Open, FileAccess.Read, FileShare.Read))
let magicnumber = d.ReadInt32() |> System.Net.IPAddress.NetworkToHostOrder
match magicnumber with
| 2051 -> // Images
let maxitems = d.ReadInt32() |> System.Net.IPAddress.NetworkToHostOrder
let rows = d.ReadInt32() |> System.Net.IPAddress.NetworkToHostOrder
let cols = d.ReadInt32() |> System.Net.IPAddress.NetworkToHostOrder
let n = min n maxitems
d.ReadBytes(n * rows * cols)
|> Array.map float32
|> DV
|> DM.ofDV n
|> DM.transpose
| _ -> failwith "Given file is not in the MNIST format."
/// Load values from the MNIST database labels, from given `filename`, reading `n` number of elements
static member LoadMNISTLabels(filename, n) =
let d = new BinaryReader(File.Open(filename, FileMode.Open, FileAccess.Read, FileShare.Read))
let magicnumber = d.ReadInt32() |> System.Net.IPAddress.NetworkToHostOrder
match magicnumber with
| 2049 -> // Labels
let maxitems = d.ReadInt32() |> System.Net.IPAddress.NetworkToHostOrder
d.ReadBytes(min n maxitems)
|> Array.map int
| _ -> failwith "Given file is not in the MNIST format."
/// Load values from the MNIST database images, from given `filename`, reading all elements
static member LoadMNISTPixels(filename) = Util.LoadMNISTPixels(filename, System.Int32.MaxValue)
/// Load values from the MNIST database labels, from given `filename`, reading all elements
static member LoadMNISTLabels(filename) = Util.LoadMNISTLabels(filename, System.Int32.MaxValue)
/// Generate a string representation of matrix `w`, reshaping each row into an image with `imagerows` rows, and presenting resulting images together in an optimal grid layout.
static member VisualizeDMRowsAsImageGrid(w:DM, imagerows:int) =
let rows = w.Rows
let mm = int (floor (sqrt (float rows)))
let nn = int (ceil (float rows / float mm))
let m = imagerows
let n = (w.[0, *] |> DV.toDM m).Cols
let mutable mat = DM.create (mm * m) (nn * n) (DM.mean w)
for i = 0 to mm - 1 do
for j = 0 to nn - 1 do
let row = i * nn + j
if row < w.Rows then
mat <- DM.AddSubMatrix(mat, i * m, j * n, w.[row, *] |> DV.toDM m)
sprintf "reshaped to (%i x %i), presented in a (%i x %i) grid:\n%s\n" m n mm nn (mat.Visualize())
================================================
FILE: src/Hype/Hype.fsproj
================================================
netstandard2.0x64BSD-2-ClauseHypetruetrue
================================================
FILE: src/Hype/Inference.fs
================================================
//
// This file is part of
// Hype: Compositional Machine Learning and Hyperparameter Optimization
//
// Copyright (c) 2015, National University of Ireland Maynooth (Atilim Gunes Baydin, Barak A. Pearlmutter)
//
// Hype is released under the MIT license.
// (See accompanying LICENSE file.)
//
// Written by:
//
// Atilim Gunes Baydin
// atilimgunes.baydin@nuim.ie
//
// Barak A. Pearlmutter
// barak@cs.nuim.ie
//
// Brain and Computation Lab
// Hamilton Institute & Department of Computer Science
// National University of Ireland Maynooth
// Maynooth, Co. Kildare
// Ireland
//
// www.bcl.hamilton.ie
//
/// Inference namespace
namespace Hype.Inference
open Hype
open DiffSharp.AD.Float32
open DiffSharp.Util
/// Hamiltonian MCMC sampler
type HMCSampler() =
static member Sample(n, hdelta, hsteps, x0:DV, f:DV->D) =
let leapFrog (u:DV->D) (k:DV->D) (d:D) steps (x0, p0) =
let hd = d / 2.f
[1..steps]
|> List.fold (fun (x, p) _ ->
let p' = p - hd * grad u x
let x' = x + d * grad k p'
x', p' - hd * grad u x') (x0, p0)
let u x = -log (f x) // potential energy
let k p = (p * p) / D 2.f // kinetic energy
let hamilton x p = u x + k p
let x = ref x0
[|for i in 1..n do
let p = DV.init x0.Length (fun _ -> Rnd.Normal())
let x', p' = leapFrog u k hdelta hsteps (!x, p)
if Rnd.Uniform() < float32 (exp ((hamilton !x p) - (hamilton x' p'))) then x := x'
yield !x|]
================================================
FILE: src/Hype/NLP.fs
================================================
//
// This file is part of
// Hype: Compositional Machine Learning and Hyperparameter Optimization
//
// Copyright (c) 2015, National University of Ireland Maynooth (Atilim Gunes Baydin, Barak A. Pearlmutter)
//
// Hype is released under the MIT license.
// (See accompanying LICENSE file.)
//
// Written by:
//
// Atilim Gunes Baydin
// atilimgunes.baydin@nuim.ie
//
// Barak A. Pearlmutter
// barak@cs.nuim.ie
//
// Brain and Computation Lab
// Hamilton Institute & Department of Computer Science
// National University of Ireland Maynooth
// Maynooth, Co. Kildare
// Ireland
//
// www.bcl.hamilton.ie
//
/// Natural language processing namespace
namespace Hype.NLP
open Hype
open DiffSharp.AD.Float32
open DiffSharp.Util
/// Language model
type Language(tokens:string[], punctuation:string[]) =
member val Tokens = tokens
static member TokenizeWords(text:string, punctuation) =
//let mutable t' = text.ToLowerInvariant()
let mutable t' = text
punctuation |> Array.iter (fun p -> t' <- t'.Replace(p, " " + p + " "))
t'.Split([|" "|], System.StringSplitOptions.RemoveEmptyEntries)
new(text:string, punctuation:string[]) = Language(Language.TokenizeWords(text, punctuation) |> Set.ofArray |> Set.toArray, punctuation)
new(text:string) = Language(text, [|"."; ","; ":"; ";"; "("; ")"; "!"; "?"|])
member l.Length = l.Tokens.Length
member l.EncodeOneHot(x:string) =
Language.TokenizeWords(x, punctuation) |> l.EncodeOneHot
member l.EncodeOneHot(x:string[]) =
try
//x |> Array.map (fun v -> v.ToLowerInvariant())
x
|> Array.map (fun v -> Array.findIndex (fun t -> t = v) l.Tokens)
|> Array.map (DV.standardBasis l.Length) |> DM.ofCols
with
| _ -> failwith "Given token is not found in the language."
member l.DecodeOneHot(x:DM) =
try
x |> DM.toCols |> Seq.map DV.maxIndex
|> Seq.map (fun i -> l.Tokens.[i]) |> Seq.toArray
with
| _ -> [||]
member l.Sample(probs:DM) = probs |> DM.toCols |> Seq.map (fun v -> Rnd.Choice(l.Tokens, v)) |> Seq.toArray
member l.Sample(probs:DV) = Rnd.Choice(l.Tokens, probs)
member l.Sample(model:DM->DM, start:string, stop:string[], maxlen) =
let mutable x = start
let mutable i = 0
let mutable t = ([while i < maxlen do
yield x
let p = x |> l.EncodeOneHot |> model
let d = l.Sample(p).[0]
match stop |> Array.tryFind (fun p -> p = d) with
| Some(_) ->
yield d
i <- maxlen
| _ ->
x <- d
i <- i + 1]
|> List.map ((+) " ")
|> List.fold (+) "").Trim()
punctuation |> Array.iter (fun p -> t <- t.Replace(" " + p, p))
t
================================================
FILE: src/Hype/Neural.fs
================================================
//
// This file is part of
// Hype: Compositional Machine Learning and Hyperparameter Optimization
//
// Copyright (c) 2015, National University of Ireland Maynooth (Atilim Gunes Baydin, Barak A. Pearlmutter)
//
// Hype is released under the MIT license.
// (See accompanying LICENSE file.)
//
// Written by:
//
// Atilim Gunes Baydin
// atilimgunes.baydin@nuim.ie
//
// Barak A. Pearlmutter
// barak@cs.nuim.ie
//
// Brain and Computation Lab
// Hamilton Institute & Department of Computer Science
// National University of Ireland Maynooth
// Maynooth, Co. Kildare
// Ireland
//
// www.bcl.hamilton.ie
//
/// Neural networks namespace
namespace Hype.Neural
open Hype
open DiffSharp.AD.Float32
open DiffSharp.Util
/// Base type for neural layers
[]
type Layer() =
abstract member Init : unit -> unit
abstract member Reset : unit -> unit
abstract member Run : DM -> DM
abstract member Encode : unit -> DV
abstract member EncodeLength : int
abstract member Decode : DV -> unit
abstract member ToStringFull : unit -> string
abstract member Visualize : unit -> string
member l.Train(d:Dataset) = Layer.Train(l, d)
member l.Train(d:Dataset, v:Dataset) = Layer.Train(l, d, v)
member l.Train(d:Dataset, par:Params) = Layer.Train(l, d, par)
member l.Train(d:Dataset, v:Dataset, par:Params) = Layer.Train(l, d, v, par)
static member init (l:Layer) = l.Init()
static member reset (l:Layer) = l.Reset()
static member run x (l:Layer) = l.Run(x)
static member encode (l:Layer) = l.Encode()
static member encodeLength (l:Layer) = l.EncodeLength
static member decode (l:Layer) (w:DV) = l.Decode(w)
static member toString (l:Layer) = l.ToString()
static member toStringFull (l:Layer) = l.ToStringFull()
static member visualize (l:Layer) = l.Visualize()
static member Train (l:Layer, d:Dataset) = Layer.Train(l, d, Dataset.empty, Params.Default)
static member Train (l:Layer, d:Dataset, par:Params) = Layer.Train(l, d, Dataset.empty, par)
static member Train (l:Layer, d:Dataset, v:Dataset) = Layer.Train(l, d, v, Params.Default)
static member Train (l:Layer, d:Dataset, v:Dataset, par:Params) =
let f =
fun w x ->
l.Decode w
l.Run x
let w0 = l.Encode()
// try
// grad (fun w -> Loss.L1Loss.FuncDM(d) (f w)) w0 |> ignore
// with
// | _ -> failwith "Input/output dimensions mismatch between dataset and the layer."
let w, loss, _, lhist = Optimize.Train(f, w0, d, v, par)
w |> l.Decode
loss, lhist
/// Initialization schemes for neural layer weights
type Initializer =
| InitUniform of D * D
| InitNormal of D * D
| InitRBM of D
| InitReLU
| InitSigmoid
| InitTanh
| InitStandard
| InitCustom of (int->int->D)
override i.ToString() =
match i with
| InitUniform(min, max) -> sprintf "Uniform min=%A max=%A" min max
| InitNormal(mu, sigma) -> sprintf "Normal mu=%A sigma=%A" mu sigma
| InitRBM sigma -> sprintf "RBM sigma=%A" sigma
| InitReLU -> "ReLU"
| InitSigmoid -> "Sigmoid"
| InitTanh -> "Tanh"
| InitStandard -> "Standard"
| InitCustom f -> "Custom"
member i.InitDM(m, n) =
let fanOut, fanIn = m, n
match i with
| InitUniform(min, max) -> Rnd.UniformDM(m, n, min, max)
| InitNormal(mu, sigma) -> Rnd.NormalDM(m, n, mu, sigma)
| InitRBM sigma -> Rnd.NormalDM(m, n, D 0.f, sigma)
| InitReLU -> Rnd.NormalDM(m, n, D 0.f, sqrt (D 2.f / (float32 fanIn)))
| InitSigmoid -> let r = D 4.f * sqrt (D 6.f / (fanIn + fanOut)) in Rnd.UniformDM(m, n, -r, r)
| InitTanh -> let r = sqrt (D 6.f / (fanIn + fanOut)) in Rnd.UniformDM(m, n, -r, r)
| InitStandard -> let r = (D 1.f) / sqrt (float32 fanIn) in Rnd.UniformDM(m, n, -r, r)
| InitCustom f -> DM.init m n (fun _ _ -> f fanIn fanOut)
member i.InitDM(m:DM) = i.InitDM(m.Rows, m.Cols)
/// Linear layer
type Linear(inputs:int, outputs:int, initializer:Initializer) =
inherit Layer()
new(inputs, outputs) = Linear(inputs, outputs, Initializer.InitStandard)
member val W = initializer.InitDM(outputs, inputs) with get, set
member val b = DV.zeroCreate outputs with get, set
override l.Init() =
l.W <- initializer.InitDM(l.W)
l.b <- DV.zeroCreate l.b.Length
override l.Reset() = ()
override l.Run (x:DM) = (l.W * x) + l.b
override l.Encode () = DV.append (DM.toDV l.W) l.b
override l.EncodeLength = l.W.Length + l.b.Length
override l.Decode w =
let ww = w |> DV.split [l.W.Length; l.b.Length] |> Array.ofSeq
l.W <- ww.[0] |> DM.ofDV l.W.Rows
l.b <- ww.[1]
override l.ToString() =
"Hype.Neural.Linear\n"
+ " " + l.W.Cols.ToString() + " -> " + l.W.Rows.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " W : %i x %i\n" l.W.Rows l.W.Cols
+ sprintf " b : %i" l.b.Length
override l.ToStringFull() =
"Hype.Neural.Linear\n"
+ " " + l.W.Cols.ToString() + " -> " + l.W.Rows.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " W:\n%O\n" l.W
+ sprintf " b:\n%O" l.b
override l.Visualize() =
"Hype.Neural.Linear\n"
+ " " + l.W.Cols.ToString() + " -> " + l.W.Rows.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " W:\n%s\n" (l.W.Visualize())
+ sprintf " b:\n%s" (l.b.Visualize())
member l.VisualizeWRowsAsImageGrid(imagerows:int) =
"Hype.Neural.Linear\n"
+ " " + l.W.Cols.ToString() + " -> " + l.W.Rows.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " W's rows %s\n" (Util.VisualizeDMRowsAsImageGrid(l.W, imagerows))
+ sprintf " b:\n%s" (l.b.Visualize())
/// Linear layer with no bias
type LinearNoBias(inputs:int, outputs:int, initializer:Initializer) =
inherit Layer()
new(inputs, outputs) = LinearNoBias(inputs, outputs, Initializer.InitStandard)
member val W = initializer.InitDM(outputs, inputs) with get, set
override l.Init() = l.W <- initializer.InitDM(l.W)
override l.Reset() = ()
override l.Run (x:DM) = l.W * x
override l.Encode () = l.W |> DM.toDV
override l.EncodeLength = l.W.Length
override l.Decode w = l.W <- w |> DM.ofDV l.W.Rows
override l.ToString() =
"Hype.Neural.LinearNoBias\n"
+ " " + l.W.Cols.ToString() + " -> " + l.W.Rows.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " W : %i x %i" l.W.Rows l.W.Cols
override l.ToStringFull() =
"Hype.Neural.LinearNoBias\n"
+ " " + l.W.Cols.ToString() + " -> " + l.W.Rows.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " W:\n%O" l.W
override l.Visualize() =
"Hype.Neural.LinearNoBias\n"
+ " " + l.W.Cols.ToString() + " -> " + l.W.Rows.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " W:\n%s" (l.W.Visualize())
member l.VisualizeWRowsAsImageGrid(imagerows:int) =
"Hype.Neural.LinearNoBias\n"
+ " " + l.W.Cols.ToString() + " -> " + l.W.Rows.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " W's rows %s" (Util.VisualizeDMRowsAsImageGrid(l.W, imagerows))
/// Activation layer with custom functions
type Activation(f:DM->DM) =
inherit Layer()
let f = f
override l.Init () = ()
override l.Reset () = ()
override l.Run (x:DM) = f x
override l.Encode () = DV.empty
override l.EncodeLength = 0
override l.Decode w = ()
override l.ToString() =
sprintf "Hype.Neural.Activation"
override l.ToStringFull() = l.ToString()
override l.Visualize() = l.ToString()
/// Feedforward sequence of layers
type FeedForward() =
inherit Layer()
let mutable (layers:Layer[]) = Array.empty
let mutable encodelength = 0
let update() =
encodelength <- layers |> Array.map Layer.encodeLength |> Array.sum
member n.Add(l) =
layers <- Array.append layers [|l|]
update()
member n.Insert(i, l) =
let a = ResizeArray(layers)
a.Insert(i, l)
layers <- a.ToArray()
update()
member n.Remove(i) =
let a = ResizeArray(layers)
a.RemoveAt(i)
layers <- a.ToArray()
update()
member n.Add(f:DM->DM) = n.Add(Activation(f))
member n.Insert(i, f:DM->DM) = n.Insert(i, Activation(f))
member n.Length = layers.Length
member n.Item
with get i = layers.[i]
override n.Init() = layers |> Array.iter Layer.init
override n.Reset() = layers |> Array.iter Layer.reset
override n.Run(x:DM) = Array.fold Layer.run x layers
override n.Encode() = layers |> Array.map Layer.encode |> Array.reduce DV.append
override n.EncodeLength = encodelength
override n.Decode(w) =
w |> DV.split (layers |> Array.map Layer.encodeLength)
|> Seq.iter2 Layer.decode layers
override n.ToString() =
let s = System.Text.StringBuilder()
if n.Length > 0 then
s.Append(" ") |> ignore
for i = 0 to layers.Length - 1 do
s.Append("(" + i.ToString() + ") -> ") |> ignore
s.Remove(s.Length - 4, 4) |> ignore
s.Append("\n\n") |> ignore
for i = 0 to layers.Length - 1 do
s.Append(" (" + i.ToString() + "): " + layers.[i].ToString() + "\n\n") |> ignore
"Hype.Neural.FeedForward\n"
+ sprintf " Learnable parameters: %i\n" encodelength
+ s.ToString()
override n.ToStringFull() =
let s = System.Text.StringBuilder()
if n.Length > 0 then
s.Append(" ") |> ignore
for i = 0 to layers.Length - 1 do
s.Append("(" + i.ToString() + ") -> ") |> ignore
s.Remove(s.Length - 4, 4) |> ignore
s.Append("\n\n") |> ignore
for i = 0 to layers.Length - 1 do
s.Append(" (" + i.ToString() + "): " + layers.[i].ToStringFull() + "\n\n") |> ignore
"Hype.Neural.FeedForward\n"
+ sprintf " Learnable parameters: %i\n" encodelength
+ s.ToString()
override n.Visualize() =
let s = System.Text.StringBuilder()
if n.Length > 0 then
s.Append(" ") |> ignore
for i = 0 to layers.Length - 1 do
s.Append("(" + i.ToString() + ") -> ") |> ignore
s.Remove(s.Length - 4, 4) |> ignore
s.Append("\n\n") |> ignore
for i = 0 to layers.Length - 1 do
s.Append(" (" + i.ToString() + "): " + layers.[i].Visualize() + "\n\n") |> ignore
"Hype.Neural.FeedForward\n"
+ sprintf " Learnable parameters: %i\n" encodelength
+ s.ToString()
/// Vanilla RNN layer
type Recurrent(inputs:int, hiddenunits:int, outputs:int, activation:DV->DV, initializer:Initializer) =
inherit Layer()
new(inputs, hiddenunits, outputs) = Recurrent(inputs, hiddenunits, outputs, tanh, Initializer.InitTanh)
new(inputs, hiddenunits, outputs, activation) = Recurrent(inputs, hiddenunits, outputs, activation, Initializer.InitTanh)
member val Act = activation with get
member val Whh = initializer.InitDM(hiddenunits, hiddenunits) with get, set
member val Wxh = initializer.InitDM(hiddenunits, inputs) with get, set
member val Why = initializer.InitDM(outputs, hiddenunits) with get, set
member val bh = DV.zeroCreate hiddenunits with get, set
member val by = DV.zeroCreate outputs with get, set
member val h = DV.zeroCreate hiddenunits with get, set
override l.Init() =
l.Whh <- initializer.InitDM(l.Whh)
l.Wxh <- initializer.InitDM(l.Wxh)
l.Why <- initializer.InitDM(l.Why)
l.bh <- DV.zeroCreate hiddenunits
l.by <- DV.zeroCreate outputs
l.h <- DV.zeroCreate hiddenunits
override l.Reset() = l.h <- DV.zeroCreate hiddenunits
override l.Run (x:DM) =
let y = x |> DM.mapCols (fun x ->
l.h <- l.Act ((l.Whh * l.h) + (l.Wxh * x) + l.bh)
(l.Why * l.h) + l.by)
l.h <- primalDeep l.h
y
override l.Encode () = [l.Whh; l.Wxh; l.Why] |> List.map DM.toDV |> List.append [l.bh; l.by] |> Seq.fold DV.append DV.Zero
override l.EncodeLength = l.Whh.Length + l.Wxh.Length + l.Why.Length + l.bh.Length + l.by.Length
override l.Decode w =
let ww = w |> DV.split [l.bh.Length; l.by.Length; l.Whh.Length; l.Wxh.Length; l.Why.Length] |> Array.ofSeq
l.bh <- ww.[0]
l.by <- ww.[1]
l.Whh <- ww.[2] |> DM.ofDV l.Whh.Rows
l.Wxh <- ww.[3] |> DM.ofDV l.Wxh.Rows
l.Why <- ww.[4] |> DM.ofDV l.Why.Rows
l.h <- DV.zeroCreate hiddenunits
override l.ToString() =
"Hype.Neural.Recurrent\n"
+ " " + l.Wxh.Cols.ToString() + " -> " + l.Whh.Rows.ToString() + " -> " + l.Why.Rows.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " Whh : %i x %i\n" l.Whh.Rows l.Whh.Cols
+ sprintf " Wxh : %i x %i\n" l.Wxh.Rows l.Wxh.Cols
+ sprintf " Why : %i x %i\n" l.Why.Rows l.Why.Cols
+ sprintf " bh : %i\n" l.bh.Length
+ sprintf " by : %i" l.by.Length
override l.ToStringFull() =
"Hype.Neural.Recurrent\n"
+ " " + l.Wxh.Cols.ToString() + " -> " + l.Whh.Rows.ToString() + " -> " + l.Why.Rows.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " Whh:\n%O\n" l.Whh
+ sprintf " Wxh:\n%O\n" l.Wxh
+ sprintf " Why:\n%O\n" l.Why
+ sprintf " bh:\n%O\n" l.bh
+ sprintf " by:\n%O" l.by
override l.Visualize() =
"Hype.Neural.Recurrent\n"
+ " " + l.Wxh.Cols.ToString() + " -> " + l.Whh.Rows.ToString() + " -> " + l.Why.Rows.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " Whh:\n%s\n" (l.Whh.Visualize())
+ sprintf " Wxh:\n%s\n" (l.Wxh.Visualize())
+ sprintf " Why:\n%s\n" (l.Why.Visualize())
+ sprintf " bh:\n%s\n" (l.bh.Visualize())
+ sprintf " by:\n%s" (l.by.Visualize())
/// Long short-term memory layer
type LSTM(inputs:int, memcells:int) =
inherit Layer()
let initializer = Initializer.InitTanh
member val Wxi = initializer.InitDM(memcells, inputs) with get, set
member val Whi = initializer.InitDM(memcells, memcells) with get, set
member val Wxc = initializer.InitDM(memcells, inputs) with get, set
member val Whc = initializer.InitDM(memcells, memcells) with get, set
member val Wxf = initializer.InitDM(memcells, inputs) with get, set
member val Whf = initializer.InitDM(memcells, memcells) with get, set
member val Wxo = initializer.InitDM(memcells, inputs) with get, set
member val Who = initializer.InitDM(memcells, memcells) with get, set
member val bi = DV.zeroCreate memcells with get, set
member val bc = DV.zeroCreate memcells with get, set
member val bf = DV.zeroCreate memcells with get, set
member val bo = DV.zeroCreate memcells with get, set
member val c = DV.zeroCreate memcells with get, set
member val h = DV.zeroCreate memcells with get, set
override l.Init() =
l.Wxi <- initializer.InitDM(l.Wxi)
l.Whi <- initializer.InitDM(l.Whi)
l.Wxc <- initializer.InitDM(l.Wxc)
l.Whc <- initializer.InitDM(l.Whc)
l.Wxf <- initializer.InitDM(l.Wxf)
l.Whf <- initializer.InitDM(l.Whf)
l.Wxo <- initializer.InitDM(l.Wxo)
l.Who <- initializer.InitDM(l.Who)
l.bi <- DV.zeroCreate memcells
l.bc <- DV.zeroCreate memcells
l.bf <- DV.zeroCreate memcells
l.bo <- DV.zeroCreate memcells
l.c <- DV.zeroCreate memcells
l.h <- DV.zeroCreate memcells
override l.Reset() =
l.c <- DV.zeroCreate memcells
l.h <- DV.zeroCreate memcells
override l.Run (x:DM) =
let y = x |> DM.mapCols (fun x ->
let i = sigmoid((l.Wxi * x) + (l.Whi * l.h) + l.bi)
let c' = tanh((l.Wxc * x) + (l.Whc * l.h) + l.bc)
let f = sigmoid((l.Wxf * x) + (l.Whf * l.h) + l.bf)
l.c <- (i .* c') + (f .* l.c)
let o = sigmoid((l.Wxo * x) + (l.Who * l.h) + l.bo)
l.h <- o .* tanh l.c
l.h)
l.h <- primalDeep l.h
l.c <- primalDeep l.c
y
override l.Encode() = [l.Wxi; l.Whi; l.Wxc; l.Whc; l.Wxf; l.Whf; l.Wxo; l.Who] |> List.map DM.toDV |> List.append [l.bi; l.bc; l.bf; l.bo] |> Seq.fold DV.append DV.Zero
override l.EncodeLength = l.Wxi.Length + l.Whi.Length + l.Wxc.Length + l.Whc.Length + l.Wxf.Length + l.Whf.Length + l.Wxo.Length + l.Who.Length + l.bi.Length + l.bc.Length + l.bf.Length + l.bo.Length
override l.Decode w =
let ww = w |> DV.split [l.bi.Length; l.bc.Length; l.bf.Length; l.bo.Length; l.Wxi.Length; l.Whi.Length; l.Wxc.Length; l.Whc.Length; l.Wxf.Length; l.Whf.Length; l.Wxo.Length; l.Who.Length] |> Array.ofSeq
l.bi <- ww.[0]
l.bc <- ww.[1]
l.bf <- ww.[2]
l.bo <- ww.[3]
l.Wxi <- ww.[4] |> DM.ofDV l.Wxi.Rows
l.Whi <- ww.[5] |> DM.ofDV l.Whi.Rows
l.Wxc <- ww.[6] |> DM.ofDV l.Wxc.Rows
l.Whc <- ww.[7] |> DM.ofDV l.Whc.Rows
l.Wxf <- ww.[8] |> DM.ofDV l.Wxf.Rows
l.Whf <- ww.[9] |> DM.ofDV l.Whf.Rows
l.Wxo <- ww.[10] |> DM.ofDV l.Wxo.Rows
l.Who <- ww.[11] |> DM.ofDV l.Who.Rows
l.c <- DV.zeroCreate memcells
l.h <- DV.zeroCreate memcells
override l.ToString() =
"Hype.Neural.LSTM\n"
+ " " + inputs.ToString() + " -> " + memcells.ToString() + " -> " + memcells.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " Wxi : %i x %i\n" l.Wxi.Rows l.Wxi.Cols
+ sprintf " Whi : %i x %i\n" l.Whi.Rows l.Whi.Cols
+ sprintf " Wxc : %i x %i\n" l.Wxc.Rows l.Wxc.Cols
+ sprintf " Whc : %i x %i\n" l.Whc.Rows l.Whc.Cols
+ sprintf " Wxf : %i x %i\n" l.Wxf.Rows l.Wxf.Cols
+ sprintf " Whf : %i x %i\n" l.Whf.Rows l.Whf.Cols
+ sprintf " Wxo : %i x %i\n" l.Wxo.Rows l.Wxo.Cols
+ sprintf " Who : %i x %i\n" l.Who.Rows l.Who.Cols
+ sprintf " bi : %i\n" l.bi.Length
+ sprintf " bc : %i\n" l.bc.Length
+ sprintf " bf : %i\n" l.bf.Length
+ sprintf " bo : %i" l.bo.Length
override l.ToStringFull() =
"Hype.Neural.LSTM\n"
+ " " + inputs.ToString() + " -> " + memcells.ToString() + " -> " + memcells.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " Wxi:\n%O\n" l.Wxi
+ sprintf " Whi:\n%O\n" l.Whi
+ sprintf " Wxc:\n%O\n" l.Wxc
+ sprintf " Whc:\n%O\n" l.Whc
+ sprintf " Wxf:\n%O\n" l.Wxf
+ sprintf " Whf:\n%O\n" l.Whf
+ sprintf " Wxo:\n%O\n" l.Wxo
+ sprintf " Who:\n%O\n" l.Who
+ sprintf " bi:\n%O\n" l.bi
+ sprintf " bc:\n%O\n" l.bc
+ sprintf " bf:\n%O\n" l.bf
+ sprintf " bo:\n%O" l.bo
override l.Visualize() =
"Hype.Neural.LSTM\n"
+ " " + inputs.ToString() + " -> " + memcells.ToString() + " -> " + memcells.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " Wxi:\n%s\n" (l.Wxi.Visualize())
+ sprintf " Whi:\n%s\n" (l.Whi.Visualize())
+ sprintf " Wxc:\n%s\n" (l.Wxc.Visualize())
+ sprintf " Whc:\n%s\n" (l.Whc.Visualize())
+ sprintf " Wxf:\n%s\n" (l.Wxf.Visualize())
+ sprintf " Whf:\n%s\n" (l.Whf.Visualize())
+ sprintf " Wxo:\n%s\n" (l.Wxo.Visualize())
+ sprintf " Who:\n%s\n" (l.Who.Visualize())
+ sprintf " bi:\n%s\n" (l.bi.Visualize())
+ sprintf " bc:\n%s\n" (l.bc.Visualize())
+ sprintf " bf:\n%s\n" (l.bf.Visualize())
+ sprintf " bo:\n%s" (l.bo.Visualize())
/// Gated recurrent unit layer
type GRU(inputs:int, memcells:int) =
inherit Layer()
let initializer = Initializer.InitStandard
member val Wxz = initializer.InitDM(memcells, inputs) with get, set
member val Whz = initializer.InitDM(memcells, memcells) with get, set
member val Wxr = initializer.InitDM(memcells, inputs) with get, set
member val Whr = initializer.InitDM(memcells, memcells) with get, set
member val Wxh = initializer.InitDM(memcells, inputs) with get, set
member val Whh = initializer.InitDM(memcells, memcells) with get, set
member val bz = DV.zeroCreate memcells with get, set
member val br = DV.zeroCreate memcells with get, set
member val bh = DV.zeroCreate memcells with get, set
member val h = DV.zeroCreate memcells with get, set
override l.Init() =
l.Wxz <- initializer.InitDM(l.Wxz)
l.Whz <- initializer.InitDM(l.Whz)
l.Wxr <- initializer.InitDM(l.Wxr)
l.Whr <- initializer.InitDM(l.Whr)
l.Wxh <- initializer.InitDM(l.Wxh)
l.Whh <- initializer.InitDM(l.Whh)
l.bz <- DV.zeroCreate memcells
l.br <- DV.zeroCreate memcells
l.bh <- DV.zeroCreate memcells
l.h <- DV.zeroCreate memcells
override l.Reset() =
l.h <- DV.zeroCreate memcells
override l.Run(x:DM) =
let y = x |> DM.mapCols (fun x ->
let z = sigmoid(l.Wxz * x + l.Whz * l.h + l.bz)
let r = sigmoid(l.Wxr * x + l.Whr * l.h + l.br)
let h' = tanh(l.Wxh * x + l.Whh * (l.h .* r))
l.h <- (1.f - z) .* h' + z .* l.h
l.h)
l.h <- primalDeep l.h
y
override l.Encode() = [l.Wxz; l.Whz; l.Wxr; l.Whr; l.Wxh; l.Whh] |> List.map DM.toDV |> List.append [l.bz; l.br; l.bh] |> Seq.fold DV.append DV.Zero
override l.EncodeLength = l.Wxz.Length + l.Whz.Length + l.Wxr.Length + l.Whr.Length + l.Wxh.Length + l.Whh.Length + l.bz.Length + l.br.Length + l.bh.Length
override l.Decode w =
let ww = w |> DV.split [l.bz.Length; l.br.Length; l.bh.Length; l.Wxz.Length; l.Whz.Length; l.Wxr.Length; l.Whr.Length; l.Wxh.Length; l.Whh.Length] |> Array.ofSeq
l.bz <- ww.[0]
l.br <- ww.[1]
l.bh <- ww.[2]
l.Wxz <- ww.[3] |> DM.ofDV l.Wxh.Rows
l.Whz <- ww.[4] |> DM.ofDV l.Whz.Rows
l.Wxr <- ww.[5] |> DM.ofDV l.Wxr.Rows
l.Whr <- ww.[6] |> DM.ofDV l.Whr.Rows
l.Wxh <- ww.[7] |> DM.ofDV l.Wxh.Rows
l.Whh <- ww.[8] |> DM.ofDV l.Whh.Rows
override l.ToString() =
"Hype.Neural.GRU\n"
+ " " + inputs.ToString() + " -> " + memcells.ToString() + " -> " + memcells.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " Wxz : %i x %i\n" l.Wxz.Rows l.Wxz.Cols
+ sprintf " Whz : %i x %i\n" l.Whz.Rows l.Whz.Cols
+ sprintf " Wxr : %i x %i\n" l.Wxr.Rows l.Wxr.Cols
+ sprintf " Whr : %i x %i\n" l.Whr.Rows l.Whr.Cols
+ sprintf " Wxh : %i x %i\n" l.Wxh.Rows l.Wxh.Cols
+ sprintf " Whh : %i x %i\n" l.Whh.Rows l.Whh.Cols
+ sprintf " bz : %i\n" l.bz.Length
+ sprintf " br : %i\n" l.br.Length
+ sprintf " bh : %i\n" l.bh.Length
override l.ToStringFull() =
"Hype.Neural.GRU\n"
+ " " + inputs.ToString() + " -> " + memcells.ToString() + " -> " + memcells.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " Wxz:\n%O\n" l.Wxz
+ sprintf " Whz:\n%O\n" l.Whz
+ sprintf " Wxr:\n%O\n" l.Wxr
+ sprintf " Whr:\n%O\n" l.Whr
+ sprintf " Wxh:\n%O\n" l.Wxh
+ sprintf " Whh:\n%O\n" l.Whh
+ sprintf " bz:\n%O\n" l.bz
+ sprintf " br:\n%O\n" l.br
+ sprintf " bh:\n%O\n" l.bh
override l.Visualize() =
"Hype.Neural.GRU\n"
+ " " + inputs.ToString() + " -> " + memcells.ToString() + " -> " + memcells.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " Wxz:\n%s\n" (l.Wxz.Visualize())
+ sprintf " Whz:\n%s\n" (l.Whz.Visualize())
+ sprintf " Wxr:\n%s\n" (l.Wxr.Visualize())
+ sprintf " Whr:\n%s\n" (l.Whr.Visualize())
+ sprintf " Wxh:\n%s\n" (l.Wxh.Visualize())
+ sprintf " Whh:\n%s\n" (l.Whh.Visualize())
+ sprintf " bz:\n%s\n" (l.bz.Visualize())
+ sprintf " br:\n%s\n" (l.br.Visualize())
+ sprintf " bh:\n%s\n" (l.bh.Visualize())
/// Long short-term memory layer (alternative implementation)
type LSTMAlt(inputs:int, memcells:int) =
inherit Layer()
let initializer = Initializer.InitTanh
member val Wxh = initializer.InitDM(4 * memcells, inputs) with get, set
member val Whh = initializer.InitDM(4 * memcells, memcells) with get, set
member val b = DV.zeroCreate (4 * memcells) with get, set
member val c = DV.zeroCreate memcells with get, set
member val h = DV.zeroCreate memcells with get, set
override l.Init() =
l.Wxh <- initializer.InitDM(l.Wxh)
l.Whh <- initializer.InitDM(l.Whh)
l.b <- DV.zeroCreate (4 * memcells)
l.c <- DV.zeroCreate memcells
l.h <- DV.zeroCreate memcells
override l.Reset() =
l.c <- DV.zeroCreate memcells
l.h <- DV.zeroCreate memcells
override l.Run(x:DM) =
let y = x |> DM.mapCols (fun x ->
let x2h = l.Wxh * x
let h2h = l.Whh * l.h
let pre = x2h + h2h + l.b
let pretan = tanh pre.[..memcells - 1]
let presig = sigmoid pre.[memcells..]
let c' = pretan
let i = presig.[..memcells - 1]
let f = presig.[memcells..(2 * memcells) - 1]
let o = presig.[(2 * memcells)..]
l.c <- (i .* c') + (f .* l.c)
l.h <- o .* tanh l.c
l.h)
l.h <- primalDeep l.h
l.c <- primalDeep l.c
y
override l.Encode() = [l.Wxh |> DM.toDV; l.Whh |> DM.toDV; l.b] |> Seq.fold DV.append DV.Zero
override l.EncodeLength = l.Wxh.Length + l.Whh.Length + l.b.Length
override l.Decode w =
let ww = w |> DV.split [l.Wxh.Length; l.Whh.Length; l.b.Length] |> Array.ofSeq
l.Wxh <- ww.[0] |> DM.ofDV l.Wxh.Rows
l.Whh <- ww.[1] |> DM.ofDV l.Whh.Rows
l.b <- ww.[2]
override l.ToString() =
"Hype.Neural.LSTMAlt\n"
+ " " + inputs.ToString() + " -> " + memcells.ToString() + " -> " + memcells.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " Wxh : %i x %i\n" l.Wxh.Rows l.Wxh.Cols
+ sprintf " Whh : %i x %i\n" l.Whh.Rows l.Whh.Cols
+ sprintf " b : %i\n" l.b.Length
override l.ToStringFull() =
"Hype.Neural.LSTMAlt\n"
+ " " + inputs.ToString() + " -> " + memcells.ToString() + " -> " + memcells.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " Wxh:\n%O\n" l.Wxh
+ sprintf " Whh:\n%O\n" l.Whh
+ sprintf " b:\n%O\n" l.b
override l.Visualize() =
"Hype.Neural.LSTMAlt\n"
+ " " + inputs.ToString() + " -> " + memcells.ToString() + " -> " + memcells.ToString() + "\n"
+ sprintf " Learnable parameters: %i\n" l.EncodeLength
+ sprintf " Init: %O\n" initializer
+ sprintf " Wxh:\n%s\n" (l.Wxh.Visualize())
+ sprintf " Whh:\n%s\n" (l.Whh.Visualize())
+ sprintf " b:\n%s\n" (l.b.Visualize())
================================================
FILE: src/Hype/Optimize.fs
================================================
//
// This file is part of
// Hype: Compositional Machine Learning and Hyperparameter Optimization
//
// Copyright (c) 2015, National University of Ireland Maynooth (Atilim Gunes Baydin, Barak A. Pearlmutter)
//
// Hype is released under the MIT license.
// (See accompanying LICENSE file.)
//
// Written by:
//
// Atilim Gunes Baydin
// atilimgunes.baydin@nuim.ie
//
// Barak A. Pearlmutter
// barak@cs.nuim.ie
//
// Brain and Computation Lab
// Hamilton Institute & Department of Computer Science
// National University of Ireland Maynooth
// Maynooth, Co. Kildare
// Ireland
//
// www.bcl.hamilton.ie
//
/// Optimization namespace
namespace Hype
open Hype
open DiffSharp.AD.Float32
open DiffSharp.Util
/// Learning rate schemes
type LearningRate =
/// Constant
| Constant of D
/// 1 / t decay, a = a0 / (1 + kt). Initial value, decay rate
| Decay of D * D
/// Exponential decay, a = a0 * Exp(-kt). Initial value, decay rate
| ExpDecay of D * D
/// Scheduled learning rate vector, its length overrides Params.Epochs
| Schedule of DV
/// Backtracking line search. Initial value, c, rho
| Backtrack of D * D * D
/// Strong Wolfe line search. lmax, c1, c2
| StrongWolfe of D * D * D
/// Adagrad. Initial value
| AdaGrad of D
/// RMSProp. Initial value, decay rate
| RMSProp of D * D
static member DefaultConstant = Constant (D 0.001f)
static member DefaultDecay = Decay (D 1.f, D 0.1f)
static member DefaultExpDecay = ExpDecay (D 1.f, D 0.1f)
static member DefaultBacktrack = Backtrack (D 1.f, D 0.0001f, D 0.5f)
static member DefaultStrongWolfe = StrongWolfe (D 1.f, D 0.0001f, D 0.5f)
static member DefaultAdaGrad = AdaGrad (D 0.001f)
static member DefaultRMSProp = RMSProp (D 0.001f, D 0.9f)
override l.ToString() =
match l with
| Constant a -> sprintf "Constant a = %A" a
| Decay (a0, k) -> sprintf "1/t decay a0 = %A, k = %A" a0 k
| ExpDecay (a0, k) -> sprintf "Exponential decay a = %A, k = %A" a0 k
| Schedule a -> sprintf "Scheduled of length %A" a.Length
| Backtrack (a0, c, r) -> sprintf "Backtracking a0 = %A, c = %A, r = %A" a0 c r
| StrongWolfe (amax, c1, c2) -> sprintf "Strong Wolfe amax = %A, c1 = %A, c2 = %A" amax c1 c2
| AdaGrad (a0) -> sprintf "AdaGrad a0 = %A" a0
| RMSProp (a0, k) -> sprintf "RMSProp a0 = %A, k = %A" a0 k
member l.Func =
let loopLimit = 500
match l with
| Constant a -> fun _ _ _ _ _ _ _ -> box a
| Decay (a0, k) -> fun i _ _ _ _ _ _ -> box (a0 / (1.f + k * i))
| ExpDecay (a0, k) -> fun i _ _ _ _ _ _ -> box (a0 * exp (-k * i))
| Schedule a -> fun i _ _ _ _ _ _ -> box a.[i % a.Length]
| Backtrack (a0, c, r) ->
fun _ w f v g _ p ->
let mutable a = a0
let mutable i = 0
let mutable found = false
while not found do
if f (w + a * p) < v + c * a * (p * g) then
found <- true
else
a <- r * a
i <- i + 1
if i > loopLimit then
found <- true
Util.printLog "*** BACKTRACKING DID NOT CONVERGE ***"
box a
| StrongWolfe (amax, c1, c2) ->
fun _ w f v g _ p ->
let v0 = v
let gp0 = g * p
let inline zoom a1 a2 =
let mutable al = a1
let mutable ah = a2
let mutable a' = a1
let mutable v'al = f (w + al * p)
let mutable i = 0
let mutable found = false
while not found do
a' <- (al + ah) / D 2.f
let v', gg = grad' f (w + a' * p)
if (v' > v0 + c1 * a' * gp0) || (v' >= v'al) then
ah <- a'
else
let gp' = gg * p
if abs gp' <= -c2 * gp0 then
found <- true
elif gp' * (ah - al) >= D 0.f then
ah <- al
al <- a'
v'al <- v'
i <- i + 1
if i > loopLimit then
found <- true
Util.printLog "*** STRONG WOLFE (ZOOM) DID NOT CONVERGE ***"
a'
let mutable v = v0
let mutable v' = v0
let mutable gp' = gp0
let mutable a = D 0.f
let mutable a' = Rnd.UniformD(amax)
let mutable a'' = a'
let mutable i = 1
let mutable found = false
while not found do
let vv, gg = grad' f (w + a' * p)
v' <- vv
gp' <- gg * p
if (v' > v0 + c1 * a' * gp0) || ((i > 1) && (v' >= v)) then
a'' <- zoom a a'
found <- true
elif (abs gp') <= (-c2 * gp0) then
a'' <- a'
found <- true
elif gp' >= D 0.f then
a'' <- zoom a' a
found <- true
else
a <- a'
v <- v'
a' <- Rnd.UniformD(a', amax)
i <- i + 1
if i > loopLimit then
found <- true
Util.printLog "*** STRONG WOLFE DID NOT CONVERGE ***"
box a''
| AdaGrad (a0) ->
fun _ _ _ _ g (gcache:DV ref) _ ->
gcache := !gcache + (g .* g)
box (a0 / sqrt (!gcache + 1e-8f))
| RMSProp (a0, k) ->
fun _ _ _ _ g (gcache:DV ref) _ ->
gcache := (k * !gcache) + (1.f - k) * (g .* g)
box (a0 / sqrt (!gcache + 1e-6f))
/// Training batch configuration
type Batch =
| Full
/// Minibatch of given size
| Minibatch of int
/// Minibatch with size 1, SGD
| Stochastic
override b.ToString() =
match b with
| Full -> "Full"
| Minibatch n -> sprintf "Minibatches of %A" n
| Stochastic -> "Stochastic (minibatch of 1)"
member b.Func =
match b with
| Full -> fun (d:Dataset) _ -> d
| Minibatch n -> fun d i -> d.[(n * i)..((n * i) + n - 1)]
| Stochastic -> fun d i -> d.[i..i]
/// Gradient-based optimization methods
type Method =
/// Gradient descent
| GD
/// Conjugate gradient
| CG
/// Conjugate descent
| CD
/// Nonlinear conjugate gradient
| NonlinearCG
/// Dai & Yuan conjugate gradient
| DaiYuanCG
/// Newton conjugate gradient
| NewtonCG
/// Exact Newton
| Newton
override o.ToString() =
match o with
| GD -> "Gradient descent"
| CG -> "Conjugate gradient"
| CD -> "Conjugate descent"
| DaiYuanCG -> "Dai & Yuan conjugate gradient"
| NonlinearCG -> "Nonlinear conjugate gradient"
| NewtonCG -> "Newton conjugate gradient"
| Newton -> "Exact Newton"
member o.Func =
match o with
| GD ->
fun w (f:DV->D) _ _ gradclip ->
let v', g' = grad' f w
let g' = gradclip g'
let p' = -g'
v', g', p'
/// Hestenes and Stiefel 1952
| CG ->
fun w f g p gradclip ->
let v', g' = grad' f w
let g' = gradclip g'
let y = g' - g
let b = (g' * y) / (p * y)
let p' = -g' + b * p
v', g', p'
/// Fletcher 1987
| CD ->
fun w f g p gradclip ->
let v', g' = grad' f w
let g' = gradclip g'
let b = (DV.normSq g') / (-p * g)
let p' = -g' + b * p
v', g', p'
/// Dai and Yuan 1999
| DaiYuanCG ->
fun w f g p gradclip ->
let v', g' = grad' f w
let g' = gradclip g'
let y = g' - g
let b = (DV.normSq g') / (p * y)
let p' = -g' + b * p
v', g', p'
/// Fletcher and Reeves 1964
| NonlinearCG ->
fun w f g p gradclip ->
let v', g' = grad' f w
let g' = gradclip g'
let b = (DV.normSq g') / (DV.normSq g)
let p' = -g' + b * p
v', g', p'
| NewtonCG ->
fun w f _ p gradclip ->
let v', g' = grad' f w
let g' = gradclip g'
let hv = hessianv f w p
let b = (g' * hv) / (p * hv)
let p' = -g' + b * p
v', g', p'
| Newton ->
fun w f _ _ gradclip ->
let v', g', h' = gradhessian' f w
let g' = gradclip g'
let p' = -DM.solveSymmetric h' g'
v', g', p'
/// Momentum configuration
type Momentum =
/// Default momentum
| Momentum of D
/// Nesterov momentum
| Nesterov of D
| NoMomentum
static member DefaultMomentum = Momentum (D 0.9f)
static member DefaultNesterov = Nesterov (D 0.9f)
override m.ToString() =
match m with
| Momentum m -> sprintf "Standard %A" m
| Nesterov m -> sprintf "Nesterov %A" m
| NoMomentum -> "None"
member m.Func =
match m with
| Momentum m -> fun (u:DV) (u':DV) -> (m * u) + u'
| Nesterov m -> fun u u' -> (m * m * u) + (m + D 1.f) * u'
| NoMomentum -> fun _ u' -> u'
/// Loss function configuration
type Loss =
/// L1 norm, least absolute deviations
| L1Loss
/// L2 norm
| L2Loss
/// L2 norm squared, least squares
| Quadratic
/// Cross entropy after linear layer
| CrossEntropyOnLinear
/// Cross entropy after softmax layer
| CrossEntropyOnSoftmax
override l.ToString() =
match l with
| L1Loss -> "L1 norm, least absolute deviations"
| L2Loss -> "L2 norm"
| Quadratic -> "L2 norm squared, least squares"
| CrossEntropyOnLinear -> "Cross entropy after linear layer"
| CrossEntropyOnSoftmax -> "Cross entropy after softmax layer"
member l.Func =
match l with
| L1Loss -> fun (d:Dataset) (f:DM->DM) -> ((d.Y - (f d.X)) |> DM.toCols |> Seq.sumBy DV.l1norm) / d.Length
| L2Loss -> fun d f -> ((d.Y - (f d.X)) |> DM.toCols |> Seq.sumBy DV.l2norm) / d.Length
| Quadratic -> fun d f -> ((d.Y - (f d.X)) |> DM.toCols |> Seq.sumBy DV.l2normSq) / d.Length
| CrossEntropyOnLinear -> fun d f -> ((f d.X) |> DM.mapiCols (fun i v -> toDV [(logsumexp v) - v.[d.Yi.[i]]]) |> DM.sum) / d.Length
| CrossEntropyOnSoftmax -> fun d f -> -((f d.X) |> DM.mapiCols (fun i v -> toDV [(DV.standardBasis v.Length d.Yi.[i]) * log v]) |> DM.sum) / d.Length
/// Regularization configuration
type Regularization =
/// L1 regularization
| L1Reg of D
/// L2 regularization
| L2Reg of D
| NoReg
static member DefaultL1Reg = L1Reg (D 0.0001f)
static member DefaultL2Reg = L2Reg (D 0.0001f)
override r.ToString() =
match r with
| L1Reg l -> sprintf "L1 lambda = %A" l
| L2Reg l -> sprintf "L2 lambda = %A" l
| NoReg -> "None"
member r.Func =
match r with
| L1Reg l -> fun (w:DV) -> l * (DV.l1norm w)
| L2Reg l -> fun w -> l * (DV.l2normSq w)
| NoReg -> fun w -> D 0.f
/// Gradient clipping configuration
type GradientClipping =
/// Norm clipping
| NormClip of D
| NoClip
static member DefaultNormClip = NormClip (D 1.f)
override g.ToString() =
match g with
| NormClip threshold -> sprintf "Norm clipping threshold = %A" threshold
| NoClip -> "None"
member g.Func =
match g with
| NormClip threshold -> fun (g:DV) -> let ng = DV.norm g in if ng > threshold then (threshold / ng) * g else g
| NoClip -> id
/// Early stopping configuration
type EarlyStopping =
/// Stagnation patience, overfitting patience
| Early of int * int
| NoEarly
static member DefaultEarly = Early (750, 10)
override e.ToString() =
match e with
| Early(s, o) -> sprintf "Stagnation thresh. = %A, overfit. thresh. = %A" s o
| NoEarly -> "None"
/// Record type holding optimization or training parameters
type Params =
{Epochs : int
Method: Method
LearningRate : LearningRate
Momentum : Momentum
Loss : Loss
Regularization : Regularization
GradientClipping : GradientClipping
Batch : Batch
EarlyStopping : EarlyStopping
ImprovementThreshold : D
Silent : bool
ReturnBest : bool
ValidationInterval : int
LoggingFunction : int->DV->D->unit}
[]
module Params =
let Default = {Epochs = 100
LearningRate = LearningRate.DefaultRMSProp
Momentum = NoMomentum
Loss = L2Loss
Regularization = Regularization.DefaultL2Reg
GradientClipping = NoClip
Method = GD
Batch = Full
EarlyStopping = NoEarly
ImprovementThreshold = D 0.995f
Silent = false
ReturnBest = true
ValidationInterval = 10
LoggingFunction = fun _ _ _ -> ()}
/// Main optimization module
type Optimize =
/// Minimize vector-to-scalar function `f`, starting from initial parameter vector `w0`. Uses the default optimization configuration in `Params.Default`.
static member Minimize (f:DV->D, w0:DV) = Optimize.Minimize(f, w0, Params.Default)
/// Minimize vector-to-scalar function `f`, starting from initial parameter vector `w0`. Uses the optimization configuration given in `par`.
static member Minimize (f:DV->D, w0:DV, par:Params) =
let dir = par.Method.Func
let lr = par.LearningRate.Func
let gradclip = par.GradientClipping.Func
let mom = par.Momentum.Func
let iters =
match par.LearningRate with
| Schedule a -> a.Length
| _ -> par.Epochs
if not par.Silent then
Util.printLog "--- Minimization started"
Util.printLog (sprintf "Parameters : %A" w0.Length)
Util.printLog (sprintf "Iterations : %A" iters)
Util.printLog (sprintf "Valid. interval: %i" par.ValidationInterval)
Util.printLog (sprintf "Method : %O" par.Method)
Util.printLog (sprintf "Learning rate : %O" par.LearningRate)
Util.printLog (sprintf "Momentum : %O" par.Momentum)
Util.printLog (sprintf "Gradient clip. : %O" par.GradientClipping)
Util.printLog (sprintf "Early stopping : %O" par.EarlyStopping)
Util.printLog (sprintf "Improv. thresh.: %A" par.ImprovementThreshold)
Util.printLog (sprintf "Return best : %A" par.ReturnBest)
let mutable i = 0
let mutable w = w0
let l, g = grad' f w0
let mutable l = l
let mutable l' = l
let mutable g = g
let mutable p = -g
let mutable u = DV.ZeroN g.Length
let gcache = ref DV.Zero
let l0 = l
let mutable wbest = w0
let mutable lbest = l0
let mutable repllast = l0
let mutable replbest = l0
let mutable replbestchar = " "
let mutable whist = []
let mutable lhist = []
let ldiffchar l = if l < D 0.f then "↓" elif l > D 0.f then "↑" else "-"
let ichars = iters.ToString().Length
let mutable stagnation = -par.ValidationInterval
let mutable earlystop = false
let isNice (v:D) =
let vf = float32 v
if System.Single.IsNaN(vf) then false
elif System.Single.IsInfinity(vf) then false
elif System.Single.IsNegativeInfinity(vf) then false
elif System.Single.IsPositiveInfinity(vf) then false
else true
let mutable diverged = false
let start = System.DateTime.Now
while (i < iters) && (not earlystop) do
let l'', g', p' = dir w f g p gradclip
l' <- l''
if (not (isNice l')) then
if not par.Silent then Util.printLog "*** MINIMIZATION DIVERGED: Function value is out of bounds ***"
earlystop <- true
diverged <- true
if (l' < par.ImprovementThreshold * lbest) && (not diverged) then
wbest <- w
lbest <- l'
whist <- [w] @ whist
lhist <- [l] @ lhist
if i % par.ValidationInterval = 0 then
let repldiff = l' - repllast
if l' < par.ImprovementThreshold * replbest then
replbest <- l'
replbestchar <- "▼"
stagnation <- 0
else
replbestchar <- " "
stagnation <- stagnation + par.ValidationInterval
match par.EarlyStopping with
| Early(s, _) ->
if stagnation >= s then
if not par.Silent then Util.printLog "*** EARLY STOPPING TRIGGERED: Stagnation ***"
earlystop <- true
| _ -> ()
if not par.Silent then
match par.EarlyStopping with
| Early(s, _) ->
Util.printLog (sprintf "%*i/%i | %O [%s%s] | Stag:%*i" ichars (i + 1) iters l' (ldiffchar repldiff) replbestchar (s.ToString().Length) stagnation)
| _ ->
Util.printLog (sprintf "%*i/%i | %O [%s%s]" ichars (i + 1) iters l' (ldiffchar repldiff) replbestchar)
repllast <- l'
par.LoggingFunction i w l'
let mutable u' = DV.Zero
match lr i w f l' g' gcache p' with
| :? D as a -> u' <- a * p'; // A scalar learning rate
| :? DV as a -> u' <- a .* p'; // Vector of independent learning rates
u' <- mom u u'
w <- w + u'
l <- l'
g <- g'
p <- p' // Or, p <- u'
u <- u'
i <- i + 1
if not diverged then
let l'', _, _ = dir w f g p gradclip
l' <- l''
if l' < par.ImprovementThreshold * lbest then
wbest <- w
lbest <- l'
let duration = System.DateTime.Now.Subtract(start)
let wfinal = if par.ReturnBest || diverged then wbest else w
let lfinal = if par.ReturnBest || diverged then lbest else l'
let lchg = (lfinal - l0)
let lchgs = lchg / (float32 duration.TotalSeconds)
let es = (float i) / (duration.TotalSeconds)
let em = (float i) / (duration.TotalMinutes)
if not par.Silent then
Util.printLog (sprintf "Duration : %A" duration)
Util.printLog (sprintf "Value initial : %O" (primal l0))
Util.printLog (sprintf "Value final : %O %s" (primal lfinal) (if par.ReturnBest then "(Best)" else "(Last)"))
Util.printLog (sprintf "Value change : %O (%.2f %%)" (primal lchg) (float32 (100 * lchg /l0)))
Util.printLog (sprintf "Value chg. / s : %O" (primal lchgs))
Util.printLog (sprintf "Iter. / s : %A" es)
Util.printLog (sprintf "Iter. / min : %A" em)
Util.printLog "--- Minimization finished"
wfinal, lfinal, (whist |> List.rev |> List.toArray), (lhist |> List.rev |> List.toArray)
/// Train model function `f`, starting from initial parameter vector `w0`, by computing the loss for the training data given in dataset `d`. Uses the default optimization configuration in `Params.Default`
static member Train (f:DV->DV->D, w0:DV, d:Dataset) = Optimize.Train((fun w v -> toDV [f w v]), w0, d)
/// Train model function `f`, starting from initial parameter vector `w0`, by computing the loss for the training data given in dataset `d`. Uses the optimization configuration given in `par`.
static member Train (f:DV->DV->D, w0:DV, d:Dataset, par:Params) = Optimize.Train((fun w v -> toDV [f w v]), w0, d, par)
/// Train model function `f`, starting from initial parameter vector `w0`, by computing the loss for the training data given in dataset `d` and also monitoring the loss for the validation data given in dataset `v`. Uses the default optimization configuration in `Params.Default`
static member Train (f:DV->DV->D, w0:DV, d:Dataset, v:Dataset) = Optimize.Train((fun w v -> toDV [f w v]), w0, d, v)
/// Train model function `f`, starting from initial parameter vector `w0`, by computing the loss for the training data given in dataset `d` and also monitoring the loss for the validation data given in dataset `v`. Uses the optimization configuration given in `par`.
static member Train (f:DV->DV->D, w0:DV, d:Dataset, v:Dataset, par:Params) = Optimize.Train((fun w v -> toDV [f w v]), w0, d, v, par)
/// Train model function `f`, starting from initial parameter vector `w0`, by computing the loss for the training data given in dataset `d`. Uses the default optimization configuration in `Params.Default`.
static member Train (f:DV->DV->DV, w0:DV, d:Dataset) = Optimize.Train(f, w0, d, Dataset.empty, Params.Default)
/// Train model function `f`, starting from initial parameter vector `w0`, by computing the loss for the training data given in dataset `d`. Uses the optimization configuration given in `par`.
static member Train (f:DV->DV->DV, w0:DV, d:Dataset, par:Params) = Optimize.Train(f, w0, d, Dataset.empty, par)
/// Train model function `f`, starting from initial parameter vector `w0`, by computing the loss for the training data given in dataset `d`, and also monitoring the loss for the validation data given in dataset `v`. Uses the default optimization configuration in `Params.Default`.
static member Train (f:DV->DV->DV, w0:DV, d:Dataset, v:Dataset) = Optimize.Train(f, w0, d, v, Params.Default)
/// Train model function `f`, starting from initial parameter vector `w0`, by computing the loss for the training data given in dataset `d`, and also monitoring the loss for the validation data given in dataset `v`. Uses the optimization configuration given in `par`.
static member Train (f:DV->DV->DV, w0:DV, d:Dataset, v:Dataset, par:Params) = Optimize.Train (f >> DM.mapCols, w0, d, v, par)
/// Train model function `f`, starting from initial parameter vector `w0`, by computing the loss for the training data given in dataset `d`. Uses the default optimization configuration in `Params.Default`.
static member Train (f:DV->DM->DM, w0:DV, d:Dataset) = Optimize.Train(f, w0, d, Dataset.empty, Params.Default)
/// Train model function `f`, starting from initial parameter vector `w0`, by computing the loss for the training data given in dataset `d`. Uses the optimization configuration given in `par`.
static member Train (f:DV->DM->DM, w0:DV, d:Dataset, par:Params) = Optimize.Train(f, w0, d, Dataset.empty, par)
/// Train model function `f`, starting from initial parameter vector `w0`, by computing the loss for the training data given in dataset `d`, and also monitoring the loss for the validation data given in dataset `v`. Uses the default optimization configuration in `Params.Default`.
static member Train (f:DV->DM->DM, w0:DV, d:Dataset, v:Dataset) = Optimize.Train(f, w0, d, v, Params.Default)
/// Train model function `f`, starting from initial parameter vector `w0`, by computing the loss for the training data given in dataset `d`, and also monitoring the loss for the validation data given in dataset `v`. Uses the optimization configuration given in `par`.
static member Train (f:DV->DM->DM, w0:DV, d:Dataset, v:Dataset, par:Params) =
let b = par.Batch.Func
let dir = par.Method.Func
let lr = par.LearningRate.Func
let gradclip = par.GradientClipping.Func
let mom = par.Momentum.Func
let reg = par.Regularization.Func
let epochs =
match par.LearningRate with
| Schedule l -> l.Length
| _ -> par.Epochs
let loss = par.Loss.Func
let batches, batchsize =
match par.Batch with
| Full -> 1, d.Length
| Minibatch n -> d.Length / n, n
| Stochastic -> d.Length, 1
let iters = epochs * batches
if not par.Silent then
Util.printLog "--- Training started"
Util.printLog (sprintf "Parameters : %A" w0.Length)
Util.printLog (sprintf "Iterations : %A" iters)
Util.printLog (sprintf "Epochs : %A" epochs)
Util.printLog (sprintf "Batches : %O (%A per epoch)" par.Batch batches)
Util.printLog (sprintf "Training data : %i" d.Length)
if Dataset.isEmpty v then
Util.printLog (sprintf "Validation data: None")
else
Util.printLog (sprintf "Validation data: %i" v.Length)
Util.printLog (sprintf "Valid. interval: %i" par.ValidationInterval)
Util.printLog (sprintf "Method : %O" par.Method)
Util.printLog (sprintf "Learning rate : %O" par.LearningRate)
Util.printLog (sprintf "Momentum : %O" par.Momentum)
Util.printLog (sprintf "Loss : %O" par.Loss)
Util.printLog (sprintf "Regularizer : %O" par.Regularization)
Util.printLog (sprintf "Gradient clip. : %O" par.GradientClipping)
Util.printLog (sprintf "Early stopping : %O" par.EarlyStopping)
Util.printLog (sprintf "Improv. thresh.: %A" par.ImprovementThreshold)
Util.printLog (sprintf "Return best : %A" par.ReturnBest)
let q i w = (loss (b d i) (f w)) + reg w
let qvalid =
if Dataset.isEmpty v then
fun _ -> D 0.f
else
fun w -> (loss v (f w)) + reg w
// i : epoch
// w : previous weights
// w' : new weights
// l : previous loss
// l' : new loss
// g : previous gradient
// g' : next gradient
// p : previous direction
// p' : next direction
// u : previous velocity
// u' : next velocity
let mutable epoch = 0
let mutable batch = 0
let mutable w = w0
let l, g = grad' (q 0) w0
let mutable l = l
let mutable l' = l
let mutable g = g
let mutable p = -g
let mutable u = DV.ZeroN g.Length
let gcache = ref DV.Zero
let l0 = l
let mutable wbest = w0
let mutable lbest = l0
let mutable repllast= l0
let mutable replbest = l0
let mutable replbestchar = " "
let mutable repvllast = if Dataset.isEmpty v then D 0.f else qvalid w0
let mutable repvlbest = repvllast
let mutable repvlbestchar = " "
let ldiffchar l = if l < D 0.f then "↓" elif l > D 0.f then "↑" else "-"
let mutable whist = []
let mutable lhist = []
let mutable stagnation = -par.ValidationInterval
let mutable overfitting = 0
let mutable validlimproved = false
let mutable earlystop = false
let echars = epochs.ToString().Length
let bchars = batches.ToString().Length
let ichars = (epochs * d.Length).ToString().Length
let isNice (v:D) =
let vf = float32 v
if System.Single.IsNaN(vf) then false
elif System.Single.IsInfinity(vf) then false
elif System.Single.IsNegativeInfinity(vf) then false
elif System.Single.IsPositiveInfinity(vf) then false
else true
let mutable diverged = false
let start = System.DateTime.Now
while (epoch < epochs) && (not earlystop) do
batch <- 0
while (batch < batches) && (not earlystop) do
let l'', g', p' = dir w (q batch) g p gradclip
l' <- l''
if (not (isNice l')) then
if not par.Silent then Util.printLog "*** TRAINING DIVERGED: Loss is out of bounds ***"
earlystop <- true
diverged <- true
whist <- w :: whist
lhist <- l :: lhist
if (l' < par.ImprovementThreshold * lbest) && (not diverged) then
wbest <- w
lbest <- l'
if not (Dataset.isEmpty v) then
if not validlimproved then
overfitting <- overfitting + 1
match par.EarlyStopping with
| Early(_, o) ->
if overfitting >= o then
if not par.Silent then Util.printLog "*** EARLY STOPPING TRIGGERED: Overfitting ***"
earlystop <- true
| _ -> ()
if batch % par.ValidationInterval = 0 then
let repldiff = l' - repllast
if l' < par.ImprovementThreshold * replbest then
replbest <- l'
replbestchar <- "▼"
else
replbestchar <- " "
if Dataset.isEmpty v then
stagnation <- stagnation + par.ValidationInterval
match par.EarlyStopping with
| Early(s, _) ->
if stagnation >= s then
if not par.Silent then Util.printLog "*** EARLY STOPPING TRIGGERED: Stagnation of training loss ***"
earlystop <- true
| _ -> ()
repllast <- l'
if Dataset.isEmpty v then
if not par.Silent then
match par.EarlyStopping with
| Early(s, _) ->
Util.printLog (sprintf "%*i/%i | Batch %*i/%i | %O [%s%s] | Stag:%*i" echars (epoch + 1) epochs bchars (batch + 1) batches l' (ldiffchar repldiff) replbestchar (s.ToString().Length) stagnation)
| _ ->
Util.printLog (sprintf "%*i/%i | Batch %*i/%i | %O [%s%s]" echars (epoch + 1) epochs bchars (batch + 1) batches l' (ldiffchar repldiff) replbestchar)
else
let vl' = qvalid w
let repvldiff = vl' - repvllast
if vl' < par.ImprovementThreshold * repvlbest then
repvlbest <- vl'
repvlbestchar <- "▼"
validlimproved <- true
stagnation <- 0
overfitting <- 0
else
repvlbestchar <- " "
validlimproved <- false
stagnation <- stagnation + par.ValidationInterval
match par.EarlyStopping with
| Early(s, _) ->
if stagnation >= s then
if not par.Silent then Util.printLog "*** EARLY STOPPING TRIGGERED: Stagnation of validation loss ***"
earlystop <- true
| _ -> ()
if not par.Silent then
match par.EarlyStopping with
| Early(s, o) ->
Util.printLog (sprintf "%*i/%i | Batch %*i/%i | %O [%s%s] | Valid %O [%s%s] | Stag:%*i Ovfit:%*i" echars (epoch + 1) epochs bchars (batch + 1) batches l' (ldiffchar repldiff) replbestchar vl' (ldiffchar repvldiff) repvlbestchar (s.ToString().Length) stagnation (o.ToString().Length) overfitting)
| _ ->
Util.printLog (sprintf "%*i/%i | Batch %*i/%i | %O [%s%s] | Valid %O [%s%s]" echars (epoch + 1) epochs bchars (batch + 1) batches l' (ldiffchar repldiff) replbestchar vl' (ldiffchar repvldiff) repvlbestchar)
repvllast <- vl'
par.LoggingFunction epoch w l'
let mutable u' = DV.Zero
match lr epoch w (q batch) l' g' gcache p' with
| :? D as a -> u' <- a * p' // A scalar learning rate
| :? DV as a -> u' <- a .* p' // Vector of independent learning rates
u' <- mom u u'
w <- w + u'
l <- l'
g <- g'
p <- p' // Or, p <- u'
u <- u'
batch <- batch + 1
let iter = batches * epoch + batch
if iter >= iters then earlystop <- true
epoch <- epoch + 1
if not diverged then
let l'', _, _ = dir w (q 0) g p gradclip
l' <- l''
if l' < par.ImprovementThreshold * lbest then
wbest <- w
lbest <- l'
let duration = System.DateTime.Now.Subtract(start)
let wfinal = if par.ReturnBest || diverged then wbest else w
let lfinal = if par.ReturnBest || diverged then lbest else l'
let lchg = (lfinal - l0)
let lchgs = lchg / (float32 duration.TotalSeconds)
let es = (float epoch) / (duration.TotalSeconds)
let em = (float epoch) / (duration.TotalMinutes)
if not par.Silent then
Util.printLog (sprintf "Duration : %A" duration)
Util.printLog (sprintf "Loss initial : %O" (primal l0))
Util.printLog (sprintf "Loss final : %O %s" (primal lfinal) (if par.ReturnBest then "(Best)" else "(Last)"))
Util.printLog (sprintf "Loss change : %O (%.2f %%)" (primal lchg) (float32 (100 * (lchg) / l0)))
Util.printLog (sprintf "Loss chg. / s : %O" (primal lchgs))
Util.printLog (sprintf "Epochs / s : %A" es)
Util.printLog (sprintf "Epochs / min : %A" em)
Util.printLog "--- Training finished"
wfinal, lfinal, (whist |> List.rev |> List.toArray), (lhist |> List.rev |> List.toArray)
================================================
FILE: src/Hype/app.config
================================================
TrueTrueTrueTrueTrueTrueTrueTrue
================================================
FILE: src/Hype/paket.references
================================================
DiffSharp
FSharp.Core
System.Drawing.Common