Repository: sammy-suyama/BayesBook
Branch: master
Commit: 61cb7ee0f1df
Files: 24
Total size: 66.9 KB

Directory structure:
gitextract_c_lvl76e/

├── LICENSE
├── README.md
├── data/
│   └── timeseries.jld
├── docker/
│   ├── Dockerfile
│   ├── README.md
│   └── add_packages.jl
└── src/
    ├── BayesNeuralNet.jl
    ├── DimensionalityReduction.jl
    ├── GaussianMixtureModel.jl
    ├── LogisticRegression.jl
    ├── NMF.jl
    ├── PoissonHMM.jl
    ├── PoissonMixtureModel.jl
    ├── demo_BayesNeuralNet.jl
    ├── demo_DimensionalityReduction.jl
    ├── demo_GaussianMixtureModel.jl
    ├── demo_LogisticRegression.jl
    ├── demo_NMF.jl
    ├── demo_PoissonHMM.jl
    ├── demo_PoissonMixtureModel.jl
    ├── demo_PolynomialRegression.jl
    ├── demo_Simple2DGauss.jl
    ├── demo_SimpleFitting.jl
    └── demo_nonconjugate.jl

================================================
FILE CONTENTS
================================================

================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2020 Sammy

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# BayesBook

「機械学習スタートアップシリーズ ベイズ推論による機械学習入門」のソースコードをアップしています。
* http://www.kspub.co.jp/book/detail/1538320.html
* 正誤表（第１～３刷まで） https://github.com/sammy-suyama/BayesBook/blob/master/pdf/seigo.pdf
* 正誤表（第４刷まで） https://github.com/sammy-suyama/BayesBook/blob/master/pdf/seigo_v4.pdf

ソースコードはJuliaで書かれています。（推奨Vesion:0.6.0）
* The Julia Language: http://julialang.org/
* Julia Documentation: http://docs.julialang.org/

グラフの描画やテストデータのダウンロードに一部Pythonライブラリを利用しています。
* Python: https://www.python.org/
* Matplotlib: https://matplotlib.org/
* scikit-learn: http://scikit-learn.org/

上記の環境構築が煩わしい場合にはDockerfileも用意しています．
* Docker: https://docs.docker.com/ 


================================================
FILE: docker/Dockerfile
================================================
FROM python:latest

# Update
RUN apt-get update

# Install matplotlib
RUN pip3 install matplotlib scipy scikit-learn notebook

# Install libraries
RUN apt-get install -y sudo hdf5-tools libzmq3

# Install julia 0.6.0
RUN wget https://julialang-s3.julialang.org/bin/linux/x64/0.6/julia-0.6.0-linux-x86_64.tar.gz && \
    tar -xzf julia-0.6.0-linux-x86_64.tar.gz && \
    ln -s /julia-903644385b/bin/julia /usr/local/bin/julia

# Set the working directory to /work
WORKDIR /work

# Add julia packages
ADD add_packages.jl /work
RUN julia add_packages.jl

# Download source codes
RUN git clone https://github.com/sammy-suyama/BayesBook.git

# Make port 8888 available to the world outside this container
EXPOSE 8888

# Start jupyter notebook
CMD jupyter notebook --allow-root --port=8888 --ip=0.0.0.0


================================================
FILE: docker/README.md
================================================
# DockerからJupyter notebookを実行する

JuliaやPythonの実行環境構築が煩わしい場合は、Dockerを使ってデモスクリプトをJupyter notebook上で動作させることができます。
Dockerのインストールに関しては公式サイトを参考ください。
* https://docs.docker.com/engine/installation/

`Dockerfile`の置いてあるディレクトリで、イメージを作成・実行します。

    $ docker build -t bayesbook .
    $ docker run -p 8888:8888 bayesbook


================================================
FILE: docker/add_packages.jl
================================================

Pkg.update()
Pkg.add("PyPlot")
Pkg.add("StatsFuns")
Pkg.add("SpecialFunctions")
Pkg.add("Distributions")
Pkg.add("PDMats")
Pkg.add("ProgressMeter")
Pkg.add("DataFrames")
Pkg.add("HDF5")
Pkg.add("JLD")
Pkg.add("IJulia")


================================================
FILE: src/BayesNeuralNet.jl
================================================
"""
Variational inference for Bayesian neural network
"""
module BayesNeuralNet
using Distributions

export sample_data_from_prior, sample_data_from_posterior
export VI

function sigmoid(x)
    return 1.0 / (1.0 + exp.(-x[1]))
end

function rho2sig(rho)
    return log.(1 + exp.(rho))
end

function compute_df_dmu(mu, rho, W)
    return (W - mu) ./ rho2sig(rho).^2
end

function compute_df_drho(Y, X, mu, rho, W)
    return -0.5*((W - mu).^2 - rho2sig(rho).^2) .* compute_dprec_drho(rho)
end

function compute_dprec_drho(rho)
    return 2 * rho2sig(rho) .^ (-3) .* (1 ./ (1+exp.(rho))).^2 .* (1 ./ (1+exp.(-rho)))
end

function compute_df_dw(Y, X, sigma2_y, sigma2_w, mu1, rho1, W1, mu2, rho2, W2)
    M, N = size(X)
    Y_err1 = zeros(size(W1)) # MxK
    Y_err2 = zeros(size(W2)) # KxD

    for n in 1 : N
        Z = tanh.(W1'*X[:,n]) # Kx1
        Y_est = W2'*Z
        # 2nd unit, Dx1
        delta2 = Y_est - Y[n]
        
        # 1st unit, KxD
        delta1 = diagm(1 - Z.^2) * W2 * delta2
        
        Y_err1 += X[:,n] * delta1'
        Y_err2 += Z * delta2'
    end
    df_dw1 = W1/sigma2_w + (mu1 - W1) ./ rho2sig(rho1).^2 + Y_err1 / sigma2_y
    df_dw2 = W2/sigma2_w + (mu2 - W2) ./ rho2sig(rho2).^2 + Y_err2 / sigma2_y
    return df_dw1, df_dw2
end

"""
Sample data given prior and inputs.
"""
function sample_data_from_prior(X, sigma2_w, sigma2_y, D, K)
    M, N = size(X)

    W1 = sqrt(sigma2_w) * randn(M, K)
    W2 = sqrt(sigma2_w) * randn(K, D)
    
    # sample function
    Y = [W2'* tanh.(W1'X[:,n]) for n in 1 : N]

    # sample data
    Y_obs = [W2'* tanh.(W1'X[:,n]) + sqrt(sigma2_y)*randn(D) for n in 1 : N]

    return Y_obs, Y, W1, W2
end

"""
Sample data given posterior and inputs.
"""
function sample_data_from_posterior(X, mu1, rho1, mu2, rho2, sigma2_y, D)
    N = size(X, 2)
    ep1 = randn(size(mu1))
    W1_tmp = mu1 + log.(1 + exp.(rho1)) .* ep1
    ep2 = randn(size(mu2))
    W2_tmp = mu2 + log.(1 + exp.(rho2)) .* ep2    
    Y_est = [W2_tmp'* tanh.(W1_tmp'X[:,n]) for n in 1 : N]
    Y_obs = [W2_tmp'* tanh.(W1_tmp'X[:,n]) + sqrt(sigma2_y)*randn(D)  for n in 1 : N]
    return Y_est, Y_obs
end

"""
Compute variational parameters.
"""
function VI(Y, X, sigma2_w, sigma2_y, K, alpha, max_iter)
    M, N = size(X)
    D = length(Y[1])

    # initialize
    mu1 = randn(M, K)
    rho1 = randn(M, K)
    mu2 = randn(K, D)
    rho2 = randn(K, D)

    for i in 1 : max_iter
        # sample
        ep1 = randn(size(mu1))
        W1_tmp = mu1 + log.(1 + exp.(rho1)) .* ep1
        ep2 = randn(size(mu2))
        W2_tmp = mu2 + log.(1 + exp.(rho2)) .* ep2
        
        # calc error
        df_dw1, df_dw2 = compute_df_dw(Y, X, sigma2_y, sigma2_w, mu1, rho1, W1_tmp, mu2, rho2, W2_tmp)
        
        # 1st unit
        df_dmu1 = compute_df_dmu(mu1, rho1, W1_tmp)
        df_drho1 = compute_df_drho(Y, X, mu1, rho1, W1_tmp)
        d_mu1 = df_dw1 + df_dmu1
        d_rho1 = df_dw1 .* (ep1 ./ (1+exp.(-rho1))) + df_drho1
        mu1 = mu1 - alpha * d_mu1
        rho1 = rho1 - alpha * d_rho1 
        
        # 2nd unit
        df_dmu2 = compute_df_dmu(mu2, rho2, W2_tmp)
            df_drho2 = compute_df_drho(Y, X, mu2, rho2, W2_tmp)
        d_mu2 = df_dw2 + df_dmu2
        d_rho2 = df_dw2 .* (ep2 ./ (1+exp.(-rho2))) + df_drho2
        mu2 = mu2 - alpha * d_mu2
        rho2 = rho2 - alpha * d_rho2
    end
    return mu1, rho1, mu2, rho2
end

end


================================================
FILE: src/DimensionalityReduction.jl
================================================
"""
Variational inference for Bayesian DimensionalityReduction
"""
module DimensionalityReduction

using Distributions
#using ProgressMeter

export DRModel
export sample_data, VI

####################
## Types
struct DRModel
    D::Int
    M::Int
    sigma2_y::Float64
    m_W::Array{Float64, 2} # MxD
    Sigma_W::Array{Float64, 3} # MxMxD
    m_mu::Array{Float64, 1} # D
    Sigma_mu::Array{Float64, 2} # DxD
end

####################
## functions
function sqsum(mat::Array{Float64}, idx::Int)
    return squeeze(sum(mat, idx), idx)
end

"""
Sample data given hyperparameters.
"""
function sample_data(N::Int, model::DRModel)
    D = model.D
    M = model.M
    W = zeros(M, D)
    mu = zeros(D)
    for d in 1 : D
        W[:,d] = rand(MvNormal(model.m_W[:,d], model.Sigma_W[:,:,d]))
    end
    mu = rand(MvNormal(model.m_mu, model.Sigma_mu))
    
    Y = zeros(D, N)
    X = randn(M, N)
    for n in 1 : N
        Y[:,n] = rand(MvNormal(W'*X[:,n] + mu, model.sigma2_y*eye(D)))
    end
    return Y, X, W, mu
end

function init(Y::Array{Float64, 2}, prior::DRModel)
    M = prior.M
    D, N = size(Y)
    X = randn(M, N)
    XX = zeros(M, M, N)
    for n in 1 : N
        XX[:,:,n] = X[:,n]*X[:,n]' + eye(M)
    end
    return X, XX
end

function update_W(Y::Array{Float64, 2}, prior::DRModel, posterior::DRModel,
                  X::Array{Float64, 2}, XX::Array{Float64, 3})
    D = prior.D
    M = prior.M
    N = size(Y, 2)
    m_W = zeros(M, D)
    Sigma_W = zeros(M, M, D)
    mu = posterior.m_mu
    for d in 1 : D
        Sigma_W[:,:,d] = inv(inv(prior.sigma2_y)*sqsum(XX, 3) + inv(prior.Sigma_W[:,:,d]))
        m_W[:,d] = Sigma_W[:,:,d]*(inv(prior.sigma2_y)*X*(Y[[d],:] - mu[d]*ones(1, N))'
                                   + inv(prior.Sigma_W[:,:,d])*prior.m_W[:,d])
    end
    return DRModel(D, M, prior.sigma2_y, m_W, Sigma_W, posterior.m_mu, posterior.Sigma_mu)
end

function update_mu(Y::Array{Float64, 2}, prior::DRModel, posterior::DRModel,
                   X::Array{Float64, 2}, XX::Array{Float64, 3})
    N = size(Y, 2)
    D = prior.D
    M = prior.M
    W = posterior.m_W
    Sigma_mu = inv(N*inv(prior.sigma2_y)*eye(D) + inv(prior.Sigma_mu))
    m_mu = Sigma_mu*(inv(prior.sigma2_y)*sqsum(Y - W'*X, 2) + inv(prior.Sigma_mu)*prior.m_mu)
    return DRModel(D, M, prior.sigma2_y, posterior.m_W, posterior.Sigma_W, m_mu, Sigma_mu)
end

function update_X(Y::Array{Float64, 2}, posterior::DRModel)
    D, N = size(Y)
    M = posterior.M
    
    W = posterior.m_W
    WW = zeros(M, M, D)
    for d in 1 : D
        WW[:,:,d] = W[:,d]*W[:,d]' + posterior.Sigma_W[:,:,d]
    end
    mu = posterior.m_mu
    X = zeros(M, N)
    XX = zeros(M, M, N)
    for n in 1 : N
        Sigma = inv(inv(posterior.sigma2_y)*sqsum(WW, 3) + eye(M))
        X[:,n] = inv(posterior.sigma2_y)*Sigma*W*(Y[:,n] - mu)
        XX[:,:,n] = X[:,n] * X[:,n]' + Sigma
    end
    return X, XX
end

function interpolate(mask::BitArray{2}, X::Array{Float64, 2}, posterior::DRModel)
    Y_est = posterior.m_W'*X + repmat(posterior.m_mu, 1, size(X, 2))
    return return Y_est[mask]
end

"""
Compute variational posterior distributions.
"""
function VI(Y::Array{Float64, 2}, prior::DRModel, max_iter::Int)
    X, XX = init(Y, prior)
    mask = isnan.(Y)
    sum_nan = sum(mask)
    posterior = deepcopy(prior)

    #progress = Progress(max_iter)
    for iter in 1 : max_iter
        # progress
        #next!(progress)
        
        # Interpolate
        if sum_nan > 0
            Y[mask] = interpolate(mask, X, posterior)
        end
        
        # M-step
        posterior = update_W(Y, prior, posterior, X, XX)
        posterior = update_mu(Y, prior, posterior, X, XX)
        
        # E-step
        X, XX = update_X(Y, posterior)
    end
    
    return posterior, X
end

end


================================================
FILE: src/GaussianMixtureModel.jl
================================================
"""
Bayesian Gaussian Mixture Model
"""
module GaussianMixtureModel
using StatsFuns.logsumexp, SpecialFunctions.digamma
using Distributions
using PDMats

export GW, BGMM, Gauss, GMM
export sample_GMM, sample_data, winner_takes_all
export learn_GS, learn_CGS, learn_VI

####################
## Types
struct GW
    # Parameters of Gauss Wisahrt distribution
    beta::Float64
    m::Vector{Float64}
    nu::Float64
    W::Matrix{Float64}
end

struct BGMM
    # Parameters of Bayesian Gaussian Mixture Model 
    D::Int
    K::Int
    alpha::Vector{Float64}
    cmp::Vector{GW}
end

struct Gauss
    # Parameters of Gauss Distribution
    mu::Vector{Float64}
    Lambda::Matrix{Float64}
end

struct GMM
    # Parameters of Gauss Mixture Model
    D::Int
    K::Int
    phi::Vector{Float64}
    cmp::Vector{Gauss}
end

####################
## Common functions
"""
Sample a GMM given hyperparameters.
"""
function sample_GMM(bgmm::BGMM)
    cmp = Vector{Gauss}()
    for c in bgmm.cmp
        Lambda = rand(Wishart(c.nu, PDMats.PDMat(Symmetric(c.W))))
        mu = rand(MvNormal(c.m, PDMats.PDMat(Symmetric(inv(c.beta*Lambda)))))
        push!(cmp, Gauss(mu, Lambda))
    end
    phi = rand(Dirichlet(bgmm.alpha))
    return GMM(bgmm.D, bgmm.K, phi, cmp)
end

"""
Sample data from a specific GMM model.
"""
function sample_data(gmm::GMM, N::Int)
    X = zeros(gmm.D, N)
    S = categorical_sample(gmm.phi, N)
    for n in 1 : N
        k = indmax(S[:, n])
        X[:,n] = rand(MvNormal(gmm.cmp[k].mu, PDMats.PDMat(Symmetric(inv(gmm.cmp[k].Lambda)))))
    end
    return X, S
end

categorical_sample(p::Vector{Float64}) = categorical_sample(p, 1)[:,1]
function categorical_sample(p::Vector{Float64}, N::Int)
    K = length(p)
    S = zeros(K, N)
    S_tmp = rand(Categorical(p), N)
    for k in 1 : K
        S[k,find(S_tmp.==k)] = 1
    end
    return S
end

function sumdigamma(nu, D)
    ret = 0.0
    for d in 1 : D
        ret += digamma.(0.5*(nu + 1 - d))
    end
    return ret
end

function init_S(X::Matrix{Float64}, bgmm::BGMM)
    N = size(X, 2)
    K = bgmm.K
    S = categorical_sample(ones(K)/K, N)    
    return S
end

function calc_ELBO(X::Array{Float64, 2}, pri::BGMM, pos::BGMM)
    function logCw(nu, W)
        D = size(W, 1)
        return -0.5*nu*logdet(W) - 0.5*nu*D*log.(2) - 0.25*D*(D-1)*log.(pi) - sum([lgamma.(0.5*(nu+1-d)) for d in 1 : D])
    end
    
    ln_expt_S = update_S(pos, X)
    expt_S = exp.(ln_expt_S)
    K, N = size(expt_S)
    D = size(X, 1)

    expt_ln_lkh = 0
    for k in 1 : K
        expt_Lambda = pos.cmp[k].nu * pos.cmp[k].W
        expt_Lambda_mu = pos.cmp[k].nu * pos.cmp[k].W * pos.cmp[k].m
        expt_mu_Lambda_mu = (pos.cmp[k].nu * pos.cmp[k].m' * pos.cmp[k].W * pos.cmp[k].m)[1] + D/pos.cmp[k].beta
        expt_ln_Lambda = sumdigamma(pos.cmp[k].nu, D) + D*log.(2) + logdet(pos.cmp[k].W)
        expt_ln_pi = digamma.(pos.alpha) - digamma.(sum(pos.alpha))
        for n in 1 : N
            # <ln p(X|S, mu, Lambda)>
            expt_ln_lkh += -0.5 * expt_S[k,n]*(trace(X[:,n]*X[:,n]'*expt_Lambda)
                                               - 2*(X[:,n]'*expt_Lambda_mu)[1]
                                               + expt_mu_Lambda_mu
                                               - expt_ln_Lambda
                                               + D * log.(2*pi)
                                               )
            # <ln p(S|pi)>
            expt_ln_lkh += expt_S[k,n]*expt_ln_pi[k]
        end
    end
    # -<ln q(S)>
    expt_ln_lkh -= sum(expt_S.*ln_expt_S)
    
    KL_mu_Lambda = [(0.5*D*(log.(pos.cmp[k].beta) - log.(pri.cmp[k].beta) + pri.cmp[k].beta/pos.cmp[k].beta - pos.cmp[k].nu - 1)
                     + 0.5*(pos.cmp[k].nu-pri.cmp[k].nu)*(sumdigamma(pos.cmp[k].nu, D) + D*log.(2) + logdet(pos.cmp[k].W))
                     + logCw(pos.cmp[k].nu, pos.cmp[k].W) - logCw(pri.cmp[k].nu, pri.cmp[k].W)
                     + 0.5*pos.cmp[k].nu*trace((pri.cmp[k].beta*(pos.cmp[k].m-pri.cmp[k].m)*(pos.cmp[k].m-pri.cmp[k].m)'
                                                +inv(pri.cmp[k].W))*pos.cmp[k].W)) for k in 1 : K]
    
    KL_pi = (lgamma.(sum(pos.alpha)) - lgamma.(sum(pri.alpha))
             - sum(lgamma.(pos.alpha)) + sum(lgamma.(pri.alpha))
             + (pos.alpha - pri.alpha)' * (digamma.(pos.alpha) - digamma.(sum(pos.alpha)))
             )[1]

    VB = expt_ln_lkh - (sum(KL_mu_Lambda) + KL_pi)
    return VB
end

function add_stats(bgmm::BGMM, X::Matrix{Float64}, S::Matrix{Float64})
    D = bgmm.D
    K = bgmm.K
    sum_S = sum(S, 2)
    alpha = [bgmm.alpha[k] + sum_S[k] for k in 1 : K]
    cmp = Vector{GW}()

    XS = X*S';
    for k in 1 : K
        beta = bgmm.cmp[k].beta + sum_S[k]
        m = (1.0/beta)*(vec(X*S[[k],:]') + bgmm.cmp[k].beta*bgmm.cmp[k].m)
        nu = bgmm.cmp[k].nu + sum_S[k]
        W = inv(X*diagm(S[k,:])*X'
                       - beta*m*m'
                       + bgmm.cmp[k].beta*bgmm.cmp[k].m*bgmm.cmp[k].m'
                       + inv(bgmm.cmp[k].W))
        
        push!(cmp, GW(beta, m, nu, W))
    end
    return BGMM(D, K, alpha, cmp)
end

remove_stats(bgmm::BGMM, X::Matrix{Float64}, S::Matrix{Float64}) = add_stats(bgmm, X, -S)

####################
## used for Variational Inference
function update_S(bgmm::BGMM, X::Matrix{Float64})
    D, N = size(X)
    K = bgmm.K
    ln_S = zeros(K, N)
    tmp = zeros(K)

    tmp = NaN * zeros(K)
    sum_digamma_tmp = digamma.(sum(bgmm.alpha))
    for k in 1 : K
        tmp[k] = -0.5*(bgmm.cmp[k].nu*trace(bgmm.cmp[k].m*bgmm.cmp[k].m'*bgmm.cmp[k].W)
                       + D*(1.0/bgmm.cmp[k].beta)
                       - (sumdigamma(bgmm.cmp[k].nu, D) + logdet(bgmm.cmp[k].W)))
        tmp[k] += digamma.(bgmm.alpha[k]) - sum_digamma_tmp
    end
    for n in 1 : N
        tmp_ln_pi = NaN * zeros(K)
        for k in 1 : K
            tmp_ln_pi[k] = tmp[k] -0.5*bgmm.cmp[k].nu*trace((X[:,n]*X[:,n]' - 2*bgmm.cmp[k].m*X[:,n]')*bgmm.cmp[k].W)
        end
        ln_S[:,n] = tmp_ln_pi - logsumexp(tmp_ln_pi)
    end
    return ln_S    
end

"""
Pick single states having a max probability.
"""
function winner_takes_all(S::Matrix{Float64})
    S_ret = zeros(size(S))
    for n in 1 : size(S_ret, 2)
        idx = indmax(S[:,n])
        S_ret[idx,n] = 1
    end
    return S_ret
end

####################
## used for Gibbs Sampling
function sample_S_GS(gmm::GMM, X::Matrix{Float64})
    D, N = size(X)
    K = gmm.K
    S = zeros(K, N)

    tmp = [0.5*logdet(gmm.cmp[k].Lambda) + log.(gmm.phi[k]) for k in 1 : K]

    for n in 1 : N
        tmp_ln_phi = [-0.5*trace(gmm.cmp[k].Lambda*(X[:,n] - gmm.cmp[k].mu)*(X[:,n] - gmm.cmp[k].mu)') + tmp[k] for k in 1 : K]
        tmp_ln_phi = tmp_ln_phi - logsumexp(tmp_ln_phi)
        S[:,n] = categorical_sample(exp.(tmp_ln_phi))
    end
    
    return S
end

####################
## used for Collapsed Gibbs Sampling
function calc_ln_ST(Xn::Vector{Float64}, gw::GW)
    # TODO; need to check value?
    D = size(Xn, 1)
    W = ((1 - D + gw.nu)*gw.beta / (1 + gw.beta)) * gw.W
    #ln_lkh = logpdf(MvTDist(1 - D + gw.nu, gw.m, (gw.nu/(gw.nu - 2))*inv(W)), Xn)
    ln_lkh = logpdf(MvTDist(1 - D + gw.nu, gw.m, PDMats.PDMat(Symmetric(inv(W)))), Xn)
    return sum(ln_lkh)
end

function sample_Sn(Xn::Vector{Float64}, bgmm::BGMM)
    ln_tmp = [(calc_ln_ST(Xn, bgmm.cmp[k]) + log.(bgmm.alpha[k])) for k in 1 : bgmm.K]
    ln_tmp = ln_tmp -  logsumexp(ln_tmp)
    Sn = categorical_sample(exp.(ln_tmp))
    return Sn
end

function sample_S_CGS(S::Matrix{Float64}, X::Matrix{Float64}, bgmm::BGMM)
    D, N = size(X)
    K = size(S, 1)
    for n in randperm(N)
        # remove
        bgmm = remove_stats(bgmm, X[:,[n]], S[:,[n]])
        # sample
        S[:,n] = sample_Sn(X[:,n], bgmm)
        # insert
        bgmm = add_stats(bgmm, X[:,[n]], S[:,[n]])
    end
    return S, bgmm
end

####################
## Algorithm main
"""
Compute posterior distributions via variational inference.
"""
function learn_VI(X::Matrix{Float64}, prior_bgmm::BGMM, max_iter::Int)
    # initialisation
    expt_S = init_S(X, prior_bgmm)
    bgmm = add_stats(prior_bgmm, X, expt_S)
    VB = NaN * zeros(max_iter)

    # inference
    for i in 1 : max_iter
        # E-step
        expt_S = exp.(update_S(bgmm, X))
        # M-step
        bgmm = add_stats(prior_bgmm, X, expt_S)
        # calc VB
        VB[i] = calc_ELBO(X, prior_bgmm, bgmm)
    end

    # assign binary values
    S = winner_takes_all(expt_S)
    return S, bgmm, VB
end

"""
Compute posterior distributions via Gibbs sampling.
"""
function learn_GS(X::Matrix{Float64}, prior_bgmm::BGMM, max_iter::Int)
    # initialisation
    S = init_S(X, prior_bgmm)
    bgmm = add_stats(prior_bgmm, X, S)
    VB = NaN * zeros(max_iter)
    
    # inference
    for i in 1 : max_iter            
        # sample parameters
        gmm = sample_GMM(bgmm)
        # sample latent variables
        S = sample_S_GS(gmm, X)
        # update current model
        bgmm = add_stats(prior_bgmm, X, S)
        # calc VB
        VB[i] = calc_ELBO(X, prior_bgmm, bgmm)
    end

    return S, bgmm, VB
end

"""
Compute posterior distributions via collapsed Gibbs sampling.
"""
function learn_CGS(X::Matrix{Float64}, prior_bgmm::BGMM, max_iter::Int)
    # initialisation
    S = init_S(X, prior_bgmm)
    bgmm = add_stats(prior_bgmm, X, S)
    VB = NaN * zeros(max_iter)

    # inference
    for i in 1 : max_iter
        # directly sample S
        S, bgmm = sample_S_CGS(S, X, bgmm)
        # calc VB
        VB[i] = calc_ELBO(X, prior_bgmm, bgmm)
    end

    return S, bgmm, VB
end

end


================================================
FILE: src/LogisticRegression.jl
================================================
"""
Variational inference for Bayesian logistic regression.
"""
module LogisticRegression
using Distributions

export sigmoid, sample_data, VI

function sigmoid(x)
    return 1.0 / (1.0 + exp.(-x[1]))
end    

function bern_sample(mu)
    i = rand(Bernoulli(mu))
    val = zeros(2)
    val[i+1] = 1
    return val
end

"""
Sample data & parameter given covariance Sigma_w and inputs X.
"""
function sample_data(X, Sigma_w)
    N = size(X, 2)
    M = size(Sigma_w, 1)

    # sample parameters
    W = rand(MvNormal(zeros(M), Sigma_w))
    
    # sample data
    Y = [rand(Bernoulli(sigmoid(W'*X[:, n]))) for n in 1 : N]
    return Y, W
end

"""
Compute variational parameters.
"""
function VI(Y, X, M, Sigma_w, alpha, max_iter)
    function rho2sig(rho)
        return log.(1 + exp.(rho))
    end
    
    function compute_df_dw(Y, X, Sigma_w, mu, rho, W)
        M, N = size(X)
        term1 = (mu - W) ./ rho2sig(rho).^2
        term2 = inv(Sigma_w)*W
        term3 = 0
        for n in 1 : N
            term3 += -(Y[n] - sigmoid(W'*X[:,n])) * X[:,n]
        end
        return term1 + term2 + term3
    end
    
    function compute_df_dmu(mu, rho, W)
        return (W - mu) ./ rho2sig(rho).^2
    end
    
    function compute_df_drho(Y, X, Sigma_w, mu, rho, W)
        return -0.5*((W - mu).^2 - rho2sig(rho).^2) .* compute_dprec_drho(rho)
    end
    
    function compute_dprec_drho(rho)
        return 2 * rho2sig(rho) .^ (-3) .* (1 ./ (1+exp.(rho))).^2 .* (1 ./ (1+exp.(-rho)))
    end

    # diag gaussian for approximate posterior
    mu = randn(M)
    rho = randn(M) # sigma = log.(1 + exp.(rho))
    
    for i in 1 : max_iter
        # sample epsilon
        ep = rand(M)
        W_tmp = mu + log.(1 + exp.(rho)) .* ep

        # calculate gradient
        df_dw = compute_df_dw(Y, X, Sigma_w, mu, rho, W_tmp)
        df_dmu = compute_df_dmu(mu, rho, W_tmp)
        df_drho = compute_df_drho(Y, X, Sigma_w, mu, rho, W_tmp)
        d_mu = df_dw + df_dmu
        d_rho = df_dw .* (ep ./ (1+exp.(-rho))) + df_drho

        # update variational parameters
        mu = mu - alpha * d_mu
        rho = rho - alpha * d_rho 
    end
    return mu, rho
end

end


================================================
FILE: src/NMF.jl
================================================
"""
Variational inference for Bayesian NMF
"""
module NMF
using Distributions
using StatsFuns.logsumexp, SpecialFunctions.digamma

export NMFModel
export sample_data, VI

####################
## Types
struct NMFModel
    a_t::Array{Float64, 2} # D x K
    b_t::Array{Float64, 2} # D x L
    a_v::Float64 # 1 dim
    b_v::Float64 # 1 dim
end

function sqsum(mat::Array, idx)
    return squeeze(sum(mat, idx), idx)
end

####################
## functions
function init(X::Array{Int64, 2}, model::NMFModel)
    D, N = size(X)
    K = size(model.a_t, 2)
    S = zeros(D, K, N)
    A_t = rand(D, K)
    B_t = rand(D, K)
    A_v = rand(K, N)
    B_v = rand(K, N)
    for d in 1 : D
        for k in 1 : K
            for n in 1 : N
                S[d,k,n] = X[d,n] * A_t[d,k] * B_t[d,k] * A_v[k,n] * B_v[k,n]
            end
        end
    end   
    return S, A_t, B_t, A_v, B_v
end

function update_S(X::Array{Int64, 2}, A_t::Array{Float64, 2}, B_t::Array{Float64, 2}, A_v::Array{Float64, 2}, B_v::Array{Float64, 2})
    D, K = size(A_t)
    N = size(A_v, 2)
    S = zeros(D, K, N)
    for d in 1 : D
        for n in 1 : N
            # K dim
            ln_P = (digamma.(A_t[d,:]) + log.(B_t[d,:])
                    + digamma.(A_v[:,n]) + log.(B_v[:,n])
                    )
            ln_P = ln_P - logsumexp(ln_P)
            S[d,:,n] = X[d,n] * exp.(ln_P)
        end
    end
    return S
end

function update_T(S::Array{Float64, 3}, A_v::Array{Float64, 2}, B_v::Array{Float64, 2}, model::NMFModel)
    D, K, N = size(S)
    a_t = model.a_t # DxK
    b_t = model.b_t # DxK
    A_t = a_t + sqsum(S, 3)
    B_t = (a_t ./ b_t + repmat(sqsum(A_v.*B_v, 2)', D, 1)).^(-1)
    return A_t, B_t
end

function update_V(S::Array{Float64, 3}, A_t::Array{Float64, 2}, B_t::Array{Float64, 2}, model::NMFModel)
    a_v = model.a_v
    b_v = model.b_v
    D, K, N = size(S)
    A_v = a_v + sqsum(S, 1)
    B_v = (a_v / b_v + repmat(sqsum(A_t.*B_t, 1), 1, N)).^(-1)
    return A_v, B_v
end

"""
Sample data given hyperparameters.
"""
function sample_data(N::Int, model::NMFModel)
    # TODO; check b or 1/b ?
    D, K = size(model.a_t)

    T = zeros(D, K)
    for d in 1 : D
        for k in 1 : K
            T[d,k] = rand(Gamma(model.a_t[d,k], 1.0/model.b_t[d,k])) # TODO: check
        end
    end

    V = reshape(rand(Gamma(model.a_v, 1.0/model.b_v), K*N) , K, N) # TODO: check
    
    S = zeros(D, K, N)
    for d in 1 : D
        for k in 1 : K
            for n in 1 : N
                S[d,k,n] = T[d,k] * V[k,n]
            end
        end
    end
    #X = sqsum(S, 2) + 0.0 # zero noise
    X = sqsum(S, 2)
    return X, T, S, V
end

function update_model(A_t::Array{Float64, 2}, B_t::Array{Float64, 2}, model::NMFModel)
    return NMFModel(A_t, B_t, model.a_v, model.b_v)
end

"""
Compute variational posterior distributions.
"""
function VI(X::Array{Int64, 2}, model::NMFModel, max_iter::Int)
    K = size(model.a_t, 2)
    D, N = size(X)
    S, A_t, B_t, A_v, B_v = init(X, model)
    for iter in 1 : max_iter
        # latent
        S = update_S(X, A_t, B_t, A_v, B_v)
        A_v, B_v = update_V(S, A_t, B_t, model)

        # param
        A_t, B_t = update_T(S, A_v, B_v, model)
    end
    
    return update_model(A_t, B_t, model), S, A_t.*B_t, A_v.*B_v
end

end


================================================
FILE: src/PoissonHMM.jl
================================================
"""
Bayesian 1dim Poisson Hidden Markov Model
"""
module PoissonHMM
using StatsFuns.logsumexp, SpecialFunctions.digamma
using Distributions

export Gam, GHMM, Poi, HMM
export sample_HMM, sample_data, winner_takes_all
export learn_VI

####################
## Types
struct Gam
    # Parameters of Gamma distribution
    # 1dim
    a::Float64
    b::Float64
end

struct BHMM
    # Parameters of Bayesian Bernoulli Mixture Model 
    K::Int
    alpha_phi::Vector{Float64}
    alpha_A::Matrix{Float64}    
    cmp::Vector{Gam}
end

struct Poi
    # Parameters of Poisson Distribution
    # 1 dim
    lambda::Float64
end

struct HMM
    # Parameters of Bernoulli Mixture Model
    K::Int
    phi::Vector{Float64}
    A::Matrix{Float64}
    cmp::Vector{Poi}
end

####################
## Common functions
"""
Sample an HMM from prior
"""
function sample_HMM(bhmm::BHMM)
    cmp = Vector{Poi}()
    for c in bhmm.cmp
        lambda = rand(Gamma(c.a, 1.0/c.b))
        push!(cmp, Poi(lambda))
    end
    phi = rand(Dirichlet(bhmm.alpha_phi))
    A = zeros(size(bhmm.alpha_A))
    for k in 1 : bhmm.K
        A[:,k] = rand(Dirichlet(bhmm.alpha_A[:,k]))
    end
    return HMM(bhmm.K, phi, A, cmp)
end

"""
Sample data from a specific Poisson HMM
"""
function sample_data(hmm::HMM, N::Int)
    X = zeros(N)
    Z = zeros(hmm.K, N)

    # sample (n=1)
    Z[:,1] = categorical_sample(hmm.phi)
    k = indmax(Z[:, 1])
    X[1] = rand(Poisson(hmm.cmp[k].lambda))

    # sample (n>1)
    for n in 2 : N
        Z[:,n] = categorical_sample(hmm.A[:,k])
        k = indmax(Z[:, n])
        X[n] = rand(Poisson(hmm.cmp[k].lambda))
    end
    return X, Z
end

categorical_sample(p::Vector{Float64}) = categorical_sample(p, 1)[:,1]
function categorical_sample(p::Vector{Float64}, N::Int)
    K = length(p)
    S = zeros(K, N)
    S_tmp = rand(Categorical(p), N)
    for k in 1 : K
        S[k,find(S_tmp.==k)] = 1
    end
    return S
end

function init_Z(X::Vector{Float64}, bhmm::BHMM)
    N = size(X, 1)
    K = bhmm.K
    
    Z = rand(Dirichlet(ones(K)/K), N)
    ZZ = [zeros(K,K) for _ in 1 : N - 1]
    for n in 1 : N - 1
        ZZ[n] = Z[:,n+1] * Z[:,n]'
    end
    
    return Z, ZZ
end

"""
Not implemented yet.
"""
function calc_ELBO(X::Matrix{Float64}, pri::BHMM, pos::BHMM)
end

function add_stats(bhmm::BHMM, X::Vector{Float64},
                   Z::Matrix{Float64}, ZZ::Vector{Matrix{Float64}})
    K = bhmm.K
    sum_Z = sum(Z, 2)
    alpha_phi = [bhmm.alpha_phi[k] + Z[k,1] for k in 1 : K]
    alpha_A = bhmm.alpha_A + sum(ZZ)
    cmp = Vector{Gam}()
    
    ZX = Z*X # (KxN) x (Nx1) = Kx1
    for k in 1 : K
        a = bhmm.cmp[k].a + ZX[k]
        b = bhmm.cmp[k].b + sum_Z[k]
        push!(cmp, Gam(a, b))
    end
    return BHMM(K, alpha_phi, alpha_A, cmp)
end

remove_stats(bhmm::BHMM, X::Vector{Float64}, Z::Matrix{Float64}) = add_stats(bhmm, X, -Z)

####################
## used for Variational Inference
function update_Z(bhmm::BHMM, X::Vector{Float64}, Z::Matrix{Float64})
    N = size(X, 1)
    K = bhmm.K
    ln_expt_Z = zeros(K, N)

    ln_lkh = zeros(K, N)
    for k in 1 : K
        ln_lambda = digamma.(bhmm.cmp[k].a) - log.(bhmm.cmp[k].b)
        lambda = bhmm.cmp[k].a / bhmm.cmp[k].b
        for n in 1 : N
            ln_lkh[k,n] = X[n]'*(ln_lambda) - lambda
        end
    end
    
    expt_ln_A = zeros(size(bhmm.alpha_A))
    for k in 1 : K
        expt_ln_A[:,k] = digamma.(bhmm.alpha_A[:,k]) - digamma.(sum(bhmm.alpha_A[:,k]))
    end

    # copy
    ln_expt_Z = log.(Z)
    
    # n = 1
    ln_expt_Z[:,1] = (digamma.(bhmm.alpha_phi) - digamma.(sum(bhmm.alpha_phi))
                      + expt_ln_A' * exp.(ln_expt_Z[:,2])
                      + ln_lkh[:,1]
                      )
    ln_expt_Z[:,1] = ln_expt_Z[:,1] - logsumexp(ln_expt_Z[:,1])
    
    # 2 <= n <= N - 1
    for n in 2 : N - 1
        ln_expt_Z[:,n] =( expt_ln_A * exp.(ln_expt_Z[:,n-1])
                          + expt_ln_A' * exp.(ln_expt_Z[:,n+1])
                          + ln_lkh[:,n]
                          )
        ln_expt_Z[:,n] = ln_expt_Z[:,n] - logsumexp(ln_expt_Z[:,n])
    end
    
    # n = N
    ln_expt_Z[:,N] =( expt_ln_A * exp.(ln_expt_Z[:,N-1])
                      + ln_lkh[:,N]
                      )
    ln_expt_Z[:,N] = ln_expt_Z[:,N] - logsumexp(ln_expt_Z[:,N])
    
    # calc output
    Z_ret = exp.(ln_expt_Z)
    ZZ_ret = [zeros(K,K) for _ in 1 : N - 1]
    for n in 1 : N - 1
        ZZ_ret[n] = Z_ret[:,n+1] * Z_ret[:,n]'
    end
    return Z_ret, ZZ_ret
end

"""
Pick single states having a max probability.
"""
function winner_takes_all(Z::Matrix{Float64})
    Z_ret = zeros(size(Z))
    for n in 1 : size(Z_ret, 2)
        idx = indmax(Z[:,n])
        Z_ret[idx,n] = 1
    end
    return Z_ret
end

function logmatprod(ln_A::Array{Float64}, ln_B::Array{Float64})
    I = size(ln_A, 1)
    J = size(ln_B, 2)
    ln_C = zeros(I, J)
    for i in 1 : I
        for j in 1 : J
            ln_C[i, j] = logsumexp(ln_A[i, :] + ln_B[:, j])
        end
    end
    return ln_C
end

function update_Z_fb(bhmm::BHMM, X::Vector{Float64})
    K = bhmm.K
    N = length(X)

    # calc likelihood
    ln_lik = zeros(K, N)
    for k in 1 : K
        ln_lambda = digamma.(bhmm.cmp[k].a) - log.(bhmm.cmp[k].b)
        lambda = bhmm.cmp[k].a / bhmm.cmp[k].b
        for n in 1 : N
            ln_lik[k,n] =X[n]'*(ln_lambda) - lambda                
        end
    end
    expt_ln_phi = digamma.(bhmm.alpha_phi) - digamma.(sum(bhmm.alpha_phi))
    expt_ln_A = zeros(K,K)
    for k in 1 : K
        expt_ln_A[:,k] = digamma.(bhmm.alpha_A[:,k]) - digamma.(sum(bhmm.alpha_A[:,k]))
    end

    Z, ZZ = fb_alg(ln_lik, expt_ln_phi, expt_ln_A)
    
    # different notation
    ZZ_ret = [ZZ[:,:,n] for n in 1:size(ZZ, 3)]

    return Z, ZZ_ret
end

function fb_alg(ln_lik::Matrix{Float64}, ln_phi::Vector{Float64}, ln_A::Matrix{Float64})
    K, T = size(ln_lik)
    ln_Z = zeros(K, T)
    ln_ZZ = zeros(K, K, T)
    ln_alpha = zeros(K, T)
    ln_beta = zeros(K, T)
    ln_st = zeros(T)
    
    for t in 1 : T
        if t == 1
            ln_alpha[:, 1] = ln_phi + ln_lik[:, 1]
        else
            ln_alpha[:, t] = logmatprod(ln_A, ln_alpha[:, t-1]) + ln_lik[:, t]
        end
        ln_st[t] = logsumexp(ln_alpha[:, t])
        ln_alpha[:,t] = ln_alpha[:,t] - ln_st[t]
    end
    
    for t in T-1 : -1 : 1
        ln_beta[:, t] = logmatprod(ln_A', ln_beta[:, t+1] + ln_lik[:,t+1])
        ln_beta[:, t] = ln_beta[:, t] - ln_st[t+1]
    end
    
    ln_Z = ln_alpha + ln_beta
    for t in 1 : T
        if t < T
            ln_ZZ[:,:,t] = (repmat(ln_alpha[:, t]', K, 1) + ln_A
            + repmat(ln_lik[:, t+1] + ln_beta[:,t+1], 1, K))
            ln_ZZ[:,:,t] = ln_ZZ[:,:,t] - ln_st[t+1]
        end
    end
    return exp.(ln_Z), exp.(ln_ZZ)
end

"""
Compute approximate posterior distributions via variational inference.
"""
function learn_VI(X::Vector{Float64}, prior_bhmm::BHMM, max_iter::Int)
    # initialisation
    expt_Z, expt_ZZ = init_Z(X, prior_bhmm)
    bhmm = add_stats(prior_bhmm, X, expt_Z, expt_ZZ)
    VB = NaN * zeros(max_iter)

    # inference
    for i in 1 : max_iter
        # E-step
        #expt_Z, expt_ZZ = update_Z(bhmm, X, expt_Z)
        expt_Z, expt_ZZ = update_Z_fb(bhmm, X)
        # M-step
        bhmm = add_stats(prior_bhmm, X, expt_Z, expt_ZZ)
    end

    return expt_Z, bhmm
end
end


================================================
FILE: src/PoissonMixtureModel.jl
================================================
"""
Bayesian Poisson Mixture Model
"""
module PoissonMixtureModel
using StatsFuns.logsumexp, SpecialFunctions.digamma
using Distributions

export Gam, BPMM, Poi, PMM
export sample_PMM, sample_data, winner_takes_all
export learn_GS, learn_CGS, learn_VI

####################
## Types
struct Gam
    # Parameters of Gamma distribution
    a::Vector{Float64}
    b::Float64
end

struct BPMM
    # Parameters of Bayesian Poisson Mixture Model 
    D::Int
    K::Int
    alpha::Vector{Float64}
    cmp::Vector{Gam}
end

struct Poi
    # Parameters of Poisson Distribution
    lambda::Vector{Float64}
end

struct PMM
    # Parameters of Poisson Mixture Model
    D::Int
    K::Int
    phi::Vector{Float64}
    cmp::Vector{Poi}
end

####################
## Common functions
"""
Sample a PMM given hyperparameters.
"""
function sample_PMM(bpmm::BPMM)
    cmp = Vector{Poi}()
    for c in bpmm.cmp
        lambda = Vector{Float64}()
        for d in 1 : bpmm.D
            push!(lambda, rand(Gamma(c.a[d], 1.0/c.b)))
        end
        push!(cmp, Poi(lambda))  
    end
    phi = rand(Dirichlet(bpmm.alpha))
    return PMM(bpmm.D, bpmm.K, phi, cmp)
end

"""
Sample data from a specific PMM model.
"""
function sample_data(pmm::PMM, N::Int)
    X = zeros(pmm.D, N)
    S = categorical_sample(pmm.phi, N)
    for n in 1 : N
        k = indmax(S[:, n])
        for d in 1 : pmm.D
            X[d,n] = rand(Poisson(pmm.cmp[k].lambda[d]))
        end
    end
    return X, S
end

categorical_sample(p::Vector{Float64}) = categorical_sample(p, 1)[:,1]
function categorical_sample(p::Vector{Float64}, N::Int)
    K = length(p)
    S = zeros(K, N)
    S_tmp = rand(Categorical(p), N)
    for k in 1 : K
        S[k,find(S_tmp.==k)] = 1
    end
    return S
end

function init_S(X::Matrix{Float64}, bpmm::BPMM)
    N = size(X, 2)
    K = bpmm.K
    S = categorical_sample(ones(K)/K, N)    
    return S
end

function calc_ELBO(X::Matrix{Float64}, pri::BPMM, pos::BPMM)
    ln_expt_S = update_S(pos, X)
    expt_S = exp.(ln_expt_S)
    K, N = size(expt_S)
    D = size(X, 1)

    expt_ln_lambda = zeros(D, K)
    expt_lambda = zeros(D, K)
    expt_ln_lkh = 0
    for k in 1 : K
        expt_ln_lambda[:,k] = digamma.(pos.cmp[k].a) - log.(pos.cmp[k].b)
        expt_lambda[:,k] = pos.cmp[k].a / pos.cmp[k].b
        for n in 1 : N
            expt_ln_lkh += expt_S[k,n] * (X[:, n]' * expt_ln_lambda[:,k]
                                       - sum(expt_lambda[:,k]) - sum(lgamma.(X[:,n]+1)))[1]
        end
    end
    
    expt_ln_pS = sum(expt_S' * (digamma.(pos.alpha) - digamma.(sum(pos.alpha))))
    expt_ln_qS = sum(expt_S .* ln_expt_S)
    
    KL_lambda = 0
    for k in 1 : K
        KL_lambda += (sum(pos.cmp[k].a)*log.(pos.cmp[k].b) - sum(pri.cmp[k].a)*log.(pri.cmp[k].b)
                      - sum(lgamma.(pos.cmp[k].a)) + sum(lgamma.(pri.cmp[k].a))
                      + (pos.cmp[k].a - pri.cmp[k].a)' * expt_ln_lambda[:,k]
                      + (pri.cmp[k].b - pos.cmp[k].b) * sum(expt_lambda[:,k])
                      )[1]
    end
    KL_pi = (lgamma.(sum(pos.alpha)) - lgamma.(sum(pri.alpha))
             - sum(lgamma.(pos.alpha)) + sum(lgamma.(pri.alpha))
             + (pos.alpha - pri.alpha)' * (digamma.(pos.alpha) - digamma.(sum(pos.alpha)))
             )[1]
    
    VB = expt_ln_lkh + expt_ln_pS - expt_ln_qS - (KL_lambda + KL_pi)
    return VB
end

function add_stats(bpmm::BPMM, X::Matrix{Float64}, S::Matrix{Float64})
    D = bpmm.D
    K = bpmm.K
    sum_S = sum(S, 2)
    alpha = [bpmm.alpha[k] + sum_S[k] for k in 1 : K]
    cmp = Vector{Gam}()

    XS = X*S';
    for k in 1 : K
        a = [(bpmm.cmp[k].a[d] + XS[d,k])::Float64 for d in 1 : D]
        b = bpmm.cmp[k].b + sum_S[k]
        push!(cmp, Gam(a, b))
    end
    return BPMM(D, K, alpha, cmp)
end

remove_stats(bpmm::BPMM, X::Matrix{Float64}, S::Matrix{Float64}) = add_stats(bpmm, X, -S)

####################
## used for Variational Inference
function update_S(bpmm::BPMM, X::Matrix{Float64})
    D, N = size(X)
    K = bpmm.K
    ln_expt_S = zeros(K, N)
    tmp = zeros(K)

    sum_digamma_tmp = digamma.(sum(bpmm.alpha))
    for k in 1 : K
        tmp[k] = - sum(bpmm.cmp[k].a) / bpmm.cmp[k].b
        tmp[k] += digamma.(bpmm.alpha[k]) - sum_digamma_tmp
    end
    ln_lambda_X = [X'*(digamma.(bpmm.cmp[k].a) - log.(bpmm.cmp[k].b)) for k in 1 : K]
    for n in 1 : N
        tmp_ln_pi =  [tmp[k] + ln_lambda_X[k][n] for k in 1 : K]
        ln_expt_S[:,n] = tmp_ln_pi - logsumexp(tmp_ln_pi)
    end
    return ln_expt_S
end

"""
Pick single states having a max probability.
"""
function winner_takes_all(S::Matrix{Float64})
    S_ret = zeros(size(S))
    for n in 1 : size(S_ret, 2)
        idx = indmax(S[:,n])
        S_ret[idx,n] = 1
    end
    return S_ret
end

####################
## used for Gibbs Sampling
function sample_S_GS(pmm::PMM, X::Matrix{Float64})
    D, N = size(X)
    K = pmm.K
    S = zeros(K, N)

    tmp = [-sum(pmm.cmp[k].lambda) + log.(pmm.phi[k]) for k in 1 : K]
    ln_lambda_X = [X'*log.(pmm.cmp[k].lambda) for k in 1 : K]
    for n in 1 : N
        tmp_ln_phi = [(tmp[k] + ln_lambda_X[k][n])::Float64 for k in 1 : K]
        tmp_ln_phi = tmp_ln_phi - logsumexp(tmp_ln_phi)
        S[:,n] = categorical_sample(exp.(tmp_ln_phi))
    end
    return S
end

####################
## used for Collapsed Gibbs Sampling
function calc_ln_NB(Xn::Vector{Float64}, gam::Gam)
    ln_lkh = [(gam.a[d]*log.(gam.b)
               - lgamma.(gam.a[d])
               + lgamma.(Xn[d] + gam.a[d])
               - (Xn[d] + gam.a[d])*log.(gam.b + 1)
               )::Float64 for d in 1 : size(Xn, 1)]
    return sum(ln_lkh)
end

function sample_Sn(Xn::Vector{Float64}, bpmm::BPMM)
    ln_tmp = [(calc_ln_NB(Xn, bpmm.cmp[k]) + log.(bpmm.alpha[k])) for k in 1 : bpmm.K]
    ln_tmp = ln_tmp -  logsumexp(ln_tmp)
    Sn = categorical_sample(exp.(ln_tmp))
    return Sn
end

function sample_S_CGS(S::Matrix{Float64}, X::Matrix{Float64}, bpmm::BPMM)
    D, N = size(X)
    K = size(S, 1)
    for n in randperm(N)
        # remove
        bpmm = remove_stats(bpmm, X[:,[n]], S[:,[n]])
        # sample
        S[:,n] = sample_Sn(X[:,n], bpmm)
        # insert
        bpmm = add_stats(bpmm, X[:,[n]], S[:,[n]])
    end
    return S, bpmm
end

####################
## Algorithm main
"""
Compute posterior distribution via variational inference.
"""
function learn_VI(X::Matrix{Float64}, prior_bpmm::BPMM, max_iter::Int)
    # initialisation
    expt_S = init_S(X, prior_bpmm)
    bpmm = add_stats(prior_bpmm, X, expt_S)
    VB = NaN * zeros(max_iter)

    # inference
    for i in 1 : max_iter
        # E-step
        expt_S = exp.(update_S(bpmm, X))
        # M-step
        bpmm = add_stats(prior_bpmm, X, expt_S)
        # calc VB
        VB[i] = calc_ELBO(X, prior_bpmm, bpmm)
    end

    return expt_S, bpmm, VB
end

"""
Compute posterior distribution via Gibbs sampling.
"""
function learn_GS(X::Matrix{Float64}, prior_bpmm::BPMM, max_iter::Int)
    # initialisation
    S = init_S(X, prior_bpmm)
    bpmm = add_stats(prior_bpmm, X, S)
    VB = NaN * zeros(max_iter)
    
    # inference
    for i in 1 : max_iter            
        # sample parameters
        pmm = sample_PMM(bpmm)
        # sample latent variables
        S = sample_S_GS(pmm, X)
        # update current model
        bpmm = add_stats(prior_bpmm, X, S)
        # calc VB
        VB[i] = calc_ELBO(X, prior_bpmm, bpmm)
    end

    return S, bpmm, VB
end

"""
Compute posterior distribution via collapsed Gibbs sampling.
"""
function learn_CGS(X::Matrix{Float64}, prior_bpmm::BPMM, max_iter::Int)
    # initialisation
    S = init_S(X, prior_bpmm)
    bpmm = add_stats(prior_bpmm, X, S)
    VB = NaN * zeros(max_iter)

    # inference
    for i in 1 : max_iter
        # directly sample S
        S, bpmm = sample_S_CGS(S, X, bpmm)
        # calc VB
        VB[i] = calc_ELBO(X, prior_bpmm, bpmm)
    end

    return S, bpmm, VB
end

end


================================================
FILE: src/demo_BayesNeuralNet.jl
================================================
####################################
## Demo script for Bayesian neural network.

using PyPlot, PyCall

push!(LOAD_PATH, ".")
import BayesNeuralNet

"""
Sample neural nets from prior.
"""
function sample_test()
    # model parameters
    D = 1 # output
    K = 3 # hidden
    M = 2 # input
    sigma2_w = 10.0
    sigma2_y = 0.1
    
    xmin = -5
    xmax = 5
    N_lin = 1000
    X_lin = ones(M, N_lin)
    X_lin[1,:] = linspace(xmin, xmax, N_lin)
    X_lin[2,:] = 1 # bias

    # visualize
    num_samples = 5
    figure("Function samples")
    clf()
    for i in 1 : num_samples
        _, Y_true, _, _ = BayesNeuralNet.sample_data_from_prior(X_lin, sigma2_w, sigma2_y, D, K)
        plot(X_lin[1,:], Y_true)
        xlim([xmin, xmax])
    end
    ratey = (ylim()[2] - ylim()[1]) * 0.1
    ratex = (xlim()[2] - xlim()[1]) * 0.1
    text(xlim()[1] + ratex, ylim()[2] - ratey, @sprintf("K=%d", K), fontsize=18)
    show()
end

"""
Run a test script of variational inference for Bayesian neural net.
"""
function test()
    #################
    # prepara data
    
    # data size
    D = 1 # output
    M = 2 # input
        
    # function setting
    xmin = -2
    xmax = 4
    N_lin = 1000
    X_lin = ones(M, N_lin)
    X_lin[1,:] = linspace(xmin, xmax, N_lin)
    X_lin[2,:] = 1 # bias

    # training data
    N = 50 # data size
    X = 2*rand(M, N) - 0.0 # input
    X[2,:] = 1.0 # bias
    Y = 0.5*sin.(2*pi * X[1,:]/3) + 0.05 * randn(N)    
    
    # model parameters
    K = 5
    sigma2_w = 10.0
    sigma2_y = 0.01
    
    ################
    # inference
    alpha = 1.0e-5
    max_iter = 100000
    mu1, rho1, mu2, rho2 = BayesNeuralNet.VI(Y, X, sigma2_w, sigma2_y, K, alpha, max_iter)
    Y_mean = [mu2'* tanh.(mu1'X_lin[:,n]) for n in 1 : N_lin]

    ################
    # visualize        
    figure("result")
    clf()
    Y_list = []
    num_samples = 100
    for i in 1 : num_samples
        Y_est, _ = BayesNeuralNet.sample_data_from_posterior(X_lin, mu1, rho1, mu2, rho2, sigma2_y, D)
        push!(Y_list, Y_est)
        plot(X_lin[1,:], Y_est, "-c", alpha=0.25)
    end
    plot(X[1,:], Y, "ok")
    plot(X_lin[1,:], Y_mean, "b-")
    xlim([xmin, xmax])
    xlabel("x")
    ylabel("y")
    show()
end

#sample_test()
test()


================================================
FILE: src/demo_DimensionalityReduction.jl
================================================
###################################
## Demo script for Bayesian Dimensionality Reduction

using PyPlot, PyCall
@pyimport sklearn.datasets as datasets

push!(LOAD_PATH,".")
import DimensionalityReduction

function load_facedata(skip::Int)
    face = datasets.fetch_olivetti_faces()
    Y_raw = face["images"]
    N, S_raw, _ = size(Y_raw)

    L = round(Int, S_raw / skip)
    Y_tmp = Y_raw[:,1:skip:end, 1:skip:end]
    Y = convert(Array{Float64, 2}, reshape(Y_tmp, N, size(Y_tmp,2)*size(Y_tmp,3))')
    D = size(Y, 1)
    
    return Y, D, L
end

function visualize(Y::Array{Float64,2}, L::Int)
    D, N = size(Y)
    base = round(Int, sqrt(N))
    v = round(Int, (L*ceil(N / base)))
    h = L * base
    pic = zeros(v, h)

    for n in 1 : N
        i = round(Int, (L*ceil(n / base)))
        idx1 = i - L + 1 : i
        idx2 = L*mod(n-1, base)+1 : L*(mod(n-1, base) + 1)
        pic[idx1,idx2] = reshape(Y[:,n], L, L)
    end
    imshow(pic, cmap=ColorMap("gray"))
end

function visualize(Y::Array{Float64,2}, L::Int, mask::BitArray{2})
    # for missing
    D, N = size(Y)
    base = round(Int, sqrt(N))
    v = round(Int, (L*ceil(N / base)))
    h = L * base
    pic = zeros(v, h, 3)

    Y_3dim = zeros(D, N, 3)
    for i in 1 : 3
        if i == 2
            Y_tmp = deepcopy(Y)
            Y_tmp[mask] = 1
            Y_3dim[:,:,i] = Y_tmp
        else
            Y_tmp = deepcopy(Y)
            Y_tmp[mask] = 0
            Y_3dim[:,:,i] = Y_tmp
        end
    end
    
    for n in 1 : N
        i = round(Int, (L*ceil(n / base)))
        idx1 = i - L + 1 : i
        idx2 = L*mod(n-1, base)+1 : L*(mod(n-1, base) + 1)
        for i in 1 : 3
            pic[idx1,idx2,i] = reshape(Y_3dim[:,n,i], L, L)
        end
    end
    imshow(pic, cmap=ColorMap("gray"))
end

"""
Run a demo script of missing data interpolation for face dataset.
"""
function test_face_missing()
    # load data
    skip = 2
    Y, D, L = load_facedata(skip)

    # mask
    missing_rate = 0.50
    mask = rand(size(Y)) .< missing_rate
    Y_obs = deepcopy(Y)
    Y_obs[mask] = NaN
    
    # known parames
    M = 16
    sigma2_y = 0.001
    Sigma_W = zeros(M,M,D)
    Sigma_mu = 1.0 * eye(D)
    for d in 1 : D
        Sigma_W[:,:,d] = 0.1 * eye(M)
    end
    prior = DimensionalityReduction.DRModel(D, M, sigma2_y, zeros(M, D), Sigma_W, zeros(D), Sigma_mu)

    # learn & generate
    max_iter = 100
    posterior, X_est = DimensionalityReduction.VI(deepcopy(Y_obs), prior, max_iter)
    Y_est = posterior.m_W'*X_est + repmat(posterior.m_mu, 1, size(X_est, 2))
    Y_itp = deepcopy(Y_obs)
    Y_itp[mask] = Y_est[mask]
  
    #visualize
    N_show = 4^2
    
    figure("Observation")
    clf()
    visualize(Y_obs[:,1:N_show], L, mask[:,1:N_show])
    title("Observation")

    #figure("Estimation")
    #clf()
    #visualize(Y_est[:,1:N_show], L)
    #title("Estimation")

    figure("Interpolation")
    clf()
    visualize(Y_itp[:,1:N_show], L)
    title("Interpolation")

    figure("Truth")
    clf()
    visualize(Y[:,1:N_show], L)
    title("Truth")
    show()
end

"""
Run a dimensionality reduction demo using Iris dataset.
"""
function test_iris()
    ##################
    # load data
    iris = datasets.load_iris()
    Y_obs = iris["data"]'
    label_list = [iris["target_names"][elem+1] for elem in iris["target"]]
    D, N = size(Y_obs)

    ##################
    # 2D compression    

    # model
    M = 2
    sigma2_y = 0.001
    Sigma_W = zeros(M,M,D)
    Sigma_mu = 1.0 * eye(D)
    for d in 1 : D
        Sigma_W[:,:,d] = 0.1 * eye(M)
    end
    prior = DimensionalityReduction.DRModel(D, M, sigma2_y, zeros(M, D), Sigma_W, zeros(D), Sigma_mu)
        
    # learn & generate
    max_iter = 100
    posterior, X_est = DimensionalityReduction.VI(deepcopy(Y_obs), prior, max_iter)

    # visualize
    figure("2D plot")
    clf()
    scatter(X_est[1,1:50], X_est[2,1:50], color="r")
    scatter(X_est[1,51:100], X_est[2,51:100], color="g")
    scatter(X_est[1,101:end], X_est[2,101:end], color="b")
    xlabel("\$x_1\$", fontsize=20)
    ylabel("\$x_2\$", fontsize=20)
    legend([label_list[1], label_list[51], label_list[101]], fontsize=16)

    ##################
    # 3D compression

    # model
    M = 3
    sigma2_y = 0.001
    Sigma_W = zeros(M,M,D)
    Sigma_mu = 1.0 * eye(D)
    for d in 1 : D
        Sigma_W[:,:,d] = 0.1 * eye(M)
    end
    prior = DimensionalityReduction.DRModel(D, M, sigma2_y, zeros(M, D), Sigma_W, zeros(D), Sigma_mu)
        
    # learn & generate
    max_iter = 100
    posterior, X_est = DimensionalityReduction.VI(deepcopy(Y_obs), prior, max_iter)

    # visualize
    figure("3D plot")
    clf()
    scatter3D(X_est[1,1:50], X_est[2,1:50], X_est[3,1:50], c="r")
    scatter3D(X_est[1,51:100], X_est[2,51:100], X_est[3,51:100], c="g")
    scatter3D(X_est[1,101:end], X_est[2,101:end], X_est[3,101:end], c="b")
    legend([label_list[1], label_list[51], label_list[101]], fontsize=16)
    xlabel("\$x_1\$", fontsize=20)
    ylabel("\$x_2\$", fontsize=20)
    zlabel("\$x_3\$", fontsize=20)
    show()
end

#test_face_missing()
test_iris()


================================================
FILE: src/demo_GaussianMixtureModel.jl
================================================
###################################
## Example code
## for Bayesian Gaussin Mixture Model

using PyPlot, PyCall
push!(LOAD_PATH,".")
import GaussianMixtureModel

"""
Visualize data & estimation in 2D space.
"""
function visualize_2D(X::Matrix{Float64}, S::Matrix{Float64}, S_est::Matrix{Float64}, text)
    cmp = get_cmap("jet")

    K1 = size(S, 1)
    K2 = size(S_est, 1)
    col1 = [pycall(cmp.o, PyAny, Int(round(val)))[1:3] for val in linspace(0,255,K1)]    
    col2 = [pycall(cmp.o, PyAny, Int(round(val)))[1:3] for val in linspace(0,255,K2)]    

    f, (ax1, ax2) = subplots(1,2,num=text)
    f[:clf]()
    f, (ax1, ax2) = subplots(1,2,num=text)

    for k in 1 : K1
        ax1[:scatter](X[1, S[k,:].==1], X[2, S[k,:].==1], color=col1[k])
    end
    ax1[:set_title]("truth")
    
    for k in 1 : K2
        ax2[:scatter](X[1, S_est[k,:].==1], X[2, S_est[k,:].==1], color=col2[k])
    end

    ax2[:set_title]("estimation")
end

"""
Run a test script for 2D data clustering.
"""
function test_2D()
    ## set model
    D = 2 # data dimension
    K = 4 #  number of mixture components
    alpha = 100.0 * ones(K)
    beta = 0.1
    m = zeros(D)
    nu = D + 1.0
    W = eye(D)
    cmp = [GaussianMixtureModel.GW(beta, m, nu, W) for _ in 1 : K]
    bgmm = GaussianMixtureModel.BGMM(D, K, alpha, cmp)
    
    ## generate data
    N = 300
    gmm = GaussianMixtureModel.sample_GMM(bgmm)
    X, S = GaussianMixtureModel.sample_data(gmm, N)
    
    ## inference
    max_iter = 100
    tic()
    S_est, post_bgmm, VB = GaussianMixtureModel.learn_VI(X, bgmm, max_iter)
    #S_est, post_bgmm, VB = GaussianMixtureModel.learn_GS(X, bgmm, max_iter)
    #S_est, post_bgmm, VB = GaussianMixtureModel.learn_CGS(X, bgmm, max_iter)
    toc()

    ## plot
    visualize_2D(X, S, GaussianMixtureModel.winner_takes_all(S_est), "2D plot")

    # VB check
    figure("ELBO")
    clf()
    plot(VB)
    ylabel("ELBO")
    xlabel("iterations")
    show()
end

test_2D()


================================================
FILE: src/demo_LogisticRegression.jl
================================================
#####################################
## Bayesian logistic regression demo

using PyPlot, PyCall
using Distributions
push!(LOAD_PATH, ".")
import LogisticRegression

"""
Visualize prediction via surface (only for 2D inputs.)
"""
function visualize_surface(mu, rho, X, Y, text)
    N = 100
    R = 100
    xmin = minimum(X[1,:])
    xmax = maximum(X[1,:])
    ymin = minimum(X[2,:])
    ymax = maximum(X[2,:])
    lx = xmax - xmin
    ly = ymax - ymin
    xmin = xmin - 0.25 * lx
    xmax = xmax + 0.25 * lx
    ymin = ymin - 0.25 * ly
    ymax = ymax + 0.25 * ly
    
    x1 = linspace(xmin,xmax,R)
    x2 = linspace(ymin,ymax,R)
    x1grid = repmat(x1, 1, R)
    x2grid = repmat(x2', R, 1)
    val = [x1grid[:] x2grid[:]]'

    z_list = []
    sigma = log.(1 + exp.(rho))
    for n in 1 : N
        W = rand(MvNormal(mu, diagm(sigma.^2)))
        z_tmp = [LogisticRegression.sigmoid(W'*val[:,i]) for i in 1 : size(val, 2)]
        push!(z_list, z_tmp)
    end
    z = mean(z_list)
    zgrid = reshape(z, R, R)

    # 3D plot
    figure("surface")
    clf()
    plot_surface(x1grid, x2grid, zgrid, alpha=0.5)
    scatter3D(X[1,Y.==1], X[2,Y.==1], Y[Y.==1]+0.01, c="r", depthshade=true)
    scatter3D(X[1,Y.==0], X[2,Y.==0], Y[Y.==0], c="b", depthshade=true)
    xlim([xmin, xmax])
    ylim([ymin, ymax])
    zlim([0, 1])
    title(text)
end

"""
Visualize prediction via contour (only for 2D inputs.)
"""
function visualize_contour(mu, rho, X, Y)
    N = 100
    R = 100
    xmin = 2*minimum(X[1,:])
    xmax = 2*maximum(X[1,:])
    ymin = minimum(X[2,:])
    ymax = maximum(X[2,:])

    x1 = linspace(xmin,xmax,R)
    x2 = linspace(ymin,ymax,R)
    x1grid = repmat(x1, 1, R)
    x2grid = repmat(x2', R, 1)
    val = [x1grid[:] x2grid[:]]'

    z_list = []
    W_list = []
    sigma = log.(1 + exp.(rho))
    for n in 1 : N
        W = rand(MvNormal(mu, diagm(sigma.^2)))
        z_tmp = [LogisticRegression.sigmoid(W'*val[:,i]) for i in 1 : size(val, 2)]
        push!(W_list, W)
        push!(z_list, z_tmp)
    end
    z = mean(z_list)
    zgrid = reshape(z, R, R)

    # precition
    figure("contour")
    clf()
    contour(x1grid, x2grid, zgrid, alpha=0.5, cmap=get_cmap("bwr"))
    scatter(X[1,Y.==1], X[2,Y.==1], c="r")
    scatter(X[1,Y.==0], X[2,Y.==0], c="b")
    xlim([xmin, xmax])
    ylim([ymin, ymax])
    title("prediction")

    # parameter samples
    figure("samples")
    clf()
    for n in 1 : 10
        draw_line(W_list[n], xmin, xmax)
    end
    scatter(X[1,Y.==1]', X[2,Y.==1]', c="r")
    scatter(X[1,Y.==0]', X[2,Y.==0]', c="b")
    xlim([xmin, xmax])
    ylim([ymin, ymax])
    title("parameter samples")
end

function draw_line(W, xmin, xmax)
    y1 = - xmin*W[1]/W[2]
    y2 = - xmax*W[1]/W[2]
    plot([xmin, xmax], [y1, y2], c="k")
end


########################
# create model

M = 2 # input dimension
Sigma_w = 100.0 * eye(M) # prior on W

########################
# create toy-data using prior model

N = 50 # num of data points
X = 2 * rand(M, N) - 1.0 # input values

# sample observation Y
Y, _ = LogisticRegression.sample_data(X, Sigma_w)

########################
# inference
alpha = 1.0e-4 # learning rate
max_iter = 100000 # VI maximum iterations 

# learn variational parameters (mu & rho)
mu, rho = LogisticRegression.VI(Y, X, M, Sigma_w, alpha, max_iter)

########################
# visualize (only for M=2)
visualize_surface(mu, rho, X, Y, "prediction")
visualize_contour(mu, rho, X, Y)
show()


================================================
FILE: src/demo_NMF.jl
================================================
##############################
## Audio decomposition demo using NMF

using PyPlot, PyCall
using DataFrames
using Distributions

push!(LOAD_PATH, ".")
import NMF
@pyimport scipy.io.wavfile as wf


# load data
wavfile = "../data/organ.wav"
fs, data = wf.read(wavfile)

figure("data")
clf()
Pxx, freqs, t, pl = specgram(data[10000:318000,2], Fs=fs, NFFT=256, noverlap=0)
xlabel("time [sec]")
ylabel("frequency [Hz]")
ylim([0,22000])

# model
D, N = size(Pxx)
K = 2
a_t = 1.0
b_t = 1.0
a_v = 1.0
b_v = 100.0
prior = NMF.NMFModel(a_t*ones(D,K), b_t*ones(D, K), a_v, b_v)

# inference
max_iter = 100
posterior, S_est, T_est, V_est = NMF.VI(Int64.(round.(Pxx)), prior, max_iter)
X = T_est * V_est

# visualize
figure("T")
clf()
for k in 1 : K
    subplot(K,1,k)
    plot(T_est[:,k], linewidth=1.0)
    xlim([0, D])
    ylim([0, ylim()[2]])
end

figure("V")
clf()
for k in 1 : K
    subplot(K,1,k)
    plot(V_est[k,:], linewidth=1.0)
    xlim([0,N])
    ylim([0, ylim()[2]])
end
show()


================================================
FILE: src/demo_PoissonHMM.jl
================================================
###################################
## Example code
## for Bayesian Poisson HMM

using PyPlot, PyCall
using HDF5, JLD
@pyimport matplotlib.gridspec as gspec

push!(LOAD_PATH,".")
import PoissonHMM
import PoissonMixtureModel

"""
Simple comparison between HMM and mixture model.
"""
function test_comparison()
    #########################
    ## load data
    file_name = "../data/timeseries.jld"
    X = load(file_name)["obs"]
    N = length(X)

    #########################
    ## Poison HMM

    ## set model
    K = 2 #  number of mixture components
    alpha_phi = 10.0 * ones(K)
    alpha_A = 100.0 * eye(K) + 1.0*ones(K, K)
    cmp = [PoissonHMM.Gam(1.0, 0.01), PoissonHMM.Gam(1.0, 0.01)]
    bhmm = PoissonHMM.BHMM(K, alpha_phi, alpha_A, cmp)
    
    ## inference
    max_iter = 100
    tic()
    Z_est_hmm, post_bhmm = PoissonHMM.learn_VI(X, bhmm, max_iter)
    toc()

    #########################
    ## Poison Mixture Model

    ## set model
    K = 2 #  number of mixture components
    alpha_phi = 10.0 * ones(K)
    cmp = [PoissonMixtureModel.Gam([1.0], 0.01), PoissonMixtureModel.Gam([1.0], 0.01)]
    bpmm = PoissonMixtureModel.BPMM(1, K, alpha_phi, cmp)
    
    ## inference
    max_iter = 100
    tic()
    Z_est_pmm, post_bpmm = PoissonMixtureModel.learn_VI(reshape(X, 1, N), bpmm, max_iter)
    toc()

    #########################
    ## Compare results
    figure("Hidden Markov Model vs Mixture Model")
    subplot(3,1,1);plot(X);ylabel("data")
    subplot(3,1,2);fill_between(1:N, reshape(Z_est_hmm[1,:]', N), zeros(N));ylim([0.0, 1.0]);ylabel("S (PHMM)")
    subplot(3,1,3);fill_between(1:N, reshape(Z_est_pmm[1,:]', N), zeros(N));ylim([0.0, 1.0]);ylabel("S (PMM)")
    show()
end

test_comparison()


================================================
FILE: src/demo_PoissonMixtureModel.jl
================================================
###################################
## Example code
## for Bayesian Poisson Mixture Model

push!(LOAD_PATH,".")
using PyPlot, PyCall
import PoissonMixtureModel

"""
Visualize data & estimation in 2D space.
"""
function visualize_2D(X::Matrix{Float64}, S::Matrix{Float64}, S_est::Matrix{Float64}, text)
    cmp = get_cmap("jet")

    K1 = size(S, 1)
    K2 = size(S_est, 1)
    col1 = [pycall(cmp.o, PyAny, Int(round(val)))[1:3] for val in linspace(0,255,K1)]    
    col2 = [pycall(cmp.o, PyAny, Int(round(val)))[1:3] for val in linspace(0,255,K2)]    

    f, (ax1, ax2) = subplots(1,2,num=text)
    f[:clf]()
    f, (ax1, ax2) = subplots(1,2,num=text)
    
    for k in 1 : K1
        ax1[:scatter](X[1, S[k,:].==1], X[2, S[k,:].==1], color=col1[k])
    end
    ax1[:set_title]("truth")
    
    for k in 1 : K2
        ax2[:scatter](X[1, S_est[k,:].==1], X[2, S_est[k,:].==1], color=col2[k])
    end

    ax2[:set_title]("estimation")
end

function draw_hist(ax, X, S, label)
    counts, bins, patches = ax[:hist](X', 20)
    for i in 1 : length(patches)
        if counts[i] > 0
            S_tmp = S[:,bins[i] .<= X[1,:] .<= bins[i+1]]
            S_sum = sum(S_tmp, 2) / sum(S_tmp)
            patches[i][:set_facecolor]((S_sum[1], 0, S_sum[2]))
        end
    end
    ax[:set_title](label)
end

"""
Visualize data & estimation using 1D histogram.
"""
function visualize_1D(X::Matrix{Float64}, S::Matrix{Float64}, S_est::Matrix{Float64})
    # separated figures
    f1, ax1 = subplots(1,1,num="observation")
    f2, ax2 = subplots(1,1,num="estimation")
    f1[:clf]()
    f2[:clf]()
    _, ax1 = subplots(1,1,num="observation")
    _, ax2 = subplots(1,1,num="estimation")
    ax1[:hist](X', 20)
    ax1[:set_title]("observation")
    draw_hist(ax2, X, S_est, "estimation")    
end

"""
Run a test script for 1D data clustering.
"""
function test_1D()
    ## set model
    D = 1 # data dimension, must be 1.
    K = 2 #  number of mixture components, must be 2.
    alpha = 100.0 * ones(K)
    cmp = [PoissonMixtureModel.Gam(1.0*ones(D), 0.01) for i in 1 : K]
    bpmm = PoissonMixtureModel.BPMM(D, K, alpha, cmp)
    
    ## generate data
    N = 1000
    pmm = PoissonMixtureModel.sample_PMM(bpmm)
    X, S = PoissonMixtureModel.sample_data(pmm, N)
    
    ## inference
    max_iter = 100
    tic()
    S_est, post_bpmm, VB = PoissonMixtureModel.learn_VI(X, bpmm, max_iter)
    #S_est, post_bpmm, VB = PoissonMixtureModel.learn_GS(X, bpmm, max_iter)
    #S_est, post_bpmm, VB = PoissonMixtureModel.learn_CGS(X, bpmm, max_iter)
    toc()

    ## plot
    visualize_1D(X, S, S_est)

    figure("ELBO")
    clf()
    plot(VB)
    ylabel("ELBO")
    xlabel("iterations")
    show()
end

"""
Run a test script for 2D data clustering.
"""
function test_2D()
    ## set model
    D = 2 # data dimension, must be 2.
    K = 8 # number of mixture components
    #K = 5

    alpha = 100.0 * ones(K)
    cmp = [PoissonMixtureModel.Gam(1.0*ones(D), 0.01) for i in 1 : K]
    bpmm = PoissonMixtureModel.BPMM(D, K, alpha, cmp)
    
    ## generate data
    N = 300
    pmm = PoissonMixtureModel.sample_PMM(bpmm)
    X, S = PoissonMixtureModel.sample_data(pmm, N)
    
    ## inference
    max_iter = 100
    tic()
    S_est, post_bpmm, VB = PoissonMixtureModel.learn_VI(X, bpmm, max_iter)
    #S_est, post_bpmm, VB = PoissonMixtureModel.learn_GS(X, bpmm, max_iter)
    #S_est, post_bpmm, VB = PoissonMixtureModel.learn_CGS(X, bpmm, max_iter)
    toc()

    ## plot
    visualize_2D(X, S, PoissonMixtureModel.winner_takes_all(S_est), "2D plot")

    # VB check
    figure("ELBO")
    clf()
    plot(VB)
    ylabel("ELBO")
    xlabel("iterations")
    show()
end

test_1D()
#test_2D()


================================================
FILE: src/demo_PolynomialRegression.jl
================================================
#################################
## Bayesian model selection demo
## for polynomial regression

using PyPlot, PyCall
using Distributions

function poly(X_raw, M)
    N = size(X_raw, 1)
    X = zeros(M, N)
    for m in 0 : M - 1
        X[m+1,:] = X_raw.^m
    end
    return X
end

function learn_bayes(X_raw, Y, M, sig2_y, Sig_w, X_lin)
    X = poly(X_raw, M)
    N = size(X_raw, 1)
    
    # calc posterior
    Sig_w_h = inv(X*inv(sig2_y*eye(N))*X' + inv(Sig_w))
    mu_w_h = Sig_w_h * (X * inv(sig2_y * eye(N)) * Y)

    # calc predictive
    X_test = poly(X_lin, M)
    Y_est = (mu_w_h'*X_test)'
    sig2_y_prd = sig2_y + diag(X_test'Sig_w_h*X_test)
    
    # calc evidence
    evidence = -0.5*(sum(Y)*inv(sig2_y) +N*log.(sig2_y) + N*log.(2*pi)
                     + logdet(Sig_w)
                     - (mu_w_h'*inv(Sig_w_h)*mu_w_h)[1] - logdet(Sig_w_h)
                     )
    return Y_est, sqrt.(sig2_y_prd), evidence
end

function test()
    # linspace
    X_lin = linspace(-1, 7, 200)
    
    # generate data
    N = 10
    sig2_y = 0.1
    X = 2*pi*rand(N)
    Y_true = [sin.(x) for x in X_lin]
    Y_obs = [sin.(x) + sig2_y * randn() for x in X]
    
    dims = [1, 2, 3, 4, 5, 10]
    
    # learning via Bayes
    sig2_w = 1.0
    Y_bayes = [learn_bayes(X, Y_obs, m, sig2_y, sig2_w*eye(m), X_lin) for m in dims]

    #############
    # compute evidences
    evidence = [learn_bayes(X, Y_obs, m, sig2_y, sig2_w*eye(m), X_lin)[3] for m in dims]
    figure("evidence")
    clf()
    plot(1:length(dims), evidence)
    xticks(1:length(dims),dims)
    ylabel(("\$\\ln p(\\bf{Y}|\\bf{X})\$"), fontsize=20)
    xlabel(("\$M\$"), fontsize=20)
    
    #############
    # visualize
    x_min = X_lin[1]
    x_max = X_lin[end]
    y_min = -4
    y_max = 4
    
    figure("prediction")
    clf()
    for k in 1 : 6
        subplot(230 + k)
        plot(X_lin, Y_bayes[k][1])
        plot(X_lin, Y_bayes[k][1] + Y_bayes[k][2], "c--")
        plot(X_lin, Y_bayes[k][1] - Y_bayes[k][2], "c--")
        plot(X, Y_obs, "ko")
        xlim([x_min, x_max])
        ylim([y_min, y_max])
        text(x_max - 2.5, y_max - 1, @sprintf("M=%d", dims[k]))
    end
    show()
end

test()


================================================
FILE: src/demo_Simple2DGauss.jl
================================================
###################################
## Simple VI & GS for 2D Gaussian

using PyPlot
using Distributions

function calc_KL(mu1, lambda1, mu2, lambda2)
    D = size(mu1, 1)
    px_lnqx = 0.5 * logdet(lambda2) - 0.5 * ((mu1 - mu2)' * lambda2 * (mu1 - mu2) + trace(lambda2 * inv(lambda1)))
    px_lnpx = 0.5 * logdet(lambda1) - 0.5 * D
    KL = - (px_lnqx - px_lnpx)
    return KL[1]
end

function plot_results(result, truth)
    N = size(result, 1)
    H = Int(ceil(sqrt(N)))
    W = Int(ceil(N / H))
    for i in 1 : H
        for j in 1 : W
            n = (i - 1) * W + j
            if n <= N
                subplot(H, W, n)
                title("$n of $N")
                plot_gaussian(truth[1], truth[2], "b", "\$p(z)\$")
                plot_gaussian(result[n][1], result[n][2], "r", "\$p(z)\$")
            end
        end
    end
end

function plot_lines(X)
    D, N = size(X)
    X_d = zeros(D, 2*N + 1)
    X_d[:,1] = X[:,1]
    for i in 1 : N
        X_d[1, 2*i - 1] = X[1, i]
        X_d[1, 2*i] = X[1, i]
        X_d[2, 2*i] = X[2, i]
        X_d[2, 2*i + 1] = X[2, i]
    end
    plot(X[1,:], X[2,:], "oy")
    plot(X_d[1,1:2*N], X_d[2,1:2*N], "--y")
end

function plot_gaussian(Mu, Sigma, col, label)
    res = 100
    plot(Mu[1], Mu[2], "x", color=col)
    
    F = eigfact(Sigma)
    vec = F.vectors
    val = F.values
    dw = 2*pi/res
    w = dw * (0 : res)
    
    c = 1.0
    a = sqrt(c*val[1])
    b = sqrt(c*val[2])
    P1 = a*cos.(w)
    P2 = b*sin.(w)
    P = Mu .+ vec'*vcat(P1', P2')
    plot(P[1, :], P[2, :], "-", color=col, label=label)
end

"""
Variational inference for 2D Gauss.
"""
function main_VI()
    ## creat truth distribution
    D = 2 # dimension
    theta = 2.0*pi/12 # tilt
    A = reshape([cos.(theta), -sin.(theta),
                 sin.(theta), cos.(theta)],
                2, 2)
    mu = [0.0, 0.0]
    lambda = inv(A * inv(reshape([1,0,0,10], 2, 2)) * A')
    
    ## initialize
    #mu_h = randn(D)
    mu_h = [-0.5, 0.3]
    lambda_h = zeros(D,D)
    
    ## main iteration
    max_iter = 10
    KL = NaN * Array{Float64, 1}(max_iter)
    result = Array{Any, 1}(max_iter)
    for i in 1 : max_iter
        ## update
        mu_h[1] = mu[1] - inv(lambda[1,1])*lambda[1,2] * (mu_h[2] - mu[2])
        
        lambda_h[1,1] = lambda[1,1]
        mu_h[2] = mu[2] - inv(lambda[2,2])*lambda[2,1] * (mu_h[1] - mu[1])
        lambda_h[2,2] = lambda[2,2]
        
        ## calculate KL divergeince
        KL[i] = calc_KL(mu_h, lambda_h, mu, lambda)

        ## store the results
        result[i] = [deepcopy(mu_h), deepcopy(inv(lambda_h))]
    end

    ## visualize results
    figure("result per iteration (VI)")
    clf()
    plot_results(result, (mu, inv(lambda)))

    figure("result (VI)")
    clf()
    plot_gaussian(mu, inv(lambda), "b", "\$p(\\bf{z})\$")
    plot_gaussian(result[end][1], result[end][2], "r", "\$q(\\bf{z})\$")
    xlabel("\$z_1\$", fontsize=20)
    ylabel("\$z_2\$", fontsize=20)
    legend(fontsize=16)
    
    figure("KL divergence (VI)")
    clf()
    plot(1:max_iter, KL)
    ylabel("KL divergence", fontsize=16)
    xlabel("iteration", fontsize=16)
    show()
end

"""
Gibbs sampling for 2D Gauss.
"""
function main_GS()
    ## creat truth distribution
    D = 2 # dimension
    theta = 2.0*pi/12 # tilt
    A = reshape([cos.(theta), -sin.(theta),
                 sin.(theta), cos.(theta)],
                2, 2)
    mu = [0.0, 0.0]
    #lambda = inv(A * inv(reshape([1,0,0,10], 2, 2)) * A')
    lambda = inv(A * inv(reshape([1,0,0,100], 2, 2)) * A')

    ## initialize
    #max_iter = 1000
    max_iter = 50
    X = randn(D, max_iter)
    mu_h = randn(D)
    
    ## main iteration
    KL = NaN * Array{Float64, 1}(max_iter)
    for i in 2 : max_iter
        ## update
        mu_h[1] = mu[1] - inv(lambda[1,1])*lambda[1,2] * (X[2,i-1] - mu[2])
        X[1, i] = rand(Normal(mu_h[1], sqrt(inv(lambda[1,1]))))
        
        mu_h[2] = mu[2] - inv(lambda[2,2])*lambda[2,1] * (X[1,i] - mu[1])
        X[2, i] = rand(Normal(mu_h[2], sqrt(inv(lambda[2,2]))))        
        
        if i > D
            KL[i] = calc_KL(mean(X[:,1:i], 2), inv(cov(X[:,1:i], 2)), mu, lambda)
        end
    end
    
    ## visualize results
    expt_mu = mean(X, 2)
    expt_Sigma = cov(X, 2)

    figure("samples (GS)")
    clf()
    plot_lines(X)
    plot_gaussian(mu, inv(lambda), "b", "\$p(\\bf{z})\$")
    plot_gaussian(expt_mu, expt_Sigma, "r", "\$q(\\bf{z})\$")
    xlabel("\$z_1\$", fontsize=20)
    ylabel("\$z_2\$", fontsize=20)
    legend(fontsize=16)
    
    figure("KL divergence (GS)")
    clf()
    plot(1:max_iter, KL)
    ylabel("KL divergence", fontsize=16)
    xlabel("sample size", fontsize=16)
    show()
end

main_VI()
main_GS()


================================================
FILE: src/demo_SimpleFitting.jl
================================================
#####################################
## Simple function fitting demo

using PyPlot, PyCall
using Distributions

# true param
W = Array([1.0, 0.0, 1.0])

# generate data
sigma = 0.5
N = 20
X = linspace(-0.4,2.4,N)
Y = [W[1] + W[2]*x + W[3]*x^2 + sigma*randn() for x in X]
X_min = minimum(X)
X_max = maximum(X)

# regression1
X_all = linspace(X_min, X_max, 100)
W1 = sum(Y.*X) / sum(X.^2)
Y1 = [W1*x for x in X_all]

# regression2
X2 = zeros(3, N)
X2[1,:] = 1
X2[2,:] = X
X2[3,:] = X.^2
W2 = inv(X2*X2') * X2*Y
Y2 = [W2[1] + W2[2]*x + W2[3]*x^2 for x in X_all]

# show data
figure()
plot(X_all, Y1, "b-")
plot(X_all, Y2, "g-")
plot(X, Y, "ko")
legend(["model1","model2","data"], loc="upper left", fontsize=16)
xlabel("\$x\$", fontsize=20)
ylabel("\$y\$", fontsize=20)
show()


================================================
FILE: src/demo_nonconjugate.jl
================================================

using PyPlot, PyCall
using Distributions
import StatsFuns.logsumexp
PyDict(matplotlib["rcParams"])["mathtext.fontset"] = "cm"
PyDict(matplotlib["rcParams"])["mathtext.rm"] = "serif"
PyDict(matplotlib["rcParams"])["lines.linewidth"] = 1.5
PyDict(matplotlib["rcParams"])["font.family"] = "TakaoPGothic"

function expt(a, b, sigma, Y, X, N_s)
    S = rand(Gamma(a, 1.0/b), N_s)
    C = mean([exp(sum(logpdf.(Normal(s, sigma), Y))) for s in S])
    curve = [exp(sum(logpdf.(Normal(mu, sigma), Y))) * pdf(Gamma(a, 1.0/b), mu) for mu in X]
    m = mean([s*exp(sum(logpdf.(Normal(s, sigma), Y)))/C for s in S])
    v = mean([(s-m)^2 * exp(sum(logpdf.(Normal(s, sigma), Y)))/C for s in S])
    return curve/C, m, v
end

X = linspace(-5, 10, 1000)

a = 2.0
b = 2.0
mu = 1.0
sigma=1.0

# data
N = 10
Y = rand(Normal(mu, sigma), N)

# calc posterior
N_s = 100000
posterior, m, v = expt(a, b, sigma, Y, X, N_s)

a_h = m^2 / v
b_h = m / v

figure()
plot(X, pdf(Normal(mu,sigma), X))
plot(X, pdf(Gamma(a,1.0/b), X))
plot(X, posterior)
plot(X, pdf(Gamma(a_h,1.0/b_h), X))
plot(Y, 0.02*ones(N), "o")
legend(["generator", "prior", "posterior", "approx", "samples"])
#legend(["データ生成分布", "事前分布", "事後分布", "近似分布", "データ"], fontsize=12)
xlim([-3, 6])
ylim([0, 1.8])