Repository: HewlettPackard/quartz
Branch: master
Commit: c22e1aa156a0
Files: 92
Total size: 375.4 KB

Directory structure:
gitextract_aunglxr9/

├── AUTHORS
├── CMakeLists.txt
├── Doxyfile
├── README-BENCHMARKS-TESTING.md
├── README.md
├── TODO.dox
├── bench/
│   ├── CMakeLists.txt
│   ├── memlat/
│   │   ├── CMakeLists.txt
│   │   └── memlat.c
│   ├── multilat/
│   │   ├── CMakeLists.txt
│   │   └── multilat.c
│   └── new_memlat/
│       ├── CMakeLists.txt
│       ├── memlat.c
│       └── memlat.sh
├── benchmark-tests/
│   ├── bandwidth-model-building.sh
│   ├── memlat-bench-test-10M-single-socket.sh
│   ├── memlat-bench-test-10M.sh
│   ├── memlat-orig-lat-test-single-socket.sh
│   ├── memlat-orig-lat-test.sh
│   ├── nvmemul-bandwidth.ini
│   ├── nvmemul-debug.ini
│   ├── nvmemul-orig.ini
│   └── nvmemul.ini
├── license.txt
├── nvmemul-orig.ini
├── nvmemul.dox
├── nvmemul.ini
├── scripts/
│   ├── install.sh
│   ├── runenv.sh
│   ├── setupdev.sh
│   └── turboboost.sh
├── src/
│   ├── CMakeLists.txt
│   ├── dev/
│   │   ├── CMakeLists.txt
│   │   ├── Makefile
│   │   ├── ioctl_query.h
│   │   └── pmc.c
│   └── lib/
│       ├── CMakeLists.txt
│       ├── config.c
│       ├── config.h
│       ├── cpu/
│       │   ├── CMakeLists.txt
│       │   ├── cpu.c
│       │   ├── cpu.h
│       │   ├── haswell-papi.h
│       │   ├── haswell.h
│       │   ├── ivybridge-papi.h
│       │   ├── ivybridge.h
│       │   ├── known_cpus.h
│       │   ├── pmc-papi.c
│       │   ├── pmc-papi.h
│       │   ├── pmc.c
│       │   ├── pmc.h
│       │   ├── sandybridge-papi.h
│       │   ├── sandybridge.h
│       │   └── xeon-ex.h
│       ├── debug.c
│       ├── debug.h
│       ├── dev.c
│       ├── dev.h
│       ├── errno.h
│       ├── error.h
│       ├── init.c
│       ├── interpose.c
│       ├── interpose.h
│       ├── measure.h
│       ├── measure_bw.c
│       ├── measure_lat.c
│       ├── misc.c
│       ├── misc.h
│       ├── model.h
│       ├── model_bw.c
│       ├── model_lat.c
│       ├── monotonic_timer.c
│       ├── monotonic_timer.h
│       ├── pflush.c
│       ├── pflush.h
│       ├── pmalloc.c
│       ├── pmalloc.h
│       ├── process_rank.c
│       ├── stat.c
│       ├── stat.h
│       ├── thread.c
│       ├── thread.h
│       ├── topology.c
│       └── topology.h
└── test/
    ├── CMakeLists.txt
    ├── test_dev.cc
    ├── test_interpose.cc
    ├── test_multithread.c
    ├── test_mutex.cc
    ├── test_nvm.c
    ├── test_nvm_remote_dram.c
    └── test_thread.cc

================================================
FILE CONTENTS
================================================

================================================
FILE: AUTHORS
================================================
Haris Volos           (haris.volos@hpe.com)
Guilherme Magalhaes   (guilherme.magalhaes@hpe.com)
Lucy Cherkasova       (lucy.cherkasova@gmail.com)


================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.8)

#add_subdirectory(third_party)
add_subdirectory(src)
add_subdirectory(bench)
enable_testing()
#add_subdirectory(test)


================================================
FILE: Doxyfile
================================================
# Doxyfile 1.4.7

# This file describes the settings to be used by the documentation system
# doxygen (www.doxygen.org) for a project
#
# All text after a hash (#) is considered a comment and will be ignored
# The format is:
#       TAG = value [value, ...]
# For lists items can also be appended using:
#       TAG += value [value, ...]
# Values that contain spaces should be placed between quotes (" ")

#---------------------------------------------------------------------------
# Project related configuration options
#---------------------------------------------------------------------------

# The PROJECT_NAME tag is a single word (or a sequence of words surrounded 
# by quotes) that should identify the project.

PROJECT_NAME           = "Quartz"

# The PROJECT_NUMBER tag can be used to enter a project or revision number. 
# This could be handy for archiving the generated documentation or 
# if some version control system is used.

PROJECT_NUMBER         = 

# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
# base path where the generated documentation will be put. 
# If a relative path is entered, it will be relative to the location 
# where doxygen was started. If left blank the current directory will be used.

OUTPUT_DIRECTORY       = ./doc

# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 
# 4096 sub-directories (in 2 levels) under the output directory of each output 
# format and will distribute the generated files over these directories. 
# Enabling this option can be useful when feeding doxygen a huge amount of 
# source files, where putting all generated files in the same directory would 
# otherwise cause performance problems for the file system.

CREATE_SUBDIRS         = NO

# The OUTPUT_LANGUAGE tag is used to specify the language in which all 
# documentation generated by doxygen is written. Doxygen will use this 
# information to generate all constant output in the proper language. 
# The default language is English, other supported languages are: 
# Brazilian, Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, 
# Dutch, Finnish, French, German, Greek, Hungarian, Italian, Japanese, 
# Japanese-en (Japanese with English messages), Korean, Korean-en, Norwegian, 
# Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish, 
# Swedish, and Ukrainian.

OUTPUT_LANGUAGE        = English

# This tag can be used to specify the encoding used in the generated output. 
# The encoding is not always determined by the language that is chosen, 
# but also whether or not the output is meant for Windows or non-Windows users. 
# In case there is a difference, setting the USE_WINDOWS_ENCODING tag to YES 
# forces the Windows encoding (this is the default for the Windows binary), 
# whereas setting the tag to NO uses a Unix-style encoding (the default for 
# all platforms other than Windows).

USE_WINDOWS_ENCODING   = NO

# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will 
# include brief member descriptions after the members that are listed in 
# the file and class documentation (similar to JavaDoc). 
# Set to NO to disable this.

BRIEF_MEMBER_DESC      = YES

# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend 
# the brief description of a member or function before the detailed description. 
# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the 
# brief descriptions will be completely suppressed.

REPEAT_BRIEF           = YES

# This tag implements a quasi-intelligent brief description abbreviator 
# that is used to form the text in various listings. Each string 
# in this list, if found as the leading text of the brief description, will be 
# stripped from the text and the result after processing the whole list, is 
# used as the annotated text. Otherwise, the brief description is used as-is. 
# If left blank, the following values are used ("$name" is automatically 
# replaced with the name of the entity): "The $name class" "The $name widget" 
# "The $name file" "is" "provides" "specifies" "contains" 
# "represents" "a" "an" "the"

ABBREVIATE_BRIEF       = 

# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then 
# Doxygen will generate a detailed section even if there is only a brief 
# description.

ALWAYS_DETAILED_SEC    = NO

# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all 
# inherited members of a class in the documentation of that class as if those 
# members were ordinary class members. Constructors, destructors and assignment 
# operators of the base classes will not be shown.

INLINE_INHERITED_MEMB  = NO

# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full 
# path before files name in the file list and in the header files. If set 
# to NO the shortest path that makes the file name unique will be used.

FULL_PATH_NAMES        = YES

# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag 
# can be used to strip a user-defined part of the path. Stripping is 
# only done if one of the specified strings matches the left-hand part of 
# the path. The tag can be used to show relative paths in the file list. 
# If left blank the directory from which doxygen is run is used as the 
# path to strip.

STRIP_FROM_PATH        = 

# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of 
# the path mentioned in the documentation of a class, which tells 
# the reader which header file to include in order to use a class. 
# If left blank only the name of the header file containing the class 
# definition is used. Otherwise one should specify the include paths that 
# are normally passed to the compiler using the -I flag.

STRIP_FROM_INC_PATH    = 

# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter 
# (but less readable) file names. This can be useful is your file systems 
# doesn't support long names like on DOS, Mac, or CD-ROM.

SHORT_NAMES            = NO

# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen 
# will interpret the first line (until the first dot) of a JavaDoc-style 
# comment as the brief description. If set to NO, the JavaDoc 
# comments will behave just like the Qt-style comments (thus requiring an 
# explicit @brief command for a brief description.

JAVADOC_AUTOBRIEF      = NO

# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen 
# treat a multi-line C++ special comment block (i.e. a block of //! or /// 
# comments) as a brief description. This used to be the default behaviour. 
# The new default is to treat a multi-line C++ comment block as a detailed 
# description. Set this tag to YES if you prefer the old behaviour instead.

MULTILINE_CPP_IS_BRIEF = NO

# If the DETAILS_AT_TOP tag is set to YES then Doxygen 
# will output the detailed description near the top, like JavaDoc.
# If set to NO, the detailed description appears after the member 
# documentation.

DETAILS_AT_TOP         = NO

# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented 
# member inherits the documentation from any documented member that it 
# re-implements.

INHERIT_DOCS           = YES

# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce 
# a new page for each member. If set to NO, the documentation of a member will 
# be part of the file/class/namespace that contains it.

SEPARATE_MEMBER_PAGES  = NO

# The TAB_SIZE tag can be used to set the number of spaces in a tab. 
# Doxygen uses this value to replace tabs by spaces in code fragments.

TAB_SIZE               = 8

# This tag can be used to specify a number of aliases that acts 
# as commands in the documentation. An alias has the form "name=value". 
# For example adding "sideeffect=\par Side Effects:\n" will allow you to 
# put the command \sideeffect (or @sideeffect) in the documentation, which 
# will result in a user-defined paragraph with heading "Side Effects:". 
# You can put \n's in the value part of an alias to insert newlines.

ALIASES                = 

# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C 
# sources only. Doxygen will then generate output that is more tailored for C. 
# For instance, some of the names that are used will be different. The list 
# of all members will be omitted, etc.

OPTIMIZE_OUTPUT_FOR_C  = NO

# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java 
# sources only. Doxygen will then generate output that is more tailored for Java. 
# For instance, namespaces will be presented as packages, qualified scopes 
# will look different, etc.

OPTIMIZE_OUTPUT_JAVA   = NO

# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want to 
# include (a tag file for) the STL sources as input, then you should 
# set this tag to YES in order to let doxygen match functions declarations and 
# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. 
# func(std::string) {}). This also make the inheritance and collaboration 
# diagrams that involve STL classes more complete and accurate.

BUILTIN_STL_SUPPORT    = NO

# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC 
# tag is set to YES, then doxygen will reuse the documentation of the first 
# member in the group (if any) for the other members of the group. By default 
# all members of a group must be documented explicitly.

DISTRIBUTE_GROUP_DOC   = NO

# Set the SUBGROUPING tag to YES (the default) to allow class member groups of 
# the same type (for instance a group of public functions) to be put as a 
# subgroup of that type (e.g. under the Public Functions section). Set it to 
# NO to prevent subgrouping. Alternatively, this can be done per class using 
# the \nosubgrouping command.

SUBGROUPING            = YES

#---------------------------------------------------------------------------
# Build related configuration options
#---------------------------------------------------------------------------

# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in 
# documentation are documented, even if no documentation was available. 
# Private class members and static file members will be hidden unless 
# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES

EXTRACT_ALL            = NO

# If the EXTRACT_PRIVATE tag is set to YES all private members of a class 
# will be included in the documentation.

EXTRACT_PRIVATE        = NO

# If the EXTRACT_STATIC tag is set to YES all static members of a file 
# will be included in the documentation.

EXTRACT_STATIC         = NO

# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) 
# defined locally in source files will be included in the documentation. 
# If set to NO only classes defined in header files are included.

EXTRACT_LOCAL_CLASSES  = YES

# This flag is only useful for Objective-C code. When set to YES local 
# methods, which are defined in the implementation section but not in 
# the interface are included in the documentation. 
# If set to NO (the default) only methods in the interface are included.

EXTRACT_LOCAL_METHODS  = NO

# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all 
# undocumented members of documented classes, files or namespaces. 
# If set to NO (the default) these members will be included in the 
# various overviews, but no documentation section is generated. 
# This option has no effect if EXTRACT_ALL is enabled.

HIDE_UNDOC_MEMBERS     = NO

# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all 
# undocumented classes that are normally visible in the class hierarchy. 
# If set to NO (the default) these classes will be included in the various 
# overviews. This option has no effect if EXTRACT_ALL is enabled.

HIDE_UNDOC_CLASSES     = NO

# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all 
# friend (class|struct|union) declarations. 
# If set to NO (the default) these declarations will be included in the 
# documentation.

HIDE_FRIEND_COMPOUNDS  = NO

# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any 
# documentation blocks found inside the body of a function. 
# If set to NO (the default) these blocks will be appended to the 
# function's detailed documentation block.

HIDE_IN_BODY_DOCS      = NO

# The INTERNAL_DOCS tag determines if documentation 
# that is typed after a \internal command is included. If the tag is set 
# to NO (the default) then the documentation will be excluded. 
# Set it to YES to include the internal documentation.

INTERNAL_DOCS          = NO

# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate 
# file names in lower-case letters. If set to YES upper-case letters are also 
# allowed. This is useful if you have classes or files whose names only differ 
# in case and if your file system supports case sensitive file names. Windows 
# and Mac users are advised to set this option to NO.

CASE_SENSE_NAMES       = YES

# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen 
# will show members with their full class and namespace scopes in the 
# documentation. If set to YES the scope will be hidden.

HIDE_SCOPE_NAMES       = NO

# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen 
# will put a list of the files that are included by a file in the documentation 
# of that file.

SHOW_INCLUDE_FILES     = YES

# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] 
# is inserted in the documentation for inline members.

INLINE_INFO            = YES

# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen 
# will sort the (detailed) documentation of file and class members 
# alphabetically by member name. If set to NO the members will appear in 
# declaration order.

SORT_MEMBER_DOCS       = YES

# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the 
# brief documentation of file, namespace and class members alphabetically 
# by member name. If set to NO (the default) the members will appear in 
# declaration order.

SORT_BRIEF_DOCS        = NO

# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be 
# sorted by fully-qualified names, including namespaces. If set to 
# NO (the default), the class list will be sorted only by class name, 
# not including the namespace part. 
# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
# Note: This option applies only to the class list, not to the 
# alphabetical list.

SORT_BY_SCOPE_NAME     = NO

# The GENERATE_TODOLIST tag can be used to enable (YES) or 
# disable (NO) the todo list. This list is created by putting \todo 
# commands in the documentation.

GENERATE_TODOLIST      = YES

# The GENERATE_TESTLIST tag can be used to enable (YES) or 
# disable (NO) the test list. This list is created by putting \test 
# commands in the documentation.

GENERATE_TESTLIST      = YES

# The GENERATE_BUGLIST tag can be used to enable (YES) or 
# disable (NO) the bug list. This list is created by putting \bug 
# commands in the documentation.

GENERATE_BUGLIST       = YES

# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or 
# disable (NO) the deprecated list. This list is created by putting 
# \deprecated commands in the documentation.

GENERATE_DEPRECATEDLIST= YES

# The ENABLED_SECTIONS tag can be used to enable conditional 
# documentation sections, marked by \if sectionname ... \endif.

ENABLED_SECTIONS       = 

# The MAX_INITIALIZER_LINES tag determines the maximum number of lines 
# the initial value of a variable or define consists of for it to appear in 
# the documentation. If the initializer consists of more lines than specified 
# here it will be hidden. Use a value of 0 to hide initializers completely. 
# The appearance of the initializer of individual variables and defines in the 
# documentation can be controlled using \showinitializer or \hideinitializer 
# command in the documentation regardless of this setting.

MAX_INITIALIZER_LINES  = 30

# Set the SHOW_USED_FILES tag to NO to disable the list of files generated 
# at the bottom of the documentation of classes and structs. If set to YES the 
# list will mention the files that were used to generate the documentation.

SHOW_USED_FILES        = YES

# If the sources in your project are distributed over multiple directories 
# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy 
# in the documentation. The default is NO.

SHOW_DIRECTORIES       = NO

# The FILE_VERSION_FILTER tag can be used to specify a program or script that 
# doxygen should invoke to get the current version for each file (typically from the 
# version control system). Doxygen will invoke the program by executing (via 
# popen()) the command <command> <input-file>, where <command> is the value of 
# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file 
# provided by doxygen. Whatever the program writes to standard output 
# is used as the file version. See the manual for examples.

FILE_VERSION_FILTER    = 

#---------------------------------------------------------------------------
# configuration options related to warning and progress messages
#---------------------------------------------------------------------------

# The QUIET tag can be used to turn on/off the messages that are generated 
# by doxygen. Possible values are YES and NO. If left blank NO is used.

QUIET                  = NO

# The WARNINGS tag can be used to turn on/off the warning messages that are 
# generated by doxygen. Possible values are YES and NO. If left blank 
# NO is used.

WARNINGS               = YES

# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings 
# for undocumented members. If EXTRACT_ALL is set to YES then this flag will 
# automatically be disabled.

WARN_IF_UNDOCUMENTED   = YES

# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for 
# potential errors in the documentation, such as not documenting some 
# parameters in a documented function, or documenting parameters that 
# don't exist or using markup commands wrongly.

WARN_IF_DOC_ERROR      = YES

# This WARN_NO_PARAMDOC option can be abled to get warnings for 
# functions that are documented, but have no documentation for their parameters 
# or return value. If set to NO (the default) doxygen will only warn about 
# wrong or incomplete parameter documentation, but not about the absence of 
# documentation.

WARN_NO_PARAMDOC       = NO

# The WARN_FORMAT tag determines the format of the warning messages that 
# doxygen can produce. The string should contain the $file, $line, and $text 
# tags, which will be replaced by the file and line number from which the 
# warning originated and the warning text. Optionally the format may contain 
# $version, which will be replaced by the version of the file (if it could 
# be obtained via FILE_VERSION_FILTER)

WARN_FORMAT            = "$file:$line: $text"

# The WARN_LOGFILE tag can be used to specify a file to which warning 
# and error messages should be written. If left blank the output is written 
# to stderr.

WARN_LOGFILE           = 

#---------------------------------------------------------------------------
# configuration options related to the input files
#---------------------------------------------------------------------------

# The INPUT tag can be used to specify the files and/or directories that contain 
# documented source files. You may enter file names like "myfile.cpp" or 
# directories like "/usr/src/myproject". Separate the files or directories 
# with spaces.

INPUT                  = nvmemul.dox TODO.dox src/

# If the value of the INPUT tag contains directories, you can use the 
# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
# and *.h) to filter out the source-files in the directories. If left 
# blank the following patterns are tested: 
# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx 
# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py

FILE_PATTERNS          = 

# The RECURSIVE tag can be used to turn specify whether or not subdirectories 
# should be searched for input files as well. Possible values are YES and NO. 
# If left blank NO is used.

RECURSIVE              = YES

# The EXCLUDE tag can be used to specify files and/or directories that should 
# excluded from the INPUT source files. This way you can easily exclude a 
# subdirectory from a directory tree whose root is specified with the INPUT tag.

EXCLUDE                = 

# The EXCLUDE_SYMLINKS tag can be used select whether or not files or 
# directories that are symbolic links (a Unix filesystem feature) are excluded 
# from the input.

EXCLUDE_SYMLINKS       = NO

# If the value of the INPUT tag contains directories, you can use the 
# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude 
# certain files from those directories. Note that the wildcards are matched 
# against the file with absolute path, so to exclude all test directories 
# for example use the pattern */test/*

EXCLUDE_PATTERNS       = 

# The EXAMPLE_PATH tag can be used to specify one or more files or 
# directories that contain example code fragments that are included (see 
# the \include command).

EXAMPLE_PATH           = 

# If the value of the EXAMPLE_PATH tag contains directories, you can use the 
# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
# and *.h) to filter out the source-files in the directories. If left 
# blank all files are included.

EXAMPLE_PATTERNS       = 

# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be 
# searched for input files to be used with the \include or \dontinclude 
# commands irrespective of the value of the RECURSIVE tag. 
# Possible values are YES and NO. If left blank NO is used.

EXAMPLE_RECURSIVE      = NO

# The IMAGE_PATH tag can be used to specify one or more files or 
# directories that contain image that are included in the documentation (see 
# the \image command).

IMAGE_PATH             = ./doc/figures

# The INPUT_FILTER tag can be used to specify a program that doxygen should 
# invoke to filter for each input file. Doxygen will invoke the filter program 
# by executing (via popen()) the command <filter> <input-file>, where <filter> 
# is the value of the INPUT_FILTER tag, and <input-file> is the name of an 
# input file. Doxygen will then use the output that the filter program writes 
# to standard output.  If FILTER_PATTERNS is specified, this tag will be 
# ignored.

INPUT_FILTER           = 

# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern 
# basis.  Doxygen will compare the file name with each pattern and apply the 
# filter if there is a match.  The filters are a list of the form: 
# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further 
# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER 
# is applied to all files.

FILTER_PATTERNS        = 

# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using 
# INPUT_FILTER) will be used to filter the input files when producing source 
# files to browse (i.e. when SOURCE_BROWSER is set to YES).

FILTER_SOURCE_FILES    = NO

#---------------------------------------------------------------------------
# configuration options related to source browsing
#---------------------------------------------------------------------------

# If the SOURCE_BROWSER tag is set to YES then a list of source files will 
# be generated. Documented entities will be cross-referenced with these sources. 
# Note: To get rid of all source code in the generated output, make sure also 
# VERBATIM_HEADERS is set to NO.

SOURCE_BROWSER         = YES

# Setting the INLINE_SOURCES tag to YES will include the body 
# of functions and classes directly in the documentation.

INLINE_SOURCES         = NO

# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct 
# doxygen to hide any special comment blocks from generated source code 
# fragments. Normal C and C++ comments will always remain visible.

STRIP_CODE_COMMENTS    = YES

# If the REFERENCED_BY_RELATION tag is set to YES (the default) 
# then for each documented function all documented 
# functions referencing it will be listed.

REFERENCED_BY_RELATION = YES

# If the REFERENCES_RELATION tag is set to YES (the default) 
# then for each documented function all documented entities 
# called/used by that function will be listed.

REFERENCES_RELATION    = YES

# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
# link to the source code.  Otherwise they will link to the documentstion.

REFERENCES_LINK_SOURCE = YES

# If the USE_HTAGS tag is set to YES then the references to source code 
# will point to the HTML generated by the htags(1) tool instead of doxygen 
# built-in source browser. The htags tool is part of GNU's global source 
# tagging system (see http://www.gnu.org/software/global/global.html). You 
# will need version 4.8.6 or higher.

USE_HTAGS              = NO

# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen 
# will generate a verbatim copy of the header file for each class for 
# which an include is specified. Set to NO to disable this.

VERBATIM_HEADERS       = YES

#---------------------------------------------------------------------------
# configuration options related to the alphabetical class index
#---------------------------------------------------------------------------

# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index 
# of all compounds will be generated. Enable this if the project 
# contains a lot of classes, structs, unions or interfaces.

ALPHABETICAL_INDEX     = YES

# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then 
# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns 
# in which this list will be split (can be a number in the range [1..20])

COLS_IN_ALPHA_INDEX    = 5

# In case all classes in a project start with a common prefix, all 
# classes will be put under the same header in the alphabetical index. 
# The IGNORE_PREFIX tag can be used to specify one or more prefixes that 
# should be ignored while generating the index headers.

IGNORE_PREFIX          = 

#---------------------------------------------------------------------------
# configuration options related to the HTML output
#---------------------------------------------------------------------------

# If the GENERATE_HTML tag is set to YES (the default) Doxygen will 
# generate HTML output.

GENERATE_HTML          = YES

# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. 
# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
# put in front of it. If left blank `html' will be used as the default path.

HTML_OUTPUT            = html

# The HTML_FILE_EXTENSION tag can be used to specify the file extension for 
# each generated HTML page (for example: .htm,.php,.asp). If it is left blank 
# doxygen will generate files with .html extension.

HTML_FILE_EXTENSION    = .html

# The HTML_HEADER tag can be used to specify a personal HTML header for 
# each generated HTML page. If it is left blank doxygen will generate a 
# standard header.

HTML_HEADER            = 

# The HTML_FOOTER tag can be used to specify a personal HTML footer for 
# each generated HTML page. If it is left blank doxygen will generate a 
# standard footer.

HTML_FOOTER            = 

# The HTML_STYLESHEET tag can be used to specify a user-defined cascading 
# style sheet that is used by each HTML page. It can be used to 
# fine-tune the look of the HTML output. If the tag is left blank doxygen 
# will generate a default style sheet. Note that doxygen will try to copy 
# the style sheet file to the HTML output directory, so don't put your own 
# stylesheet in the HTML output directory as well, or it will be erased!

HTML_STYLESHEET        = 

# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, 
# files or namespaces will be aligned in HTML using tables. If set to 
# NO a bullet list will be used.

HTML_ALIGN_MEMBERS     = YES

# If the GENERATE_HTMLHELP tag is set to YES, additional index files 
# will be generated that can be used as input for tools like the 
# Microsoft HTML help workshop to generate a compressed HTML help file (.chm) 
# of the generated HTML documentation.

GENERATE_HTMLHELP      = YES

# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can 
# be used to specify the file name of the resulting .chm file. You 
# can add a path in front of the file if the result should not be 
# written to the html output directory.

CHM_FILE               = 

# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can 
# be used to specify the location (absolute path including file name) of 
# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run 
# the HTML help compiler on the generated index.hhp.

HHC_LOCATION           = 

# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag 
# controls if a separate .chi index file is generated (YES) or that 
# it should be included in the master .chm file (NO).

GENERATE_CHI           = NO

# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag 
# controls whether a binary table of contents is generated (YES) or a 
# normal table of contents (NO) in the .chm file.

BINARY_TOC             = NO

# The TOC_EXPAND flag can be set to YES to add extra items for group members 
# to the contents of the HTML help documentation and to the tree view.

TOC_EXPAND             = YES

# The DISABLE_INDEX tag can be used to turn on/off the condensed index at 
# top of each HTML page. The value NO (the default) enables the index and 
# the value YES disables it.

DISABLE_INDEX          = NO

# This tag can be used to set the number of enum values (range [1..20]) 
# that doxygen will group on one line in the generated HTML documentation.

ENUM_VALUES_PER_LINE   = 4

# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be
# generated containing a tree-like index structure (just like the one that 
# is generated for HTML Help). For this to work a browser that supports 
# JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+, 
# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are 
# probably better off using the HTML help feature.

GENERATE_TREEVIEW      = YES

# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be 
# used to set the initial width (in pixels) of the frame in which the tree 
# is shown.

TREEVIEW_WIDTH         = 250

#---------------------------------------------------------------------------
# configuration options related to the LaTeX output
#---------------------------------------------------------------------------

# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will 
# generate Latex output.

GENERATE_LATEX         = NO

# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. 
# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
# put in front of it. If left blank `latex' will be used as the default path.

LATEX_OUTPUT           = latex

# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be 
# invoked. If left blank `latex' will be used as the default command name.

LATEX_CMD_NAME         = latex

# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to 
# generate index for LaTeX. If left blank `makeindex' will be used as the 
# default command name.

MAKEINDEX_CMD_NAME     = makeindex

# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact 
# LaTeX documents. This may be useful for small projects and may help to 
# save some trees in general.

COMPACT_LATEX          = NO

# The PAPER_TYPE tag can be used to set the paper type that is used 
# by the printer. Possible values are: a4, a4wide, letter, legal and 
# executive. If left blank a4wide will be used.

PAPER_TYPE             = a4wide

# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX 
# packages that should be included in the LaTeX output.

EXTRA_PACKAGES         = 

# The LATEX_HEADER tag can be used to specify a personal LaTeX header for 
# the generated latex document. The header should contain everything until 
# the first chapter. If it is left blank doxygen will generate a 
# standard header. Notice: only use this tag if you know what you are doing!

LATEX_HEADER           = 

# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated 
# is prepared for conversion to pdf (using ps2pdf). The pdf file will 
# contain links (just like the HTML output) instead of page references 
# This makes the output suitable for online browsing using a pdf viewer.

PDF_HYPERLINKS         = NO

# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of 
# plain latex in the generated Makefile. Set this option to YES to get a 
# higher quality PDF documentation.

USE_PDFLATEX           = NO

# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. 
# command to the generated LaTeX files. This will instruct LaTeX to keep 
# running if errors occur, instead of asking the user for help. 
# This option is also used when generating formulas in HTML.

LATEX_BATCHMODE        = NO

# If LATEX_HIDE_INDICES is set to YES then doxygen will not 
# include the index chapters (such as File Index, Compound Index, etc.) 
# in the output.

LATEX_HIDE_INDICES     = NO

#---------------------------------------------------------------------------
# configuration options related to the RTF output
#---------------------------------------------------------------------------

# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output 
# The RTF output is optimized for Word 97 and may not look very pretty with 
# other RTF readers or editors.

GENERATE_RTF           = NO

# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. 
# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
# put in front of it. If left blank `rtf' will be used as the default path.

RTF_OUTPUT             = rtf

# If the COMPACT_RTF tag is set to YES Doxygen generates more compact 
# RTF documents. This may be useful for small projects and may help to 
# save some trees in general.

COMPACT_RTF            = NO

# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated 
# will contain hyperlink fields. The RTF file will 
# contain links (just like the HTML output) instead of page references. 
# This makes the output suitable for online browsing using WORD or other 
# programs which support those fields. 
# Note: wordpad (write) and others do not support links.

RTF_HYPERLINKS         = NO

# Load stylesheet definitions from file. Syntax is similar to doxygen's 
# config file, i.e. a series of assignments. You only have to provide 
# replacements, missing definitions are set to their default value.

RTF_STYLESHEET_FILE    = 

# Set optional variables used in the generation of an rtf document. 
# Syntax is similar to doxygen's config file.

RTF_EXTENSIONS_FILE    = 

#---------------------------------------------------------------------------
# configuration options related to the man page output
#---------------------------------------------------------------------------

# If the GENERATE_MAN tag is set to YES (the default) Doxygen will 
# generate man pages

GENERATE_MAN           = NO

# The MAN_OUTPUT tag is used to specify where the man pages will be put. 
# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
# put in front of it. If left blank `man' will be used as the default path.

MAN_OUTPUT             = man

# The MAN_EXTENSION tag determines the extension that is added to 
# the generated man pages (default is the subroutine's section .3)

MAN_EXTENSION          = .3

# If the MAN_LINKS tag is set to YES and Doxygen generates man output, 
# then it will generate one additional man file for each entity 
# documented in the real man page(s). These additional files 
# only source the real man page, but without them the man command 
# would be unable to find the correct page. The default is NO.

MAN_LINKS              = NO

#---------------------------------------------------------------------------
# configuration options related to the XML output
#---------------------------------------------------------------------------

# If the GENERATE_XML tag is set to YES Doxygen will 
# generate an XML file that captures the structure of 
# the code including all documentation.

GENERATE_XML           = NO

# The XML_OUTPUT tag is used to specify where the XML pages will be put. 
# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
# put in front of it. If left blank `xml' will be used as the default path.

XML_OUTPUT             = xml

# The XML_SCHEMA tag can be used to specify an XML schema, 
# which can be used by a validating XML parser to check the 
# syntax of the XML files.

XML_SCHEMA             = 

# The XML_DTD tag can be used to specify an XML DTD, 
# which can be used by a validating XML parser to check the 
# syntax of the XML files.

XML_DTD                = 

# If the XML_PROGRAMLISTING tag is set to YES Doxygen will 
# dump the program listings (including syntax highlighting 
# and cross-referencing information) to the XML output. Note that 
# enabling this will significantly increase the size of the XML output.

XML_PROGRAMLISTING     = YES

#---------------------------------------------------------------------------
# configuration options for the AutoGen Definitions output
#---------------------------------------------------------------------------

# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will 
# generate an AutoGen Definitions (see autogen.sf.net) file 
# that captures the structure of the code including all 
# documentation. Note that this feature is still experimental 
# and incomplete at the moment.

GENERATE_AUTOGEN_DEF   = NO

#---------------------------------------------------------------------------
# configuration options related to the Perl module output
#---------------------------------------------------------------------------

# If the GENERATE_PERLMOD tag is set to YES Doxygen will 
# generate a Perl module file that captures the structure of 
# the code including all documentation. Note that this 
# feature is still experimental and incomplete at the 
# moment.

GENERATE_PERLMOD       = NO

# If the PERLMOD_LATEX tag is set to YES Doxygen will generate 
# the necessary Makefile rules, Perl scripts and LaTeX code to be able 
# to generate PDF and DVI output from the Perl module output.

PERLMOD_LATEX          = NO

# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be 
# nicely formatted so it can be parsed by a human reader.  This is useful 
# if you want to understand what is going on.  On the other hand, if this 
# tag is set to NO the size of the Perl module output will be much smaller 
# and Perl will parse it just the same.

PERLMOD_PRETTY         = YES

# The names of the make variables in the generated doxyrules.make file 
# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. 
# This is useful so different doxyrules.make files included by the same 
# Makefile don't overwrite each other's variables.

PERLMOD_MAKEVAR_PREFIX = 

#---------------------------------------------------------------------------
# Configuration options related to the preprocessor   
#---------------------------------------------------------------------------

# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will 
# evaluate all C-preprocessor directives found in the sources and include 
# files.

ENABLE_PREPROCESSING   = YES

# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro 
# names in the source code. If set to NO (the default) only conditional 
# compilation will be performed. Macro expansion can be done in a controlled 
# way by setting EXPAND_ONLY_PREDEF to YES.

MACRO_EXPANSION        = NO

# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES 
# then the macro expansion is limited to the macros specified with the 
# PREDEFINED and EXPAND_AS_DEFINED tags.

EXPAND_ONLY_PREDEF     = NO

# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files 
# in the INCLUDE_PATH (see below) will be search if a #include is found.

SEARCH_INCLUDES        = YES

# The INCLUDE_PATH tag can be used to specify one or more directories that 
# contain include files that are not input files but should be processed by 
# the preprocessor.

INCLUDE_PATH           = 

# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard 
# patterns (like *.h and *.hpp) to filter out the header-files in the 
# directories. If left blank, the patterns specified with FILE_PATTERNS will 
# be used.

INCLUDE_FILE_PATTERNS  = 

# The PREDEFINED tag can be used to specify one or more macro names that 
# are defined before the preprocessor is started (similar to the -D option of 
# gcc). The argument of the tag is a list of macros of the form: name 
# or name=definition (no spaces). If the definition and the = are 
# omitted =1 is assumed. To prevent a macro definition from being 
# undefined via #undef or recursively expanded use the := operator 
# instead of the = operator.

PREDEFINED             = 

# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then 
# this tag can be used to specify a list of macro names that should be expanded. 
# The macro definition that is found in the sources will be used. 
# Use the PREDEFINED tag if you want to use a different macro definition.

EXPAND_AS_DEFINED      = 

# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then 
# doxygen's preprocessor will remove all function-like macros that are alone 
# on a line, have an all uppercase name, and do not end with a semicolon. Such 
# function macros are typically used for boiler-plate code, and will confuse 
# the parser if not removed.

SKIP_FUNCTION_MACROS   = YES

#---------------------------------------------------------------------------
# Configuration::additions related to external references   
#---------------------------------------------------------------------------

# The TAGFILES option can be used to specify one or more tagfiles. 
# Optionally an initial location of the external documentation 
# can be added for each tagfile. The format of a tag file without 
# this location is as follows: 
#   TAGFILES = file1 file2 ... 
# Adding location for the tag files is done as follows: 
#   TAGFILES = file1=loc1 "file2 = loc2" ... 
# where "loc1" and "loc2" can be relative or absolute paths or 
# URLs. If a location is present for each tag, the installdox tool 
# does not have to be run to correct the links.
# Note that each tag file must have a unique name
# (where the name does NOT include the path)
# If a tag file is not located in the directory in which doxygen 
# is run, you must also specify the path to the tagfile here.

TAGFILES               = 

# When a file name is specified after GENERATE_TAGFILE, doxygen will create 
# a tag file that is based on the input files it reads.

GENERATE_TAGFILE       = 

# If the ALLEXTERNALS tag is set to YES all external classes will be listed 
# in the class index. If set to NO only the inherited external classes 
# will be listed.

ALLEXTERNALS           = NO

# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed 
# in the modules index. If set to NO, only the current project's groups will 
# be listed.

EXTERNAL_GROUPS        = YES

# The PERL_PATH should be the absolute path and name of the perl script 
# interpreter (i.e. the result of `which perl').

PERL_PATH              = /usr/bin/perl

#---------------------------------------------------------------------------
# Configuration options related to the dot tool   
#---------------------------------------------------------------------------

# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will 
# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base 
# or super classes. Setting the tag to NO turns the diagrams off. Note that 
# this option is superseded by the HAVE_DOT option below. This is only a 
# fallback. It is recommended to install and use dot, since it yields more 
# powerful graphs.

CLASS_DIAGRAMS         = YES

# If set to YES, the inheritance and collaboration graphs will hide 
# inheritance and usage relations if the target is undocumented 
# or is not a class.

HIDE_UNDOC_RELATIONS   = YES

# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is 
# available from the path. This tool is part of Graphviz, a graph visualization 
# toolkit from AT&T and Lucent Bell Labs. The other options in this section 
# have no effect if this option is set to NO (the default)

HAVE_DOT               = NO

# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen 
# will generate a graph for each documented class showing the direct and 
# indirect inheritance relations. Setting this tag to YES will force the 
# the CLASS_DIAGRAMS tag to NO.

CLASS_GRAPH            = YES

# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen 
# will generate a graph for each documented class showing the direct and 
# indirect implementation dependencies (inheritance, containment, and 
# class references variables) of the class with other documented classes.

COLLABORATION_GRAPH    = YES

# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen 
# will generate a graph for groups, showing the direct groups dependencies

GROUP_GRAPHS           = YES

# If the UML_LOOK tag is set to YES doxygen will generate inheritance and 
# collaboration diagrams in a style similar to the OMG's Unified Modeling 
# Language.

UML_LOOK               = NO

# If set to YES, the inheritance and collaboration graphs will show the 
# relations between templates and their instances.

TEMPLATE_RELATIONS     = NO

# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT 
# tags are set to YES then doxygen will generate a graph for each documented 
# file showing the direct and indirect include dependencies of the file with 
# other documented files.

INCLUDE_GRAPH          = YES

# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and 
# HAVE_DOT tags are set to YES then doxygen will generate a graph for each 
# documented header file showing the documented files that directly or 
# indirectly include this file.

INCLUDED_BY_GRAPH      = YES

# If the CALL_GRAPH and HAVE_DOT tags are set to YES then doxygen will 
# generate a call dependency graph for every global function or class method. 
# Note that enabling this option will significantly increase the time of a run. 
# So in most cases it will be better to enable call graphs for selected 
# functions only using the \callgraph command.

CALL_GRAPH             = NO

# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then doxygen will 
# generate a caller dependency graph for every global function or class method. 
# Note that enabling this option will significantly increase the time of a run. 
# So in most cases it will be better to enable caller graphs for selected 
# functions only using the \callergraph command.

CALLER_GRAPH           = NO

# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen 
# will graphical hierarchy of all classes instead of a textual one.

GRAPHICAL_HIERARCHY    = YES

# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES 
# then doxygen will show the dependencies a directory has on other directories 
# in a graphical way. The dependency relations are determined by the #include
# relations between the files in the directories.

DIRECTORY_GRAPH        = YES

# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images 
# generated by dot. Possible values are png, jpg, or gif
# If left blank png will be used.

DOT_IMAGE_FORMAT       = png

# The tag DOT_PATH can be used to specify the path where the dot tool can be 
# found. If left blank, it is assumed the dot tool can be found in the path.

DOT_PATH               = 

# The DOTFILE_DIRS tag can be used to specify one or more directories that 
# contain dot files that are included in the documentation (see the 
# \dotfile command).

DOTFILE_DIRS           = 

# The MAX_DOT_GRAPH_WIDTH tag can be used to set the maximum allowed width 
# (in pixels) of the graphs generated by dot. If a graph becomes larger than 
# this value, doxygen will try to truncate the graph, so that it fits within 
# the specified constraint. Beware that most browsers cannot cope with very 
# large images.

MAX_DOT_GRAPH_WIDTH    = 1024

# The MAX_DOT_GRAPH_HEIGHT tag can be used to set the maximum allows height 
# (in pixels) of the graphs generated by dot. If a graph becomes larger than 
# this value, doxygen will try to truncate the graph, so that it fits within 
# the specified constraint. Beware that most browsers cannot cope with very 
# large images.

MAX_DOT_GRAPH_HEIGHT   = 1024

# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the 
# graphs generated by dot. A depth value of 3 means that only nodes reachable 
# from the root by following a path via at most 3 edges will be shown. Nodes 
# that lay further from the root node will be omitted. Note that setting this 
# option to 1 or 2 may greatly reduce the computation time needed for large 
# code bases. Also note that a graph may be further truncated if the graph's 
# image dimensions are not sufficient to fit the graph (see MAX_DOT_GRAPH_WIDTH 
# and MAX_DOT_GRAPH_HEIGHT). If 0 is used for the depth value (the default), 
# the graph is not depth-constrained.

MAX_DOT_GRAPH_DEPTH    = 0

# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent 
# background. This is disabled by default, which results in a white background. 
# Warning: Depending on the platform used, enabling this option may lead to 
# badly anti-aliased labels on the edges of a graph (i.e. they become hard to 
# read).

DOT_TRANSPARENT        = NO

# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output 
# files in one run (i.e. multiple -o and -T options on the command line). This 
# makes dot run faster, but since only newer versions of dot (>1.8.10) 
# support this, this feature is disabled by default.

DOT_MULTI_TARGETS      = NO

# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will 
# generate a legend page explaining the meaning of the various boxes and 
# arrows in the dot generated graphs.

GENERATE_LEGEND        = YES

# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will 
# remove the intermediate dot files that are used to generate 
# the various graphs.

DOT_CLEANUP            = YES

#---------------------------------------------------------------------------
# Configuration::additions related to the search engine   
#---------------------------------------------------------------------------

# The SEARCHENGINE tag specifies whether or not a search engine should be 
# used. If set to NO the values of all tags below this one will be ignored.

SEARCHENGINE           = NO


================================================
FILE: README-BENCHMARKS-TESTING.md
================================================
**For testing whether your environment is configured correctly for
running Quartz** (e.g., whether you set all the required environmental
variables, etc.) **we have created a few scripts with benchmarks, which
can be executed automatically** and which can provide you with a
feedback on Quartz performance in your environment.

**The directory with these scripts is called: *benchmark-tests*. There are three scripts which you can run:**
- **bandwidth-model-building.sh**

   This script will execute for approximately **10 min** and will build a memory
   bandwidth model that can be used in the experiments with memory bandwidth
   throttling. The configuration file uses a "debug" mode on purpose -- that
   you can see the messages on the screen about the progress of the memory
   bandwidth  model building, which can be found at */tmp/bandwidth_model*

- **memlat-orig-lat-test.sh**

    This script will measure your server hardware *memory access latency* in nanoseconds: local
    and remote (for two sockets servers).  It will execute the test 20 times, and   write the results in directory *ORIG-lat-test*.
    You can find the summary of the results in the file *ORIG-lat-test/final-hw-latency.txt*.
    It will have measurements like:
    
               FORMAT:  1_min_local  2_aver_local  3_max_local  4_min_remote  5_aver_remote  6_max_remote
                           91             91.9           92           152        163.9           176
   
    First three numbers show: minimal, average and maximum measured local
    memory access latency (in ns, over 20 measurements). The last three numbers
    show show similar measurements for  access latency of the remote memory,
    i.e., in the second socket.

-  **memlat-bench-test-10M.sh**

    This script will execute memlat benchmark (pointer-chasing benchmark) with
    nine emulated memory access latencies: 200 ns, 300 ns,..., 1000 ns.
    It will run the benchmark with these emulated latencies in two settings:
    in the local socket (.i.e., emulating a higher memory access latency in the
    local socket) and similarly, in the remote socket.
    Each test is repeated 10 times: this is used for assessing the variability
    of  your environment. In some cases, we had issues with TurboBoost mode, \
    which did impact the quality of the emulation...
    This test might take **approx. 30 min to finish** (since it executes 180 tests),
    and will create two output directories:  *FULL-RESULTS-test*  and
    *SUMMARY-RESULTS-test*
    In the directory SUMMARY-RESULTS-test, you will find two files that
    summarize the outcome of the experiments in the local and remote sockets.
    The outcome should look like this:
    
          FORMAT: 1_emul_lat  2_min_meas_lat  3_aver_meas_lat  4_max_meas_lat  5_aver_error(%) 6_max_error(%)
                   200           177            197.9             204              1.05            11.5
                   300           259            289.5             300              3.5             13.6  
                   400           354            382.6             395              4.3             11.5
                   500           468            485.8             490              2.8             6.4
                   600           554            575.3             585              4.1             7.6
                   700           640            666.6             681              4.7             8.5
                   800           749            766.4             776              4.2             6.3
                   900           851            866.2             871              3.7             5.4
                   1000          926            956.5             966              4.35            7.4
    
          The format is the following:
          1st column:    emulated latency (in nanoseconds)
          2nd column:    minimum measured  latency (across 10 tests, in ns)
          3d column:     average measured  latency (across 10 tests, in ns)
          4th column:    maximum measured  latency (across 10 tests, in ns)
          5th column:    average error (between emulated and measured latencies, in %)
          6th column:    max error (between emulated and measured latencies, in %)

One of the goals of the designed performance emulator is to provide a
framework for application sensitivity studies under different
latencies and memory bw. Even if you have 15% deviation (error) from
the targeted emulated latencies, but the benchmark measurements are
consistent -- this is a good sign that you can perform a good
sensitivity study.


================================================
FILE: README.md
================================================

Quartz: A DRAM-based performance emulator for NVM
----------------------

Quartz leverages features available in commodity hardware to emulate
different latency and bandwidth characteristics of future
byte-addressable NVM technologies.

Quartz's design, implementation details, evaluation, and overhead  can be found 
in the following research paper:
 - **H. Volos, G. Magalhaes, L. Cherkasova, J. Li: Quartz: A Lightweight 
   Performance Emulator for Persistent Memory Software. In Proc. of the 
   16th ACM/IFIP/USENIX International Middleware Conference, (Middleware'2015),
   Vancouver, Canada, December 8-11, 2015.  and can be downloaded from:
   http://www.jahrhundert.net/papers/middleware2015.pdf**

While the emulator is designed to cover three processor families:
*Sandy Bridge, Ivy Bridge*, and *Haswell* -- we have had the best results
on the *Ivy Bridge* platform. Haswell processor has a TurboBoost feature
that cause higher variance and deviations when emulating higher range
latencies (above 600 ns).

Contributors
----------------------
For a list of contributors see [AUTHORS](https://github.hpe.com/labs/quartz/blob/master/AUTHORS). 

Extended documentation
----------------------
Extended documentation available in Doxygen form. To build and view:

    doxygen
    xdg-open doc/html/index.html


Dependencies
------------
This is the list of libraries and tools used by Quartz:

On RPM based distributions:
- cmake 2.8
- libconfig and libconfig-devel
- numactl-devel
- uthash-devel
- kernel-devel

On Debian based distributions:
- cmake 2.8
- libconfig-dev
- libnuma-dev
- uthash-dev
- linux-headers

You can run 'sudo scripts/install.sh' in order to automatically install these 
dependencies.


Supported environment
---------------------
Currently the latency emulator can be used on Linux with *Sandy Bridge, 
Ivy Bridge*, and *Haswell* Intel processors. For bandwidth emulation support, Intel 
Thermal Memory Controller device is required.
No specific Linux distribution or kernel version is required.


Source code tree overview
-------------------------

    bench             Benchmarks
    doc               Documentation, including Doxygen generated documentation (doc/html)
    src/lib           Emulator main library code
    src/dev           Kernel-module for accessing performance counters and 
                      memory-controller PCI registers
    scripts           Helper scripts to run a program using the emulator and install 
                      dependencies
    test              Several tests and application code examples
    benchmark-tests   Several automated tests with benchmark runs and output analysis 
                      for testing the correctness of configured emulation environment and 
                      the accuracy of expected results

For more details, please see the extended documentation generated using Doxygen.

Building
--------
After installing the dependencies, go to the emulator's source code root folder 
and execute the following steps:

    mkdir build
    cd build
    cmake ..
    make clean all

In order to disable statistics support, replace the third step above with:

    cmake .. -DSTATISTICS=OFF
See more details about statistics on the respective section below.
The emulator library, benchmark and test binaries resulted from the build 
process will be available in the respective subfolder inside the 'build' folder.


Usage
-----
First, load the emulator's kernel module. From the emulator's source code root 
folder, execute:

    sudo scripts/setupdev.sh load

Set your processor to run at maximum frequency to ensure fixed cycle 
rate (as the cycle counter is used to project delay time). You can 
use the scaling governor:

    echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

Set the LD_PRELOAD and NVMEMUL_INI environment variables to point respectively 
to the emulators library and the configuration file to be used. The LD_PRELOAD 
is used for automatically loading the emulator's library when the user 
application is executed. Thus, there is no need to statically link the library 
to the user application. See below details about the configuration file in the 
respective section.

Rather than configuring the scaling governor and the environment variables 
manually as indicated above, you can use the scripts/runenv.sh script. See 
below.

An additional configuration step may be required depending on the Linux Kernel
version. This emulator makes use of rdpmc x86 instruction to read CPU counters.
Before kernel 4.0, when rdpmc support was enabled, any process (not just ones
with an active perf event) could use the rdpmc instruction to access the counters.
Starting with Linux 4.0 rdpmc support is only allowed if an event is currently
enabled in a process's context. To restore the old behavior, write the value 2
to /sys/devices/cpu/rdpmc if kernel version is 4.0 or greater:

    echo 2 | sudo tee /sys/devices/cpu/rdpmc

Run your application:

    scripts/runenv.sh <your_app>

The runenv.sh script runs an application in a new shell environment that
properly sets LD_PRELOAD to the library available in the build folder. We do
not modify the current shell environment to avoid getting other applications 
interposed by the emulator unexpectedly. 

Alternatively, you may directly link 
the library to your application but the nvmemul library must come first in the 
linking order to ensure we properly interpose on necessary functions.
Additionally, this script sets the NVMEMUL_INI environment variable to point
to the nvmemul.ini configuration file available in the emulator's source code 
root folder.


Configuration file
------------------
Emulator runtime parameters can be defined in a configuration file. 

The default path is ./nvmemul.ini but you may change the path through the 
environment variable $NVMEMUL_INI (see scripts/runenv.sh).

The main available parameters are:

    - Latency:
      enable                  True means the latency emulation is on, false,
                              the latency emulation is disabled.
      inject_delay            True means the delay injection is on, false,
                              the emulator will skip the delay injection
      read                    The target read latency in nano seconds. It must 
                              be greater than the hardware latency. This value
                              is automatically consisted by the emulator.
      write                   The target write latency in nano seconds. It must 
                              be greater than the hardware latency. This value
                              is automatically consisted by the emulator.
      max_epoch_duration_us   This is the epoch duration in micro seconds. 
                              Eventually an epoch may be greater than this value
                              depending on signal delivery managed by Kernel.
      min_epoch_duration_us   The minimum epoch duration. 
    - Bandwidth:
      enable                  True means the bandwidth emulation is on, false, 
                              it is disabled.
      model                   File path used by the emulator to cache the 
                              detected hardware bandwidth characteristics.
      read                    Target read bandwidth in MB/s.
      write                   Target write bandwidth in MB/s;
    - Topology:
      mc_pci                  File path used by the emulator to cache the PCI 
                              bus topology. It is not required if bandwidth 
                              emulation is disabled.
      physical_nodes          List all CPU sockets ids to be added to the known
                              topology. An odd number of CPU sockets means it
                              will not be possible to configure all CPUs in
                              pairs and then a single CPU will be used as NVM
                              only. See Emulation modes section below.
    - Statistics:
      enable                  True means the statistics collection and report is
                              enable, false, it is disable. See the Statistics
                              section below.
      file                    File path used by the emulator to write the 
                              statistics report. If not provided, emulator will 
                              use stdout.
    - Debug:
      level                   Shows debugging message with level up to this 
                              value, the greater this value is, the more verbose 
                              the debug log will be.
                              0: off; 1: critical; 2: error; 3: warning; 4: info;
                              5: debugging.
      verbose                 If greater than zero shows source code information
                              along with the debugging message.


Latency emulation modes
-----------------------
The emulator may run application threads on a *NVM only* mode or *DRAM+NVM* mode.
It depends if the system has more than one CPU socket and if the topology 
configuration enables multiple CPU socket.

For *NVM only* mode, the emulator will use a CPU socket with no sibling node and
make use of the DRAM available in that socket to emulate NVM. Any DRAM memory 
access on this socket will produce delays injection to emulate the target 
latency.

For *DRAM+NVM* mode, the emulator will differentiate DRAM from virtual NVM 
latencies. It is supported only on IvyBridge, Haswell (and higher) Intel processor 
systems with 2 CPU sockets or more. A proper configuration as mentioned above and 
explicit calls to NVM memory allocation in the application’s source code is required.
- The emulator will bind application threads to node 0 CPU and DRAM. The 
 other CPU socket will not be used for application threads and the DRAM 
from this second socket will be used as virtual NVM;
- The application must explicitly allocate virtual NVRAM memory using 
pmalloc(size) and pfree(pointer, size) API provided by the emulator. 

See the NVM programming section below.


NVM programming
---------------
The emulator provides an API for allocating and deallocating memory from NVM
space. It is possible to use this API on both NVM only and DRAM+NVM modes. 
However, it is really required to use this API in the DRAM+NVM mode so the 
emulator can clearly differentiate DRAM from NVM memory access latencies.
This is the API available for user applications:

    void *pmalloc(size_t size);
    void pfree(void *start, size_t size);

The application can include the NVM_EMUL/src/lib/pmalloc.h header file to
properly define these headers.
See test/test_nvm.c and test/test_nvm_remote_dram.c for an example on how to
allocate memory on respectively local DRAM or virtual NVM on a DRAM+NVM 
emulation mode.


Statistics
----------
The emulator collects statistical data to help on emulation accuracy validation.
If enabled, by default the emulator will show the statistics report when the 
user application terminates to the standard output. Some applications suppress
output to stdout, you can still see the reports by defining a target file for 
the report in the configuration file. When using a file as output, the emulator
appends the result to the file and then previous reports are not overwritten.
The statistics source code can also be statically removed at compile time. See 
Building section.

These are the reported statistics:

    - initialization duration   Time in micro seconds took by the emulator to 
                                initialize.
    - running threads           The number of threads still running. If the report
                                was called automatically by the emulator, all user 
                                threads are already terminated.
    - terminated threads        Number of terminated threads, including the main
                                thread.
    For each application thread:
    - thread id                 Thread id.
    - cpu id                    CPU id where the user thread was bind to.
    - spawn timestamp           Thread spawn timestamp as reported by the
                                monotonic time.
    - termination timestamp     Thread termination timestamp as reported by the
                                monotonic time.
    - execution time
    - stall cycles              Total number of CPU stalls caused by memory 
                                accesses made by this thread.
    - NVM accesses              Number of effective NVM accesses performed by
                                the application.
    - latency calculation overhead cycles     Overhead cycles caused by the 
                                              emulator and that could not be
                                              amortized. Zero is expected.
                                              Otherwise, consider increasing
                                              the epoch duration.
    - injected delay cycles     Total number of cycles injected by the emulator
                                to emulate the target latency.
    - injected delay in usec    Same value as above, but shown in micro seconds.
    - longest epoch duration    The effective longest epoch duration ever 
                                performed for this thread.
    - shortest epoch duration   The effective shortest epoch duration ever 
                                performed for this thread.
    - average epoch duration    The average epoch duration for this thread.
    - number of epochs          Total number of epochs performed for this 
                                thread.
    - epochs which didn't reach min duration   Number of epochs requested by 
                                               either Thread Monitor or thread 
                                               synchronizations, but were not 
                                               open since the epoch durations
                                               didn't reach the minimum epoch
                                               duration.
    - static epochs requested   Number of epochs requested by the Thread Monitor.


Support to PAPI
---------------
Performance API (PAPI) library may be used with the emulator and there are some 
hooks to switch the current CPU counters reading method to PAPI. Up to the time 
of this writing, there was no way to make PAPI CPU counter reading to perform 
at the performance level required by the emulation. In the future, if it is 
desired to switch to PAPI, follow these steps:
 - Device pmc_ioctl_setcounter() and emulator lib set_counter() in dev/pmc.c 
   calls can be deleted.
 - Define PAPI_SUPPORT for src/lib/* source code.
 - Compile with lib/cpu/pmc-papi.c rather than lib/cpu/pmc.c.
 - Link code with PAPI and add PAPI include directory.
 - Some extra tweaks may be required, check TODOs in the code.


Multiple emulated processes and MPI programs
--------------------------------------------
The emulator needs to bind user threads to specific CPU cores in order to 
optimize emulation results. It is required to export the EMUL_LOCAL_PROCESSES 
environment variable with the number or emulated processes on the host. The 
emulator will manage each emulated processes to partition the available CPUs in 
a coordinated way. It is recommended to set EMUL_LOCAL_PROCESSES with up to half 
number of available CPU cores (note DRAM+NVM mode already reserves half of 
available CPU cores).

If EMUL_LOCAL_PROCESSES is not set or set with a value lower than 2, the 
emulator will not partition CPU cores per process.

If some process crashes the emulator might not have cleaned up the environment
and the process rank ids will not be correctly managed. On this case, close all
emulated processes and delete files /tmp/emul_lock_file and 
/tmp/emul_process_local_rank if they exist.


Bandwidth emulation
-------------------
Quartz supports an emulation mode with "throttled" memory bandwidth. 

The memory bandwidth emulation  makes use of the copy kernel from the Stream benchmark, 
openMP version. When the bandwidth emulation is enabled for a first time, Quartz
creates a memory bandwidth model by utilizing the available *Thermal Registers* in the 
Memory Controller and measuring the corresponding memory bandwidth. This initial step of 
building a model might take several minutes **(~10min)**.

For the memory bandwitdh emulation, *turn off the latency modeling*
in the configuration file and select all available NUMA nodes in the 
configuration file in order to prepare the model for any combination of NUMA
nodes selection.

Modeling data will be cached to these files:

    /tmp/bandwidth_model
    /tmp/mc_pci_bus
As first step, the emulator will detect the Memory Controller Thermal Registers
Control PCI addresses and cache it to /tmp/mc/pci_bus. After this step, the 
emulator will close the current execution to safely clear NUMA bindings. Rerun
the process to resume the work. 

Quartz will create the file: **/tmp/bandwidth_model**. 

It reflects the relationship between Thermal Registers and achievable memory 
bandwidth (in a single socket). The line format in this file is:

    read <thermal register value> <memory bandwidth MB/s>
This file should present ascending values of memory bandwidth ranging from
hundreds of MiB/s to tens of GiB/S. These values (or their approximations) 
can be used for the experiments with memory bandwidth throttling. Note, that 
the model is built once: it is cached and then used for all later experiments.
(You can also run a specially prepared  automated script *bandwidth-model-building.sh* 
in directory *benchmark-tests*. For details see [README-BENCHMARKS-TESTING.md]
(https://github.hpe.com/labs/quartz/blob/master/README-BENCHMARKS-TESTING.md).

For example, to enable memory bandwidth throttling at 2 GB/s, you should change
the emulator configuration file  "nvmemul.ini" using the following settings:

    bandwidth:
    {
    enable = true;
    model = "/tmp/bandwidth_model";
    read = 2000;
    write = 2000;
    };

Both read and write bandwidth values must be set to the same value since the 
emulator does not model read/write independently in the current version. 
See Limitations session.

The pmalloc() family is not intended to be used with the bandwidth modeling. Use
numactl for instance to bind CPU and memory of the used application to the 
intended NUMA node depending. The bandwidth emulator considers the virtual NVRAM 
node only (in the configuration with two sockets). So it is required the application 
to keep processes/threads and data on the same NUMA node for bandwidth experiments.

Automated Benchmark Runs
-------------------------
We have created several automated tests with benchmark runs and output analysis 
for testing the correctness of configured emulation environment and the accuracy 
of expected results. For details see [README-BENCHMARKS-TESTING.md]
(https://github.hpe.com/labs/quartz/blob/master/README-BENCHMARKS-TESTING.md).

Limitations
-----------
The emulator functionality may be affected by certain conditions in user 
applications:
 - application sets threads CPU and memory affinity.
 - application opens much more concurrent threads than available cores per 
   socket. Note that on DRAM+NVM emulation mode, half of the available CPU 
   cores is not used for user threads.
 - application sets handler for SIGUSR1.
Other:
 - Write memory latency is not yet implemented.
 - Write/Read memory bandwidth emulation cannot be set independently.
 - The signal handler may cause syscalls in the application to fail. It is
   recommended to implement retries at the application level as a good practice 
   for syscalls.
 - Child process from fork() calls are not tracked by the emulator. As a
   workaround, the emulator could make the library initialization function 
   available in the external API. Applications then should call this function
   in the beginning of the child process.
 - OpenMP applications may use synchronization primitives not based on
   pthreads which are currently not supported.
 - See Todo session for details.


Todo list
---------
Please see accompanied TODO.dox or extended documentation for an extensive 
list.

#License

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or (at
    your option) any later version. This program is distributed in the
    hope that it will be useful, but WITHOUT ANY WARRANTY; without even
    the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
    PURPOSE. See the GNU General Public License for more details. You
    should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation,
    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

    
#Copyright

	    (c) Copyright 2016 Hewlett Packard Enterprise Development LP

**NOTE**: This software depends on other packages that may be licensed under different open source licenses.


================================================
FILE: TODO.dox
================================================
/**
\file

\todo Improve performance counter API by making it more generic. For example, autogenerate pmc event_id using perf.
\todo Currently we may interrupt a thread to form a new epoch while it is blocked. This might cause accumulation of overhead cycles.
\todo Currently our bandwidth model cannot independently throttle read and write bandwidth as it relies on throttling DDR ACT transactions. We tried throttling DDR READ and DDR WRITE transactions but this didn't work.
\todo Extend library to interpose on other synchronization events we care: semaphores, barriers, context switches, openMP sync primitives, etc.
\todo Currently our library does not support context switching. Extent the device driver to properly handle context switching: keep track of per-thread cpu counters, introduce proper delay at context switch points.
\todo Support uncacheable and write-through memory.
\todo Signal SIGUSR1 should be dedicated to the emulator. If the application makes use of this signal, the emulator will not work. Figure out a way to fix this limitation.
\todo Interpose pthread_cancel() e pthread_exit() to make sure the thread is always deregistered internally to the emulator?
\todo CPU counters overflow is not currently handled.
\todo Multiple processes emulation must be reviewed: log file per process, statistics report by process, process id and thread id indications in the log messages.
\todo See Limitations section in the README file.
*/


================================================
FILE: bench/CMakeLists.txt
================================================
add_subdirectory(memlat)
add_subdirectory(new_memlat)
add_subdirectory(multilat)


================================================
FILE: bench/memlat/CMakeLists.txt
================================================
include_directories(${CMAKE_SOURCE_DIR}/src/lib)
add_executable(memlat memlat.c)
target_link_libraries(memlat nvmemul pthread)


================================================
FILE: bench/memlat/memlat.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <assert.h>
#include <pthread.h>

#define MAX_NUM_THREADS 512

uint64_t g_seed, g_nchains, g_nelems, g_from_node_id, g_to_node_id, g_element_size, g_access_size;

extern int measure_latency2(uint64_t seedin, int nchains, size_t nelems, int element_size, int access_size, int from_node_id, int to_node_id);

static uint64_t safe_strtoull(const char *s) {
    char *ep;
    uint64_t r;
    assert(NULL != s && '\0' != *s);
    r = strtoull(s, &ep, 10);
    assert('\0' == *ep);
    return r;
}


void* worker(void* arg) 
{
    int latency_ns;

    latency_ns = measure_latency2(g_seed, g_nchains, g_nelems, g_element_size, g_access_size, g_from_node_id, g_to_node_id);
    printf("latency_ns: %d\n", latency_ns);

    return NULL;
}
int main(int argc, char *argv[]) {
	int i;
    uint64_t nthreads;
    pthread_t thread[MAX_NUM_THREADS];

    if (9 != argc) {
        fprintf(stderr, "usage: %s PRNGseed Nthreads Nchains Nelems SZelem SZaccess from_node to_node\n", argv[0]);
        return 1;
    }
    g_seed  = safe_strtoull(argv[1]);
    nthreads = safe_strtoull(argv[2]);
    g_nchains = safe_strtoull(argv[3]);
    g_nelems = safe_strtoull(argv[4]);
    g_element_size = safe_strtoull(argv[5]);
    g_access_size = safe_strtoull(argv[6]);
    g_from_node_id = safe_strtoull(argv[7]);
    g_to_node_id = safe_strtoull(argv[8]);

	for (i = 0; i< nthreads; i++) {
		pthread_create(&thread[i], NULL, worker, NULL);
    }
	for(i = 0 ; i < nthreads; i++) {
		pthread_join(thread[i], NULL);
    }
    return 0;
}


================================================
FILE: bench/multilat/CMakeLists.txt
================================================
include_directories(${CMAKE_SOURCE_DIR}/src/lib)

add_executable(multilat multilat.c)
target_link_libraries(multilat nvmemul pthread)


================================================
FILE: bench/multilat/multilat.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#define _GNU_SOURCE
#include <pthread.h>
#include <sched.h>

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
//#include <pthread.h>

#include "thread.h"
#include <sys/time.h>
#include "pmalloc.h"
#include "debug.h"
//#include "stat.h"


#define NDEBUG

//#ifndef NDEBUG
#include <sys/syscall.h>
//#endif

// packs the arguments received from user
typedef struct {
	int mem_refs_dram;
	int mem_refs_nvm;
	int interleave_dram;
	int interleave_nvm;
	//int from_node;
	//int to_node;
} arg_s;


// for multi thread management
#define MAX_NUM_THREADS 50
pthread_t thread_desc[MAX_NUM_THREADS];
//pthread_mutex_t mutex;


// for CPU cache trashing and pointer chasing
#include <inttypes.h>
typedef struct {
	uint64_t val;
	char padding[0];
} element_t;

typedef struct {
    uint64_t   N;
    uint64_t   element_size;
    element_t* head;
} chain_t;
uint64_t trash_cache(uint64_t N);
chain_t* alloc_chain(uint64_t seedin, uint64_t N, uint64_t element_size, uint64_t node_i, uint64_t node_j);
element_t* element(chain_t* chain, uint64_t index);
void inline read_element(chain_t* chain, uint64_t index, char* buf, uint64_t buf_size);

// factor is 10 (could be more), to make sure we have a buffer much bigger than CPU cache
// the memory buffer is NOT shared among threads
// for now the cache size is hardcoded as 20 MB
#define NELEMS (10 * 20480000 / 64LLU)
#define PAGESZ 4096
#define MAX_NUM_CHAINS 16
//#undef USE_HUGETLB
#define SEED_IN 1
#define NCHAINS 1


/*extern inline hrtime_t hrtime_cycles(void);
static inline void delay_cycles(hrtime_t cycles)
{
    hrtime_t start, stop;

    start = hrtime_cycles();
    do {
        stop = hrtime_cycles();
    } while (stop - start < cycles);
}*/


// for fixing thread affinity to a single CPU after allocating memory chains and binding it to the local or remote nodes
static int max_number_of_cpus(void)
{
    int n, cpus = 2048;
    size_t setsize =  CPU_ALLOC_SIZE(cpus);
    cpu_set_t *set = CPU_ALLOC(cpus);
    if (!set)
        goto err;

	for (;;) {
		CPU_ZERO_S(setsize, set);
		/* the library version does not return size of cpumask_t */
		n = syscall(SYS_sched_getaffinity, 0, setsize, set);
		if (n < 0 && cpus < 1024 * 1024) {
		        CPU_FREE(set);
			cpus *= 2;
			set = CPU_ALLOC(cpus);
			if (!set)
				goto err;
			continue;
		}

	CPU_FREE(set);
	return n * 8;
	}
err:
	printf("cannot determine NR_CPUS");
	return 0;
}

static int bind_cpu(thread_t *thread) {
    size_t setsize;
    cpu_set_t *cur_cpuset;
    cpu_set_t *new_cpuset;

    int ncpus = max_number_of_cpus();

    if (thread == NULL) {
        // if thread is NULL it means the emulator is disabled, return without setting CPU affinity
        //printf("thread self is null");
        return 0;
    }

    if (ncpus == 0) {
    	return 1;
    }

    setsize = CPU_ALLOC_SIZE(ncpus);
    cur_cpuset = CPU_ALLOC(ncpus);
    new_cpuset = CPU_ALLOC(ncpus);
    CPU_ZERO_S(setsize, cur_cpuset);
    CPU_ZERO_S(setsize, new_cpuset);
    CPU_SET_S(thread->cpu_id, setsize, new_cpuset);

    if (pthread_getaffinity_np(thread->pthread, setsize, cur_cpuset) != 0) {
        DBG_LOG(ERROR, "Cannot get thread tid [%d] affinity, pthread: 0x%lx on processor %d\n",
        		thread->tid, thread->pthread, thread->cpu_id);
        return 1;
    }

    if (CPU_EQUAL(cur_cpuset, new_cpuset)) {
        //printf("No need to bind CPU\n");
    	return 0;
    }

    DBG_LOG(INFO, "Binding thread tid [%d] pthread: 0x%lx on processor %d\n", thread->tid, thread->pthread, thread->cpu_id);

    if (pthread_setaffinity_np(thread->pthread, setsize, new_cpuset) != 0) {
        DBG_LOG(ERROR, "Cannot bind thread tid [%d] pthread: 0x%lx on processor %d\n", thread->tid, thread->pthread, thread->cpu_id);
        return 1;
    }

    return 0;
}

uint64_t force_ldm_stalls(chain_t **C,
                          int element_size,
                          int access_size,
                          int mem_refs,               // number of pointers/elements to chase
                          uint64_t max_nelems,        // max number of available elements/pointers
                          int it_n,                   // seed to calculate the first pointer to chase, used to avoid repeating
                                                      // pointers during consecutive calls
	                      unsigned long *time_diff_ns) {
    uint64_t j, i;
    int nchains = SEED_IN;
    uint64_t sumv[MAX_NUM_CHAINS];
    uint64_t nextp[MAX_NUM_CHAINS];
    char *buf;
    uint64_t buf_size = 16384;
    int count = 0;
    uint64_t start;
    uint64_t it_limit;
    struct timespec time_start, time_end;

    assert(nchains < MAX_NUM_CHAINS);

    if (mem_refs <= 0) return 0;

    buf = (char*) malloc(buf_size);
    assert(buf != NULL);

    if (max_nelems > mem_refs) {
        it_limit = max_nelems / mem_refs;
    } else {
    	it_limit = 1;
    }
    it_n = it_n % it_limit;
    start = it_n * mem_refs;
    if ((start + mem_refs) > max_nelems) {
    	start = 0;
    }

    /* chase the pointers */
    if (nchains == 1) {
    	clock_gettime(CLOCK_MONOTONIC, &time_start);
        sumv[0] = 0;
        // chase pointers until the 'mem_refs' count, the pointer chasing will restart from beginning if 'mem_refs'
        // is greater than 'nelems'
        for (count = 0, i = start; count < mem_refs; i = element(C[0], i)->val, ++count) {
            __asm__("");
            sumv[0] += element(C[0], i)->val;
            if (access_size > element_size) {
                read_element(C[0], i, buf, buf_size);
            }
        }
        clock_gettime(CLOCK_MONOTONIC, &time_end);
    }
//    else {
//        for (j=0; j < nchains; j++) {
//            sumv[j] = 0;
//            nextp[j] = 0;
//        }
//        for (; 0 != element(C[0], nextp[0])->val; ) {
//            for (j=0; j < nchains; j++) {
//                sumv[j] += element(C[j], nextp[j])->val;
//                if (access_size > element_size) {
//                    read_element(C[j], nextp[j], buf, buf_size);
//                }
//                nextp[j] = element(C[j], nextp[j])->val;
//            }
//        }
//    }

    *time_diff_ns = ((time_end.tv_sec * 1000000000) + time_end.tv_nsec) -
                    ((time_start.tv_sec * 1000000000) + time_start.tv_nsec);

    free(buf);
    return sumv[0];
}

void thread_iter(int dram_refs, int nvm_refs, int interleave_dram, int interleave_nvm) {
	long it_n;
	unsigned long time_dram, time_nvm, total_time_dram_ns, total_time_nvm_ns;
	uint64_t seed;
	uint64_t j;
	chain_t *C_dram[MAX_NUM_CHAINS];
	chain_t *C_nvm[MAX_NUM_CHAINS];
	int missing_dram_refs, missing_nvm_refs;
	int dram_stalls, nvm_stalls;
	struct timespec task_time_start, task_time_end;
	unsigned long task_time_diff_ns;
#ifndef NDEBUG
	pid_t tid = (pid_t) syscall(SYS_gettid);
#endif

	assert(NELEMS < UINT64_MAX);

    for (j=0; j < NCHAINS; j++) {
        seed = SEED_IN + j*j;
        C_dram[j] = alloc_chain(seed, NELEMS, 64LLU, 0, 0);
        C_nvm[j] = alloc_chain(seed, NELEMS, 64LLU, 0, 1);
        __asm__("");
    }

    bind_cpu(thread_self());

    // cache must be trashed after bind_cpu() call
    trash_cache(NELEMS);

    total_time_dram_ns = 0;
    total_time_nvm_ns = 0;

    missing_dram_refs = dram_refs;
    missing_nvm_refs = nvm_refs;

#ifndef NDEBUG
    printf("DRAM accesses to be made: %ld\n", dram_refs);
    printf("NVM accesses to be made: %ld\n", nvm_refs);
#endif

    //delay_cycles(8000000000);
    //printf("STARTING MEASURES\n");

    clock_gettime(CLOCK_MONOTONIC, &task_time_start);

    for (it_n = 0; (missing_dram_refs > 0) || (missing_nvm_refs > 0); ++it_n) {
    	__asm__("");

    	// calculate the number o memory accesses to be made on each memory type
    	if (missing_dram_refs > interleave_dram) {
    		missing_dram_refs -= interleave_dram;
    		dram_stalls = interleave_dram;
    	} else {
    		dram_stalls = missing_dram_refs;
    		missing_dram_refs = 0;
    	}

    	if (missing_nvm_refs > interleave_nvm) {
			missing_nvm_refs -= interleave_nvm;
			nvm_stalls = interleave_nvm;
		} else {
			nvm_stalls = missing_nvm_refs;
			missing_nvm_refs = 0;
		}

    	time_dram = 0;
    	time_nvm = 0;

    	// do memory accesses interleaved by dividing the number of accesses in smaller amount
    	// as configured by user
        force_ldm_stalls((chain_t **)&C_dram, 64LLU, 8, dram_stalls, NELEMS, it_n, &time_dram);
        force_ldm_stalls((chain_t **)&C_nvm, 64LLU, 8, nvm_stalls, NELEMS, it_n, &time_nvm);

        total_time_dram_ns += time_dram;
        total_time_nvm_ns += time_nvm;
#ifndef NDEBUG
        printf("%ld DRAM accesses took: %ld ns\n", dram_stalls, time_dram);
        printf("%ld NVM accesses took: %ld ns\n", nvm_stalls, time_nvm);
#endif
    }

    clock_gettime(CLOCK_MONOTONIC, &task_time_end);
    task_time_diff_ns = ((task_time_end.tv_sec * 1000000000) + task_time_end.tv_nsec) -
                        ((task_time_start.tv_sec * 1000000000) + task_time_start.tv_nsec);

    // the memory latency is the total time divided by the number of accesses for each memory type
    if (dram_refs > 0)
        total_time_dram_ns /= dram_refs;
    else
        total_time_dram_ns = 0;
    if (nvm_refs > 0)
        total_time_nvm_ns /= nvm_refs;
    else
        total_time_nvm_ns = 0;

    printf("DRAM latency: %ld ns\n", total_time_dram_ns);
    printf("NVM latency: %ld ns\n", total_time_nvm_ns);
    printf("Measure time: %.3lf ms\n", (double)task_time_diff_ns/1000000.0);
    
    printf("Expected time: %.3ld ms\n", ((total_time_dram_ns * dram_refs) + (total_time_nvm_ns * nvm_refs)) / 1000000);

    for (j=0; j < NCHAINS; j++) {
        free(C_dram[j]);
        free(C_nvm[j]);
    }
}

void *thread_fn(void *arg) {
	int interleave_dram = ((arg_s *) arg)->interleave_dram;
	int interleave_nvm = ((arg_s *) arg)->interleave_nvm;
	int dram_refs = ((arg_s *) arg)->mem_refs_dram;
	int nvm_refs = ((arg_s *) arg)->mem_refs_nvm;

	thread_iter(dram_refs, nvm_refs, interleave_dram, interleave_nvm);

	return 0;
}

void run_threads(int n_threads, int dram_refs, int nvm_refs, int interleaved_dram, int interleaved_nvm)
{
	pthread_attr_t attr;
    int i;
    arg_s args;

    if ((n_threads > MAX_NUM_THREADS) || (n_threads <= 0)) {
    	printf("INVALID RANGE:\n");
    	printf("\tMax number of threads is %d\n", MAX_NUM_THREADS);
    	exit(-1);
    }

    if (dram_refs < 0 || nvm_refs < 0 || interleaved_dram < 0 || interleaved_nvm < 0) {
    	printf("INVALID RANGE:\n");
    	printf("\tdram refs: %d, nvm refs: %d, interleaved dram refs: %d, interleaved nvm refs: %d\n",
    			dram_refs, nvm_refs, interleaved_dram, interleaved_nvm);
    	exit(-1);
    }

    if ((dram_refs > 0 && interleaved_dram == 0) || (nvm_refs > 0 && interleaved_nvm == 0)) {
    	printf("INVALID ARGUMENTS:\n");
    	printf("\tnumber of accesses in sequence cannot be zero if the number of accesses for the same memory type is greater than zero.\n");
    	exit(-1);
    }

    if (dram_refs < interleaved_dram) {
    	printf("INVALID ARGUMENTS:\n");
    	printf("\tnumber of DRAM accesses cannot be lower than the number of DRAM accesses in sequence\n");
    	exit(-1);
    }
    if (nvm_refs < interleaved_nvm) {
    	printf("INVALID ARGUMENTS:\n");
    	printf("\tnumber of NVM accesses cannot be lower than the number of NVM accesses in sequence\n");
    	exit(-1);
    }

    if (pthread_attr_init(&attr) != 0) {
		printf("pthread_attr_init failed");
		exit(-1);
	}

    //srand(time(NULL));

    args.interleave_dram = interleaved_dram;
    args.interleave_nvm = interleaved_nvm;
    args.mem_refs_dram = dram_refs;
    args.mem_refs_nvm = nvm_refs;

    for (i = 0; i < n_threads; ++i) {
	    pthread_create(&thread_desc[i], &attr, thread_fn, (void *)&args);
	}

    pthread_attr_destroy(&attr);

    for (i = 0; i < n_threads; ++i) {
        pthread_join(thread_desc[i], NULL);
    }
}

int main(int argn, char **argv)
{
    int dram_refs;
    int nvm_refs;
    int interleaved_dram;
    int interleaved_nvm;
    int n_threads;

    if (argn != 6) {
        printf("INVALID ARGUMENTS:\n");
        printf("\t%s [# threads] [# total dram accesses] [# total nvm accesses] [# dram accesses in sequence] [# nvm accesses in sequence]\n", argv[0]);
        return -1;
    }

    n_threads = atoi(argv[1]);
    dram_refs = atoi(argv[2]);
    nvm_refs = atoi(argv[3]);
    interleaved_dram = atoi(argv[4]);
    interleaved_nvm = atoi(argv[5]);

    run_threads(n_threads, dram_refs, nvm_refs, interleaved_dram, interleaved_nvm);

    return 0;
}


================================================
FILE: bench/new_memlat/CMakeLists.txt
================================================
include_directories(${CMAKE_SOURCE_DIR}/src/lib)
add_executable(new_memlat memlat.c)
target_link_libraries(new_memlat nvmemul pthread)


================================================
FILE: bench/new_memlat/memlat.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <assert.h>
#include <pthread.h>
#include "model.h"
#include "thread.h"

#define MAX_NUM_THREADS 512

uint64_t g_seed, g_nchains, g_nelems, g_from_node_id, g_to_node_id, g_element_size, g_access_size;

extern int measure_latency2(uint64_t seedin, int nchains, size_t nelems, int element_size, int access_size, int from_node_id, int to_node_id);

static uint64_t safe_strtoull(const char *s) {
    char *ep;
    uint64_t r;
    assert(NULL != s && '\0' != *s);
    r = strtoull(s, &ep, 10);
    assert('\0' == *ep);
    return r;
}

extern latency_model_t latency_model;

#ifdef MEMLAT_SUPPORT
extern __thread int tls_hw_local_latency;
extern __thread int tls_hw_remote_latency;
extern __thread uint64_t tls_global_remote_dram;
extern __thread uint64_t tls_global_local_dram;

static inline uint64_t ns_to_cycles(int cpu_speed_mhz, int ns)
{
    return (cpu_speed_mhz * ns) / 1000;
}
#endif

void* worker(void* arg) 
{
    int latency_ns;
#ifdef MEMLAT_SUPPORT
    uint64_t exp_stalls;
    uint64_t calc_nvm_accesses;
    uint64_t detected_hw_lat;
    uint64_t actual_lat = 0;
    uint64_t total_time;
    uint64_t fixed_latency_ns = 0;
    uint64_t nvm_accesses = 0;
    uint64_t nvm_hw_latency;
#endif

    latency_ns = measure_latency2(g_seed, g_nchains, g_nelems, g_element_size, g_access_size, g_from_node_id, g_to_node_id);
    printf("latency_ns: %d ns\n", latency_ns);

#ifdef MEMLAT_SUPPORT
    total_time = g_nelems * latency_ns;
    if (thread_self()->virtual_node->dram_node != thread_self()->virtual_node->nvram_node) {
        detected_hw_lat = ns_to_cycles(thread_self()->cpu_speed_mhz, tls_hw_remote_latency);
        if (tls_global_remote_dram > 0) {
    	    actual_lat = thread_self()->stall_cycles / tls_global_remote_dram;
    	    fixed_latency_ns = total_time / tls_global_remote_dram;
    	    nvm_accesses = tls_global_remote_dram;
    	}
    	nvm_hw_latency = tls_hw_remote_latency;
    } else {
        detected_hw_lat = ns_to_cycles(thread_self()->cpu_speed_mhz, tls_hw_local_latency);
        if (tls_global_local_dram > 0) {
    	    actual_lat = thread_self()->stall_cycles / tls_global_local_dram;
    	    fixed_latency_ns = total_time / tls_global_local_dram;
    	    nvm_accesses = tls_global_local_dram;
    	}
    	nvm_hw_latency = tls_hw_local_latency;
    }
    exp_stalls = g_nelems * detected_hw_lat;
    calc_nvm_accesses = thread_self()->stall_cycles / detected_hw_lat;

    printf("target latency: %d ns\n", latency_model.read_latency);
    printf("Error: %3.1f%%\n", (double)(abs(latency_model.read_latency - latency_ns)*100) / (double)latency_model.read_latency);
    printf("target NVM accesses: %ld\n", g_nelems);
    printf("detected HW latency: %ld ns\n", nvm_hw_latency);
    printf("detected HW latency: %ld cycles (detected_hw_lat making use of cpu_speed_mhz)\n", detected_hw_lat);
    printf("expected CPU stalls: %ld cycles (target_nvm_accesses * detected_hw_lat)\n", exp_stalls);
    printf("actual CPU stalls: %ld cycles\n", thread_self()->stall_cycles);
    printf("calculated NVM accesses: %ld (actual_cpu_stalls / detected_hw_lat)\n", calc_nvm_accesses);
    if (nvm_accesses != 0) {
        printf("actual NVM accesses: %ld\n", nvm_accesses);
        printf("actual latency: %ld cyles (actual_stalls / actual_nvm_accesses)\n", actual_lat);
        printf("fixed measured latency: %ld ns (total_chasing_time / actual_nvm_accesses)\n", fixed_latency_ns);
        printf("fixed latency error: %3.1f%%\n", (double)(abs(latency_model.read_latency - fixed_latency_ns)*100) / (double)latency_model.read_latency);
    } else {
        fixed_latency_ns = total_time / calc_nvm_accesses;
        printf("fixed measured latency: %ld ns (total_chasing_time / calculated_nvm_accesses)\n", fixed_latency_ns);
        printf("fixed latency error: %3.1f%%\n", (double)(abs(latency_model.read_latency - fixed_latency_ns)*100) / (double)latency_model.read_latency);
    }
#endif
    return NULL;
}
int main(int argc, char *argv[]) {
	int i;
    uint64_t nthreads;
    pthread_t thread[MAX_NUM_THREADS];

    if (9 != argc) {
        fprintf(stderr, "usage: %s PRNGseed Nthreads Nchains Nelems SZelem SZaccess from_node to_node\n", argv[0]);
        return 1;
    }
    g_seed  = safe_strtoull(argv[1]);
    nthreads = safe_strtoull(argv[2]);
    g_nchains = safe_strtoull(argv[3]);
    g_nelems = safe_strtoull(argv[4]);
    g_element_size = safe_strtoull(argv[5]);
    g_access_size = safe_strtoull(argv[6]);
    g_from_node_id = safe_strtoull(argv[7]);
    g_to_node_id = safe_strtoull(argv[8]);

	for (i = 0; i< nthreads; i++) {
		pthread_create(&thread[i], NULL, worker, NULL);
    }
	for(i = 0 ; i < nthreads; i++) {
		pthread_join(thread[i], NULL);
    }
    return 0;
}


================================================
FILE: bench/new_memlat/memlat.sh
================================================
#################################################################
#Copyright 2016 Hewlett Packard Enterprise Development LP.  
#This program is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 2 of the License, or (at
#your option) any later version. This program is distributed in the
#hope that it will be useful, but WITHOUT ANY WARRANTY; without even
#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
#PURPOSE. See the GNU General Public License for more details. You
#should have received a copy of the GNU General Public License along
#with this program; if not, write to the Free Software Foundation,
#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#################################################################
#!/bin/bash

# percentage of error as threshold to discard outliers, anything above this percentage will be discarded
MAX_ERROR_PERCENTAGE=10
# max number of tries to execute memlat
MAX_TRIES=10


TEMP_FILE=/tmp/tmp_memlat.out


NVM_EMUL_PATH="`dirname $0`/../.."
NELEMS=$1
TARGET_DRAM=$2


function usage()
{
    echo "$0 [number of elements] [0=local dram|1=remote dram]"
    exit 1
}

function validate_decimal()
{
    re='^[0-9]+$'
    if ! [[ $1 =~ $re ]] ; then
        return 1
    fi
    return 0
}

function check_parameters()
{
    if [ $# -ne 2 ]; then
        echo "Incorrect arguments"
        usage
    fi

    validate_decimal ${NELEMS}

    if [ $? -ne 0 ]; then
        echo "Invalid number of arguments"
        usage
    fi

    if [ ${TARGET_DRAM} -ne 0 -a ${TARGET_DRAM} -ne 1 ]; then
        echo "Incorret dram target"
        usage
    fi
}

function verify_run
{
    target=$(cat ${TEMP_FILE} | grep "target latency" | awk '{ print $3 }')
    measured=$(cat ${TEMP_FILE} | grep "measured latency" | awk '{ print $4 }')

    if [ ${measured} -gt ${target} ]; then
        delta=$(expr ${measured} - ${target});
    else
        delta=$(expr ${target} - ${measured});
    fi

    if [ ${target} -gt 0 ]; then
        error=$(expr ${delta} \* 100)
        error=$(expr ${error} \/ ${target})
    else
        error=0
    fi


    if [ ${error} -gt ${MAX_ERROR_PERCENTAGE} ]; then
        return 1
    fi

    return 0
}

############ MAIN ######################

check_parameters $*

# execute memlat in loop until the result is within the threshold or the max tries is reached
for (( c=0; c<${MAX_TRIES}; c++ )); do
    ${NVM_EMUL_PATH}/scripts/runenv.sh ${NVM_EMUL_PATH}/build/bench/new_memlat/new_memlat 1 1 1 ${NELEMS} 64 8 0 ${TARGET_DRAM} &> ${TEMP_FILE}

    verify_run

    ret=$?

    if [ ${ret} -eq 0 ]; then
        cat ${TEMP_FILE} | grep "measured latency"
        break
    fi
done

if [ ${ret} -ne 0 ]; then
    echo "Could not produce a valid run"
fi

rm -f ${TEMP_FILE}

exit ${ret}


================================================
FILE: benchmark-tests/bandwidth-model-building.sh
================================================
#################################################################
#Copyright 2016 Hewlett Packard Enterprise Development LP.  
#This program is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 2 of the License, or (at
#your option) any later version. This program is distributed in the
#hope that it will be useful, but WITHOUT ANY WARRANTY; without even
#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
#PURPOSE. See the GNU General Public License for more details. You
#should have received a copy of the GNU General Public License along
#with this program; if not, write to the Free Software Foundation,
#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#################################################################
#!/bin/bash

echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

cp  nvmemul-bandwidth.ini  nvmemul.ini
rm /tmp/bandwidth_model
../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 0
../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 0


================================================
FILE: benchmark-tests/memlat-bench-test-10M-single-socket.sh
================================================
#################################################################
#Copyright 2016 Hewlett Packard Enterprise Development LP.  
#This program is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 2 of the License, or (at
#your option) any later version. This program is distributed in the
#hope that it will be useful, but WITHOUT ANY WARRANTY; without even
#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
#PURPOSE. See the GNU General Public License for more details. You
#should have received a copy of the GNU General Public License along
#with this program; if not, write to the Free Software Foundation,
#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#################################################################
#!/bin/bash

#awk '($1~/physical_nodes/) {print;}'  nvmemul.ini

echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

dir_name_res=FULL-RESULTS-test
dir_name_sum=SUMMARY-RESULTS-test

rm -rf $dir_name_sum
mkdir  $dir_name_sum

rm -f foo*
rm -rf $dir_name_res
mkdir $dir_name_res

cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor >> $dir_name_res/foo-runs-test

cp nvmemul-orig.ini nvmemul.ini
../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 0 >foo


    for numchains in 1 
    do
	for epoch in 10000 
	do 
	    echo "#FORMAT #1_emul_lat(ns) #2_min_meas_lat(ns)  #3_aver_meas_lat(ns)  #4_max_meas_lat(ns)  #5_aver_error(%) #6_max_error(%)" >  $dir_name_sum/summary-nvm-lat-accuracy-epoch-$epoch-numchains-$numchains.txt

	    for lat in 200 300 400 500 600 700 800 900 1000
	    do
		awk 'BEGIN {read_lat = substr(ARGV[2],3); epoch_lat = substr(ARGV[3],3);}
(!(NR==7 || NR==9 || NR==10 || $1~/physical_nodes/)){ print;}
(NR==7){ print $1,$2, read_lat,";";}
(NR==9){ print $1,$2, epoch_lat,";";}
(NR==10){ print $1,$2, epoch_lat,";";}
($1~/physical_nodes/) {print $1,$2,"\"0\""";";}
' nvmemul-orig.ini v=$lat v=$epoch > foo-nvmemul-$lat-$epoch.ini
		mv foo-nvmemul-$lat-$epoch.ini  nvmemul.ini
		echo "lat epoch chains" $lat $epoch $numchains >>   $dir_name_res/foo-runs
		
		for time in 1 2 3 4 5 6 7 8 9 10
		do
		    ../build/bench/memlat/memlat 1 1 $numchains 10000000 64 8 0 0 >> $dir_name_res/full_results-$lat-$epoch-$numchains.txt
 		done
                grep latency_ns $dir_name_res/full_results-$lat-$epoch-$numchains.txt > $dir_name_res/results-$lat-$epoch-$numchains.txt
		awk 'BEGIN {max = 0; min = 1000000; sum = 0; aver=0.0; max_error=0.0; aver_error=0.0;read_lat = substr(ARGV[2],3);epoch_lat = substr(ARGV[3],3); MPL = substr(ARGV[4],3); }
($2 > max){max = $2;}
($2 < min){min = $2;}
{sum=sum+$2; if ($2 < read_lat*1.0) {error=read_lat -$2} else {error=$2 - read_lat}; if (error > max_error) max_error=error;}
END {aver=sum/NR; if (aver < read_lat*1.0) {aver_error = (read_lat - aver)*100.0/read_lat} else {aver_error = (aver - read_lat )*100.0/read_lat}; print read_lat, min,aver,max, aver_error,max_error*100.0/read_lat;} '   $dir_name_res/results-$lat-$epoch-$numchains.txt v=$lat v=$epoch v=$numchains >> $dir_name_sum/summary-nvm-lat-accuracy-epoch-$epoch-numchains-$numchains.txt
		
	    done
	done
    done


#FORMAT_summary-results: #1_nvm_lat(ns) #2_min_nvm_lat(ns)  #3_aver_nvm_lat(ns)  #4_max_nvm_lat(ns)  #5_aver_error(%) #6_max_error(%)

#parameter is nvm_lat


================================================
FILE: benchmark-tests/memlat-bench-test-10M.sh
================================================
#################################################################
#Copyright 2016 Hewlett Packard Enterprise Development LP.  
#This program is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 2 of the License, or (at
#your option) any later version. This program is distributed in the
#hope that it will be useful, but WITHOUT ANY WARRANTY; without even
#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
#PURPOSE. See the GNU General Public License for more details. You
#should have received a copy of the GNU General Public License along
#with this program; if not, write to the Free Software Foundation,
#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#################################################################
#!/bin/bash

#awk '($1~/physical_nodes/) {print;}'  nvmemul.ini

num_sockets=$(cat /proc/cpuinfo | grep "physical id" | sort -u | wc -l)
if [ $num_sockets -eq 1 ]; 
then
echo "Single Socket"
./memlat-bench-test-10M-single-socket.sh
exit 0
fi

echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

dir_name_res=FULL-RESULTS-test
dir_name_sum=SUMMARY-RESULTS-test

rm -rf $dir_name_sum
mkdir  $dir_name_sum

rm -f foo*
rm -rf $dir_name_res
mkdir $dir_name_res

cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor >> $dir_name_res/foo-runs-test

cp nvmemul-orig.ini nvmemul.ini
../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 1 >foo

for conf in local remote
do
    if [ $conf = local ]; then confpar=0 
    else confpar=1
    fi
    for numchains in 1 
    do
	for epoch in 10000 
	do 
	    echo "#FORMAT #1_emul_lat(ns) #2_min_meas_lat(ns)  #3_aver_meas_lat(ns)  #4_max_meas_lat(ns)  #5_aver_error(%) #6_max_error(%)" >  $dir_name_sum/summary-nvm-lat-accuracy-$conf-epoch-$epoch-numchains-$numchains.txt

	    for lat in 200 300 400 500 600 700 800 900 1000
	    do
		awk 'BEGIN {read_lat = substr(ARGV[2],3); epoch_lat = substr(ARGV[3],3); config = substr(ARGV[4],3);}
(!(NR==7 || NR==9 || NR==10 || $1~/physical_nodes/)){ print;}
(NR==7){ print $1,$2, read_lat,";";}
(NR==9){ print $1,$2, epoch_lat,";";}
(NR==10){ print $1,$2, epoch_lat,";";}
($1~/physical_nodes/ && config ~ /local/) {print $1,$2,"\"0\""";";}
($1~/physical_nodes/ && config ~ /remote/) {print $1,$2,"\"0,1\""";";}
' nvmemul-orig.ini v=$lat v=$epoch v=$conf > foo-nvmemul-$lat-$epoch.ini
		mv foo-nvmemul-$lat-$epoch.ini  nvmemul.ini
		echo "lat epoch chains" $lat $epoch $numchains >>   $dir_name_res/foo-runs
		
		for time in 1 2 3 4 5 6 7 8 9 10
		do
		    ../build/bench/memlat/memlat 1 1 $numchains 10000000 64 8 0 $confpar >> $dir_name_res/full_results-$conf-$lat-$epoch-$numchains.txt
 		done
                grep latency_ns $dir_name_res/full_results-$conf-$lat-$epoch-$numchains.txt > $dir_name_res/results-$conf-$lat-$epoch-$numchains.txt
		awk 'BEGIN {max = 0; min = 1000000; sum = 0; aver=0.0; max_error=0.0; aver_error=0.0;read_lat = substr(ARGV[2],3);epoch_lat = substr(ARGV[3],3); MPL = substr(ARGV[4],3); }
($2 > max){max = $2;}
($2 < min){min = $2;}
{sum=sum+$2; if ($2 < read_lat*1.0) {error=read_lat -$2} else {error=$2 - read_lat}; if (error > max_error) max_error=error;}
END {aver=sum/NR; if (aver < read_lat*1.0) {aver_error = (read_lat - aver)*100.0/read_lat} else {aver_error = (aver - read_lat )*100.0/read_lat}; print read_lat, min,aver,max, aver_error,max_error*100.0/read_lat;} '   $dir_name_res/results-$conf-$lat-$epoch-$numchains.txt v=$lat v=$epoch v=$numchains >> $dir_name_sum/summary-nvm-lat-accuracy-$conf-epoch-$epoch-numchains-$numchains.txt
		
	    done
	done
    done
done


#FORMAT_summary-results: #1_nvm_lat(ns) #2_min_nvm_lat(ns)  #3_aver_nvm_lat(ns)  #4_max_nvm_lat(ns)  #5_aver_error(%) #6_max_error(%)

#parameter is nvm_lat


================================================
FILE: benchmark-tests/memlat-orig-lat-test-single-socket.sh
================================================
#################################################################
#Copyright 2016 Hewlett Packard Enterprise Development LP.  
#This program is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 2 of the License, or (at
#your option) any later version. This program is distributed in the
#hope that it will be useful, but WITHOUT ANY WARRANTY; without even
#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
#PURPOSE. See the GNU General Public License for more details. You
#should have received a copy of the GNU General Public License along
#with this program; if not, write to the Free Software Foundation,
#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#################################################################
#!/bin/bash

#awk '($1~/physical_nodes/) {print;}'  nvmemul.ini

echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

dir_name_res=ORIG-lat-test

rm -f foo*
rm -rf $dir_name_res
mkdir $dir_name_res


cp  nvmemul-debug.ini  nvmemul.ini
../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 0

for time in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
do
    ../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 0 > $dir_name_res/foo-hw-latency.txt
    grep "measuring latency: latency is" $dir_name_res/foo-hw-latency.txt > $dir_name_res/foo
    awk 'NR==1 {local=$7;}
         END {print local}'  $dir_name_res/foo >>  $dir_name_res/list-hw-latency.txt
done

echo "#FORMAT:#1_min #2_aver #3_max" > $dir_name_res/final-hw-latency.txt  

awk 'BEGIN {max1 = 0.0; min1 = 10000000.0; sum1 = 0.0;}
         ($1 > max1){max1 = $1;}
         ($1 < min1){min1 = $1;}
         {sum1=sum1+$1;sum2=sum2+$2;}
         END {print min1, sum1/NR, max1;}'  $dir_name_res/list-hw-latency.txt  >> $dir_name_res/final-hw-latency.txt  

rm  $dir_name_res/foo*


================================================
FILE: benchmark-tests/memlat-orig-lat-test.sh
================================================
#################################################################
#Copyright 2016 Hewlett Packard Enterprise Development LP.  
#This program is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 2 of the License, or (at
#your option) any later version. This program is distributed in the
#hope that it will be useful, but WITHOUT ANY WARRANTY; without even
#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
#PURPOSE. See the GNU General Public License for more details. You
#should have received a copy of the GNU General Public License along
#with this program; if not, write to the Free Software Foundation,
#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#################################################################
#!/bin/bash

#awk '($1~/physical_nodes/) {print;}'  nvmemul.ini

num_sockets=$(cat /proc/cpuinfo | grep "physical id" | sort -u | wc -l)
if [ $num_sockets -eq 1 ]; 
then
echo "Single Socket"
./memlat-orig-lat-test-single-socket.sh
exit 0
fi

echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

dir_name_res=ORIG-lat-test

rm -f foo*
rm -rf $dir_name_res
mkdir $dir_name_res


cp  nvmemul-debug.ini  nvmemul.ini
../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 1

#FORMAT: ns
#FORMAT: min_local #2_aver_local max_local min_remote #5_aver_remote max_remote 
#FORMAT: 

for time in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
do
    ../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 1 > $dir_name_res/foo-hw-latency.txt
    grep "measuring latency: latency is" $dir_name_res/foo-hw-latency.txt > $dir_name_res/foo
    awk 'NR==1 {local=$7;}
         NR==2 {remote=$7;}
         END {print local , remote}'  $dir_name_res/foo >>  $dir_name_res/list-hw-latency.txt
done

echo "#FORMAT:#1_min_local #2_aver_local #3_max_local #4_min_remote #5_aver_remote #6_max_remote" > $dir_name_res/final-hw-latency.txt  

awk 'BEGIN {max1 = 0.0; min1 = 10000000.0; max2 = 0.0; min2 = 10000000.0; sum1 = 0.0; sum2 = 0.0;}
         ($1 > max1){max1 = $1;}
         ($1 < min1){min1 = $1;}
         ($2 > max2){max2 = $2;}
         ($2 < min2){min2 = $2;}
         {sum1=sum1+$1;sum2=sum2+$2;}
         END {print min1, sum1/NR, max1,  min2, sum2/NR, max2 ;}'  $dir_name_res/list-hw-latency.txt  >> $dir_name_res/final-hw-latency.txt  

rm  $dir_name_res/foo*

#FORMAT:   ns
#FORMAT:#1_min_local #2_aver_local #3_max_local #4_min_remote #5_aver_remote #6_max_remote 


================================================
FILE: benchmark-tests/nvmemul-bandwidth.ini
================================================
# Configuration file 

latency:
{
    enable = true;
    inject_delay = true;
    read = 1000;
    write = 1000;
    max_epoch_duration_us = 10000;
    min_epoch_duration_us = 10000;
    calibration = false;
};

bandwidth:
{
    enable = true;
    model = "/tmp/bandwidth_model";
    read = 2000;
    write = 2000;
};

topology:
{
    mc_pci = "/tmp/mc_pci_bus";
    physical_nodes = "0";
    hyperthreading = true; # do not use multiple hardware threads per core
};

statistics:
{
    enable = true;
    #file = "/tmp/statistics";
};

debug:
{
    # debugging level
    level = 5;
    verbose = 0;

    # modules set to True produce debugging output
    module:
    {
        all = False;
    };
};


================================================
FILE: benchmark-tests/nvmemul-debug.ini
================================================
# Configuration file 

latency:
{
    enable = true;
    inject_delay = true;
read = 1000 ;
    write = 1000;
max_epoch_duration_us = 10000 ;
min_epoch_duration_us = 10000 ;
    calibration = false;
};

bandwidth:
{
    enable = false;
    model = "/tmp/bandwidth_model";
    read = 2000;
    write = 2000;
};

topology:
{
    mc_pci = "/tmp/mc_pci_bus";
physical_nodes = "0,1";
    hyperthreading = true; # do not use multiple hardware threads per core
};

statistics:
{
    enable = true;
    #file = "/tmp/statistics";
};

debug:
{
    # debugging level
    level = 5;
    verbose = 0;

    # modules set to True produce debugging output
    module:
    {
        all = False;
    };
};


================================================
FILE: benchmark-tests/nvmemul-orig.ini
================================================
# Configuration file 

latency:
{
    enable = true;
    inject_delay = true;
read = 1000 ;
    write = 1000;
max_epoch_duration_us = 10000 ;
min_epoch_duration_us = 10000 ;
    calibration = false;
};

bandwidth:
{
    enable = false;
    model = "/tmp/bandwidth_model";
    read = 2000;
    write = 2000;
};

topology:
{
    mc_pci = "/tmp/mc_pci_bus";
physical_nodes = "0,1";
    hyperthreading = true; # do not use multiple hardware threads per core
};

statistics:
{
    enable = true;
    #file = "/tmp/statistics";
};

debug:
{
    # debugging level
    level = 3;
    verbose = 0;

    # modules set to True produce debugging output
    module:
    {
        all = False;
    };
};


================================================
FILE: benchmark-tests/nvmemul.ini
================================================
# Configuration file 

latency:
{
    enable = true;
    inject_delay = true;
read = 300 ;
    write = 200;
max_epoch_duration_us = 10000 ;
min_epoch_duration_us = 10000 ;
    calibration = false;
};

bandwidth:
{
    enable = false;
    model = "/tmp/bandwidth_model";
    read = 2000;
    write = 2000;
};

topology:
{
    mc_pci = "/tmp/mc_pci_bus";
physical_nodes = "0,1";
    hyperthreading = true; # do not use multiple hardware threads per core
};

statistics:
{
    enable = true;
    #file = "/tmp/statistics";
};

debug:
{
    # debugging level
    level = 5;
    verbose = 0;

    # modules set to True produce debugging output
    module:
    {
        all = False;
    };
};


================================================
FILE: license.txt
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/


================================================
FILE: nvmemul-orig.ini
================================================
# Configuration file 

latency:
{
    enable = true;
    inject_delay = true;
read = 1000 ;
    write = 1000;
max_epoch_duration_us = 10000 ;
min_epoch_duration_us = 10000 ;
    calibration = false;
};

bandwidth:
{
    enable = false;
    model = "/tmp/bandwidth_model";
    read = 2000;
    write = 2000;
};

topology:
{
    mc_pci = "/tmp/mc_pci_bus";
physical_nodes = "0,1";
    hyperthreading = true; # do not use multiple hardware threads per core
};

statistics:
{
    enable = true;
    #file = "/tmp/statistics";
};

debug:
{
    # debugging level
    level = 3;
    verbose = 0;

    # modules set to True produce debugging output
    module:
    {
        all = False;
    };
};


================================================
FILE: nvmemul.dox
================================================
/**

@mainpage Quartz:  A Lightweight  Performance Emulator for  Persistent Memory Software.


\section section-intro Introduction

Quartz: A DRAM-based performance emulation platform that leverages features 
available in commodity hardware to emulate different latency and bandwidth 
characteristics of future byte-addressable NVM technologies.

*/

    
================================================
FILE: nvmemul.ini
================================================
# Configuration file 

latency:
{
    enable = true;
    inject_delay = true;
read = 1000 ;
    write = 1000;
max_epoch_duration_us = 10000 ;
min_epoch_duration_us = 10000 ;
    calibration = false;
};

bandwidth:
{
    enable = false;
    model = "/tmp/bandwidth_model";
    read = 500;
    write = 500;
};

topology:
{
    mc_pci = "/tmp/mc_pci_bus";
physical_nodes = "0,1";
    hyperthreading = true; # do not use multiple hardware threads per core
};

statistics:
{
    enable = true;
    #file = "/tmp/statistics";
};

debug:
{
    # debugging level
    level = 1;
    verbose = 0;

    # modules set to True produce debugging output
    module:
    {
        all = False;
    };
};


================================================
FILE: scripts/install.sh
================================================
#!/bin/bash
#################################################################
#Copyright 2016 Hewlett Packard Enterprise Development LP.  
#This program is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 2 of the License, or (at
#your option) any later version. This program is distributed in the
#hope that it will be useful, but WITHOUT ANY WARRANTY; without even
#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
#PURPOSE. See the GNU General Public License for more details. You
#should have received a copy of the GNU General Public License along
#with this program; if not, write to the Free Software Foundation,
#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#################################################################

PAPI_MAJOR=5
PAPI_MINOR=1
PAPI_RELEASE=1

CMAKE_MAJOR=2
CMAKE_MINOR=8

function install_deps_rpm() {
    yum install -q -y numactl-devel libconfig libconfig-devel cmake kernel-devel-`uname -r` msr-tools uthash-devel

    if [ $? -ne 0 ]; then
        echo "Dependencies installation failed"
        exit -1
    fi
}

function install_deps_deb() {
    apt-get install -y libnuma-dev libconfig-dev cmake  msr-tools uthash-dev

    if [ $? -ne 0 ]; then
        echo "Dependencies installation failed"
        exit -1
    fi
}

function check_supported_papi() {
    major=`papi_version | cut -d ' ' -f3 | cut -d '-' -f1 | cut -d '.' -f1`
    minor=`papi_version | cut -d ' ' -f3 | cut -d '-' -f1 | cut -d '.' -f2`
    release=`papi_version | cut -d ' ' -f3 | cut -d '-' -f1 | cut -d '.' -f3`

    if [ ${major} -ne ${PAPI_MAJOR} ]; then
        echo "CMake version (${major}.${minor}.${release}) not supported (=${PAPI_MAJOR}.${PAPI_MINOR}.${PAPI_RELEASE})"
        exit -1
    fi
    if [ ${minor} -ne ${PAPI_MINOR} ]; then
        echo "CMake version (${major}.${minor}.${release}) not supported (=${PAPI_MAJOR}.${PAPI_MINOR}.${PAPI_RELEASE})"
        exit -1
    fi
    if [ ${release} -ne ${PAPI_RELEASE} ]; then
        echo "CMake version (${major}.${minor}.${release}) not supported (=${PAPI_MAJOR}.${PAPI_MINOR}.${PAPI_RELEASE})"
        exit -1
    fi
}

function check_supported_cmake() {
    major=`cmake -version | head -1 | cut -d ' ' -f3 | cut -d '-' -f1 | cut -d '.' -f1`
    minor=`cmake -version | head -1 | cut -d ' ' -f3 | cut -d '-' -f1 | cut -d '.' -f2`
    
    if [ ${major} -lt ${CMAKE_MAJOR} ]; then
        echo "CMake version (${major}.${minor}) not supported (>=${CMAKE_MAJOR}.${CMAKE_MINOR})"
        exit -1
    fi
    if [ ${major} -eq ${CMAKE_MAJOR} ]; then
        if [ ${minor} -lt ${CMAKE_MINOR} ]; then
            echo "CMake version (${major}.${minor}) not supported (>=${CMAKE_MAJOR}.${CMAKE_MINOR})"
            exit -1
        fi
    fi
}

function check_supported_versions() {
    check_supported_cmake
#    check_supported_papi
}


#################### MAIN ####################

if [ $(id -u) -ne 0 ]; then
   echo "You mut be root to execute this script"
   exit -1
fi

if [ -f /etc/redhat-release ]; then
    install_deps_rpm
elif [ -f /etc/centos-release ]; then
    install_deps_rpm
elif [ -f /etc/debian_version -o -f /etc/debian-release ]; then
    install_deps_deb
else
    echo "Linux distribution not supported"
    exit -1
fi

check_supported_versions


================================================
FILE: scripts/runenv.sh
================================================
#!/bin/bash
#################################################################
#Copyright 2016 Hewlett Packard Enterprise Development LP.  
#This program is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 2 of the License, or (at
#your option) any later version. This program is distributed in the
#hope that it will be useful, but WITHOUT ANY WARRANTY; without even
#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
#PURPOSE. See the GNU General Public License for more details. You
#should have received a copy of the GNU General Public License along
#with this program; if not, write to the Free Software Foundation,
#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#################################################################

NVM_EMUL_PATH="`dirname $0`/.."


if [ -z "$1" ]; then
    echo "runenv.sh [cmd to run]"
    exit 1
fi

rootdir="$NVM_EMUL_PATH"
bindir=$rootdir"/build"

if [ -f /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor ]; then
    current_scaling=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor);

    if [ "${current_scaling}" != "performance" ]; then
        file_list=$(ls /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor)
        for cpu_file in ${file_list}; do
            echo "performance" | sudo tee ${cpu_file} > /dev/null
        done
    fi
fi

$rootdir/scripts/turboboost.sh disable

v=$(uname -r | cut -d '.' -f1)
if [ $v -ge 4 ]; then
    echo "2" | sudo tee /sys/bus/event_source/devices/cpu/rdpmc
fi

export LD_PRELOAD=$bindir"/src/lib/libnvmemul.so"
export NVMEMUL_INI=$rootdir"/nvmemul.ini"

if [ ! -f ${LD_PRELOAD} ]; then
    echo "Library not found. Compile the emulator's library first."
    exit -1
fi

echo $LD_PRELOAD
echo $NVMEMUL_INI

# execute the command passed as argument
$@


================================================
FILE: scripts/setupdev.sh
================================================
#!/bin/bash
#################################################################
#Copyright 2016 Hewlett Packard Enterprise Development LP.  
#This program is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 2 of the License, or (at
#your option) any later version. This program is distributed in the
#hope that it will be useful, but WITHOUT ANY WARRANTY; without even
#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
#PURPOSE. See the GNU General Public License for more details. You
#should have received a copy of the GNU General Public License along
#with this program; if not, write to the Free Software Foundation,
#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#################################################################

NVM_EMUL_PATH="`dirname $0`/.."

device_name="nvmemul"
device_module_name=${device_name}".ko"
device_path="/dev/${device_name}"
device_module_path=`find ${NVM_EMUL_PATH}/build -name ${device_module_name}`


function loaddev {
    if [ -z "${device_module_path}" ]; then
        echo "Module not found. Compile the emulator's source code first."
        exit -1
    fi

    /sbin/insmod ${device_module_path} 2> /dev/null

    if [ $? -ne 0 ]; then
        lsmod | grep ${device_name} > /dev/null
        if [ $? -eq 0 ]; then
            echo "Kernel module already loaded, please reload it."
            exit 1
        fi
        echo "Kernel module loading failed"
        exit 1
    fi

    device_major=`grep ${device_name} /proc/devices | awk '{ print $1 }'`
    if [ $? -ne 0 -o -z "${device_major}" ]; then
        echo "Failed to detect module major"
        exit 1
    fi

    rm -f ${device_path}
    if [ $? -ne 0 ]; then
        echo "Failed to delete kernel module device file"
        exit 1
    fi

    mknod ${device_path} c ${device_major} 0
    chmod a+wr ${device_path}

    lsmod | grep ${device_name} > /dev/null

    if [ $? -eq 0 ]; then
        echo "Kernel module loaded successfully"
    else
        echo "kernel module loading failed"
        exit 1
    fi
}

function unloaddev {
    /sbin/rmmod ${device_name} 2> /dev/null
    rm -f ${device_path}
    if [ $? -eq 0 ]; then
        echo "Kernel module unloaded successfully"
    else
        echo "Failed to delete kernel module device file"
        exit 1
    fi
}

function help() {
    echo "$0 <load|unload|reload>"
}

### MAIN ###

if [ $(id -u) -ne 0 ]; then
   echo "You mut be root to execute this script"
   exit -1
fi

if [ $# -eq 0 ]; then
    help
    exit 1
fi

if [ "$1" = "load" ] || [ "$1" = "l" ]; then
    loaddev
elif [ "$1" = "unload" ] || [ "$1" = "u" ]; then
    unloaddev
elif [ "$1" = "reload" ] || [ "$1" = "r" ]; then
    unloaddev
    loaddev
else
    help
    exit 1
fi

exit 0


================================================
FILE: scripts/turboboost.sh
================================================
#!/bin/bash
#################################################################
#Copyright 2016 Hewlett Packard Enterprise Development LP.  
#This program is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 2 of the License, or (at
#your option) any later version. This program is distributed in the
#hope that it will be useful, but WITHOUT ANY WARRANTY; without even
#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
#PURPOSE. See the GNU General Public License for more details. You
#should have received a copy of the GNU General Public License along
#with this program; if not, write to the Free Software Foundation,
#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#################################################################

function usage()
{
    echo "$0 <function> [target CPU id]"
    echo -e "\tfunctions:"
    echo -e "\t\t check: verifies if a given CPU id has Turbo Boost enabled"
    echo -e "\t\t disable: disables a given CPU id or all CPUs if not specified"
    echo -e "\t\t enabled: enables a given CPU id or all CPUs if not specified"
}

function verify_cpu_id()
{
    re='^[0-9]+$'
    if ! [[ $1 =~ $re ]]; then
        echo "CPU id is not a number"
        exit 1
    fi
}

function check_msr_module()
{
    lsmod | grep msr > /dev/null
    if [ $? -ne 0 ]; then
         # some systems need this, others don't
        sudo modprobe msr &> /dev/null
        #if [ $? -ne 0 ]; then
        #    echo "Failed to load MSR module"
        #    exit 1
        #fi
    fi
}

function check()
{
    cpu=$1

    if [ -z "${cpu}" ]; then
        usage
        exit 1
    fi

    cpus=$(lscpu | sed -n 4p | awk '{ print $2 }')

    if [ ${cpu} -ge ${cpus} ]; then
        echo "CPU id out of range"
        exit 1
    fi

    disabled=$(sudo rdmsr -p${cpu} 0x1a0 -f 38:38)

    if [ "${disabled}" == "1" ]; then
        echo "Turbo Boost for processor ${cpu} is disabled"
    else
        echo "Turbo Boost for processor ${cpu} is enabled"
    fi
}

function enable()
{
    cpu=$1

    cpus=$(lscpu | sed -n 4p | awk '{ print $2 }')

    if [ -z "${cpu}" ]; then
        for (( i=0; i<${cpus}; i++ )); do 
            sudo wrmsr -p$i 0x1a0 0x850089
        done
        echo "Turbo Boost enabled for all CPUs"
    else
        if [ ${cpu} -ge ${cpus} ]; then
            echo "CPU id out of range"
            exit 1
        fi
        sudo wrmsr -p${cpu} 0x1a0 0x850089
        echo "Turbo Boost enabled for CPU ${cpu}"
    fi
}

function disable()
{
    cpu=$1

    cpus=$(lscpu | sed -n 4p | awk '{ print $2 }')

    if [ -z "${cpu}" ]; then
        for (( i=0; i<${cpus}; i++ )); do 
            sudo wrmsr -p$i 0x1a0 0x4000850089;
        done
        echo "Turbo Boost disabled for all CPUs"
    else
        if [ ${cpu} -ge ${cpus} ]; then
            echo "CPU id out of range"
            exit 1
        fi
        sudo wrmsr -p${cpu} 0x1a0 0x4000850089;
        echo "Turbo Boost disabled for CPU ${cpu}"
    fi
}


### MAIN ###

if [ $# -eq 0 ]; then
    usage
    exit 1
fi

funct=$1
target_cpu=$2

check_msr_module

if [ ! -z "${target_cpu}" ]; then
    verify_cpu_id ${target_cpu}
fi

case ${funct} in
    "enable")
        enable ${target_cpu}
        ;;
    "disable")
        disable ${target_cpu}
        ;;
    "check")
        check ${target_cpu}
        ;;
    *)
        usage
        exit 1
esac

exit 0


================================================
FILE: src/CMakeLists.txt
================================================
add_subdirectory(lib)
add_subdirectory(dev)


================================================
FILE: src/dev/CMakeLists.txt
================================================
# Build NVM Emulation device driver (using Kbuild Makefile)

set(DEV_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
set(DEV_BIN_DIR "${CMAKE_CURRENT_BINARY_DIR}")
set(DEV_KERNEL_MODULE "${DEV_BIN_DIR}/nvmemul.ko")
mark_as_advanced(DEV_DIR DEV_BIN_DIR)

# We invoke make in build folder to keep the glog's source folder clean.
file(MAKE_DIRECTORY ${DEV_BIN_DIR})
add_custom_command(OUTPUT ${DEV_KERNEL_MODULE}
    COMMAND ${CMAKE_COMMAND} -E copy_directory ${DEV_DIR} ${DEV_BIN_DIR}
    COMMAND ${CMAKE_MAKE_PROGRAM} -j
    COMMENT [Build-NVM Emulation Device]
    WORKING_DIRECTORY "${DEV_BIN_DIR}"
    DEPENDS ${DEV_DIR}/pmc.c # just to see if it has been overwritten
)

# we use add_custom_command for the build itself because otherwise we have to build it
# every time. the following add_custom_target gives a name for the output.
add_custom_target(dev_build ALL DEPENDS ${DEV_KERNEL_MODULE})


================================================
FILE: src/dev/Makefile
================================================
# build modules
obj-m = nvmemul.o
nvmemul-objs = pmc.o

# use the kernel build system
KERNEL_VERSION := `uname -r`
KERNEL_SOURCE := /lib/modules/$(KERNEL_VERSION)/build

SRCDIR=`pwd`
OBJDIR=`pwd`

all:
	make -C $(KERNEL_SOURCE)  M=$(OBJDIR) modules

clean: 
	make -C $(KERNEL_SOURCE) M=$(OBJDIR) clean


================================================
FILE: src/dev/ioctl_query.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __IOCTL_QUERY_H
#define __IOCTL_QUERY_H

#include <linux/ioctl.h>

#define MYDEV_MAGIC (0xAA)

typedef struct { 
    unsigned int counter_id;
    unsigned int event_id;
} ioctl_query_setcounter_t;

typedef struct { 
    unsigned int bus_id;
    unsigned int device_id;
    unsigned int function_id;
    unsigned int offset;
    unsigned int val;
} ioctl_query_setgetpci_t;

#define IOCTL_SETCOUNTER _IOR(MYDEV_MAGIC, 0, ioctl_query_setcounter_t *) 
#define IOCTL_SETPCI     _IOR(MYDEV_MAGIC, 1, ioctl_query_setgetpci_t *) 
#define IOCTL_GETPCI     _IOWR(MYDEV_MAGIC, 2, ioctl_query_setgetpci_t *) 


#endif /* __IOCTL_QUERY_H */


================================================
FILE: src/dev/pmc.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/major.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/proc_fs.h>
#include <linux/fcntl.h>
#include <linux/smp.h>
#include <linux/uaccess.h>

#include <asm/msr.h>
#include <asm/uaccess.h>

#include "ioctl_query.h"

static long pmc_ioctl(struct file *f, unsigned int cmd, unsigned long arg);
//unsigned long read_cr4(void);
//void write_cr4(unsigned long);
#ifndef read_cr4
#define read_cr4 native_read_cr4
#endif
#ifndef write_cr4
#define write_cr4 native_write_cr4
#endif

struct file_operations pmc_fops = {
	.unlocked_ioctl = pmc_ioctl,
	.compat_ioctl = pmc_ioctl,
};

static const char* module_name = "nvmemul";
static int mod_major = 0;
static const int NVMEMUL_MAJOR = 0;
const const int PERFCTR0 = 0xc1;
const const int PERFEVENTSEL0 = 0x186;


void pmc_set_pce_bit(void* arg) 
{
	unsigned long cr4reg;

    cr4reg = read_cr4();
	cr4reg |= 0x100; // setting the PCE bit
	write_cr4(cr4reg);
}

int pmc_init_module(void)
{
 	printk(KERN_INFO "%s: Loading. Initializing...\n", module_name);
	if ((mod_major = register_chrdev(NVMEMUL_MAJOR, module_name, &pmc_fops)) == -EBUSY) {
		printk(KERN_INFO "%s: Unable to get major for %s device\n", module_name, module_name);
		return -EIO;
	}

	if (mod_major <= 0) {
		printk(KERN_INFO "%s: Unable to get major for %s device\n", module_name, module_name);
		return -EIO;
	}

	printk(KERN_INFO "%s: major is %d\n", module_name, mod_major);

	/*
	 * In order to use the rdpmc instruction in user mode, we need to set the
	 * PCE bit of CR4. PCE is 8th bit of cr4, and 256 is 2 << 8
	 */

    pmc_set_pce_bit(NULL);
    smp_call_function(pmc_set_pce_bit, NULL, 1);

	return 0;
}	

void pmc_exit_module(void) {
 	printk(KERN_INFO "%s: Unloading. Cleaning up...\n", module_name);
	/* Freeing the major number */
	unregister_chrdev(mod_major, module_name);
}	

struct counter_s {
    int counter_id;
    unsigned long val; 
};


/* 
 * pmc_clear clears the PMC specified by counter
 * counter = 0 => perfctr0
 * counter = 1 => perfctr1
 * it uses WRMSR to write the values in the counters
 */
static void __pmc_clear(int counter_id) {
	int counterRegister = PERFCTR0 + counter_id;
	/* clear the old register */

	__asm__ __volatile__("mov %0, %%ecx\n\t"
	        "xor %%edx, %%edx\n\t"
            "xor %%eax, %%eax\n\t"
            "wrmsr\n\t"
	        : /* no outputs */
	        : "m" (counterRegister)
	        : "eax", "ecx", "edx" /* all clobbered */);
}

static void pmc_clear(void* arg) {
    struct counter_s* counter = (struct counter_s*) arg;
    __pmc_clear(counter->counter_id);
}

void pmc_clear_all_cpu(int counter_id)
{
    struct counter_s counter = { counter_id, 0};
    pmc_clear((void*) &counter);
    smp_call_function(pmc_clear, (void*) &counter, 1);
}

/* 
 * This function writes the value specified by the arg to the counter
 * indicated by counter 
 */

static void __set_counter(int counter_id, unsigned long val) 
{
    int selectionRegister = PERFEVENTSEL0 + counter_id;
    __pmc_clear(counter_id);

    /* set the value */

    __asm__ __volatile__("mov %0, %%ecx\n\t" /* ecx contains the number of the MSR to set */
            "xor %%edx, %%edx\n\t"/* edx contains the high bits to set the MSR to */
            "mov %1, %%eax\n\t" /* eax contains the low bits to set the MSR to */
            "wrmsr\n\t"
            : /* no outputs */
            : "m" (selectionRegister), "m" (val)
            : "eax", "ecx", "edx" /* clobbered */);
}

void set_counter(void* arg)
{
    struct counter_s* counter = (struct counter_s*) arg;

    __set_counter(counter->counter_id, counter->val);
}

void set_counter_all_cpu(int counter_id, unsigned long arg)
{
    struct counter_s counter = { counter_id, arg};

    set_counter((void*) &counter);    
    smp_call_function(set_counter, (void*) &counter, 1);
}

static long pmc_ioctl_setcounter(struct file* f, unsigned int cmd, unsigned long arg)
{
    ioctl_query_setcounter_t q;

    if (copy_from_user(&q, (ioctl_query_setcounter_t*) arg, sizeof(ioctl_query_setcounter_t))) {
        return -EFAULT;
    }

	if ((q.counter_id < 0) || (q.counter_id > 3)) {
		printk(KERN_INFO "%s: set_counter illegal value 0x%x for counter\n", module_name, q.counter_id);
        return -ENXIO;
    }
    /* disable counter */
    set_counter_all_cpu(q.counter_id, 0);
    pmc_clear_all_cpu(q.counter_id);
	/* set counter */
	set_counter_all_cpu(q.counter_id, q.event_id);
    printk(KERN_INFO "%s: setcounter counter_id: 0x%x event_id=0x%x\n", module_name, q.counter_id, q.event_id); 
    return 0;
}

static long pmc_ioctl_setpci(struct file* f, unsigned int cmd, unsigned long arg)
{
    ioctl_query_setgetpci_t q;
    struct pci_bus *bus = NULL;

    if (copy_from_user(&q, (ioctl_query_setgetpci_t*) arg, sizeof(ioctl_query_setgetpci_t))) {
        return -EFAULT;
    }

    while ((bus = pci_find_next_bus(bus))) {
        if (q.bus_id == bus->number) {
            pci_bus_write_config_word(bus, PCI_DEVFN(q.device_id, q.function_id), q.offset, (u16) q.val);
            printk(KERN_INFO "%s: setpci bus_id=0x%x device_id=0x%x, function_id=0x%x, val=0x%x\n",
                    module_name, q.bus_id, q.device_id, q.function_id, q.val);
            return 0;
        }
    }
    return -ENXIO;
}

static long pmc_ioctl_getpci(struct file* f, unsigned int cmd, unsigned long arg)
{
    ioctl_query_setgetpci_t q;
    struct pci_bus *bus = NULL;

    if (copy_from_user(&q, (ioctl_query_setgetpci_t*) arg, sizeof(ioctl_query_setgetpci_t))) {
        return -EFAULT;
    }

    while ((bus = pci_find_next_bus(bus))) {
        if (q.bus_id == bus->number) {
            unsigned int val = 0;
            pci_bus_read_config_word(bus, PCI_DEVFN(q.device_id, q.function_id), q.offset, (u16*) &val);
            printk(KERN_INFO "%s: getpci bus_id 0x%x device_id 0x%x, function_id 0x%x, offset 0x%x, val 0x%x\n",
                    module_name, q.bus_id, q.device_id, q.function_id, q.offset, val);
            q.val = val;
            if (copy_to_user((ioctl_query_setgetpci_t*) arg, &q, sizeof(ioctl_query_setgetpci_t))) {
                return -EFAULT;
            }
            return 0;
        }
    }
    return -ENXIO;
}

static long pmc_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 
{
    int ret = -1;

	printk(KERN_INFO "%s: ioctl command: 0x%x\n", module_name, cmd);
	switch (cmd) {
		case IOCTL_SETCOUNTER:
            ret = pmc_ioctl_setcounter(f, cmd, arg);
            break;
        case IOCTL_SETPCI:
            ret = pmc_ioctl_setpci(f, cmd, arg);
            break;
        case IOCTL_GETPCI:
            ret = pmc_ioctl_getpci(f, cmd, arg);
            break;
		default:
			printk(KERN_INFO "%s: ioctl illegal command: 0x%x\n", module_name, cmd);
			break;
	}
	return ret;
}


/* Declaration of the init and exit functions */
module_init(pmc_init_module);
module_exit(pmc_exit_module);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("HPLabs");


================================================
FILE: src/lib/CMakeLists.txt
================================================
project(nvmemul)

option(STATISTICS "Enable statistics report" ON)

if(STATISTICS)
  message(STATUS "WITH STATISTICS")
  add_definitions(-DUSE_STATISTICS)
else()
  message(STATUS "WITHOUT STATISTICS")
endif()

set(nvmemul_src
    config.c
    debug.c
    dev.c
    init.c
    interpose.c
    measure_bw.c
    measure_lat.c
    misc.c
    monotonic_timer.c
    model_bw.c
    model_lat.c
    pflush.c
    pmalloc.c
    stat.c
    thread.c
    topology.c
    process_rank.c
)

include_directories(${CMAKE_SOURCE_DIR}/third_party)
include_directories(${CMAKE_SOURCE_DIR}/src)
include_directories(${CMAKE_SOURCE_DIR}/src/lib)
add_definitions(-g)
add_definitions(-O2)
add_definitions(-fPIC)
add_definitions(-Wall)
add_definitions(-march=native)
add_definitions(-fopenmp)
add_definitions(-std=gnu89)
#add_definitions(-DNDEBUG)
#add_definitions(-std=c99)
add_definitions(-msse4)
add_subdirectory(cpu)
add_library(nvmemul SHARED ${nvmemul_src} $<TARGET_OBJECTS:cpu>)
target_link_libraries(nvmemul dl)
target_link_libraries(nvmemul config)
target_link_libraries(nvmemul numa)
target_link_libraries(nvmemul rt)
target_link_libraries(nvmemul m)
target_link_libraries(nvmemul gomp)


================================================
FILE: src/lib/config.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include "config.h"
#include <libconfig.h>
#include <string.h>
#include <stdlib.h>
#include <stdarg.h>
#include <ctype.h>

#define ENVVAR_MAX_LEN 128

static char* __getenv(const char* prefix, const char* name)
{
	char normalized_name[ENVVAR_MAX_LEN];

	if ((strlen(name) + strlen(prefix) + 1) > ENVVAR_MAX_LEN) {
		return NULL;
	}
	
    strcpy(normalized_name, prefix);
    strcat(normalized_name, "_");
    strcat(normalized_name, name);

    return getenv(normalized_name);
}

static inline int 
env_setting_lookup(const char *name, char **value_str)
{
	char *val;
	char normalized_name[ENVVAR_MAX_LEN];
	int  i;

	if ((strlen(name)) > ENVVAR_MAX_LEN) {
		return CONFIG_FALSE;
	}
	
	for (i=0; name[i]; i++) {
		if (name[i] == '.') {
			normalized_name[i] = '_';
		} else {
			normalized_name[i] = toupper(name[i]);
		}
	}
	normalized_name[i] = '\0';
	
	val = __getenv(ENVVAR_PREFIX, normalized_name);
	if (val) {
		*value_str = val;
		return CONFIG_TRUE;
	} else {
		return CONFIG_FALSE;
	}
}


static inline int
env_setting_lookup_int(const char *name, int *value)
{
	char *value_str;

	if (env_setting_lookup(name, &value_str) == CONFIG_FALSE) {
		return CONFIG_FALSE;
	}

	if (value_str) {
		*value = atoi(value_str);
		return CONFIG_TRUE;
	} else {
		return CONFIG_FALSE;
	}
}


static inline int
env_setting_lookup_bool(const char *name, int *value)
{
	return env_setting_lookup_int(name, value);
}


static inline int 
env_setting_lookup_string(const char *name, char **value)
{
	return env_setting_lookup(name, value);
}


int
__cconfig_lookup_bool(config_t *cfg, const char *name, int *value) 
{
	int val;
	int found_val = 0;

	if (env_setting_lookup_bool(name, &val) == CONFIG_TRUE) {
		found_val = 1;
	} else {
	    if (config_lookup_bool(cfg, name, &val) == CONFIG_TRUE) {
			found_val = 1;
		}
	}

	if (found_val)	{
		*value = val;
		return CONFIG_TRUE;
	}
	return CONFIG_FALSE;
}


int
__cconfig_lookup_valid_bool(config_t *cfg, 
                     const char *name, 
                     int *value, 
                     int validity_check, ...)
{
	return __cconfig_lookup_bool(cfg, name, value);
}


int
__cconfig_lookup_int(config_t *cfg, const char *name, int *value)
{
	int val;
	int found_val = 0;

	if (env_setting_lookup_int(name, &val) == CONFIG_TRUE) {
		found_val = 1;
	} else {
		// third parameter changed from libconfig 1.3 to 1.4, it was 'long' and now it is 'int'
	    if (config_lookup_int(cfg, name, &val) == CONFIG_TRUE) {
			found_val = 1;
		}
	}

	if (found_val)	{
		*value = val;
		return CONFIG_TRUE;
	}
	return CONFIG_FALSE;
}


int
__cconfig_lookup_valid_int(config_t *cfg, 
                           const char *name, 
                           int *value, 
                           int validity_check, ...)
{
	int              min;
	int              max;
	int              list_length;
	int              i;
	int              val;
	int              listval;
	va_list          ap;

	if (__cconfig_lookup_int(cfg, name, &val) == CONFIG_TRUE) {
		switch (validity_check) {
			case CONFIG_NO_CHECK:
				*value = val;
				return CONFIG_TRUE;
			case CONFIG_RANGE_CHECK:
				va_start(ap, validity_check);
				min = va_arg(ap, int);
				max = va_arg(ap, int);
				va_end(ap);
				if (*value >= min && *value <= max) {
					*value = val;
					return CONFIG_TRUE;
				}
				break;
			case CONFIG_LIST_CHECK:
				va_start(ap, validity_check);
				list_length = va_arg(ap, int);
				for (i=0; i<list_length; i++) {
					listval = va_arg(ap, int);
					if (val == listval) {
						*value = val;
						return CONFIG_TRUE;
					}
				}
				va_end(ap);
				break;
		}
	}
	return CONFIG_FALSE;
}


int
__cconfig_lookup_string(config_t *cfg, const char *name, char **value)
{
	char *val;
	int  found_val = 0;

	if (env_setting_lookup_string(name, &val) == CONFIG_TRUE) {
		found_val = 1;
	} else {	
	    if (config_lookup_string(cfg, name, (const char**) &val) == CONFIG_TRUE) {
			found_val = 1;
		}
	}

	if (found_val)	{
		*value = val;
		return CONFIG_TRUE;
	}
	return CONFIG_FALSE;
}


int
__cconfig_lookup_valid_string(config_t *cfg, 
                              const char *name, 
                              char **value, 
                              int validity_check, ...)
{
	int       list_length;
	int       i;
	char      *val;
	va_list   ap;

	if (__cconfig_lookup_string(cfg, name, &val) == CONFIG_TRUE) {
		switch (validity_check) {
			case CONFIG_NO_CHECK:
				*value = val;
				return CONFIG_TRUE;
			case CONFIG_RANGE_CHECK:
				break;
			case CONFIG_LIST_CHECK:
				va_start(ap, validity_check);
				list_length = va_arg(ap, int);
				for (i=0; i<list_length; i++) {
					if (strcmp(val, va_arg(ap, char *))==0) {
						*value = val;
						return CONFIG_TRUE;
					}
				}
				va_end(ap);
				break;
		}
	}
	return CONFIG_FALSE;
}


int 
__cconfig_init(config_t *cfg, const char *config_file)
{
    int ret;
	char* env_config_file;

	if ((env_config_file = __getenv(ENVVAR_PREFIX, "INI"))) {
		config_file = env_config_file;
	}
	
	config_init(cfg);
	if ((ret = config_read_file(cfg, config_file)) == CONFIG_FALSE) {
        fprintf(stderr, "ERROR: nvmemul: Configuration file %s not found.\n", config_file);
    }
    return ret;
}


================================================
FILE: src/lib/config.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __CONFIG_H
#define __CONFIG_H

/**
 * \file 
 * 
 * Runtime configuration parameters
 */


#include <stdio.h>
#include <libconfig.h>

#define ENVVAR_PREFIX "NVMEMUL"

#ifdef __cplusplus
extern "C" {
#endif

/* Make sure we don't redefine a macro already defined in libconfig.h */

#ifdef CONFIG_NO_CHECK
# error "ERROR: Redefining previously defined CONFIG_NO_CHECK"
#else
# define CONFIG_NO_CHECK    0
#endif

#ifdef CONFIG_RANGE_CHECK
# error "ERROR: Redefining previously defined CONFIG_RANGE_CHECK"
#else
# define CONFIG_RANGE_CHECK 1
#endif

#ifdef CONFIG_LIST_CHECK
# error "ERROR: Redefining previously defined CONFIG_LIST_CHECK"
#else
# define CONFIG_LIST_CHECK  2
#endif


/** 
 * The lookup functions return the value of a configuration variable based on 
 * the following order: 
 *  1) value of environment variable
 *  2) value in configuration file variable
 *  
 * If the variable is not found then a lookup function does not set the value.
 */

int __cconfig_lookup_bool(config_t *cfg, const char *name, int *value);
int __cconfig_lookup_int(config_t *cfg, const char *name, int *value);
int __cconfig_lookup_string(config_t *cfg, const char *name, char **value);
int __cconfig_lookup_valid_bool(config_t *cfg, const char *name, int *value, int validity_check, ...);
int __cconfig_lookup_valid_int(config_t *cfg, const char *name, int *value, int validity_check, ...);
int __cconfig_lookup_valid_string(config_t *cfg, const char *name, char **value, int validity_check, ...);
int __cconfig_init(config_t *cfg, const char *config_file);

#ifdef __cplusplus
}
#endif

#endif /* __CONFIG_H */


================================================
FILE: src/lib/cpu/CMakeLists.txt
================================================
set(nvmemul_cpu_src
    cpu.c
    pmc.c
)

add_library(cpu OBJECT ${nvmemul_cpu_src})


================================================
FILE: src/lib/cpu/cpu.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <regex.h>
#include <string.h>
#include "cpu.h"
#include "dev.h"
#include "error.h"
#include "misc.h"
#include "known_cpus.h"
#include "xeon-ex.h"
#include <cpuid.h>

// Mainline architectures and processors available here:
// https://software.intel.com/en-us/articles/intel-architecture-and-processor-identification-with-cpuid-model-and-family-numbers
//
// It turns out that CPUID is not an accurate approach to identifying a
// processor as different processors may have the same CPUID.
// So instead we rely on the brand string returned by /proc/cpuinfo:model_name

#define MASK(msb, lsb) (~((~0) << (msb + 1)) & ((~0) << lsb))
#define EXTRACT(val, msb, lsb) ((MASK(msb, lsb) & val) >> lsb)
#define MODEL(eax) EXTRACT(eax, 7, 4)
#define EXTENDED_MODEL(eax) EXTRACT(eax, 19, 16)
#define MODEL_NUMBER(eax) ((EXTENDED_MODEL(eax) << 4) | MODEL(eax))
#define FAMILY(eax) EXTRACT(eax, 11, 8)
#define Extended_Family(eax) EXTRACT(eax, 27, 20)
#define Family_Number(eax) (FAMILY(eax) + Extended_Family(eax))

void cpuid(unsigned int info, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
{
    __asm__(
        "cpuid;"
        : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
        : "a"(info));
}

void get_family_model(int *family, int *model)
{
    unsigned int eax, ebx, ecx, edx;
    int success = __get_cpuid(1, &eax, &ebx, &ecx, &edx);
    if (family != NULL)
    {
        *family = success ? Family_Number(eax) : 0;
    }

    if (model != NULL)
    {
        *model = success ? MODEL_NUMBER(eax) : 0;
    }
}

// caller is responsible for freeing memory allocated by this function
char *cpuinfo(char *valname)
{
    FILE *fp;
    char *line = NULL;
    size_t len = 0;
    ssize_t read;

    fp = fopen("/proc/cpuinfo", "r");
    if (fp == NULL)
    {
        return NULL;
    }

    while ((read = getline(&line, &len, fp)) != -1)
    {
        if (strstr(line, valname))
        {
            char *colon = strchr(line, ':');
            int len = colon - line;
            char *buf = malloc(strlen(line) - len);
            strcpy(buf, &line[len + 2]);
            free(line);
            fclose(fp);
            return buf;
        }
    }

    free(line);
    fclose(fp);
    return NULL;
}

// reads current cpu frequency through the /proc/cpuinfo file
// avoid calling this function often
int cpu_speed_mhz()
{
    size_t val;
    char *str = cpuinfo("cpu MHz");
    val = string_to_size(str);
    free(str);
    return val;
}

// reads cpu LLC cache size through the /proc/cpuinfo file
// avoid calling this function often
size_t cpu_llc_size_bytes()
{
    size_t val;
    char *str = cpuinfo("cache size");
    val = string_to_size(str);
    free(str);
    return val;
}

// caller is responsible for freeing memory allocated by this function
char *cpu_model_name()
{
    return cpuinfo("model name");
}

int match(const char *to_match, const char *regex_text)
{
    int ret;
    const char *p = to_match;
    regex_t regex;
    regmatch_t m[1];

    if ((ret = regcomp(&regex, regex_text, REG_EXTENDED | REG_NEWLINE)) != 0)
    {
        return E_ERROR;
    }
    if ((ret = regexec(&regex, p, 1, m, 0)))
    {
        regfree(&regex);
        return E_ERROR; // no match
    }
    regfree(&regex);
    return E_SUCCESS;
}

int is_Xeon()
{
    char *model_name;
    if ((model_name = cpu_model_name()) == NULL)
    {
        return 0;
    }

    if (match(model_name, "Xeon") == E_SUCCESS)
    {
        free(model_name);
        return 1;
    }
    else
    {
        free(model_name);
        return 0;
    }
}

int is_Intel()
{
    char *model_name;
    if ((model_name = cpu_model_name()) == NULL)
    {
        return 0;
    }

    if (match(model_name, "Intel") == E_SUCCESS)
    {
        free(model_name);
        return 1;
    }
    else
    {
        free(model_name);
        return 0;
    }
}

cpu_model_t *cpu_model()
{
    int i, family, model;
    cpu_model_t *cpu_model = NULL;

    if (!is_Intel())
        return NULL;

    get_family_model(&family, &model);

    int isXeon = is_Xeon();

    for (i = 0; known_cpus[i].microarch != Invalid; i++)
    {
        microarch_ID_t c = known_cpus[i];

        if (c.family == family && c.model == model)
        {
            switch (c.microarch)
            {
            case SandyBridge:
                cpu_model = &cpu_model_intel_xeon_ex;
                break;
            case IvyBridge:
                cpu_model = &cpu_model_intel_xeon_ex_v2;
                break;
            case Haswell:
                cpu_model = &cpu_model_intel_xeon_ex_v3;
                break;
            default:
                return NULL;
            }

            if (!isXeon)
                cpu_model->microarch = (microarch_t)(cpu_model->microarch - 1);

            DBG_LOG(INFO, "Detected CPU model '%s'\n", microarch_strings[cpu_model->microarch]);
            break;
        }
    }

    if (!cpu_model)
    {
        return NULL;
    }

    // complete the model with some runtime information
    cpu_model->llc_size_bytes = cpu_llc_size_bytes();
    //    cpu_model->speed_mhz = cpu_speed_mhz();

    return cpu_model;
}


================================================
FILE: src/lib/cpu/cpu.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __CPU_H
#define __CPU_H

#include <stddef.h>
#include <stdint.h>
#include "dev.h"

#define MAX_THROTTLE_VALUE 1023

int set_throttle_register(int node, uint64_t val);
size_t cpu_llc_size_bytes();

struct pmc_set_s;

typedef enum {
    THROTTLE_DDR_ACT = 0,
    THROTTLE_DDR_READ,
    THROTTLE_DDR_WRITE
} throttle_type_t;

// order matters. see cpu_model()
typedef enum {
    Invalid,
    SandyBridge,
    SandyBridgeXeon,
    IvyBridge,
    IvyBridgeXeon,
    Haswell,
    HaswellXeon
} microarch_t;

typedef struct
{
    int family;
    int model;
    microarch_t microarch;
} microarch_ID_t;

/**
 *  CPU object that encapsulates processor-specific methods for accessing
 *  performance counters and memory controller PCI registers
 */
typedef struct cpu_model_s {
    microarch_t microarch; // processor description
    size_t llc_size_bytes; // last level cache size
//    int speed_mhz; // cpu clock frequency
    struct pmc_events_s* pmc_events; // performance monitoring events supported by the processor
    int (*set_throttle_register)(pci_regs_t *regs, throttle_type_t throttle_type, uint16_t val);
    int (*get_throttle_register)(pci_regs_t *regs, throttle_type_t throttle_type, uint16_t* val);
} cpu_model_t;

cpu_model_t* cpu_model();
int cpu_speed_mhz();

#endif /* __CPU_H */


================================================
FILE: src/lib/cpu/haswell-papi.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __CPU_HASWELL_H
#define __CPU_HASWELL_H

#include <papi.h>
#include "debug.h"

// Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with
// applications to list all available performance events with their architecture specific
// detailed description and translate them to their respective event code. 'showevtinfo' application can
// be used to list all available performance event names with detailed description and 'check_events' application
// can be used to translate the performance event to the corresponding event code.  

// These events will be initialized and started.
// Every event reading will return an array with the values for all these events.
// The array index is the same index used to define the event in the *_native_events array below
const char *haswell_native_events[MAX_NUM_EVENTS] = {
    "CYCLE_ACTIVITY:STALLS_L2_PENDING",
    "MEM_LOAD_UOPS_L3_HIT_RETIRED:XSNP_NONE",
    "MEM_LOAD_UOPS_L3_MISS_RETIRED:REMOTE_DRAM",
    "MEM_LOAD_UOPS_L3_MISS_RETIRED:LOCAL_DRAM"
};

uint64_t haswell_read_stall_events_local() {
    long long values[MAX_NUM_EVENTS];
    uint64_t events = 0;

    if (pmc_events_read_local_thread(values) == PAPI_OK) {
		uint64_t l2_pending = values[0];
		uint64_t llc_hit  = values[1];
		uint64_t remote_dram = values[2];
		uint64_t local_dram  = values[3];

		DBG_LOG(DEBUG, "read stall L2 cycles %lu; llc_hit %lu; remote_dram %lu; local_dram %lu\n",
			l2_pending, llc_hit, remote_dram, local_dram);

		double num = remote_dram + local_dram;
		double den = num + llc_hit;
		if (den == 0) return 0;

		events = (uint64_t)((double)l2_pending * ((double)num / den));
    } else {
        DBG_LOG(ERROR, "read stall cycles failed\n");
    }

    return events;
}

uint64_t haswell_read_stall_events_remote() {
    long long values[MAX_NUM_EVENTS];
    uint64_t events = 0;

    if (pmc_events_read_local_thread(values) == PAPI_OK) {
		uint64_t l2_pending = values[0];
		uint64_t llc_hit  = values[1];
		uint64_t remote_dram = values[2];
		uint64_t local_dram  = values[3];

		DBG_LOG(DEBUG, "read stall L2 cycles %lu; llc_hit %lu; remote_dram %lu; local_dram %lu\n",
			l2_pending, llc_hit, remote_dram, local_dram);

		// calculate stalls based on l2 stalls and LLC miss/hit
		double num = remote_dram + local_dram;
		double den = num + llc_hit;
		if (den == 0) return 0;
		double stalls = (double)l2_pending * ((double)num / den);

		// calculate remote dram stalls based on total stalls and local/remote dram accesses
		den = remote_dram + local_dram;
		if (den == 0) return 0;
		events = (uint64_t) (stalls * ((double)remote_dram / den));
    } else {
        DBG_LOG(ERROR, "read stall cycles failed\n");
    }

    return events;
}

#endif /* __CPU_HASWELL_H */


================================================
FILE: src/lib/cpu/haswell.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __CPU_HASWELL_H
#define __CPU_HASWELL_H

#include <math.h>
#include "thread.h"
#include "cpu/pmc.h"
#include "debug.h"

// Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with
// applications to list all available performance events with their architecture specific
// detailed description and translate them to their respective event code. 'showevtinfo' application can
// be used to list all available performance event names with detailed description and 'check_events' application
// can be used to translate the performance event to the corresponding event code.  

extern __thread int tls_hw_local_latency;
extern __thread int tls_hw_remote_latency;
#ifdef MEMLAT_SUPPORT
extern __thread uint64_t tls_global_remote_dram;
extern __thread uint64_t tls_global_local_dram;
#endif

#undef FOREACH_PMC_HW_EVENT
#define FOREACH_PMC_HW_EVENT(ACTION)                                                                       \
  ACTION("CYCLE_ACTIVITY:STALLS_L2_PENDING", NULL, 0x55305a3)                                              \
  ACTION("MEM_LOAD_UOPS_L3_HIT_RETIRED:XSNP_NONE", NULL, 0x5308d2)                                        \
  ACTION("MEM_LOAD_UOPS_L3_MISS_RETIRED:REMOTE_DRAM", NULL, 0x530cd3)                                     \
  ACTION("MEM_LOAD_UOPS_L3_MISS_RETIRED:LOCAL_DRAM", NULL, 0x5303d3)

#undef FOREACH_PMC_EVENT
#define FOREACH_PMC_EVENT(ACTION, prefix)                                                                  \
  ACTION(ldm_stall_cycles, prefix)                                                                         \
  ACTION(remote_dram, prefix)

#define L3_FACTOR 7.0

DECLARE_ENABLE_PMC(haswell, ldm_stall_cycles)
{
    ASSIGN_PMC_HW_EVENT_TO_ME("CYCLE_ACTIVITY:STALLS_L2_PENDING", 0);
    ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_L3_HIT_RETIRED:XSNP_NONE", 1);
    ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_L3_MISS_RETIRED:REMOTE_DRAM", 2);
    ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_L3_MISS_RETIRED:LOCAL_DRAM", 3);

    return E_SUCCESS;
}

DECLARE_CLEAR_PMC(haswell, ldm_stall_cycles)
{
}

DECLARE_READ_PMC(haswell, ldm_stall_cycles)
{
   uint64_t l2_pending_diff  = READ_MY_HW_EVENT_DIFF(0);
   uint64_t llc_hit_diff     = READ_MY_HW_EVENT_DIFF(1);
   uint64_t remote_dram_diff = READ_MY_HW_EVENT_DIFF(2);
   uint64_t local_dram_diff  = READ_MY_HW_EVENT_DIFF(3);

   DBG_LOG(DEBUG, "read stall L2 cycles diff %lu; llc_hit %lu; cycles diff remote_dram %lu; local_dram %lu\n",
		   l2_pending_diff, llc_hit_diff, remote_dram_diff, local_dram_diff);

   if ((remote_dram_diff == 0) && (local_dram_diff == 0)) return 0;
#ifdef MEMLAT_SUPPORT
   tls_global_local_dram += local_dram_diff;
#endif

   // calculate stalls based on L2 stalls and LLC miss/hit
   double num = L3_FACTOR * (remote_dram_diff + local_dram_diff);
   double den = num + llc_hit_diff;
   if (den == 0) return 0;
   return (uint64_t) ((double)l2_pending_diff * (num / den));
}


DECLARE_ENABLE_PMC(haswell, remote_dram)
{
    ASSIGN_PMC_HW_EVENT_TO_ME("CYCLE_ACTIVITY:STALLS_L2_PENDING", 0);
    ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_L3_HIT_RETIRED:XSNP_NONE", 1);
    ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_L3_MISS_RETIRED:REMOTE_DRAM", 2);
    ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_L3_MISS_RETIRED:LOCAL_DRAM", 3);

    return E_SUCCESS;
}

DECLARE_CLEAR_PMC(haswell, remote_dram)
{
}

DECLARE_READ_PMC(haswell, remote_dram)
{
   uint64_t l2_pending_diff  = READ_MY_HW_EVENT_DIFF(0);
   uint64_t llc_hit_diff     = READ_MY_HW_EVENT_DIFF(1);
   uint64_t remote_dram_diff = READ_MY_HW_EVENT_DIFF(2);
   uint64_t local_dram_diff  = READ_MY_HW_EVENT_DIFF(3);

   DBG_LOG(DEBUG, "read stall L2 cycles diff %lu; llc_hit %lu; cycles diff remote_dram %lu; local_dram %lu\n",
		   l2_pending_diff, llc_hit_diff, remote_dram_diff, local_dram_diff);

   if ((remote_dram_diff == 0) && (local_dram_diff == 0)) return 0;
#ifdef MEMLAT_SUPPORT
   tls_global_remote_dram += remote_dram_diff;
#endif

   // calculate stalls based on L2 stalls and LLC miss/hit
   double num = L3_FACTOR * (remote_dram_diff + local_dram_diff);
   double den = num + llc_hit_diff;
   if (den == 0) return 0;
   double stalls = (double)l2_pending_diff * (num / den);

   // calculate remote dram stalls based on total stalls and local/remote dram accesses
   // also consider the weight of remote memory access against local memory access
   den = (remote_dram_diff * tls_hw_remote_latency) + (local_dram_diff * tls_hw_local_latency);
   if (den == 0) return 0;
   return (uint64_t) (stalls * ((double)(remote_dram_diff * tls_hw_remote_latency) / den));
}


PMC_EVENTS(haswell, 4)
#endif /* __CPU_HASWELL_H */


================================================
FILE: src/lib/cpu/ivybridge-papi.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __CPU_IVYBRIDGE_H
#define __CPU_IVYBRIDGE_H

#include <papi.h>
#include "debug.h"

// Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with
// applications to list all available performance events with their architecture specific
// detailed description and translate them to their respective event code. 'showevtinfo' application can
// be used to list all available performance event names with detailed description and 'check_events' application
// can be used to translate the performance event to the corresponding event code.  

// These events will be initialized and started.
// Every event reading will return an array with the values for all these events.
// The array index is the same index used to define the event in the *_native_events array below
const char *ivybridge_native_events[MAX_NUM_EVENTS] = {
    "CYCLE_ACTIVITY:STALLS_L2_PENDING",
    "MEM_LOAD_UOPS_LLC_HIT_RETIRED:XSNP_NONE",
    "MEM_LOAD_UOPS_LLC_MISS_RETIRED:REMOTE_DRAM",
    "MEM_LOAD_UOPS_LLC_MISS_RETIRED:LOCAL_DRAM"
};

uint64_t ivybridge_read_stall_events_local() {
    long long values[MAX_NUM_EVENTS];
    uint64_t events = 0;

    if (pmc_events_read_local_thread(values) == PAPI_OK) {
		uint64_t l2_pending = values[0];
		uint64_t llc_hit  = values[1];
		uint64_t remote_dram = values[2];
		uint64_t local_dram  = values[3];

		DBG_LOG(DEBUG, "read stall L2 cycles %lu; llc_hit %lu; remote_dram %lu; local_dram %lu\n",
			l2_pending, llc_hit, remote_dram, local_dram);

		double num = remote_dram + local_dram;
		double den = num + llc_hit;
		if (den == 0) return 0;

		events = (uint64_t)((double)l2_pending * ((double)num / den));
    } else {
        DBG_LOG(ERROR, "read stall cycles failed\n");
    }

    return events;
}

uint64_t ivybridge_read_stall_events_remote() {
    long long values[MAX_NUM_EVENTS];
    uint64_t events = 0;

    if (pmc_events_read_local_thread(values) == PAPI_OK) {
		uint64_t l2_pending = values[0];
		uint64_t llc_hit  = values[1];
		uint64_t remote_dram = values[2];
		uint64_t local_dram  = values[3];

		DBG_LOG(DEBUG, "read stall L2 cycles %lu; llc_hit %lu; remote_dram %lu; local_dram %lu\n",
			l2_pending, llc_hit, remote_dram, local_dram);

		// calculate stalls based on l2 stalls and LLC miss/hit
		double num = remote_dram + local_dram;
		double den = num + llc_hit;
		if (den == 0) return 0;
		double stalls = (double)l2_pending * ((double)num / den);

		// calculate remote dram stalls based on total stalls and local/remote dram accesses
		den = remote_dram + local_dram;
		if (den == 0) return 0;
		events = (uint64_t) (stalls * ((double)remote_dram / den));
    } else {
        DBG_LOG(ERROR, "read stall cycles failed\n");
    }

    return events;
}

#endif /* __CPU_IVYBRIDGE_H */


================================================
FILE: src/lib/cpu/ivybridge.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __CPU_IVYBRIDGE_H
#define __CPU_IVYBRIDGE_H

#include <math.h>
#include "thread.h"
#include "cpu/pmc.h"
#include "debug.h"

// Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with
// applications to list all available performance events with their architecture specific
// detailed description and translate them to their respective event code. 'showevtinfo' application can
// be used to list all available performance event names with detailed description and 'check_events' application
// can be used to translate the performance event to the corresponding event code.  

extern __thread int tls_hw_local_latency;
extern __thread int tls_hw_remote_latency;
#ifdef MEMLAT_SUPPORT
extern __thread uint64_t tls_global_remote_dram;
extern __thread uint64_t tls_global_local_dram;
#endif

#undef FOREACH_PMC_HW_EVENT
#define FOREACH_PMC_HW_EVENT(ACTION)                                                                       \
  ACTION("CYCLE_ACTIVITY:STALLS_L2_PENDING", NULL, 0x55305a3)                                              \
  ACTION("MEM_LOAD_UOPS_LLC_HIT_RETIRED:XSNP_NONE", NULL, 0x5308d2)                                        \
  ACTION("MEM_LOAD_UOPS_LLC_MISS_RETIRED:REMOTE_DRAM", NULL, 0x530cd3)                                     \
  ACTION("MEM_LOAD_UOPS_LLC_MISS_RETIRED:LOCAL_DRAM", NULL, 0x5303d3)

#undef FOREACH_PMC_EVENT
#define FOREACH_PMC_EVENT(ACTION, prefix)                                                                  \
  ACTION(ldm_stall_cycles, prefix)                                                                         \
  ACTION(remote_dram, prefix)


#define L3_FACTOR 7.0

DECLARE_ENABLE_PMC(ivybridge, ldm_stall_cycles)
{
    ASSIGN_PMC_HW_EVENT_TO_ME("CYCLE_ACTIVITY:STALLS_L2_PENDING", 0);
    ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_LLC_HIT_RETIRED:XSNP_NONE", 1);
    ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_LLC_MISS_RETIRED:REMOTE_DRAM", 2);
    ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_LLC_MISS_RETIRED:LOCAL_DRAM", 3);

    return E_SUCCESS;
}

DECLARE_CLEAR_PMC(ivybridge, ldm_stall_cycles)
{
}

DECLARE_READ_PMC(ivybridge, ldm_stall_cycles)
{
   uint64_t l2_pending_diff  = READ_MY_HW_EVENT_DIFF(0);
   uint64_t llc_hit_diff     = READ_MY_HW_EVENT_DIFF(1);
   uint64_t remote_dram_diff = READ_MY_HW_EVENT_DIFF(2);
   uint64_t local_dram_diff  = READ_MY_HW_EVENT_DIFF(3);

   DBG_LOG(DEBUG, "read stall L2 cycles diff %lu; llc_hit %lu; cycles diff remote_dram %lu; local_dram %lu\n",
		   l2_pending_diff, llc_hit_diff, remote_dram_diff, local_dram_diff);

   if ((remote_dram_diff == 0) && (local_dram_diff == 0)) return 0;
#ifdef MEMLAT_SUPPORT
   tls_global_local_dram += local_dram_diff;
#endif

   // calculate stalls based on L2 stalls and LLC miss/hit
   double num = L3_FACTOR * (remote_dram_diff + local_dram_diff);
   double den = num + llc_hit_diff;
   if (den == 0) return 0;
   return (uint64_t) ((double)l2_pending_diff * (num / den));
}


DECLARE_ENABLE_PMC(ivybridge, remote_dram)
{
    ASSIGN_PMC_HW_EVENT_TO_ME("CYCLE_ACTIVITY:STALLS_L2_PENDING", 0);
    ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_LLC_HIT_RETIRED:XSNP_NONE", 1);
    ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_LLC_MISS_RETIRED:REMOTE_DRAM", 2);
    ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_LLC_MISS_RETIRED:LOCAL_DRAM", 3);

    return E_SUCCESS;
}

DECLARE_CLEAR_PMC(ivybridge, remote_dram)
{
}

DECLARE_READ_PMC(ivybridge, remote_dram)
{
   uint64_t l2_pending_diff  = READ_MY_HW_EVENT_DIFF(0);
   uint64_t llc_hit_diff     = READ_MY_HW_EVENT_DIFF(1);
   uint64_t remote_dram_diff = READ_MY_HW_EVENT_DIFF(2);
   uint64_t local_dram_diff  = READ_MY_HW_EVENT_DIFF(3);

   DBG_LOG(DEBUG, "read stall L2 cycles diff %lu; llc_hit %lu; cycles diff remote_dram %lu; local_dram %lu\n",
		   l2_pending_diff, llc_hit_diff, remote_dram_diff, local_dram_diff);

   if ((remote_dram_diff == 0) && (local_dram_diff == 0)) return 0;
#ifdef MEMLAT_SUPPORT
   tls_global_remote_dram += remote_dram_diff;
#endif

   // calculate stalls based on L2 stalls and LLC miss/hit
   double num = L3_FACTOR * (remote_dram_diff + local_dram_diff);
   double den = num + llc_hit_diff;
   if (den == 0) return 0;
   double stalls = (double)l2_pending_diff * (num / den);

   // calculate remote dram stalls based on total stalls and local/remote dram accesses
   // also consider the weight of remote memory access against local memory access
   den = (remote_dram_diff * tls_hw_remote_latency) + (local_dram_diff * tls_hw_local_latency);
   if (den == 0) return 0;
   return (uint64_t) (stalls * ((double)(remote_dram_diff * tls_hw_remote_latency) / den));
}


PMC_EVENTS(ivybridge, 4)
#endif /* __CPU_IVYBRIDGE_H */


================================================
FILE: src/lib/cpu/known_cpus.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __KNOWN_CPUS_H
#define __KNOWN_CPUS_H

#include "cpu.h"

// later, cpu_model_name() is used to distinguish between
// Xeon and non-Xeon processors. It's much easier here
// to consider all processors non-Xeon.
// references:
// 1- http://a4lg.com/tech/x86/database/x86-families-and-models.en.html
// 2- Intel® Xeon® Processor E7-8800/4800 v3 Product Family Specification
// 3- https://software.intel.com/en-us/articles/intel-architecture-and-processor-identification-with-cpuid-model-and-family-numbers
microarch_ID_t known_cpus[] =
    {
        // order does not matter
        {.family = 0x06, .model = 0x2A, .microarch = SandyBridge},
        {.family = 0x06, .model = 0x2D, .microarch = SandyBridge},

        {.family = 0x06, .model = 0x3A, .microarch = IvyBridge},
        {.family = 0x06, .model = 0x3E, .microarch = IvyBridge},

        {.family = 0x06, .model = 0x3C, .microarch = Haswell},
        {.family = 0x06, .model = 0x3F, .microarch = Haswell},
        {.family = 0x06, .model = 0x45, .microarch = Haswell},
        {.family = 0x06, .model = 0x46, .microarch = Haswell},

        // must be the last element
        {.family = 0x0, .model = 0x0, .microarch = Invalid}};

// order must correspond to microarch_t
char *microarch_strings[] =
    {
        "Invalid",
        "Sandy Bridge",
        "Sandy Bridge Xeon",
        "Ivy Bridge",
        "Ivy Bridge Xeon",
        "Haswell",
        "Haswell Xeon"};

#endif /* __KNOWN_CPUS_H */


================================================
FILE: src/lib/cpu/pmc-papi.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <papi.h>
#include <pthread.h>
#include <sys/syscall.h>
#include "cpu/pmc-papi.h"
#include "debug.h"

__thread int tls_event_set = PAPI_NULL;

#define STR_MAX_SIZE 256

static void log_papi_critical(int ret_val, const char *msg) {
	//char papi_str[STR_MAX_SIZE];
	//PAPI_perror(ret_val, (char *)papi_str, sizeof(papi_str));
    DBG_LOG(CRITICAL, "%s (%s)\n", msg, PAPI_strerror(ret_val));
}

int pmc_init() {
	int ret_val;

    if ((ret_val = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT) {
        log_papi_critical(ret_val, "PMC library init error");
        return -1;
    }

    if ((ret_val = PAPI_thread_init(pthread_self)) != PAPI_OK) {
        log_papi_critical(ret_val, "PMC thread support init error");
        return -1;
    }

//    if ((ret_val = PAPI_set_domain(PAPI_DOM_ALL)) != PAPI_OK) {
//        log_papi_critical(ret_val, "PMC set domain error");
//        return -1;
//    }

    return 0;
}

void pmc_shutdown() {
    PAPI_shutdown();
}

int pmc_create_event_set_local_thread() {
	int ret_val;

    if ((ret_val = PAPI_create_eventset(&tls_event_set)) != PAPI_OK) {
        log_papi_critical(ret_val, "PMC event set init error");
        return -1;
    }

//    if ((ret_val = PAPI_set_granularity(PAPI_GRN_SYS)) != PAPI_OK) {
//        log_papi_critical(ret_val, "PMC set granularity error");
//        return -1;
//    }

    return 0;
}

void pmc_destroy_event_set_local_thread() {
    PAPI_cleanup_eventset(tls_event_set);
    PAPI_destroy_eventset(&tls_event_set);
}

int pmc_register_thread() {
	return PAPI_register_thread();
}

int pmc_unregister_thread() {
	return PAPI_unregister_thread();
}

int pmc_register_event_local_thread(const char *event_name) {
    int ret_val;
    char msg[STR_MAX_SIZE];

    // The pthread scope for each thread should be set to PTHREAD_SCOPE_SYSTEM.
    // On linux, pthread supports only PTHREAD_SCOPE_SYSTEM.

    assert(tls_event_set != PAPI_NULL);
    assert(event_name);

    if ((ret_val = PAPI_add_named_event(tls_event_set, (char *)event_name)) != PAPI_OK) {
    	snprintf(msg, sizeof(msg), "PMC event (%s) register error", event_name);
    	log_papi_critical(ret_val, msg);
        return -1;
    }

    return 0;
}

int pmc_events_start_local_thread() {
    int ret_val;

    assert(tls_event_set != PAPI_NULL);

    if ((ret_val = PAPI_start(tls_event_set)) != PAPI_OK) {
    	log_papi_critical(ret_val, "PMC events start error");
        return -1;
    }

    return 0;
}

void pmc_events_stop_local_thread() {
	long long values[MAX_NUM_EVENTS];

	assert(tls_event_set != PAPI_NULL);

    PAPI_stop(tls_event_set, values);
}

int pmc_events_read_local_thread(long long *values) {
    int ret_val;
//    int status = 0;

    assert(values);

//    PAPI_state(event_set, &status);
//    if (status != PAPI_RUNNING) {
//        DBG_LOG(CRITICAL, "PMC event set not in running state");
//        return -1;
//    }

    if ((ret_val = PAPI_read(tls_event_set, values)) != PAPI_OK) {
    	log_papi_critical(ret_val, "PMC events read error");
        return -1;
    }

    if ((ret_val = PAPI_reset(tls_event_set)) != PAPI_OK) {
        log_papi_critical(ret_val, "PMC events reset error");
        return -1;
    }

    return 0;
}


================================================
FILE: src/lib/cpu/pmc-papi.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __CPU_PMC_H
#define __CPU_PMC_H

#include <stdint.h>


// Usually the architectures support up to 4 counters enabled at the same
// time per core when HT is enabled
#define MAX_NUM_EVENTS 4

typedef uint64_t (*read_stalls_t)(void);

typedef struct {
	const char **native_events;
	read_stalls_t read_stalls_events_local;
	read_stalls_t read_stalls_events_remote;
} pmc_event_t;

int pmc_init();
void pmc_shutdown();
int pmc_create_event_set_local_thread();
void pmc_destroy_event_set_local_thread();
int pmc_register_event_local_thread(const char *event_name);
int pmc_events_start_local_thread();
void pmc_events_stop_local_thread();
int pmc_events_read_local_thread(long long *values);

int pmc_register_thread();
int pmc_unregister_thread();

#endif /* __CPU_PMC_H */


================================================
FILE: src/lib/cpu/pmc.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <stdlib.h>
#include "cpu/pmc.h"
#include "dev.h"
#include "error.h"
#include "thread.h"
#include "topology.h"

#pragma GCC push_options
#pragma GCC optimize ("O0")

// The width of general purpose counters are 40bits.
// https://www.felixcloutier.com/x86/RDPMC.html
#define RDPMC_MAX_VALUE 0xFFFFFFFFFF  

long long rdpmc(int counter) 
{

	unsigned eax;
	unsigned edx;
	unsigned long long r;

	__asm__ __volatile__ ("mov %2, %%ecx\n\t"
	                      "rdpmc\n\t"
	                      "mov %%eax, %0\n\t"
	                      "and $255, %%edx\n\t"
	                      "mov %%edx, %1\n\t"
	                      : "=m" (eax), "=m" (edx), "=m" (counter)
	                      : /* no inputs */
	                      : "eax", "ecx", "edx"); /* eax, ecx, edx clobbered */
	                      r = ((unsigned long long) edx << 32) | eax;
	return r;

}

int rdpmc32(int counter) {

	unsigned eax;
	
	__asm__ __volatile__ ("mov %1, %%ecx\n\t"
	                      "rdpmc\n\t"
	                      "mov %%eax, %0\n\t"
	                      : "=m" (eax), "=m" (counter)
	                      : /* no inputs */
	                      : "eax", "ecx", "edx"); /* eax, ecx, edx clobbered */
	return eax;

}
#pragma GCC pop_options


/*int num_used_hw_cntrs(pmc_events_t* events)
{
    int i;
    int used;
    pmc_hw_event_t* event = 0;

     // check if this a known registered hardware event
    for (i=0, used=0; events->known_hw_events[i].name; i++) {
        event = &events->known_hw_events[i];
        used += event->active ? 0 : 1;
    }
    return used;    
}*/

int get_avail_hw_cntr_id(pmc_events_t* events)
{
    int i;
    int used;
    pmc_hw_event_t* event = 0;
    int status = -1;

    int* hw_cntr_id_status = calloc(events->num_avail_hw_cntrs, sizeof(int));
    
    for (i=0, used=0; events->known_hw_events[i].name; i++) {
        event = &events->known_hw_events[i];
        if (event->active) {
            used++;
            hw_cntr_id_status[event->hw_cntr_id] = 1;
        }
    }
    
    if (used == events->num_avail_hw_cntrs) {
        goto done;
    }

    for (i=0; events->num_avail_hw_cntrs; i++) {
        if (hw_cntr_id_status[i] == 0) {
            status = i;
            goto done;
        }
    }

done:
	free(hw_cntr_id_status);
	return status;
}

pmc_hw_event_t* enable_pmc_hw_event(pmc_events_t* events, const char* name)
{
    int i;
    pmc_hw_event_t* event = 0;
    int found = 0;

     // check if this a known registered hardware event
    for (i=0; events->known_hw_events[i].name; i++) {
        event = &events->known_hw_events[i];
        if (strcasecmp(event->name, name) == 0) {
        	found = 1;
            if (event->active) {
                return event;
            }
            break;
        }
    }

    if (!found) {
        DBG_LOG(WARNING, "Unknown hardware performance monitoring event\n");
        return NULL;
    }

    // enable it 
    // need to find an available performance counter to monitor this event
    if ((event->hw_cntr_id = get_avail_hw_cntr_id(events)) < 0) {
        DBG_LOG(ERROR, "No available hardware performance counters\n");
        return NULL;
    }

    // assign an array to keep per processor last read values (useful to calculate the diff since the last read)
    int num_cpus = system_num_cpus();
    if (!event->last_val) {
        event->last_val = calloc(num_cpus, sizeof(*event->last_val));
    }
    for (i=0; i<num_cpus; i++) {
        event->last_val[i] = 0;
    }
    // call into the kernel driver to enable the counter on all processors
    if (set_counter(event->hw_cntr_id, event->encoding) != E_SUCCESS) {
    	DBG_LOG(ERROR, "Can't enable counter on all processors\n");
    	return NULL;
    }

    event->active = 1;
    return event;
}

void disable_pmc_hw_event(pmc_events_t* events, const char* name)
{
    int i;
    pmc_hw_event_t* event = 0;
    int found = 0;

    // check if this a known registered hardware event
    for (i=0; events->known_hw_events[i].name; i++) {
        event = &events->known_hw_events[i];
        if (strcasecmp(event->name, name) == 0) {
        	found = 1;
            if (!event->active) {
                return;
            }
            break;
        }
    }

    if (!found) {
        DBG_LOG(WARNING, "Unknown hardware performance monitoring event\n");
        return;
    }

    event->active = 0;
}

void clear_pmc_hw_event(pmc_hw_event_t* event)
{
    DBG_LOG(CRITICAL, "Unimplemented functionality\n");
}

uint64_t read_pmc_hw_event_cur(pmc_hw_event_t* event)
{
    return rdpmc(event->hw_cntr_id);
}

uint64_t read_pmc_hw_event_diff(pmc_hw_event_t* event)
{
    int cpu_id = thread_self()->cpu_id;
    uint64_t cur_val = read_pmc_hw_event_cur(event);
    uint64_t last_val = event->last_val[cpu_id];
    //if (cur_val < last_val && (event->hw_cntr_id == 0)) {
    if (cur_val < last_val) {
        event->last_val[cpu_id] = cur_val;
        return (cur_val + (RDPMC_MAX_VALUE - last_val));
    }
    event->last_val[cpu_id] = cur_val;
    return cur_val - last_val;
}


pmc_event_t* enable_pmc_event(cpu_model_t* cpu, const char* name) 
{
    int i;
    pmc_event_t* event = 0;
    int found = 0;

    // check if this a known registered event
    for (i=0; cpu->pmc_events->known_events[i].name; i++) {
        event = &cpu->pmc_events->known_events[i];
        if (strcasecmp(event->name, name) == 0) {
        	found = 1;
            if (event->active) {
                return event;
            }
            break;
        }
    }

    if (!found) {
    	return NULL;
    }

    // enable it 
    event->hw_events = NULL;
    event->num_hw_events = 0;
    if (event->enable(cpu->pmc_events, event) != E_SUCCESS) {
        assert(0 && "DIE");
        return NULL;
    }
    event->active = 1;
    return event;
}

int assign_pmc_hw_event_to_event(pmc_events_t* events, const char* name, pmc_event_t* event, int local_id)
{
    pmc_hw_event_t* hw_event;

    if (!(hw_event = enable_pmc_hw_event(events, name))) {
        return E_ERROR;
    }
    if (local_id != event->num_hw_events) {
        DBG_LOG(CRITICAL, "local_id does not match assign id\n")
        // TODO: application should abort here, look for all DBG_LOG(CRITICAL)
    }

    event->hw_events = realloc(event->hw_events, (event->num_hw_events+1) * sizeof(*event->hw_events));
    event->hw_events[event->num_hw_events] = hw_event;
    event->num_hw_events++; 
    return E_SUCCESS;
}

void release_all_pmc_hw_events_of_event(pmc_event_t* event)
{
    int i;
    if (event->num_hw_events > 0) {
        for (i=0; i<event->num_hw_events; i++) {
            event->hw_events[i]->active = 0;
        }
        free(event->hw_events);
        event->hw_events = NULL;
        event->num_hw_events = 0;
    }
}

void disable_pmc_event(cpu_model_t* cpu, const char* name) 
{
    int i;
    pmc_event_t* event;

    for (i=0; cpu->pmc_events->known_events[i].name; i++) {
        event = &cpu->pmc_events->known_events[i];
        if (strcasecmp(event->name, name) == 0 && event->active) {
            event->active = 0;
        }
    }
}


================================================
FILE: src/lib/cpu/pmc.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __CPU_PMC_H
#define __CPU_PMC_H

#include "cpu/cpu.h"

#define DECLARE_ENABLE_PMC(prefix, name) int prefix##_create_pmc_##name(struct pmc_events_s* events, struct pmc_event_s* event)
#define DECLARE_CLEAR_PMC(prefix, name) void prefix##_clear_pmc_##name(struct pmc_event_s* event)
#define DECLARE_READ_PMC(prefix, name) uint64_t prefix##_read_pmc_##name(struct pmc_event_s* event)
#define ENABLE_PMC_FNAME(prefix, name) prefix##_create_pmc_##name
#define CLEAR_PMC_FNAME(prefix, name) prefix##_clear_pmc_##name
#define READ_PMC_FNAME(prefix, name) prefix##_read_pmc_##name

#define PMC_HW_EVENT(name, os_name, encoding)  { name, os_name, encoding, 0, 0},
#define PMC_EVENT(name, prefix)  { #name, NULL, 0, 0, ENABLE_PMC_FNAME(prefix, name), CLEAR_PMC_FNAME(prefix, name), READ_PMC_FNAME(prefix, name)},

#define PMC_EVENTS_PTR(prefix) &prefix##_pmc_events

#define PMC_EVENTS(prefix, num_hw_cntrs)          \
  pmc_hw_event_t prefix##_known_hw_event[] = {    \
    FOREACH_PMC_HW_EVENT(PMC_HW_EVENT)            \
    {NULL, NULL, 0, 0, 0}                         \
  };                                              \
  pmc_event_t prefix##_known_event[] = {          \
    FOREACH_PMC_EVENT(PMC_EVENT, prefix)          \
    {NULL, NULL, 0, 0, NULL, NULL, NULL}          \
  };                                              \
  pmc_events_t prefix##_pmc_events = {            \
    num_hw_cntrs,                                 \
    prefix##_known_hw_event,                      \
    prefix##_known_event                          \
  };

#define ASSIGN_PMC_HW_EVENT_TO_ME(name, local_id)                                   \
  if (assign_pmc_hw_event_to_event(events, name, event, local_id) != E_SUCCESS) {   \
    release_all_pmc_hw_events_of_event(event);                                      \
  }

#define READ_MY_HW_EVENT_DIFF(local_id) read_pmc_hw_event_diff(event->hw_events[local_id])
#define READ_MY_HW_EVENT_CUR(local_id) read_pmc_hw_event_cur(event->hw_events[local_id])

typedef struct {
    char* name;
    char* os_name; // perf name if known
    uint64_t encoding;
    int active;
    int hw_cntr_id;
    uint64_t* last_val; // array holding the last read values per processor (useful to calculate the diff since the last read)
} pmc_hw_event_t;

typedef struct pmc_event_s {
    const char* name;
    pmc_hw_event_t** hw_events;
    int num_hw_events;
    int active;
    int (*enable)(struct pmc_events_s* events, struct pmc_event_s* event);
    void (*clear)(struct pmc_event_s* event);
    uint64_t (*read)(struct pmc_event_s* event);
} pmc_event_t;

typedef struct pmc_events_s {
    int num_avail_hw_cntrs; 
    pmc_hw_event_t* known_hw_events;
    pmc_event_t* known_events;
} pmc_events_t;

pmc_hw_event_t* enable_pmc_hw_event(pmc_events_t* events, const char* name);
void disable_pmc_hw_event(pmc_events_t* events, const char* name);
void clear_pmc_hw_event(pmc_hw_event_t* event);
uint64_t read_pmc_hw_event_cur(pmc_hw_event_t* event);
uint64_t read_pmc_hw_event_diff(pmc_hw_event_t* event);
int assign_pmc_hw_event_to_event(pmc_events_t* events, const char* name, pmc_event_t* event, int local_id);
void release_all_pmc_hw_events_of_event(pmc_event_t* event);

pmc_event_t* enable_pmc_event(cpu_model_t* cpu, const char* name);
void disable_pmc_event(cpu_model_t* cpu, const char* name);

static inline void clear_pmc_event(pmc_event_t* event)
{
    event->clear(event);
}

//#include "debug.h"

static inline uint64_t read_pmc_event(pmc_event_t* event)
{
    uint64_t ret;
    ret = event->read(event);
    return ret;
}

#endif /* __CPU_PMC_H */


================================================
FILE: src/lib/cpu/sandybridge-papi.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __CPU_SANDYBRIDGE_H
#define __CPU_SANDYBRIDGE_H

#include <papi.h>
#include <math.h>
#include "debug.h"

// Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with
// applications to list all available performance events with their architecutre specific 
// detailed description and translate them to their respective event code. showevtinfo application can 
// be used to list all available performance event names with detailed desciption and check_events application
// can be used to translate the performance event to the corresponding event code.  

// These events will be initialized and started.
// Every event reading will return an array with the values for all these events.
// The array index is the same index used to define the event in the *_native_events array below
const char *sandybridge_native_events[MAX_NUM_EVENTS] = {
    "CYCLE_ACTIVITY:STALLS_L2_PENDING",
    "MEM_LOAD_UOPS_MISC_RETIRED:LLC_MISS",
    "MEM_LOAD_UOPS_RETIRED:L3_HIT",
    NULL
};


void sandybridge_latency_calibration_local(int *hw_latency, int target_latency) {
	if ((*hw_latency + 10) < target_latency)
		*hw_latency += 10;
}

void sandybridge_latency_calibration_remote(int *hw_latency, int target_latency) {
	if ((*hw_latency + 30) < target_latency)
		*hw_latency += 30;
}

uint64_t sandybridge_read_stall_events_local() {
    long long values[MAX_NUM_EVENTS];
    uint64_t events = 0;

    if (pmc_events_read_local_thread(values) == PAPI_OK) {
        uint64_t cycle_activity_stalls_l2_pending_diff = values[0];
        uint64_t mem_load_uops_misc_retired_llc_miss_diff = values[1];
        uint64_t mem_load_uops_retired_l3_hit_diff = values[2];

        DBG_LOG(DEBUG, "read stall L2 cycles %lu, LLC miss %lu, L3 hit %lu\n",
        		cycle_activity_stalls_l2_pending_diff, mem_load_uops_misc_retired_llc_miss_diff,
        		mem_load_uops_retired_l3_hit_diff);

    	uint64_t uden = 7.0 * mem_load_uops_misc_retired_llc_miss_diff + mem_load_uops_retired_l3_hit_diff;
        if (uden == 0) {
            return 0;
        }
        double den = uden;
        double num = 7.0 * mem_load_uops_misc_retired_llc_miss_diff;

        events = (uint64_t) floorl(cycle_activity_stalls_l2_pending_diff*num/den);
    } else {
        DBG_LOG(DEBUG, "read stall cycles failed\n");
    }

    return events;
}

#endif /* __CPU_SANDYBRIDGE_H */


================================================
FILE: src/lib/cpu/sandybridge.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __CPU_SANDYBRIDGE_H
#define __CPU_SANDYBRIDGE_H

#include <math.h>
#include "thread.h"
#include "cpu/pmc.h"
#include "debug.h"

// Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with
// applications to list all available performance events with their architecutre specific 
// detailed description and translate them to their respective event code. showevtinfo application can 
// be used to list all available performance event names with detailed desciption and check_events application
// can be used to translate the performance event to the corresponding event code.  

#undef FOREACH_PMC_HW_EVENT
#define FOREACH_PMC_HW_EVENT(ACTION)                                                                       \
  ACTION("CYCLE_ACTIVITY:STALLS_L2_PENDING", NULL, 0x55305a3)                                              \
  ACTION("MEM_LOAD_UOPS_MISC_RETIRED:LLC_MISS", NULL, 0x5302d4)                                            \
  ACTION("MEM_LOAD_UOPS_RETIRED:L3_HIT", NULL, 0x5304d1)                                                   \
  ACTION("INSTRUCTION_RETIRED", NULL, 0x5300c0)               

#undef FOREACH_PMC_EVENT
#define FOREACH_PMC_EVENT(ACTION, prefix)                                                                  \
  ACTION(ldm_stall_cycles, prefix)


DECLARE_ENABLE_PMC(sandybridge, ldm_stall_cycles)
{
    ASSIGN_PMC_HW_EVENT_TO_ME("CYCLE_ACTIVITY:STALLS_L2_PENDING", 0);
    ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_MISC_RETIRED:LLC_MISS", 1);
    //ASSIGN_PMC_HW_EVENT_TO_ME("INSTRUCTION_RETIRED", 2);
    ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_RETIRED:L3_HIT", 2);

    return E_SUCCESS;
}

DECLARE_CLEAR_PMC(sandybridge, ldm_stall_cycles)
{
}

DECLARE_READ_PMC(sandybridge, ldm_stall_cycles)
{
	//return 0;
   uint64_t cycle_activity_stalls_l2_pending_diff = READ_MY_HW_EVENT_DIFF(0);
   uint64_t mem_load_uops_misc_retired_llc_miss_diff = READ_MY_HW_EVENT_DIFF(1);
   uint64_t mem_load_uops_retired_l3_hit_diff = READ_MY_HW_EVENT_DIFF(2);

   //return floor(cycle_activity_stalls_l2_pending_diff * (((double) (7*mem_load_uops_misc_retired_llc_miss_diff))/((double)(7*mem_load_uops_misc_retired_llc_miss_diff + mem_load_uops_retired_l3_hit_diff))));
   uint64_t uden = 7.0 * mem_load_uops_misc_retired_llc_miss_diff + mem_load_uops_retired_l3_hit_diff;
   if (uden == 0) {
      return 0;  
   }
   double den = uden;
   double num = 7.0 * mem_load_uops_misc_retired_llc_miss_diff;

   return (uint64_t) floorl(cycle_activity_stalls_l2_pending_diff*num/den);
}


PMC_EVENTS(sandybridge, 4)
#endif /* __CPU_SANDYBRIDGE_H */


================================================
FILE: src/lib/cpu/xeon-ex.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include "dev.h"

#ifdef PAPI_SUPPORT
#include "sandybridge-papi.h"
#include "ivybridge-papi.h"
#include "haswell-papi.h"
#else
#include "sandybridge.h"
#include "ivybridge.h"
#include "haswell.h"
#endif

int intel_xeon_ex_set_throttle_register(pci_regs_t *regs, throttle_type_t throttle_type, uint16_t val)
{
    int offset;
    int i;

    switch(throttle_type) {
        case THROTTLE_DDR_ACT:
            offset = 0x190; break;
        case THROTTLE_DDR_READ:
            offset = 0x192; break;
        case THROTTLE_DDR_WRITE:
            offset = 0x194; break;
        default:
            offset = 0x190;
    }

    // write to all 4 channels

    // first Activate throttling
    /*set_pci(bus_id, 0x10, 0x0, 0x190, (uint16_t) val);
    set_pci(bus_id, 0x10, 0x1, 0x190, (uint16_t) val);
    set_pci(bus_id, 0x10, 0x4, 0x190, (uint16_t) val);
    set_pci(bus_id, 0x10, 0x5, 0x190, (uint16_t) val);*/

    // then the Read or Write throttling
    for (i=0; i < regs->channels; ++i) {
        set_pci(regs->addr[i].bus_id, regs->addr[i].dev_id, regs->addr[i].funct, offset, (uint16_t) val);
    }

    return 0;
}

int intel_xeon_ex_get_throttle_register(pci_regs_t *regs, throttle_type_t throttle_type, uint16_t* val)
{
    int offset;

    switch(throttle_type) {
        case THROTTLE_DDR_ACT:
            offset = 0x190; break;
        case THROTTLE_DDR_READ:
            offset = 0x192; break;
        case THROTTLE_DDR_WRITE:
            offset = 0x194; break;
        default:
            offset = 0x190;
    }

    // read just channel 1
    get_pci(regs->addr[0].bus_id, regs->addr[0].dev_id, regs->addr[0].funct, offset, val);
    return 0;
}


// desc is fixed in cpu_model() if not Xeon

cpu_model_t cpu_model_intel_xeon_ex = {
    .microarch = SandyBridgeXeon,
#ifdef PAPI_SUPPORT
    .pmc_events = {sandybridge_native_events, sandybridge_read_stall_events_local, NULL},
#else
    .pmc_events = PMC_EVENTS_PTR(sandybridge),
#endif
    .set_throttle_register = intel_xeon_ex_set_throttle_register,
    .get_throttle_register = intel_xeon_ex_get_throttle_register
};

cpu_model_t cpu_model_intel_xeon_ex_v2 = {
    .microarch = IvyBridgeXeon,
#ifdef PAPI_SUPPORT
    .pmc_events = {ivybridge_native_events, ivybridge_read_stall_events_local, ivybridge_read_stall_events_remote},
#else
    .pmc_events = PMC_EVENTS_PTR(ivybridge),
#endif
    .set_throttle_register = intel_xeon_ex_set_throttle_register,
    .get_throttle_register = intel_xeon_ex_get_throttle_register
};

cpu_model_t cpu_model_intel_xeon_ex_v3 = {
    .microarch = HaswellXeon,
#ifdef PAPI_SUPPORT
    .pmc_events = {haswell_native_events, haswell_read_stall_events_local, haswell_read_stall_events_remote},
#else
    .pmc_events = PMC_EVENTS_PTR(haswell),
#endif
    .set_throttle_register = intel_xeon_ex_set_throttle_register,
    .get_throttle_register = intel_xeon_ex_get_throttle_register
};


================================================
FILE: src/lib/debug.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include "debug.h"
#include <sys/types.h>
#include <execinfo.h>
#include <unistd.h>
#include <stdio.h>
#include "config.h"


int         dbg_modules[dbg_module_count];
int         dbg_level = 0;
int         dbg_verbose = 0;
const char* dbg_identifier = "";
static char dbg_identifier_buf[128];

static int 
strrep(char *target, char *source, char oldc, char newc)
{
	int i;

	for (i=0; source[i]; i++) {
		if (source[i] == oldc) {
			target[i] = newc;
		} else {
			target[i] = source[i];
		}
	}
	target[i] = '\0';
	return 0;
}


void
dbg_set_level(int level)
{
	dbg_level = level;
}


int 
dbg_init(config_t* dbg_cfg, int level, const char* identifier)
{
	// if user hasn't provided a debugging level then get it from the 
	// configuration env/file
	if (level < 0) {
		__cconfig_lookup_int(dbg_cfg, "debug.level", &dbg_level);
	} else {
		dbg_level = level;
	}

	__cconfig_lookup_int(dbg_cfg, "debug.verbose", &dbg_verbose);

	// if user hasn't provide an identifier then check whether the environment 
	// provides one, othewise create one based on process' pid 
	if (!identifier) {
		dbg_identifier = getenv("DEBUG_IDENTIFIER");
		if (!dbg_identifier) {
			sprintf(dbg_identifier_buf, "%d", getpid()); 
			dbg_identifier = dbg_identifier_buf;
		}
	} else {
		dbg_identifier = identifier;
	}


	// read per module debugging flags
#define STR(name) #name
#define ACTION(name)                                                           \
	do {                                                                       \
		char dotstr[128];                                                      \
		strrep(dotstr, STR(debug_module_##name), '_', '.');                    \
		__cconfig_lookup_bool(dbg_cfg, dotstr,                                 \
		                      &dbg_modules[dbg_module_##name]);                \
	} while (0);

	FOREACH_DEBUG_MODULE(ACTION)
#undef ACTION
        DBG_LOG(DEBUG, ""); // prevent compiler warning
	return 0;
}


void
dbg_backtrace (void)
{
	void *array[10];
	size_t size;
	char **strings;
	size_t i;
 
	size = backtrace (array, 10);
	strings = backtrace_symbols (array, size);
									      
	printf ("Obtained %zd stack frames.\n", size);
																	      
	for (i = 0; i < size; i++)
		printf ("%s\n", strings[i]);
	free (strings);
}


================================================
FILE: src/lib/debug.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __DEBUG_H
#define __DEBUG_H

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include "config.h"

#define FOREACH_DEBUG_MODULE(ACTION)                        \
	ACTION(all) /* special name that covers all modules */


#define ACTION(name)                                        \
	dbg_module_##name,

enum {
	FOREACH_DEBUG_MODULE(ACTION)
	dbg_module_count
};
#undef ACTION

#ifndef NDEBUG
#define DBG_CODE(code) DBG_##code

enum dbg_code {
	DBG_OFF = 0,
	DBG_CODE(CRITICAL) = 1, // Critical
	DBG_CODE(ERROR)    = 2, // Error
	DBG_CODE(WARNING)  = 3, // Warning
	DBG_CODE(INFO)     = 4, // Info
	DBG_CODE(DEBUG)    = 5, // Debugging
};

static const char* dbg_code2str[] = {
	(char*) "OFF",
	(char*) "CRITICAL",
	(char*) "ERROR",
	(char*) "WARNING",
	(char*) "INFO",
	(char*) "DEBUG",
};

static const int dbg_terminate_level = DBG_ERROR;
static const int dbg_stderr_level = DBG_WARNING;

extern int         dbg_modules[];
extern int         dbg_level;
extern int         dbg_verbose;
extern const char* dbg_identifier;

#define DBG_MODULE(name) dbg_module_##name

#define DBG_LOG(level, format, ...)                                            \
  do {                                                                         \
    FILE* ferr = stdout;                                                       \
    time_t ctime;                                                              \
    if (DBG_CODE(level) && (DBG_CODE(level) <= dbg_level ||                    \
                  DBG_CODE(level) <= dbg_terminate_level))                     \
    {                                                                          \
      if (DBG_CODE(level) <= dbg_stderr_level) {                               \
        ferr=stderr;                                                           \
      }                                                                        \
      if (dbg_verbose) {                                                       \
        ctime = time(NULL);                                                    \
        fprintf(ferr, "[%s] [%lu] %s in %s <%s,%d>: " format,                  \
                dbg_identifier,                                                \
                ctime,                                                         \
                dbg_code2str[DBG_CODE(level)],                                 \
                __FUNCTION__, __FILE__, __LINE__, ##__VA_ARGS__);              \
      } else {                                                                 \
        fprintf(ferr, "[%s] %s: " format,                                      \
                dbg_identifier,                                                \
                dbg_code2str[DBG_CODE(level)],                                 \
                ##__VA_ARGS__);                                                \
      }                                                                        \
      if (DBG_CODE(level) <= dbg_terminate_level) {                            \
        exit(-1);                                                              \
      }	                                                                       \
    }			                                                               \
  } while(0);


#define DBG_LOG2(level, module, format, ...)                                   \
  do {                                                                         \
    FILE* ferr = stdout;                                                       \
    if (DBG_CODE(level) &&                                                     \
	    (dbg_modules[module] || dbg_modules[dbg_module_all] ||                 \
		 DBG_CODE(level) <= dbg_terminate_level) &&                            \
	    (DBG_CODE(level) <= dbg_level ||                                       \
         DBG_CODE(level) <= dbg_terminate_level))                              \
    {                                                                          \
      if (DBG_CODE(level) <= dbg_stderr_level) {                               \
        ferr=stderr;                                                           \
      }                                                                        \
 	  fprintf(ferr, "[%s] %s in %s <%s,%d>: " format,                          \
              dbg_identifier,                                                  \
              dbg_code2str[DBG_CODE(level)],                                   \
              __FUNCTION__, __FILE__, __LINE__, ##__VA_ARGS__);                \
      if (DBG_CODE(level) <= dbg_terminate_level) {                            \
        exit(-1);                                                              \
      }	                                                                       \
    }			                                                               \
  } while(0);

#else /* NDEBUG */

#define DBG_LOG(level, format, ...)
#define DBG_LOG2(level, module, format, ...)

#endif /* NDEBUG */


#define VERIFY(condition)                                                      \
  do {                                                                         \
    if (!(condition)) {                                                        \
      fprintf(stderr, "Assumption \"%s\"\nFailed in file %s: at line:%i\n",    \
              #condition,__FILE__,__LINE__);                                   \
      DBG_LOG (DBG_CRITICAL, #condition);}                                     \
      fflush(stderr);                                                          \
  } while (0);


int dbg_init(config_t* dbg_cfg, int level, const char* identifier);
void dbg_backtrace (void);
void dbg_set_level(int level);

#endif // __DEBUG_H


================================================
FILE: src/lib/dev.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include <stdint.h>
#include <unistd.h>
#include <errno.h>
#include "dev/ioctl_query.h"
#include "error.h"
#include "dev.h"

// TODO: get this value from the config file
#define DEV_PATH "/dev/nvmemul"

int set_counter(unsigned int counter_id, unsigned int event_id)
{
    int fd;
    int ret;

    ioctl_query_setcounter_t q;
    fd = open(DEV_PATH, O_RDONLY);
    if (fd < 0) {
        DBG_LOG(ERROR, "Can't open %s - Is the NVM emulator device driver installed?\n", DEV_PATH);
        return E_ERROR;
    }
    q.counter_id = counter_id;
    q.event_id = event_id;
    if ((ret = ioctl(fd, IOCTL_SETCOUNTER, &q)) < 0) {
    close(fd);
        return E_ERROR;
    }
    close(fd);
    return E_SUCCESS;
}


int set_pci(unsigned int bus_id, unsigned int device_id, unsigned int function_id, unsigned int offset, uint16_t val)
{
	int fd; 
    int ret;

    ioctl_query_setgetpci_t q;
	fd = open(DEV_PATH, O_RDONLY);
	if (fd < 0) {
		DBG_LOG(ERROR, "Can't open %s - Is the NVM emulator device driver installed?\n", DEV_PATH);
		return E_ERROR;
	}
    q.bus_id = bus_id;
    q.device_id = device_id;
    q.function_id = function_id;
    q.offset = offset;
    q.val = val;
    if ((ret = ioctl(fd, IOCTL_SETPCI, &q)) < 0) {
    	close(fd);
        return E_ERROR;
    }
	close(fd);
    return E_SUCCESS;
}

int get_pci(unsigned int bus_id, unsigned int device_id, unsigned int function_id, unsigned int offset, uint16_t* val)
{
	int fd; 
    int ret;

    ioctl_query_setgetpci_t q;
	fd = open(DEV_PATH, O_RDWR);
	if (fd < 0) {
		DBG_LOG(ERROR, "Can't open %s - Is the NVM emulator device driver installed?\n", DEV_PATH);
		return E_ERROR;
	}
    q.bus_id = bus_id;
    q.device_id = device_id;
    q.function_id = function_id;
    q.offset = offset;
    q.val = 0;
    if ((ret = ioctl(fd, IOCTL_GETPCI, &q)) < 0) {
    	close(fd);
        return E_ERROR;
    }
    *val = q.val;
	close(fd);
    return E_SUCCESS;
}


================================================
FILE: src/lib/dev.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __DEVICE_DRIVER_API_H
#define __DEVICE_DRIVER_API_H

#include <stdint.h>

#define MAX_NUM_MC_PCI_BUS 16
#define MAX_NUM_MC_CHANNELS 16

typedef struct {
    unsigned int bus_id;
    unsigned int dev_id;
    unsigned int funct;
} pci_addr;

typedef struct {
    pci_addr addr[MAX_NUM_MC_CHANNELS];
    unsigned int channels;
} pci_regs_t;

int set_counter(unsigned int counter_id, unsigned int event_id);
int set_pci(unsigned bus_id, unsigned int device_id, unsigned int function_id, unsigned int offset, uint16_t val);
int get_pci(unsigned bus_id, unsigned int device_id, unsigned int function_id, unsigned int offset, uint16_t* val);

#endif /* __DEVICE_DRIVER_API_H */


================================================
FILE: src/lib/errno.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __ERRNO_H
#define __ERRNO_H

#ifdef __DEFINE_ERRNO
# error "__DEFINE_ERRNO previously defined"
#endif

/*
 * Define error codes and error messages here
 */
#define __DEFINE_ERRNO(ACTION)                                               \
	ACTION(E_SUCCESS, "Success")                                             \
	ACTION(E_ERROR, "Generic error")                                         \
	ACTION(E_NOMEM, "No memory")                                             \
    ACTION(E_EXIST, "Name already exists")                                   \
    ACTION(E_NOENT, "Name does not exist")                                   \
    ACTION(E_INVAL, "Invalid argument")                                      \
    ACTION(E_BUSY, "Resource busy")                                          \
    ACTION(E_NOTEMPTY, "Not empty")                                          \
    ACTION(E_ERRNO, "Standard C library error; check errno for details")


#ifdef __ENUM_MEMBER
# error "__ENUM_MEMBER previously defined"
#endif

#define __ENUM_MEMBER(name, str)  name,

enum {
	__DEFINE_ERRNO(__ENUM_MEMBER)
	E_MAXERRNO
};

#undef __ENUM_MEMBER /* don't polute the macro namespace */

#ifdef __ERRNO_STRING
# error "__ERRNO_STRING previously defined"
#endif

#define __ERRNO_STRING(name, str) str,

/*
    TODO: not used for now
static const char* 
ErrorToString(int err) {
	static const char* errstr[] = {
		__DEFINE_ERRNO(__ERRNO_STRING)
		"Unknown error code"
	};
	if (err >= 0 && err < E_MAXERRNO) {
		return errstr[err];
	}
	return errstr[E_MAXERRNO];
}
*/
#undef __ERRNO_STRING /* don't polute the macro namespace */
#undef __DEFINE_ERRNO /* don't polute the macro namespace */

#endif /* __ERRNO_H */


================================================
FILE: src/lib/error.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __ERROR_H
#define __ERROR_H

#include "errno.h"
#include "debug.h"

#endif /* __ERROR_H */


================================================
FILE: src/lib/init.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <errno.h>
#include "cpu/cpu.h"
#include "config.h"
#include "error.h"
#include "model.h"
#include "measure.h"
#include "thread.h"
#include "topology.h"
#include "interpose.h"
#include "monotonic_timer.h"
#include "pflush.h"
#include "stat.h"

static void init() __attribute__((constructor));
static void finalize() __attribute__((destructor));

int set_process_local_rank();
int unset_process_local_rank();
int partition_cpus(virtual_topology_t* virtual_topology);

static virtual_topology_t* virtual_topology = NULL;

void finalize() {
    int i;
    if (latency_model.enabled) {
        unregister_self();
    }

    if (read_bw_model.enabled) {
        for (i=0; i < virtual_topology->num_virtual_nodes; i++) {
            // FIXME: currently we keep a single bandwidth model and not per-node BW model
            physical_node_t* phys_node = virtual_topology->virtual_nodes[i].nvram_node;
            pci_regs_t *regs = phys_node->mc_pci_regs;

            // reset throttling
            phys_node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, 0x8FFF);
        }
    }
#ifdef USE_STATISTICS
    stats_report();
#endif
    // finalize libraries and release resources
#ifdef PAPI_SUPPORT
    pmc_shutdown();
#endif

    unset_process_local_rank();

    //__cconfig_destroy(&cfg);
}

void init()
{
    config_t cfg;
    cpu_model_t* cpu;
    char* ld_preload_path;
    double start_time, end_time;
#ifdef CALIBRATION_SUPPORT
    int i;
#endif

    // FIXME: do we need to register the main thread with our system?
    // YES: for sure for single-threaded apps

    start_time = monotonic_time_us();

    // we reset LD_PRELOAD to ensure we don't get into recursive preloads when 
    // calling popen during initialization. before exiting we reactivate LD_PRELOAD 
    // to allow LD_PRELOADS on children
    ld_preload_path = getenv("LD_PRELOAD");
    unsetenv("LD_PRELOAD");

    if (__cconfig_init(&cfg, "nvmemul.ini") == CONFIG_FALSE) {
        goto error;
    }

    __cconfig_lookup_bool(&cfg, "latency.enable", &latency_model.enabled);
    __cconfig_lookup_bool(&cfg, "bandwidth.enable", &read_bw_model.enabled);

    if (dbg_init(&cfg, -1, NULL) != E_SUCCESS) {
        goto error;
    }

    if (init_interposition() != E_SUCCESS) {
        goto error;
    }

    if ((cpu = cpu_model()) == NULL) {
        DBG_LOG(ERROR, "No supported processor found\n");
        goto error;
    }

    init_virtual_topology(&cfg, cpu, &virtual_topology);

    if (init_bandwidth_model(&cfg, virtual_topology) != E_SUCCESS) {
        goto error;
    }

    if (latency_model.enabled) {
        if (init_latency_model(&cfg, cpu, virtual_topology) != E_SUCCESS) {
   	        goto error;
        }

        init_thread_manager(&cfg, virtual_topology);

#ifdef USE_STATISTICS
        // statistics makes use of the thread manager and is used by the register_self()
        stats_enable(&cfg);
#endif

        set_process_local_rank();

        // thread manager must be initialized and local rank set
        // CPU partitioning must be made before the first thread is registered
        if (partition_cpus(virtual_topology) != E_SUCCESS) {
            goto error;
        }

        if (register_self() != E_SUCCESS) {
            goto error;
        }

#ifdef CALIBRATION_SUPPORT
        // main thread is now tracked by the latency emulator
        // first, calibrate the latency emulation
        if (latency_model.calibration) {
            for (i = 0; i < virtual_topology->num_virtual_nodes; ++i) {
                latency_calibration(&virtual_topology->virtual_nodes[i]);
            }
        }
#endif
        int write_latency;
        __cconfig_lookup_int(&cfg, "latency.write", &write_latency);
        init_pflush(cpu_speed_mhz(), write_latency);
    }

    end_time = monotonic_time_us();

#ifdef USE_STATISTICS
    if (latency_model.enabled) {
        stats_set_init_time(end_time - start_time);
    }
#endif

    if (ld_preload_path)
        setenv("LD_PRELOAD", ld_preload_path, 1);

    return;

error:
    /* Cannot initialize library -- catastrophic error */
    if (ld_preload_path)
        setenv("LD_PRELOAD", ld_preload_path, 1);

    fprintf(stderr, "ERROR: nvmemul: Initialization failed. Running without non-volatile memory emulation.\n");
}


================================================
FILE: src/lib/interpose.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#define _GNU_SOURCE
#include <stdio.h>
#include <dlfcn.h>
#include <pthread.h>
#include <assert.h>
#include <signal.h>
#include "error.h"
#include "model.h"
#include "thread.h"
#include "cpu/cpu.h"
#ifdef PAPI_SUPPORT
#include "cpu/pmc-papi.h"
#else
#include "cpu/pmc.h"
#endif


// WARNING: Our library MUST directly use the functions we interpose on by 
// calling __lib_X to avoid interposition on ourselves.


int (*__lib_pthread_create)(pthread_t *thread, const pthread_attr_t *attr,
                              void *(*start_routine) (void *), void *arg);
int (*__lib_pthread_mutex_lock)(pthread_mutex_t *mutex);
int (*__lib_pthread_mutex_trylock)(pthread_mutex_t *mutex);
int (*__lib_pthread_mutex_unlock)(pthread_mutex_t *mutex);
int (*__lib_pthread_detach)(pthread_t thread);

extern inline hrtime_t hrtime_cycles(void);
extern inline int cycles_to_us(cpu_model_t* cpu, hrtime_t cycles);


int init_interposition()
{
	char *error;
    // if no symbol is returned then no interposition needed
    __lib_pthread_create = dlsym(RTLD_NEXT, "pthread_create");
    __lib_pthread_mutex_lock = dlsym(RTLD_NEXT, "pthread_mutex_lock");
    __lib_pthread_mutex_trylock = dlsym(RTLD_NEXT, "pthread_mutex_trylock");
    __lib_pthread_mutex_unlock = dlsym(RTLD_NEXT, "pthread_mutex_unlock");
    __lib_pthread_detach = dlsym(RTLD_NEXT, "pthread_detach");

    if (__lib_pthread_mutex_lock == NULL || __lib_pthread_mutex_unlock == NULL ||
    	    __lib_pthread_create == NULL || __lib_pthread_mutex_trylock == NULL ||
    	    __lib_pthread_detach == NULL) {
    	error = dlerror();
    	DBG_LOG(ERROR, "Interposition failed: %s\n", error != NULL ? error : "unknown reason");
    	return E_ERROR;
    }

    return E_SUCCESS;
}


// Interposing on pthread_create requires interposing on the thread created as we 
// require the TID of that thread which we can only get by executing the gettid() 
// system call from that thread. So we interpose on the start_routine which is
// called by the new thread
typedef struct {
    void *(*start_routine) (void *);
    void *arg;
} pthread_create_functor_t;

void* __interposed_start_routine(void* args)
{
    void* ret;
    pthread_create_functor_t* f = (pthread_create_functor_t*) args;
    if (register_self() != E_SUCCESS) {
        free(args);
        return NULL;
    }
    ret = f->start_routine(f->arg);
    // FIXME: directly calling unregister may miss cases where the 
    // thread terminates prematurely (such as pthread_exit or cancel)
    // consider using a key destructor function instead
    //fprintf(stderr, "stall cycles: %lu\n", thread_self()->stall_cycles);
    //fprintf(stderr, "signals_sent: %lu signals_recv: %lu\n", thread_self()->signals_sent, thread_self()->signals_recv);
    unregister_self();
    free(args);
    return ret;
}

int pthread_create(pthread_t *thread, const pthread_attr_t *attr,
                   void *(*start_routine) (void *), void *arg)
{
    int ret;

    //DBG_LOG(DEBUG, "interposing pthread_create\n");

    //assert(__lib_pthread_create);
    if (__lib_pthread_create == NULL)
        init_interposition();

    if (latency_model.enabled) {
        pthread_create_functor_t *functor = malloc(sizeof(pthread_create_functor_t));
        functor->arg = arg;
        functor->start_routine = start_routine;

        if ((ret = __lib_pthread_create(thread, attr, __interposed_start_routine, (void*) functor)) != 0) {
            DBG_LOG(ERROR, "call to __lib_pthread_create failed\n");
            return ret;
        }
    } else {
        ret = __lib_pthread_create(thread, attr, start_routine, arg);
    }

    return ret;    
}

int pthread_mutex_lock(pthread_mutex_t *mutex)
{
    int err;

    if (latency_model.enabled) {
        if(reached_min_epoch_duration(thread_self())) {
            // create new epoch here in order to propagate only the critical session delay to other threads
            // the thread monitor will keep trying to create new epoch, unless the min duration has not been reached
            create_latency_epoch();
        }
    }

    //DBG_LOG(DEBUG, "interposing pthread_mutex_lock\n");

    //assert(__lib_pthread_mutex_lock);
    if (__lib_pthread_mutex_lock == NULL)
        init_interposition();
    err =  __lib_pthread_mutex_lock(mutex);

    return err;
}

int pthread_mutex_trylock(pthread_mutex_t *mutex)
{
    int err;

    if (latency_model.enabled) {
        if(reached_min_epoch_duration(thread_self())) {
            create_latency_epoch();
        }
    }

    //DBG_LOG(DEBUG, "interposing pthread_mutex_trylock\n");

    //assert(__lib_pthread_mutex_trylock);
    if (__lib_pthread_mutex_trylock == NULL)
        init_interposition();
    err =  __lib_pthread_mutex_trylock(mutex);

    return err;
}

int pthread_mutex_unlock(pthread_mutex_t *mutex)
{
    int err;

    if (latency_model.enabled) {
        if (reached_min_epoch_duration(thread_self())) {
            create_latency_epoch();
        }
    }

    //DBG_LOG(DEBUG, "interposing pthread_mutex_unlock\n");

    //assert(__lib_pthread_mutex_unlock);
    if (__lib_pthread_mutex_unlock == NULL)
        init_interposition();
    err = __lib_pthread_mutex_unlock(mutex);

    return err;
}


================================================
FILE: src/lib/interpose.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __INTERPOSE_H
#define __INTERPOSE_H


/**
 * 
 * \page library_interposition Library interposition 
 * 
 * The emulator intercepts several events of interest. It achieves this
 * by interposing on corresponding functions. 
 * Currently this includes thread creation and POSIX synchronization mechanisms.
 */

extern int (*__lib_pthread_create)(pthread_t *thread, const pthread_attr_t *attr,
                                   void *(*start_routine) (void *), void *arg);
extern int (*__lib_pthread_mutex_lock)(pthread_mutex_t *mutex);
extern int (*__lib_pthread_mutex_trylock)(pthread_mutex_t *mutex);
extern int (*__lib_pthread_mutex_unlock)(pthread_mutex_t *mutex);
extern int (*__lib_pthread_detach)(pthread_t thread);

int init_interposition();

#endif /* __INTERPOSE_H */


================================================
FILE: src/lib/measure.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __MEASURE_H
#define __MEASURE_H

/**
 * \file 
 * 
 * Memory latency and bandwidth measurements
 */

/**
 * \brief Measure memory read bandwidth
 *
 * Measures memory read bandwidth from a local socket (cpu_node) 
 * to the memory of a remote socket (mem_node). It does this 
 * by firing a bunch of threads issuing streaming instructions
 * to saturate memory bandwidth. 
 */
double measure_read_bw(int cpu_node, int mem_node);

/**
 * \brief Measure memory write bandwidth
 *
 * Measures memory write bandwidth from a local socket (cpu_node) 
 * to the memory of a remote socket (mem_node).
 * See measure_read_bw for how this is done.
 */
double measure_write_bw(int cpu_node, int mem_node);


/** 
 * \brief Measure memory latency 
 * 
 * Measures memory read latency from one local socket to the memory of a 
 * remote socket. It does this using a pointer chasing microbenchmark.
 * The microbenchmark setups an array where each element determines the
 * element to be read next.
 */ 
int measure_latency(cpu_model_t* cpu, int from_node_id, int to_node_id);

/**
 * \brief Calibrate memory latency
 *
 * Automatically tweaks the memory latency based on the detected hardware latency
 * on the target systems.
 */
void latency_calibration();

#endif


================================================
FILE: src/lib/measure_bw.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/


// 2 BW measuring algorithms: one based on SSE4 instructions and the second based on 
// stream benchmark Copy kernel.


//#define SSE4_VERSION

#ifdef SSE4_VERSION

#include <math.h>
#include <assert.h>
#include <stdint.h>
#include <pthread.h>
#include <string.h>
#include <numa.h>
#include "monotonic_timer.h"
#include "interpose.h"


#ifdef __SSE4_1__
#include <smmintrin.h>
#endif

#define BYTES_PER_GB (1024*1024*1024LL)
#define BYTES_PER_MB (1024*1024LL)

// flag for terminating current test
int g_done;

// global current number of threads
int g_nthreads = 0;

// synchronization barrier for current thread counter
pthread_barrier_t g_barrier;

// thread shared parameters for test function
void* g_array;
size_t g_thrsize;
int g_times;
void (*g_func)(void*, size_t);

// Compute bandwidth in MB/s.
static inline double to_bw(size_t bytes, double secs) {
  double size_bytes = (double) bytes;
  double size_mb = size_bytes / ((double) BYTES_PER_MB);
  return size_mb / secs;
}

void* thread_worker(void* arg)
{
    int j;
    unsigned int thread_num = (uintptr_t) arg;

    while (1)
    {
        // *** Barrier ****
        pthread_barrier_wait(&g_barrier);

        if (g_done) break;

        for (j = 0; j < g_times; j++) {
            g_func(&((char*) g_array)[g_thrsize * thread_num], g_thrsize);
        }

        // *** Barrier ****
        pthread_barrier_wait(&g_barrier);
    }

    return NULL;
}


int timeitp(void (*function)(void*, size_t), int nthreads, void* array, size_t size, int samples, int times) {
    double min = INFINITY;
    double runtime;
    size_t i, j, p;
    int thread_num;

    // globally set test function and thread number
    g_func = function;
    g_nthreads = nthreads;
    g_array = array;
    g_thrsize = size / nthreads;
    g_times = times;

    // create barrier and run threads
    pthread_barrier_init(&g_barrier, NULL, nthreads);

    pthread_t thr[nthreads];
    //__lib_pthread_create(&thr[0], NULL, thread_master, new int(0));
    for (p = 1; p < nthreads; ++p) {
    	assert(__lib_pthread_create);
        __lib_pthread_create(&thr[p], NULL, thread_worker, (void *) p);
    }

    // use current thread as master thread;
    g_done = 0;
    thread_num = 0;
    for (i = 0; i < samples; i++) 
    {
        pthread_barrier_wait(&g_barrier);

        assert(!g_done);

        double ts1 = monotonic_time();

        for (j = 0; j < times; j++) {
            g_func(&((char*)g_array)[g_thrsize * thread_num], g_thrsize);
        }

        pthread_barrier_wait(&g_barrier);
        double ts2 = monotonic_time();

        runtime = ts2 - ts1;
        if (runtime < min) {
            min = runtime;
        }
    }
    g_done = 1;

    pthread_barrier_wait(&g_barrier);

    for (p = 1; p < nthreads; ++p) {
        pthread_join(thr[p], NULL);
    }

    pthread_barrier_destroy(&g_barrier);

    return to_bw(size * times, min);
}


int timeit(void (*function)(void*, size_t), void* array, size_t size, int samples, int times) {
    double min = INFINITY;
    size_t i;

    // force allocation of physical pages
    memset(array, 0xff, size);

    for (i = 0; i < samples; i++) {
        double before, after, total;

        before = monotonic_time();
        int j;
        for (j = 0; j < times; j++) {
            function(array, size);
        }
        after = monotonic_time();

        total = after - before;
        if (total < min) {
            min = total;
        }
    }

    return to_bw(size * times, min);
}


#ifdef __SSE4_1__
void write_memory_nontemporal_sse(void* array, size_t size) {
  __m128i* varray = (__m128i*) array;

  __m128i vals = _mm_set1_epi32(1);
  size_t i;
  for (i = 0; i < size / sizeof(__m128i); i++) {
    _mm_stream_si128(&varray[i], vals);
    vals = _mm_add_epi16(vals, vals);
  }
}

void write_memory_sse(void* array, size_t size) {
  __m128i* varray = (__m128i*) array;

  __m128i vals = _mm_set1_epi32(1);
  size_t i;
  for (i = 0; i < size / sizeof(__m128i); i++) {
    _mm_store_si128(&varray[i], vals);
    vals = _mm_add_epi16(vals, vals);
  }
}

void read_memory_sse(void* array, size_t size) {
  __m128i* varray = (__m128i*) array;
  __m128i accum = _mm_set1_epi32(0xDEADBEEF);
  size_t i;
  for (i = 0; i < size / sizeof(__m128i); i++) {
    accum = _mm_add_epi16(varray[i], accum);
  }

  // This is unlikely, and we want to make sure the reads are not optimized
  // away.
  assert(!_mm_testz_si128(accum, accum));
}
#else
# error "No compiler support for SSE instructions"
#endif

//static char array[1024*1024*1024];

double measure_read_bw(int cpu_node, int mem_node)
{
    char* array;
    size_t size = 1024*1024*1024;
    double bw;
    int nthreads = 16;

    array = numa_alloc_onnode(size, mem_node);
    assert(array);
    numa_run_on_node(cpu_node);
    // force allocation of physical pages
    memset(array, 0xff, size);
    bw = timeitp(read_memory_sse, nthreads, array, size, 5, 1);
    numa_free(array, size);
    return bw;
}

double measure_write_bw(int cpu_node, int mem_node)
{
    char* array;
    size_t size = 1024*1024*1024;
    double bw;
    int nthreads = 16;

    array = numa_alloc_onnode(size, mem_node);
    assert(array);
    numa_run_on_node(cpu_node);
    // force allocation of physical pages
    memset(array, 0xff, size);
    bw = timeitp(write_memory_nontemporal_sse, nthreads, array, size, 5, 1);
    numa_free(array, size);
    return bw;
}

#else // SSE4_VERSION


#include <stdio.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <sys/time.h>
#include <numa.h>
#include <numaif.h>
#include <omp.h>
#include "monotonic_timer.h"
#include "debug.h"


# define N	20000000
# define NTIMES	10
# define OFFSET	0

# define HLINE "-------------------------------------------------------------\n"

# ifndef MIN
# define MIN(x,y) ((x)<(y)?(x):(y))
# endif
# ifndef MAX
# define MAX(x,y) ((x)>(y)?(x):(y))
# endif


static double	mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};

static double	bytes[4] = {
    2 * sizeof(double) * N,
    2 * sizeof(double) * N,
    3 * sizeof(double) * N,
    3 * sizeof(double) * N
    };

//extern double mysecond();

double measure_read_bw(int cpu_node, int mem_node)
    {
    register int	j, k;
    double		t, times[4][NTIMES];
    double *a, *c;
    //struct bitmask* membind;

    /* --- SETUP --- determine precision and check timing --- */

    //membind = numa_allocate_nodemask();
    //numa_bitmask_setbit(membind, mem_node);
    //numa_bind(membind);
    //numa_free_nodemask(membind);
    numa_run_on_node(cpu_node);

    omp_set_num_threads(10);

    // allocate memory dynamically to make sure the data is stored on the expected NUMA node
    a = (double *)numa_alloc_onnode( (N+OFFSET) * sizeof(double), mem_node);
    c = (double *)numa_alloc_onnode( (N+OFFSET) * sizeof(double), mem_node);

    DBG_LOG(DEBUG, "Measuring read BW on cpu node %d and mem node %d\n", cpu_node, mem_node);

    /* Get initial value for system clock. */
#pragma omp parallel for
    for (j=0; j<N; j++) {
	a[j] = (double)random(); //1.0;
	c[j] = 0.0;
	}

    t = monotonic_time(); //mysecond();
#pragma omp parallel for
    for (j = 0; j < N; j++)
	a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (monotonic_time() - t);

    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = monotonic_time(); //mysecond();
#pragma omp parallel for
	for (j=0; j<N; j++)
	    c[j] = a[j];
	times[0][k] = monotonic_time() - times[0][k];
	}

    /*	--- SUMMARY --- */
    
    mintime[0] = FLT_MAX;
    for (k=1; k<NTIMES; k++) 
	{
	    mintime[0] = MIN(mintime[0], times[0][k]);
	}

    numa_free(a, (N+OFFSET) * sizeof(double));
    numa_free(c, (N+OFFSET) * sizeof(double));

    // reset NUMA binding
    //numa_run_on_node_mask(numa_all_nodes_ptr);
    //numa_set_membind(numa_all_nodes_ptr);
    //numa_bind(numa_all_nodes_ptr);
    numa_run_on_node(-1);

    return 1.0E-06 * bytes[0]/mintime[0]; // bytes to MiB/s 
}


#endif // SSE4_VERSION


================================================
FILE: src/lib/measure_lat.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
/*
 * Originally developed by Terence Kelly with contributions from Haris Volos
 */

#include <string.h>
#include <assert.h>
#include <errno.h>
#include <inttypes.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <numa.h>
#include <numaif.h>
#include <math.h>
#include "cpu/cpu.h"
#include "error.h"
#include "model.h"

#define P  (void)printf
#define FP (void)fprintf

#define PAGESZ 4096

#define MAX_NUM_CHAINS 16

#undef USE_HUGETLB

#ifdef MEMLAT_SUPPORT
extern __thread uint64_t tls_global_remote_dram;
extern __thread uint64_t tls_global_local_dram;
#endif

typedef struct {
	uint64_t val;
	char padding[0];
} element_t;

typedef struct {
    uint64_t   N;
    uint64_t   element_size;
    element_t* head;
} chain_t;

inline uint64_t min(uint64_t a, uint64_t b)
{
    return a < b ? a : b;
}

/* G. Marsaglia, 2003.  "Xorshift RNGs", Journal of Statistical
   Software v. 8 n. 14, pp. 1-6, discussed in _Numerical Recipes_
   3rd ed. */
static uint64_t prng(uint64_t* seed) {
    uint64_t x = *seed;
    x ^= x >> 21;
    x ^= x << 35;
    x ^= x >>  4;
    *seed = x;
    return x;
}

static uint64_t T(void) {
    struct timeval tv;

#ifndef NDEBUG
    int r =
#endif
        gettimeofday(&tv, NULL);

    assert(0 == r);

    return (uint64_t)(tv.tv_sec) * 1000000 + tv.tv_usec;
}

element_t* element(chain_t* chain, uint64_t index) 
{
    char* p = (char*) chain->head + index * chain->element_size;
    return (element_t *) p;
}

void inline read_element(chain_t* chain, uint64_t index, char* buf, uint64_t buf_size)
{
    uint64_t i;
    element_t *elem = element(chain, index);
    buf_size = min(chain->element_size, buf_size);
    
    memcpy(buf, &elem->padding[0], buf_size - sizeof(elem->val));
    for (i = buf_size; i <= chain->element_size - buf_size; i += buf_size) {
        memcpy(buf, &elem->padding[i], buf_size);
    }
}

chain_t* alloc_chain(uint64_t seedin, uint64_t N, uint64_t element_size, uint64_t node_i, uint64_t node_j)
{
    uint64_t sum, p, i;
    element_t *B;
    char *A, *Aaligned, *M;
    uint64_t seed = seedin;
    chain_t* chain;
#ifndef NDEBUG
    long mbind_result;
#endif
    /* fill B[] with random permutation of 1..N */
    chain = (chain_t*) malloc(sizeof(chain_t));
    chain->N = N;
    chain->element_size = element_size;
    Aaligned = A = (char *) malloc(2 * PAGESZ + N * sizeof(element_t));
    assert(NULL != A);
    while ( 0 != (Aaligned - (char *)0) % PAGESZ )
        Aaligned++;
    B = (element_t *) Aaligned;
    for (i = 0; i < N; i++)
        B[i].val = 1+i;
    for (i = 0; i < N; i++) {
        uint64_t r, t;
        r = prng(&seed);
        r = r % N;  /* should be okay for N << 2^64 */
        t = B[i].val;
        B[i].val = B[r].val;
        B[r].val = t;
    }

    sum = 0;
    for (i = 0; i < N; i++)
      sum += B[i].val;
    assert((N+1)*N/2 == sum);  /* Euler's formula */

    /* set up C[] such that "chasing pointers" through it visits
       every element exactly once */
#ifdef USE_HUGETLB
    M = (char*) mmap(NULL, 2 * PAGESZ + (1+N) * element_size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB, -1, 0);
#else
    M = (char*) mmap(NULL, 2 * PAGESZ + (1+N) * element_size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
#endif
    assert(NULL != M);
    while ( 0 != (M - (char *)0) % PAGESZ )
      M++;
    numa_run_on_node(node_i);
    uint64_t nodemask = 1 << node_j;
#ifndef NDEBUG
    mbind_result =
#endif
        mbind(M, N*element_size, MPOL_BIND, &nodemask, 64, MPOL_MF_MOVE);

    assert(mbind_result == 0);

    bzero(M, N*element_size); // force physical memory allocation
    chain->head = (element_t *) M;
    for (i = 0; i < N; i++) {
        element(chain, i)->val = UINT64_MAX;
    }
    p = 0;
    for (i = 0; i < N; i++) {
        p = element(chain, p)->val = B[i].val;
    }
    element(chain, p)->val = 0;
    for (i = 0; i <= N; i++) {
        assert(N >= element(chain, i)->val);
    }
    free(A);
    return chain;
}


uint64_t trash_cache(uint64_t N)
{
    uint64_t T1, i, sum;
    char* A;
    char* ptr;
    element_t* B;
    ptr = A = (char *) malloc(2 * PAGESZ + N * sizeof(element_t));
    assert(NULL != A);
    while ( 0 != (A - (char *)0) % PAGESZ ) {
        A++;
        __asm__(""); /* prevent optimizer from removing loop */
    }
    B = (element_t *)A;

    /* trash the CPU cache */
    T1 = T() % 1000;
    for (i = 0; i < N; i++) {
        B[i].val = T1 * i + i % (T1+1);
        __asm__(""); /* prevent optimizer from removing loop */
    }
    sum = 0;
    for (i = 0; i < N; i++) {
        sum += B[i].val;
        __asm__(""); /* prevent optimizer from removing loop */
    }
    free(ptr);
    return sum;
}


int __measure_latency(uint64_t seedin, int nchains, size_t nelems, int element_size, int access_size, int from_node_id, int to_node_id) 
{
    uint64_t seed, j, i, T1, T2;
    uint64_t sumv[MAX_NUM_CHAINS];
    uint64_t nextp[MAX_NUM_CHAINS];
    chain_t *C[MAX_NUM_CHAINS];
    char *buf;
    uint64_t buf_size = 16384;

    assert(nelems < UINT64_MAX);
    assert(nchains < MAX_NUM_CHAINS);

    DBG_LOG(INFO, "measuring latency: nchains %d, nelems %zu, elem_sz %d, access_sz %d, from_node_id %d, to_node_id %d\n", nchains, nelems, element_size, access_size, from_node_id, to_node_id);

    for (j=0; j < nchains; j++) {
        seed = seedin + j*j;
        C[j] = alloc_chain(seed, nelems, element_size, from_node_id, to_node_id);
    }

    trash_cache(nelems);

    buf = (char*) malloc(buf_size);
    assert(buf != NULL);
#ifdef MEMLAT_SUPPORT
    tls_global_remote_dram = 0;
    tls_global_local_dram = 0;
#endif

    /* chase the pointers */
    if (nchains == 1) {
        T1 = T();
        sumv[0] = 0;
        for (i = 0; 0 != element(C[0], i)->val; i = element(C[0], i)->val) {
            sumv[0] += element(C[0], i)->val;
            if (access_size > element_size) {
                read_element(C[0], i, buf, buf_size);
            }
        }
        T2 = T();
    } else {
        T1 = T();
        for (j=0; j < nchains; j++) {
            sumv[j] = 0;
            nextp[j] = 0;
        }
        for (; 0 != element(C[0], nextp[0])->val; ) {
            for (j=0; j < nchains; j++) {
                sumv[j] += element(C[j], nextp[j])->val;
                if (access_size > element_size) {
                    read_element(C[j], nextp[j], buf, buf_size);
                }
                nextp[j] = element(C[j], nextp[j])->val;
            }
        }
        T2 = T();
    }
    assert((nelems+1)*nelems/2 == sumv[0]);  /* Euler's formula */
    uint64_t time_per_op_ns = ((T2-T1)*1000)/nelems;

    DBG_LOG(INFO, "measuring latency: latency is %lu ns\n", time_per_op_ns);

    for (j=0; j < nchains; j++) {
        free(C[j]);
    }
    free(buf);

    return time_per_op_ns;
}

int measure_latency(cpu_model_t* cpu, int from_node_id, int to_node_id) 
{
    size_t factor = 10; // this needs to be large enough to ensure we always miss in the LLC cache
    size_t element_size = 64LLU;
    size_t access_size = 8;
    size_t nelems = factor * cpu->llc_size_bytes / element_size;
    
    return __measure_latency(1, 1, nelems, element_size, access_size, from_node_id, to_node_id);
}

int measure_latency2(uint64_t seedin, int nchains, size_t nelems, int element_size, int access_size, int from_node_id, int to_node_id) 
{
    if (nelems*element_size < cpu_llc_size_bytes()) { 
        DBG_LOG(WARNING, "warning:  #elements == %" PRIu64 " seems small!\n", nelems);
    }

    return __measure_latency(seedin, nchains, nelems, element_size, access_size, from_node_id, to_node_id);
}

#ifdef CALIBRATION_SUPPORT

#define TOLERATED_DEVIATION_PERCENTAGE 5  // maximum deviation acceptable for the target latency
#define CALIBRATION_STEP_SIZE 0.05        // max ns step size to calibrate the CPU stalls
#define CALIBRATION_FINEST_STEP 0.01      // min (finest) ns step size to calibrate the CPU stalls
#define MAX_TOLERATED_BAD_STEPS 2         // max number of bad steps in the calibration, before the calibration inverts the value to increment
#define NELEMS 10000000
#define SEED_IN 1
#define NCHAINS 1
#define ELEM_SIZE 64LLU
#define ACCESS_SIZE 8
#define FILE_CALIB_LOCAL "/tmp/local_latency_calibration"
#define FILE_CALIB_REMOTE "/tmp/remote_latency_calibration"

static int calibrate_load_from_file(virtual_node_t *virtual_node) {
    FILE *fp = NULL;
    char *file_name = NULL;
    char *line = NULL;
    size_t len;
    double correction_factor;
    int status = E_ERROR;

    if (virtual_node->dram_node == virtual_node->nvram_node) {
    	file_name = FILE_CALIB_LOCAL;
    } else {
    	file_name = FILE_CALIB_REMOTE;
    }

    if (access(file_name, R_OK | W_OK) == 0) {
        // calibration file is available, check if the current target latency is mapped
        if ((fp = fopen(file_name, "r"))) {
            if (getline(&line, &len, fp) != -1) {
                if (sscanf(line, "%lf", &correction_factor) == 1) {
                    // set CPU stalls factor to the read value
                    latency_model.stalls_calibration_factor = correction_factor;
                    DBG_LOG(INFO, "CALIBRATION: factor loaded from file (%s) (%f)\n",
                            file_name, correction_factor);
                    status = E_SUCCESS;
                }
            }

            if (line) free(line);
            fclose(fp);
        }
    }

    return status;
}

static void calibrate_save_to_file(virtual_node_t *virtual_node, double correction_factor) {
	char *file_name;
	FILE *fp;

	if (virtual_node->dram_node == virtual_node->nvram_node) {
		file_name = FILE_CALIB_LOCAL;
	} else {
		file_name = FILE_CALIB_REMOTE;
	}

	// calibration file is available, check if the current target latency is mapped
	if ((fp = fopen(file_name, "a"))) {
		// it is assumed this line is not yet present in the file
		fprintf(fp, "%f\n", correction_factor);
		DBG_LOG(INFO, "CALIBRATION: factor saved to file (%s) (%f)\n",
                file_name, correction_factor);
		fclose(fp);
	}
}

static int diff_target_latencies(int measured_latency, int target_latency) {
    int diff = target_latency - measured_latency;
    return abs(diff);
}

static double calibrate(virtual_node_t *virtual_node, double step_value, int from_node, int to_node) {
    int measured;
    int best_diff_latency;
    double best_factor = 0;
    int diff;
    int bad_step_count = 0;
    int close_value;
    int calib_done;

    // force a change in correction factor and measure latency
    // each step will increment the or decrement the factor
    // at the end we have a calibrated correction factor for the CPU stalls

    DBG_LOG(INFO, "CALIBRATION: for nodes (dram %d, nvram %d)\n", from_node, to_node);
    best_diff_latency = INT32_MAX;
    close_value = 0;
    calib_done = 0;

    while(!calib_done) {
        measured = measure_latency2(SEED_IN, NCHAINS, NELEMS, ELEM_SIZE, ACCESS_SIZE, from_node, to_node);
        DBG_LOG(INFO, "CALIBRATION: measured latency (%d)\n", measured);

        diff = diff_target_latencies(measured, latency_model.read_latency);
        if (diff < best_diff_latency) {
        	// best measured latency so far
            bad_step_count = 0;
            best_diff_latency = diff;
            best_factor = latency_model.stalls_calibration_factor;
            // check if the diff is less or equal than the configured percentage of the target latency
            if (diff <= (latency_model.read_latency * TOLERATED_DEVIATION_PERCENTAGE / 100)) {
                DBG_LOG(INFO, "CALIBRATION: got a close latency value (factor %lf)\n", best_factor);
                close_value = 1;
            }
        } else if (diff >= best_diff_latency) {
        	// measure latency is getting worse
            if (close_value && bad_step_count == 0) {
            	// if we have a close_value, return it
                calib_done = 1;
            } else {
            	// otherwise let's give retries
                ++bad_step_count;
                if (bad_step_count >= MAX_TOLERATED_BAD_STEPS) {
                    // this calibration method seem to be moving to the wrong direction
                    // return invalid value and hopefully fall back to the second method
                    return 0;
                }
            }
        }

        latency_model.stalls_calibration_factor += step_value;
    } // while

    return best_factor;
}

static double calibrate_with_size(virtual_node_t *virtual_node, double calib_size, int from_node, int to_node) {
	double best_factor;

	// first method decrements the factor with the provided step size
    if (((best_factor = calibrate(virtual_node, (-calib_size), from_node, to_node)) == 0) ||
            calib_size == CALIBRATION_FINEST_STEP) {
        if (best_factor > 0.0) {
        	// recover last best factor
            latency_model.stalls_calibration_factor = best_factor;
        }
        // second method increments the factor with the provided step size
        // this method will be always performed if the provided step size is the finest
        best_factor = calibrate(virtual_node, calib_size, from_node, to_node);
    }

    return best_factor;
}

void latency_calibration(virtual_node_t *virtual_node) {
    double best_factor;
    int from_node = virtual_node->dram_node->node_id;
    int to_node = virtual_node->nvram_node->node_id;

    // if calibration file exist, load the correction factor and exit
    if (calibrate_load_from_file(virtual_node) == E_SUCCESS) {
        return;
    }

    if ((best_factor = calibrate_with_size(virtual_node, CALIBRATION_STEP_SIZE, from_node, to_node)) != 0) {
    	latency_model.stalls_calibration_factor = best_factor + CALIBRATION_FINEST_STEP;
    	best_factor = calibrate_with_size(virtual_node, CALIBRATION_FINEST_STEP, from_node, to_node);
    }

    if (best_factor == 0.0) {
        best_factor = 1.0;
    }

    // set the hardware latency to the best fit value
    latency_model.stalls_calibration_factor = best_factor;
    DBG_LOG(INFO, "CALIBRATION: CPU stalls correction factor is %f (dram %d, nvram %d)\n",
    		best_factor, from_node, to_node);

    // save file for local or remote 'correction factor'
    calibrate_save_to_file(virtual_node, best_factor);
}

#endif // CALIBRATION SUPPORT


================================================
FILE: src/lib/misc.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>


#include <stdio.h>
size_t string_to_size(char* str)
{
    size_t factor = 1;
    size_t size;
    long   val;
    char*  endptr = 0;

    val = strtoull(str, &endptr, 10);
    while(endptr && (endptr - str) < strlen(str) && !isalpha(*endptr)) {endptr++;}

    switch (endptr[0]) {
        case 'K': case 'k':
            factor = 1024LLU;
            break;
        case 'M': case 'm':
            factor = 1024LLU*1024LLU;
            break;
        case 'G': case 'g':
            factor = 1024LLU*1024LLU*1024LLU;
            break;
        default:
            factor = 1;
    }
    size = factor * val;
    return size;
}


================================================
FILE: src/lib/misc.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __MISC_H
#define __MISC_H

size_t string_to_size(char* str);

#endif


================================================
FILE: src/lib/model.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __MODEL_H
#define __MODEL_H

#include "config.h"
#include "cpu/cpu.h"
#include "thread.h"
#ifdef PAPI_SUPPORT
#include "cpu/pmc-papi.h"
#else
#include "cpu/pmc.h"
#endif

#define MAX_EPOCH_DURATION_US 1000000
#define MIN_EPOCH_DURATION_US 1

typedef struct {
	int enabled;
    int read_latency;
    int write_latency;
    int inject_delay;
#ifdef CALIBRATION_SUPPORT
    int calibration;
#endif
#ifdef PAPI_SUPPORT
    read_stalls_t pmc_stall_local;
    read_stalls_t pmc_stall_remote;
#else
    pmc_event_t* pmc_stall_cycles;
    pmc_event_t* pmc_remote_dram;
    int process_local_rank;
    int max_local_processe_ranks;
#endif

    double stalls_calibration_factor;
} latency_model_t;

extern latency_model_t latency_model;

typedef struct {
    unsigned int throttle_reg_val[MAX_THROTTLE_VALUE]; 
    double bandwidth[MAX_THROTTLE_VALUE];
    int npoints;
    int enabled;
} bw_model_t;

extern bw_model_t read_bw_model;
extern bw_model_t write_bw_model;

int init_bandwidth_model(config_t* cfg, struct virtual_topology_s* topology);
int init_latency_model(config_t* cfg, cpu_model_t* cpu, struct virtual_topology_s* virtual_topology);
void init_thread_latency_model(thread_t *thread);

void create_latency_epoch();

#endif /* __MODEL_H */


================================================
FILE: src/lib/model_bw.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <stdio.h>
#include <string.h>
#include <inttypes.h>
#include <math.h>
#include "cpu/cpu.h"
#include "config.h"
#include "error.h"
#include "measure.h"
#include "stat.h"
#include "topology.h"
#include "monotonic_timer.h"
#include "model.h"

/**
 * \file
 * 
 * \page latency_emulation Memory bandwidth emulation
 * 
 * To emulate bandwidth, we rely on memory power throttling (supported by recent memory 
 * controllers) to limit the effective bandwidth to the DRAM attached to a socket.
 * Memory power throttling is configured through the PCI configuration space. 
 * We use a kernel-module to set the proper PCI registers. 
 * 
 * Initially, we perform a series of bandwidth measurements to find out the bandwidth 
 * that corresponds to each register value. We incrementally try out each register value 
 * starting from 0x800f until we saturate memory bandwidth.
 * 
 */ 


bw_model_t read_bw_model;
bw_model_t write_bw_model;


#define THROTTLE_INCREMENT 15
#define THROTTLE_INITIAL_VALUE 0x800f

static int train_model(physical_node_t* phys_node, char model_type, bw_model_t* bw_model)
{
    double x[MAX_THROTTLE_VALUE];
    double best_rate;
    double m;
    int    i;
    uint16_t    throttle_reg_val;

    int min_number_throttle_points = 10;
    double stop_slope = 0.1;
    int phys_node_id = phys_node->node_id;
    pci_regs_t *regs = phys_node->mc_pci_regs;

    // reset throttling
    phys_node->cpu_model->get_throttle_register(regs, THROTTLE_DDR_ACT, &throttle_reg_val);
    if (throttle_reg_val < 0x8fff)
        phys_node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, 0x8FFF);

    DBG_LOG(INFO, "throttle bus id %d, on physical node: %d\n", regs->addr[0].bus_id, phys_node_id);

    // we run until our bandwidth curve flattens out which we find out using 
    // gradient (slope) analysis 
    for (i=0; i < MAX_THROTTLE_VALUE; i++) {
        phys_node->cpu_model->get_throttle_register(regs, THROTTLE_DDR_ACT, &throttle_reg_val);
        if (throttle_reg_val >= 0x8fff) throttle_reg_val = THROTTLE_INITIAL_VALUE;
        else throttle_reg_val += THROTTLE_INCREMENT;
        if (model_type == 'r') {
            phys_node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, throttle_reg_val);
            best_rate = measure_read_bw(phys_node_id, phys_node_id);
            // restore throttling register
            //phys_node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, 0x8fff);
        } /*else if (model_type == 'w') {
            phys_node->cpu_model->set_throttle_register(bus_id, THROTTLE_DDR_ACT, throttle_reg_val);
            best_rate = measure_write_bw(phys_node_id, phys_node_id);
            // restore throttling register
            phys_node->cpu_model->set_throttle_register(bus_id, THROTTLE_DDR_ACT, 0x8fff);
        }*/
        DBG_LOG(INFO, "throttle reg: 0x%x, %c bandwidth: %f\n", throttle_reg_val, model_type, best_rate);
        bw_model->throttle_reg_val[i] = throttle_reg_val;
        bw_model->bandwidth[i] = best_rate;
        x[i] = (double) throttle_reg_val; // slope calculation requires values of type double
        if (i > min_number_throttle_points) {
            m = slope(&x[i-min_number_throttle_points], 
                      &bw_model->bandwidth[i-min_number_throttle_points], 
                      min_number_throttle_points);
            if (abs(m) < stop_slope) {
                break;
            }
        }
    }
    bw_model->npoints = i;
    return E_SUCCESS;
}

static int load_model(const char* path, const char* prefix, bw_model_t* bw_model)
{
    FILE *fp;
    char *line = NULL;
    char str[64];
    size_t len = 0;
    ssize_t read;
    int x;
    double y;
    int found_points;

    fp = fopen(path, "r");
    if (fp == NULL) {
        return E_ERROR;
    }

    DBG_LOG(INFO, "Loading %s bandwidth model from %s\n", prefix, path);
    for (found_points = 0; (read = getline(&line, &len, fp)) != -1; ) {
        if (strstr(line, prefix)) {
            sscanf(line, "%s\t%d\t%lf", str, &x, &y);
            DBG_LOG(INFO, "throttle reg: 0x%x, bandwidth: %f\n", x, y);
            bw_model->throttle_reg_val[found_points] = x;
            bw_model->bandwidth[found_points] = y;
            found_points++;
        }
    }
    free(line);
    if (found_points) {
        bw_model->npoints = found_points;
    } else {
        DBG_LOG(INFO, "No %s bandwidth model found in %s\n", prefix, path);
        return E_ERROR;
    }
    fclose(fp);
    return E_SUCCESS;
}

static int save_model(const char* path, const char* prefix, bw_model_t* bw_model)
{
    int i;
    FILE *fp;

    fp = fopen(path, "a");
    if (fp == NULL) {
        return E_ERROR;
    }

    DBG_LOG(INFO, "Saving %s bandwidth model into %s\n", prefix, path);
    for (i=0; i<bw_model->npoints; i++) {
        int x = bw_model->throttle_reg_val[i];
        double y = bw_model->bandwidth[i];
        //DBG_LOG(INFO, "throttle reg: 0x%x, bandwidth: %f\n", x, y);
        fprintf(fp, "%s\t%d\t%f\n", prefix, x, y);
    }
    fclose(fp);
    return E_SUCCESS;
}

static int find_data_point(bw_model_t* model, double target_bw, unsigned int* point)
{
    int i;
    double error;

    // go through all points as we are not sorted and pick the one closest
    *point = 0;
    error = target_bw;    
    for (i=1; i<model->npoints; i++) {
        if (fabs(model->bandwidth[i] - target_bw) < error) {
            *point = i;
            error = fabs(model->bandwidth[i] - target_bw);
        }
    }
    return E_SUCCESS;
}

int __set_write_bw(physical_node_t* node, uint64_t target_bw)
{
    pci_regs_t *regs = node->mc_pci_regs;
    int ret;
    unsigned int point;

    if (regs == NULL) {
        return E_SUCCESS;
    }

    if (target_bw == (uint64_t) (-1)) {
        node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, 0x8fff);
        return E_SUCCESS;
    }

    if ((ret = find_data_point(&write_bw_model, (double) target_bw, &point)) != E_SUCCESS) {
        return ret;
    }
    DBG_LOG(INFO, "Setting throttle reg: %d (0x%x), target write bandwidth: %" PRIu64 ", actual write bandwidth: %" PRIu64 "\n", write_bw_model.throttle_reg_val[point], write_bw_model.throttle_reg_val[point], target_bw, (uint64_t) write_bw_model.bandwidth[point]);
    node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, write_bw_model.throttle_reg_val[point]);
    
    return E_SUCCESS;
}

int set_write_bw(config_t* cfg, physical_node_t* node)
{
    int target_bw;
    __cconfig_lookup_int(cfg, "bandwidth.write", &target_bw);

    return __set_write_bw(node, target_bw);
}

int __set_read_bw(physical_node_t* node, uint64_t target_bw)
{
    pci_regs_t *regs = node->mc_pci_regs;
    int ret;
    unsigned int point;

    if (regs == NULL) {
        return E_SUCCESS;
    }

    if (target_bw == (uint64_t) (-1)) {
        node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, 0x8fff);
        return E_SUCCESS;
    }

    if ((ret = find_data_point(&read_bw_model, (double) target_bw, &point)) != E_SUCCESS) {
        return ret;
    }
    DBG_LOG(INFO, "Setting throttle reg: %d (0x%x), target read bandwidth: %" PRIu64 ", actual read bandwidth: %" PRIu64 "\n", read_bw_model.throttle_reg_val[point], read_bw_model.throttle_reg_val[point], target_bw, (uint64_t) read_bw_model.bandwidth[point]);
    node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, read_bw_model.throttle_reg_val[point]);

    return E_SUCCESS;
}

int set_read_bw(config_t* cfg, physical_node_t* node)
{
    int target_bw;
    __cconfig_lookup_int(cfg, "bandwidth.read", &target_bw);

    return __set_read_bw(node, target_bw);
}

int init_bandwidth_model(config_t* cfg, virtual_topology_t* topology)
{
    int i;
    char* model_file;

    srandom((int)monotonic_time());

    if (read_bw_model.enabled) {
        DBG_LOG(INFO, "Initializing bandwidth model\n");
        // initialize bandwidth model
        for (i=0; i<topology->num_virtual_nodes; i++) {
            // FIXME: currently we keep a single bandwidth model and not per-node bandwidth model
            physical_node_t* phys_node = topology->virtual_nodes[i].nvram_node;
            if (__cconfig_lookup_string(cfg, "bandwidth.model", &model_file) == CONFIG_TRUE) {
                if (load_model(model_file, "read", &read_bw_model) != E_SUCCESS) {
                    train_model(phys_node, 'r', &read_bw_model);
                    save_model(model_file, "read", &read_bw_model);
                }
                /*if (load_model(model_file, "write", &write_bw_model) != E_SUCCESS) {
                    train_model(phys_node, 'w', &write_bw_model);
                    save_model(model_file, "write", &write_bw_model);
                }*/
            }
        }

        // set read and write memory bandwidth 
        for (i=0; i<topology->num_virtual_nodes; i++) {
            physical_node_t* phys_node = topology->virtual_nodes[i].nvram_node;
            set_read_bw(cfg, phys_node);
            //set_write_bw(cfg, phys_node);
        }
    } else {
        // reset throttle registers
        for (i=0; i<topology->num_virtual_nodes; i++) {
            // FIXME: currently we keep a single bandwidth model and not per-node bandwidth model
            physical_node_t* phys_node = topology->virtual_nodes[i].dram_node;
            __set_read_bw(phys_node, (uint64_t) (-1));
            __set_write_bw(phys_node, (uint64_t) (-1));
        }
    }

    return E_SUCCESS;
}


================================================
FILE: src/lib/model_lat.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <string.h>
#include "cpu/cpu.h"
#include "config.h"
#include "error.h"
#include "thread.h"
#include "topology.h"
#include "model.h"
#include "monotonic_timer.h"

/**
 * \file
 * 
 * \page latency_emulation Memory latency emulation
 * 
 * To emulate latency, we construct epochs and inject software created delays 
 * at the end of each epoch.
 * Epochs are created either at fixed intervals by periodically interrupting 
 * threads or on demand when a synchronization method (lock, unlock) is called.
 *
 * Delays are calculated using a simple analytic model that takes input from 
 * performance counters.
 */ 


latency_model_t latency_model;

#pragma GCC push_options
#pragma GCC optimize ("O0")
inline hrtime_t hrtime_cycles(void)
{
    unsigned hi, lo;
    __asm__ __volatile__ ("rdtscp" : "=a"(lo), "=d"(hi));
    return ( (hrtime_t)lo)|( ((hrtime_t)hi)<<32 );
}
#pragma GCC pop_options

/*
static inline hrtime_t ns_to_cycles(int cpu_speed_mhz, int ns)
{
    return (cpu_speed_mhz * ns) / 1000;
}
*/

inline hrtime_t cycles_to_us(int cpu_speed_mhz, hrtime_t cycles)
{
    return (cycles/cpu_speed_mhz);
}

#pragma GCC push_options
#pragma GCC optimize ("O0")
static inline void create_delay_cycles(hrtime_t cycles)
{
    hrtime_t start, stop;

    start = hrtime_cycles();
    do {
        stop = hrtime_cycles();
    } while (stop - start < cycles);
}
#pragma GCC pop_options

/*
static inline void create_delay_ns(cpu_model_t* cpu, int ns)
{
    hrtime_t cycles;
    cycles = ns_to_cycles(cpu, ns);
    create_delay_cycles(cycles);
}
*/

static int check_target_latency_against_hw_latency(virtual_topology_t* virtual_topology) {
    int status = 0;
    int i;
    int hw_latency_dram;
    int hw_latency_nvram;

    for (i = 0; i < virtual_topology->num_virtual_nodes; ++i) {
        hw_latency_dram = virtual_topology->virtual_nodes[i].dram_node->latency;
        hw_latency_nvram = virtual_topology->virtual_nodes[i].nvram_node->latency;
        if (hw_latency_dram >= latency_model.read_latency ||
            hw_latency_dram >= latency_model.write_latency ||
            hw_latency_nvram >= latency_model.read_latency ||
            hw_latency_nvram >= latency_model.write_latency) {
            DBG_LOG(ERROR, "Target read (%d) and write (%d) latency to be emulated must be greater than the "
            		"hardware latency dram (%d) and virtual nvram (%d) (virtual node %d)\n",
            		latency_model.read_latency, latency_model.write_latency, hw_latency_dram, hw_latency_nvram, i);
            status = -1;
            break;
        }
    }

    return status;
}

int init_latency_model(config_t* cfg, cpu_model_t* cpu, virtual_topology_t* virtual_topology)
{
	int i;

    DBG_LOG(INFO, "Initializing latency model\n");

    memset(&latency_model, 0, sizeof(latency_model_t));
    latency_model.enabled = 1;

    __cconfig_lookup_int(cfg, "latency.read", &latency_model.read_latency);
    __cconfig_lookup_int(cfg, "latency.write", &latency_model.write_latency);

    if (check_target_latency_against_hw_latency(virtual_topology) < 0) {
        return E_INVAL;
    }

    __cconfig_lookup_bool(cfg, "latency.inject_delay", &latency_model.inject_delay);
    if (!latency_model.inject_delay) {
        DBG_LOG(WARNING, "Latency model is enabled, but delay injection is disabled\n");
    }

#ifdef PAPI_SUPPORT
    if (pmc_init() != 0) {
        return E_ERROR;
    }

    latency_model.pmc_stall_local = cpu->pmc_events.read_stalls_events_local;
    latency_model.pmc_stall_remote = cpu->pmc_events.read_stalls_events_remote;
#else
    for (i=0; cpu->pmc_events->known_events[i].name; ++i) {
        // LDM_STALL_CYCLES implementation for each processor is mandatory
        if (strcasecmp(cpu->pmc_events->known_events[i].name, "LDM_STALL_CYCLES") == 0) {
            if (!(latency_model.pmc_stall_cycles = enable_pmc_event(cpu, "LDM_STALL_CYCLES"))) {
                return E_NOENT;
            }
        }
        if (strcasecmp(cpu->pmc_events->known_events[i].name, "REMOTE_DRAM") == 0) {
            if (!(latency_model.pmc_remote_dram = enable_pmc_event(cpu, "REMOTE_DRAM"))) {
                return E_NOENT;
            }
        }
    }

    assert(latency_model.pmc_stall_cycles);
#endif

#ifdef CALIBRATION_SUPPORT
    __cconfig_lookup_bool(cfg, "latency.calibration", &latency_model.calibration);
    if (latency_model.calibration) {
        latency_model.stalls_calibration_factor = 1.0;
    }
#endif

    return E_SUCCESS;
}

__thread uint64_t tls_overhead = 0;
__thread int tls_hw_local_latency = 0;
__thread int tls_hw_remote_latency = 0;
#ifdef MEMLAT_SUPPORT
__thread uint64_t tls_global_remote_dram = 0;
__thread uint64_t tls_global_local_dram = 0;
#endif

void init_thread_latency_model(thread_t *thread)
{
    tls_hw_local_latency = thread->virtual_node->dram_node->latency;
    tls_hw_remote_latency = thread->virtual_node->nvram_node->latency;
}

void create_latency_epoch()
{
    uint64_t stall_cycles = 0;
    uint64_t delay_cycles = 0;
    int hw_latency;
    int target_latency;
    hrtime_t start, stop;
    double epoch_end;

    start = hrtime_cycles();

    // An epoch may be created by a critical section and the static epoch
    // may interfere with the current epoch creation. Block the signal here
    // and unblock it at the end of this function.
    block_new_epoch();

    // must always be thread_self since we call core specific data through hrtime_cycles
    thread_t* thread = thread_self();

    if (!reached_min_epoch_duration(thread)) {
    	if (!thread) thread = thread_self();
    	if (thread) thread->signaled = 0;
    	unblock_new_epoch();
        return;
    }

    //DBG_LOG(INFO, "new epoch for thread id [%i]\n", thread->tid);

#ifdef USE_STATISTICS
    if (thread->thread_manager->stats.enabled) {
        thread->stats.epochs++;
    }
#endif

    // this is the generic hardware latency for this thread (it takes into account the current virtual node latencies)
    hw_latency = thread->virtual_node->nvram_node->latency;
    target_latency = latency_model.read_latency;

    // check if the thread_self is remote (virtual topology where dram != nvram) or local (dram == nvram)
    // on this case, stall cycles will be a proportion of remote memory accesses
    // TODO: the read pmc method used below must be changed to support PAPI
    if (thread->virtual_node->dram_node != thread->virtual_node->nvram_node &&
            latency_model.pmc_remote_dram) {
        stall_cycles = read_pmc_event(latency_model.pmc_remote_dram);
	} else {
		stall_cycles = read_pmc_event(latency_model.pmc_stall_cycles);
	}

#ifdef CALIBRATION_SUPPORT
    if (latency_model.calibration) {
        stall_cycles = (uint64_t)((double)stall_cycles * latency_model.stalls_calibration_factor);
    }
#endif

    delay_cycles = stall_cycles * ((double)(target_latency - hw_latency) / ((double) hw_latency));

    stop = hrtime_cycles();
    tls_overhead += stop - start;

    DBG_LOG(DEBUG, "overhead cycles: %lu; immediate overhead %lu; stall cycles: %lu; delay cycles: %lu\n", tls_overhead, stop - start, stall_cycles, delay_cycles);

    if (delay_cycles > tls_overhead) {
    	delay_cycles -= tls_overhead;
        tls_overhead = 0;
    }
    else {
    	tls_overhead -= delay_cycles;
    	delay_cycles = 0;
    }

#ifdef MEMLAT_SUPPORT
    thread->stall_cycles += stall_cycles;
#endif

#ifdef USE_STATISTICS
    if (thread->thread_manager->stats.enabled) {
        thread->stats.stall_cycles += stall_cycles;
        thread->stats.delay_cycles += delay_cycles;
        thread->stats.overhead_cycles = tls_overhead;
    }
#endif

    epoch_end = monotonic_time_us();

    DBG_LOG(DEBUG, "injecting delay of %lu cycles (%lu usec) - discounted overhead\n", delay_cycles,
                    cycles_to_us(thread->cpu_speed_mhz, delay_cycles));
    if (delay_cycles && latency_model.inject_delay) {
        create_delay_cycles(delay_cycles);
    }

#ifdef USE_STATISTICS
    if (thread->thread_manager->stats.enabled) {
    	uint64_t older_epoch_timestamp = thread->stats.last_epoch_timestamp;
    	uint64_t diff_epoch_timestamp = epoch_end - older_epoch_timestamp;

    	if (diff_epoch_timestamp < thread->stats.shortest_epoch_duration_us) {
    	    thread->stats.shortest_epoch_duration_us = diff_epoch_timestamp;
    	}

    	if (diff_epoch_timestamp > thread->stats.longest_epoch_duration_us) {
		    thread->stats.longest_epoch_duration_us = diff_epoch_timestamp;
    	}

    	thread->stats.overall_epoch_duration_us += diff_epoch_timestamp;
    	thread->stats.last_epoch_timestamp = monotonic_time_us();
    } else {
    	// last epoch timestamp must always be updated
        thread->stats.last_epoch_timestamp = monotonic_time_us();
    }
#else
    thread->last_epoch_timestamp = monotonic_time_us();
#endif
    // this must be the last step, since this function is called also from the signal handler
    // and the monitor thread sets this flag, we must make sure race conditions are prevented
    thread->signaled = 0;

    unblock_new_epoch();
}


================================================
FILE: src/lib/monotonic_timer.c
================================================
// Copyright 2013 Alex Reece.
//
// A cross platform monotonic timer.

#include <unistd.h>
#include "monotonic_timer.h"

#if _POSIX_TIMERS > 0 && defined(_POSIX_MONOTONIC_CLOCK)
  // If we have it, use clock_gettime and CLOCK_MONOTONIC.

  #include <time.h>

  double monotonic_time() {
    struct timespec time;
    // Note: Make sure to link with -lrt to define clock_gettime.
    clock_gettime(CLOCK_MONOTONIC, &time);
    return ((double) time.tv_sec) + ((double) time.tv_nsec / (NANOS_PER_SECF));
  }

  double monotonic_time_us() {
	  struct timespec time;
	  // Note: Make sure to link with -lrt to define clock_gettime.
	  clock_gettime(CLOCK_MONOTONIC, &time);
	  return ((double) (time.tv_sec * USECS_PER_SEC)) + ((double) time.tv_nsec / NANOS_PER_USECF);
  }

#else
  // Fall back to rdtsc. The reason we don't use clock() is this scary message
  // from the man page:
  //     "On several other implementations, the value returned by clock() also
  //      includes the times of any children whose status has been collected via
  //      wait(2) (or another wait-type call)."
  //
  // Also, clock() only has microsecond accuracy.
  //
  // This whitepaper offered excellent advice on how to use rdtscp for
  // profiling: http://download.intel.com/embedded/software/IA/324264.pdf
  //
  // Unfortunately, we can't follow its advice exactly with our semantics,
  // so we're just going to use rdtscp with cpuid.
  //
  // Note that rdtscp will only be available on new processors.

  #include <stdint.h>

  static inline uint64_t rdtsc() {
    uint32_t hi, lo;
    asm volatile("rdtscp\n"
                 "movl %%edx, %0\n"
                 "movl %%eax, %1\n"
                 "cpuid"
                 : "=r" (hi), "=r" (lo) : : "%rax", "%rbx", "%rcx", "%rdx");
    return (((uint64_t)hi) << 32) | (uint64_t)lo;
  }

  static uint64_t rdtsc_per_sec = 0;
  static uint64_t rdtsc_per_usec = 0;
  static void __attribute__((constructor)) init_rdtsc_per_sec() {
    uint64_t before, after;

    before = rdtsc();
    usleep(USECS_PER_SEC);
    after = rdtsc();

    rdtsc_per_sec = after - before;

    before = rdtsc();
    usleep(1);
    after = rdtsc();

    rdtsc_per_usec = after - before;
  }

  double monotonic_time() {
    return (double) rdtsc() / (double) rdtsc_per_sec;
  }

  // TODO: not tested, it is core specific and callers must be aware
  double monotonic_time_us() {
    return ((double) rdtsc() / (double) rdtsc_per_usec);
  }

#endif


================================================
FILE: src/lib/monotonic_timer.h
================================================
// Copyright 2013 Alex Reece.
//
// A cross platform monotonic timer.

#ifndef MONOTONIC_TIMER_H_
#define MONOTONIC_TIMER_H_

#define NANOS_PER_SECF 1000000000.0
#define NANOS_PER_USECF 1000.0
#define NANOS_PER_USEC 1000
#define USECS_PER_SEC 1000000

// Returns seconds since some unspecified start time (guaranteed to be
// monotonically increasing).
double monotonic_time();
double monotonic_time_us();

#endif  // MONOTONIC_TIMER_H_


================================================
FILE: src/lib/pflush.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include "pflush.h"

#include <stdint.h>

typedef uint64_t hrtime_t;

#if defined(__i386__)

static inline unsigned long long asm_rdtsc(void)
{
    unsigned long long int x;
    __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
    return x;
}

static inline unsigned long long asm_rdtscp(void)
{
        unsigned hi, lo;
    __asm__ __volatile__ ("rdtscp" : "=a"(lo), "=d"(hi)::"ecx");
    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );

}
#elif defined(__x86_64__)

static inline unsigned long long asm_rdtsc(void)
{
    unsigned hi, lo;
    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}

static inline unsigned long long asm_rdtscp(void)
{
    unsigned hi, lo;
    __asm__ __volatile__ ("rdtscp" : "=a"(lo), "=d"(hi)::"rcx");
    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
#else
#error "What architecture is this???"
#endif

/* Flush cacheline */
#define asm_clflush(addr)                   \
({                              \
    __asm__ __volatile__ ("clflush %0" : : "m"(*addr)); \
})

/* Memory fence */
#define asm_mfence()                \
({                      \
    PM_FENCE();             \
    __asm__ __volatile__ ("mfence");    \
})

static int global_cpu_speed_mhz = 0;
static int global_write_latency_ns = 0;

void init_pflush(int cpu_speed_mhz, int write_latency_ns)
{
    global_cpu_speed_mhz = cpu_speed_mhz;
    global_write_latency_ns = write_latency_ns;
}

inline hrtime_t cycles_to_ns(int cpu_speed_mhz, hrtime_t cycles)
{
    return (cycles*1000/cpu_speed_mhz);
}

inline hrtime_t ns_to_cycles(int cpu_speed_mhz, hrtime_t ns)
{
    return (ns*cpu_speed_mhz/1000);
}

static inline
void
emulate_latency_ns(int ns)
{
    hrtime_t cycles;
    hrtime_t start;
    hrtime_t stop;
    
    start = asm_rdtsc();
    cycles = ns_to_cycles(global_cpu_speed_mhz, ns);

    do { 
        /* RDTSC doesn't necessarily wait for previous instructions to complete 
         * so a serializing instruction is usually used to ensure previous 
         * instructions have completed. However, in our case this is a desirable
         * property since we want to overlap the latency we emulate with the
         * actual latency of the emulated instruction. 
         */
        stop = asm_rdtsc();
    } while (stop - start < cycles);
}

void
pflush(uint64_t *addr)
{
    if (global_write_latency_ns == 0) {
        return;
    }

    /* Measure the latency of a clflush and add an additional delay to
     * meet the latency to write to NVM */
    hrtime_t start;
    hrtime_t stop;
    start = asm_rdtscp();
    asm_clflush(addr);  
    stop = asm_rdtscp();
    int to_insert_ns = global_write_latency_ns - cycles_to_ns(global_cpu_speed_mhz, stop-start);
    if (to_insert_ns <= 0) {
        return;
    }
    emulate_latency_ns(to_insert_ns);
}


================================================
FILE: src/lib/pflush.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __PFLUSH_H
#define __PFLUSH_H

/**
 * \file
 * 
 * \page pflush_api Persistent Memory API 
 *
 * Method to be used by client to inject a write latency.
 */

#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

void init_pflush(int cpu_speed_mhz, int write_latency_ns);

/**
 * \brief Flush the cacheline containing address addr.
 */
void pflush(uint64_t *addr);

#ifdef __cplusplus
}
#endif

#endif /* __PFLUSH_H */


================================================
FILE: src/lib/pmalloc.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <numa.h>
#include "topology.h"
#include "pmalloc.h"
#include "thread.h"
#include "debug.h"

// pmalloc should be implemented as a separate library

// FIXME: pmalloc currently uses numa_alloc_onnode() which is slower than regular malloc.
// Consider layering another malloc on top of a emulated nvram 


void* pmalloc(size_t size)
{
    thread_t* thread = thread_self();

    if (thread == NULL) {
    	// FIXME: JVM for instance create threads using a mechanism not traced by this emulator
    	//        for now we make sure the current thread is registered right when it makes the
    	//        first explicit NVM allocation. A better solution is to trace the thread creation
    	//        done by JVM.
        register_self();
        thread = thread_self();
    }

    if (thread) {
        return numa_alloc_onnode(size, thread->virtual_node->nvram_node->node_id);
    } else {
    	DBG_LOG(ERROR, "pmalloc called with NULL thread\n");
    }
    
    return NULL;
}

void *prealloc(void *old_addr, size_t old_size, size_t new_size)
{
    return numa_realloc(old_addr, old_size, new_size);
}

void pfree(void* start, size_t size)
{
    numa_free(start, size);
}


================================================
FILE: src/lib/pmalloc.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __PMALLOC_H
#define __PMALLOC_H

/**
 * \file
 * 
 * \page pmalloc_api Persistent Memory API 
 *
 * Methods to be used by clients to allocate and free emulated NVRAM.
 */

#include <stddef.h>

#ifdef __cplusplus
extern "C" {
#endif

void *pmalloc(size_t size);
void *prealloc(void *old_addr, size_t old_size, size_t new_size);
void pfree(void *start, size_t size);

#ifdef __cplusplus
}
#endif

#endif /* __PMALLOC_H */


================================================
FILE: src/lib/process_rank.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
/*
 * process_rank.c
 *
 *  Created on: Jun 16, 2015
 *      Author: root
 */


#include <unistd.h>
#include "model.h"
#include "error.h"

#define EMUL_LOCAL_PROCESSES_VAR "EMUL_LOCAL_PROCESSES"

#define EMUL_LOCK_FILE "/tmp/emul_lock_file"
#define EMUL_PROCESS_LOCAL_RANK_FILE "/tmp/emul_process_local_rank"
#define LOCKED_WAIT_US 1000
#define MAX_LOCKED_RETRIES 50

extern latency_model_t latency_model;

int set_process_local_rank()
{
    FILE *flock = NULL;
    FILE *fcounter = NULL;
    int expired = 0;
    int process_id = 0;
    char *processes;
    int ret = E_SUCCESS;
#ifndef NDEBUG
    char hname[64];
#endif

    processes = getenv(EMUL_LOCAL_PROCESSES_VAR);

    if (!processes) {
    	DBG_LOG(WARNING, "No %s variable set, skipping rank setting\n", EMUL_LOCAL_PROCESSES_VAR);
    	return E_SUCCESS;
    } else {
    	if (sscanf(processes, "%d", &latency_model.max_local_processe_ranks) != 1) {
    		DBG_LOG(WARNING, "Ignoring EMUL_PROCESSES_PER_SYSTEM variable with invalid value '%s'\n", processes);
    		return E_SUCCESS;
    	}
    }

    if (latency_model.max_local_processe_ranks < 2) {
    	DBG_LOG(WARNING, "EMUL_PROCESSES_PER_SYSTEM value is %d, skipping rank setting\n",
    			latency_model.max_local_processe_ranks);
    	return E_SUCCESS;
    }

    DBG_LOG(DEBUG, "setting process local rank for %d local processes\n",
    		latency_model.max_local_processe_ranks);

    while (expired < MAX_LOCKED_RETRIES) {
    	// open lock file on exclusive mode
        flock = fopen(EMUL_LOCK_FILE, "wx");

        if (flock == NULL) {
//        	DBG_LOG(DEBUG, "failed to create lock file\n");
            usleep(LOCKED_WAIT_US);
            expired++;
        }
        if (flock) break;
    }
    if (expired >= MAX_LOCKED_RETRIES) {
    	DBG_LOG(ERROR, "failed to set process local rank\n");
    	return E_ERROR;
    }

    // lock acquired, read process counter file
    if (access(EMUL_PROCESS_LOCAL_RANK_FILE, R_OK | W_OK) < 0) {
    	// rank file does not exist, create it and write "1" for next process
    	// this process rank id is 1
    	process_id = 1;
    	fcounter = fopen(EMUL_PROCESS_LOCAL_RANK_FILE, "w");
    	fwrite(&process_id, sizeof(int), 1, fcounter);
    	fclose(fcounter);
    } else {
    	// rank file exists, read the current rank max value and use it as this process local
    	// rank id and increment the value in the rank file for the next process
    	fcounter = fopen(EMUL_PROCESS_LOCAL_RANK_FILE, "r+");
    	if (fread(&process_id, sizeof(int), 1, fcounter) == 0) {
    	    abort();
    	}
    	DBG_LOG(DEBUG, "read from file current max rank %d\n", process_id);
    	latency_model.process_local_rank = process_id;
    	process_id++;
    	if (process_id >= latency_model.max_local_processe_ranks) {
    	    DBG_LOG(ERROR, "process rank %d exceeded limit of %d max emulated processes\n",
    	        process_id, latency_model.max_local_processe_ranks);
    	    fclose(fcounter);
    	    ret = E_ERROR;
    	} else {
    	    DBG_LOG(DEBUG, "write to file new max rank %d\n", process_id);
    	    rewind(fcounter);
            fwrite(&process_id, sizeof(int), 1, fcounter);
            fclose(fcounter);
        }
    }

    // close and delete lock file
    fclose(flock);
    remove(EMUL_LOCK_FILE);

#ifndef NDEBUG
    gethostname(hname, sizeof(hname));
    DBG_LOG(DEBUG, "process local rank is %d on system %s\n", latency_model.process_local_rank, hname);
#endif

    return ret;
}

int unset_process_local_rank()
{
    FILE *flock = NULL;
    FILE *fcounter = NULL;
    int expired = 0;
    int process_id;

    if (latency_model.max_local_processe_ranks < 2) {
    	return E_SUCCESS;
    }

    DBG_LOG(DEBUG, "Unsetting process local rank\n");

    while (expired < MAX_LOCKED_RETRIES) {
    	// open lock file on Exclusive mode
        flock = fopen(EMUL_LOCK_FILE, "wx");

        if (flock == NULL) {
//        	DBG_LOG(DEBUG, "failed to create lock file\n");
            usleep(LOCKED_WAIT_US);
            expired++;
        }
        if (flock) break;
    }
    if (expired >= MAX_LOCKED_RETRIES) {
    	DBG_LOG(ERROR, "failed to unset process local rank\n");
    	return E_ERROR;
    }

    // lock acquired, read process counter file
    if (access(EMUL_PROCESS_LOCAL_RANK_FILE, R_OK | W_OK) == 0) {
    	// if rank file does not exist, nothing to be done
    	// file exists, read the current value and decrement it
    	fcounter = fopen(EMUL_PROCESS_LOCAL_RANK_FILE, "r+");
    	if (fread(&process_id, sizeof(int), 1, fcounter) == 0) {
    	    abort();
    	}
    	DBG_LOG(DEBUG, "Exiting process and reading current rank max %d\n", process_id);
    	if (process_id > 0) process_id--;
    	{
    	char hname[64];
    	gethostname(hname, sizeof(hname));
    	DBG_LOG(DEBUG, "Exiting process and writing new rank max %d on %s\n", process_id, hname);
    	}
    	rewind(fcounter);
		fwrite(&process_id, sizeof(int), 1, fcounter);
		fclose(fcounter);
    }

    // close and delete lock file
    fclose(flock);
    remove(EMUL_LOCK_FILE);

    return E_SUCCESS;
}


================================================
FILE: src/lib/stat.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <stdio.h>
#include <math.h>
#include <sys/types.h>
#include <unistd.h>

#include "utlist.h"
#include "stat.h"
#include "thread.h"
#include "interpose.h"
#include "model.h"

thread_manager_t* get_thread_manager();
hrtime_t cycles_to_us(int cpu_speed_mhz, hrtime_t cycles);

#ifdef USE_STATISTICS
void stats_set_init_time(double init_time_us) {
	thread_manager_t* thread_manager = get_thread_manager();

	__lib_pthread_mutex_lock(&thread_manager->mutex);
	thread_manager->stats.init_time_us = init_time_us;
	__lib_pthread_mutex_unlock(&thread_manager->mutex);
}

void stats_enable(config_t *cfg) {
	thread_manager_t* thread_manager = get_thread_manager();

    __cconfig_lookup_bool(cfg, "statistics.enable", &thread_manager->stats.enabled);
    if (__cconfig_lookup_string(cfg, "statistics.file", &thread_manager->stats.output_file) == CONFIG_FALSE) {
    	__lib_pthread_mutex_lock(&thread_manager->mutex);
    	thread_manager->stats.output_file = NULL;
    	__lib_pthread_mutex_unlock(&thread_manager->mutex);
    }
}

static char *get_current_time() {
    time_t curtime;
    char *str_time;

    time(&curtime);
    str_time = ctime(&curtime);
    str_time[strlen(str_time) - 1] = 0;

    return str_time;
}

static inline hrtime_t ns_to_cycles(int cpu_speed_mhz, int ns)
{
    return (cpu_speed_mhz * ns) / 1000;
}

extern __thread int tls_hw_local_latency;
extern __thread int tls_hw_remote_latency;

static void show_thread_stats(thread_t *thread, FILE *out_file) {
    uint64_t fixed_value;
    uint64_t cycles;

    fprintf(out_file, "\tThread id [%d]\n", thread->tid);
    fprintf(out_file, "\t\t: cpu id: %d\n", thread->cpu_id);
    fprintf(out_file, "\t\t: spawn timestamp: %lu\n", thread->stats.register_timestamp);
    fprintf(out_file, "\t\t: termination timestamp: %lu\n", thread->stats.unregister_timestamp);
    fixed_value = thread->stats.unregister_timestamp > 0 ? (thread->stats.unregister_timestamp - thread->stats.register_timestamp) : 0;
    fprintf(out_file, "\t\t: execution time: %lu usecs\n", fixed_value);
    fprintf(out_file, "\t\t: stall cycles: %lu\n", thread->stats.stall_cycles);

    if (thread->virtual_node->dram_node != thread->virtual_node->nvram_node &&
                latency_model.pmc_remote_dram) {
        cycles = ns_to_cycles(thread->cpu_speed_mhz, tls_hw_remote_latency);
        fixed_value = cycles ? thread->stats.stall_cycles / cycles : 0;
    }
    else {
        cycles = ns_to_cycles(thread->cpu_speed_mhz, tls_hw_local_latency);
        fixed_value = cycles ? thread->stats.stall_cycles / cycles : 0;
    }
    fprintf(out_file, "\t\t: NVM accesses: %lu\n", fixed_value);


    fprintf(out_file, "\t\t: latency calculation overhead cycles: %lu\n", thread->stats.overhead_cycles);
    fprintf(out_file, "\t\t: injected delay cycles: %lu\n", thread->stats.delay_cycles);
    if (thread->cpu_speed_mhz) {
        fprintf(out_file, "\t\t: injected delay in usec: %lu\n", cycles_to_us(thread->cpu_speed_mhz, thread->stats.delay_cycles));
    }
    fprintf(out_file, "\t\t: longest epoch duration: %lu usec\n", thread->stats.longest_epoch_duration_us);
    fixed_value = (thread->stats.shortest_epoch_duration_us == UINT64_MAX) ? 0 : thread->stats.shortest_epoch_duration_us;
    fprintf(out_file, "\t\t: shortest epoch duration: %lu usec\n", fixed_value);
    fixed_value = thread->stats.epochs ? (thread->stats.overall_epoch_duration_us / thread->stats.epochs) :
    		thread->stats.overall_epoch_duration_us;
    fprintf(out_file, "\t\t: average epoch duration: %lu usec\n", fixed_value);
    fprintf(out_file, "\t\t: number of epochs: %lu\n", thread->stats.epochs);
    fprintf(out_file, "\t\t: epochs which didn't reach min duration: %lu\n", thread->stats.min_epoch_not_reached);
    fprintf(out_file, "\t\t: static epochs requested: %lu\n", thread->stats.signals_sent);
}

void stats_report() {
    thread_t *thread;
    FILE *out_file;
    uint64_t running_threads = 0;
    thread_manager_t* thread_manager = get_thread_manager();
    uint64_t terminated_threads;

    if (!thread_manager) return;
    if (!thread_manager->stats.enabled) return;

    if (thread_manager->stats.output_file) {
        out_file = fopen(thread_manager->stats.output_file, "a");
        if (!out_file) {
            fprintf(stderr, "Failed to open statistics file for writing: %s\n", thread_manager->stats.output_file);
            return;
        }
    } else {
        out_file = stdout;
    }

    __lib_pthread_mutex_lock(&thread_manager->mutex);
    LL_FOREACH(thread_manager->thread_list, thread) {
        running_threads++;
    }
    __lib_pthread_mutex_unlock(&thread_manager->mutex);

    fprintf(out_file, "\n\n===== STATISTICS (%s) =====\n\n", get_current_time());
    if (!latency_model.inject_delay) {
    	fprintf(out_file, "WARNING: delay injection is disabled\n");
    }
    fprintf(out_file, "PID: %d\n", getpid());
    fprintf(out_file, "Initialization duration: %lu usec\n", thread_manager->stats.init_time_us);
    fprintf(out_file, "Running threads: %lu\n", running_threads);
    terminated_threads = thread_manager->stats.n_threads > 0 ? (thread_manager->stats.n_threads - running_threads) : 0;
    fprintf(out_file, "Terminated threads: %lu\n", terminated_threads);
    fprintf(out_file, "\n");

    fprintf(out_file, "== Running threads == \n");

    __lib_pthread_mutex_lock(&thread_manager->mutex);
    LL_FOREACH(thread_manager->thread_list, thread) {
    	show_thread_stats(thread, out_file);
    }
    __lib_pthread_mutex_unlock(&thread_manager->mutex);

    fprintf(out_file, "\n== Terminated threads == \n");

    __lib_pthread_mutex_lock(&thread_manager->mutex);
    LL_FOREACH(thread_manager->stats.thread_list, thread) {
    	show_thread_stats(thread, out_file);
    }
    __lib_pthread_mutex_unlock(&thread_manager->mutex);

    if (out_file != stdout) {
        fclose(out_file);
    }
}
#endif

double sum(double array[], int n)
{
    int i;
    double s = 0;

    for (i=0; i<n; i++) {
        s += array[i];
    }
    return s;
}

// returns sum of x . y
double sumxy(double x[], double y[], int n)
{
    int i;
    double s = 0;

    for (i=0; i<n; i++) {
        s += x[i] * y[i];
    }
    return s;
}


double avg(double array[], int n)
{
    double s;

    s = sum(array, n);
    return s/n;
}

double slope(double x[], double y[], int n)
{
    double sumxy_;
    double sumx2;
    double sumx;
    double sumy;
    double m; 

    sumxy_ = sumxy(x, y, n);
    sumx2 = sumxy(x, x, n);
    sumx = sum(x, n);
    sumy = sum(y, n);

    m = (n * sumxy_ - sumx * sumy) / 
        (n * sumx2 - sumx*sumx);
    return m;
}


================================================
FILE: src/lib/stat.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __STATISTICS_H
#define __STATISTICS_H

//#include <sys/types.h>
#include <stdint.h>
#include "config.h"

#ifdef USE_STATISTICS
struct thread_s;

typedef struct {
    int enabled;
    struct thread_s* thread_list;
    uint64_t n_threads;
    uint64_t init_time_us;
    char *output_file;
} stats_t;

typedef struct {
    uint64_t stall_cycles;
    uint64_t overhead_cycles;
    uint64_t delay_cycles;
    uint64_t signals_sent;
    uint64_t epochs;
    double last_epoch_timestamp;
    uint64_t shortest_epoch_duration_us;
    uint64_t longest_epoch_duration_us;
    uint64_t overall_epoch_duration_us;
    uint64_t min_epoch_not_reached;
    uint64_t register_timestamp;
    uint64_t unregister_timestamp;
} thread_stats_t;

void stats_enable(config_t *cfg);
void stats_set_init_time(double init_time_us);
void stats_report();
#endif

double sum(double array[], int n);
double sumxy(double x[], double y[], int n);
double avg(double array[], int n);
double slope(double x[], double y[], int n);

#endif /* __STATISTICS_H */


================================================
FILE: src/lib/thread.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <sys/syscall.h>
#include <unistd.h>
#include <pthread.h>
#include <signal.h>
#include <stdlib.h>
#include "cpu/cpu.h"
#include "utlist.h"
#include "error.h"
#include "interpose.h"
#include "model.h"
#include "thread.h"
#include "topology.h"
#include "monotonic_timer.h"

static thread_manager_t* thread_manager = NULL;
__thread thread_t* tls_thread = NULL;

extern inline hrtime_t hrtime_cycles(void);

// assign a virtual/physical node using a round-robin policy
static void rr_next_cpu_id(thread_manager_t* thread_manager, int* next_virtual_node_idp, int* next_cpu_idp)
{
    int next_virtual_node_id;
    virtual_node_t* virtual_node;
    physical_node_t* physical_node;
    virtual_topology_t* virtual_topology = thread_manager->virtual_topology;

    *next_virtual_node_idp = thread_manager->next_virtual_node_id;
    *next_cpu_idp = thread_manager->next_cpu_id;

    // advance to the next virtual node and cpu id
    next_virtual_node_id = thread_manager->next_virtual_node_id;
    virtual_node = &virtual_topology->virtual_nodes[next_virtual_node_id];
    physical_node = virtual_node->dram_node; // we run threads on the dram node
    if ((thread_manager->next_cpu_id = next_cpu(physical_node->cpu_bitmask, thread_manager->next_cpu_id + 1)) < 0) {
        next_virtual_node_id = (next_virtual_node_id + 1) % virtual_topology->num_virtual_nodes;
        virtual_node = &virtual_topology->virtual_nodes[next_virtual_node_id];
        physical_node = virtual_node->dram_node;
        thread_manager->next_cpu_id = first_cpu(physical_node->cpu_bitmask);
        thread_manager->next_virtual_node_id = next_virtual_node_id;
    } 
}

void rr_set_next_cpu_based_on_rank(int rank, int max_rank)
{
    int cpu_id;
    int virtual_node_id;
    int i;

    // set the next CPU id based on this process rank id
    thread_manager->next_virtual_node_id = 0;
    thread_manager->next_cpu_id = 0;
    for (i = 0; i <= rank; ++i) {
        rr_next_cpu_id(thread_manager, &virtual_node_id, &cpu_id);
    }

    DBG_LOG(DEBUG, "no partitioning of CPUs, set next CPU "
                   "to vnode %d and cpu %d\n", virtual_node_id, cpu_id);
}

void partition_cpus_based_on_rank(int rank, int max_rank, int num_cpus,
                                  virtual_topology_t* virtual_topology)
{
    // assumed the number of cpus/2 is greater or equal to max_rank
    // this partition is num_cpus/max_rank
    int part_size = num_cpus/max_rank;
    int start = rank * part_size;
    int end = start + part_size -1;
    int i;
    int cpu_id = 0;
    int virtual_node_id = 0;
    virtual_node_t* virtual_node;
    physical_node_t* physical_node;

    DBG_LOG(DEBUG, "partitioning CPUS, this process has CPUs from %d and %d\n",
            start, end);

    thread_manager->next_virtual_node_id = 0;
    thread_manager->next_cpu_id = 0;
    for (i = 0; i < num_cpus; ++i) {
        rr_next_cpu_id(thread_manager, &virtual_node_id, &cpu_id);
        if (i < start || i > end) {
            // this CPU is outside the partition of this process
            // disable this CPU
            virtual_node = &virtual_topology->virtual_nodes[virtual_node_id];
            physical_node = virtual_node->dram_node;

            DBG_LOG(DEBUG, "disabling CPU %d\n", cpu_id);

            if (numa_bitmask_isbitset(physical_node->cpu_bitmask, cpu_id)) {
                numa_bitmask_clearbit(physical_node->cpu_bitmask, cpu_id);
            }
        }
    }
}

int bind_thread_on_cpu(thread_manager_t* thread_manager, thread_t* thread, int virtual_node_id, int cpu_id)
{
    thread->virtual_node = &thread_manager->virtual_topology->virtual_nodes[virtual_node_id];
    DBG_LOG(INFO, "Binding thread tid [%d] pthread: 0x%lx on processor %d\n", thread->tid, thread->pthread, cpu_id);
    struct bitmask* cpubind = numa_allocate_cpumask();
    numa_bitmask_setbit(cpubind, cpu_id);
    if (numa_sched_setaffinity(thread->tid, cpubind) != 0) {
        DBG_LOG(ERROR, "Cannot bind thread tid [%d] pthread: 0x%lx on processor %d\n", thread->tid, thread->pthread, cpu_id);
        numa_bitmask_free(cpubind);
        return E_ERROR;
    }
    numa_bitmask_free(cpubind);
    return E_SUCCESS;
}

int bind_thread_on_mem(thread_manager_t* thread_manager, thread_t* thread, int virtual_node_id, int cpu_id)
{
    int physical_node_id;
    struct bitmask* membind = numa_allocate_nodemask();
    physical_node_id = thread_manager->virtual_topology->virtual_nodes[virtual_node_id].dram_node->node_id;
    numa_bitmask_setbit(membind, physical_node_id);
    numa_set_membind(membind);
    numa_free_nodemask(membind);

    return E_SUCCESS;
}

thread_t* thread_self()
{
    return tls_thread;
}

void thread_interrupt_handler(int signum)
{
    DBG_LOG(DEBUG, "Handling interrupt thread [%d] pthread: 0x%lx\n", thread_self()->tid, thread_self()->pthread);

    create_latency_epoch();
}

#ifdef PAPI_SUPPORT
static int setup_events_thread_self(thread_t *thread, const char **native_events) {
    int i;

    // create event set for this thread
    if (pmc_create_event_set_local_thread() != 0) {
       return -1;
    }

    // register events for this thread
    for (i = 0; i < MAX_NUM_EVENTS; ++i) {
   	    if (native_events[i]) {
            DBG_LOG(INFO, "registering event %s, thread id [%d]\n", native_events[i], thread->tid);
            if (pmc_register_event_local_thread(native_events[i]) != 0) {
                return E_ERROR;
            }
        }
    }

    // start event counting for this thread
    if (pmc_events_start_local_thread() != 0) {
    	return E_ERROR;
    }

    pmc_register_thread();

    return 0;
}
#endif

int register_thread(thread_manager_t* thread_manager, pthread_t pthread, pid_t tid)
{
    int ret = 0;
    int cpu_id;
    int virtual_node_id;
    thread_t* thread = malloc(sizeof(thread_t));

    if (thread_manager == NULL) {
        // this is possible if both BW and latency modeling are enabled and the BW model is not yet created.
        // the BW modeling will spawn threads which will attempt to register with the thread manager if the
        // latency modeling is enabled. However the thread manager is instantiated later.
        //goto error;
        return E_SUCCESS;
    }

    memset(thread, 0, sizeof(thread_t));

    thread->pthread = pthread;
    thread->tid = tid;
    thread->thread_manager = thread_manager;

#ifdef USE_STATISTICS
    if (thread_manager->stats.enabled) {
        thread->stats.last_epoch_timestamp = monotonic_time_us();
        thread->stats.shortest_epoch_duration_us = UINT64_MAX;
    }
#endif

	/* install thread interrupt handler as the signal handler for SIGUSR1. */
    struct sigaction sa;
    memset (&sa, 0, sizeof(sa));
    sa.sa_handler = &thread_interrupt_handler;
    sa.sa_flags = SA_RESTART;
    sigaction (SIGUSR1, &sa, NULL);

    // bind the thread on a cpu and memory node and
    // link the thread to the list of threads
    assert(__lib_pthread_mutex_lock);
    __lib_pthread_mutex_lock(&thread_manager->mutex);
    rr_next_cpu_id(thread_manager, &virtual_node_id, &cpu_id);
    if ((ret = bind_thread_on_cpu(thread_manager, thread, virtual_node_id, cpu_id)) != E_SUCCESS) {
    	__lib_pthread_mutex_unlock(&thread_manager->mutex);
    	DBG_LOG(ERROR, "thread id [%d] failed to bind to CPU\n", thread->tid);
        goto error;
    }
    if ((ret = bind_thread_on_mem(thread_manager, thread, virtual_node_id, cpu_id)) != E_SUCCESS) {
    	__lib_pthread_mutex_unlock(&thread_manager->mutex);
    	DBG_LOG(ERROR, "thread id [%d] failed to bind to Memory\n", thread->tid);
        goto error;
    }
    thread->cpu_id = cpu_id;
    thread->cpu_speed_mhz = cpu_speed_mhz();
#ifdef PAPI_SUPPORT
    cpu_model_t *cpu = thread_manager->virtual_topology->virtual_nodes[virtual_node_id].dram_node->cpu_model;
    if (setup_events_thread_self(thread, cpu->pmc_events.native_events) != 0) {
        ret = E_ERROR;
        __lib_pthread_mutex_unlock(&thread_manager->mutex);
        goto error;
    }
#endif
    LL_APPEND(thread_manager->thread_list, thread);
#ifdef USE_STATISTICS
    if (thread_manager->stats.enabled) {
        thread_manager->stats.n_threads++;
        thread->stats.register_timestamp = monotonic_time_us();
    }
#endif
    __lib_pthread_mutex_unlock(&thread_manager->mutex);

    init_thread_latency_model(thread);

    tls_thread = thread;

    return E_SUCCESS;

error:
    free(thread);
    DBG_LOG(ERROR, "thread id [%d] failed to register with Monitor Thread\n", thread->tid);
    return ret;
}


int unregister_thread(thread_manager_t* thread_manager, thread_t * thread)
{
    __lib_pthread_mutex_lock(&thread_manager->mutex);

    if (thread_manager == NULL) {
        return E_SUCCESS;
    }

    LL_DELETE(thread_manager->thread_list, thread);

#ifdef USE_STATISTICS
    if (thread_manager->stats.enabled) {
        thread->stats.unregister_timestamp = monotonic_time_us();
        LL_APPEND(thread_manager->stats.thread_list, thread);
    }
#endif

    __lib_pthread_mutex_unlock(&thread_manager->mutex);

#ifdef PAPI_SUPPORT
    pmc_events_stop_local_thread();
    pmc_destroy_event_set_local_thread();
    pmc_unregister_thread();
#endif

    return E_SUCCESS;
}


int register_self()
{
	int ret = E_SUCCESS;

    if (thread_self() == NULL) {
    	pid_t tid = (pid_t) syscall(SYS_gettid);
    	DBG_LOG(INFO, "Registering thread tid [%d]\n", tid);
        ret = register_thread(thread_manager, pthread_self(), tid);
    }

    return ret;
}

int unregister_self()
{
	if (tls_thread) {
	    unregister_thread(thread_manager, tls_thread);

#ifdef USE_STATISTICS
	    if (!thread_manager->stats.enabled) {
		    // statistics makes use of the thread descriptor
            free(tls_thread);
	    }
#else
	    free(tls_thread);
#endif
        tls_thread = NULL;
	}

    return E_SUCCESS;
}

static int reached_max_epoch_duration(thread_t* thread);
void interrupt_threads(thread_manager_t* manager)
{
    thread_t* thread;

    assert(__lib_pthread_mutex_lock);
    __lib_pthread_mutex_lock(&manager->mutex);
    LL_FOREACH(manager->thread_list, thread)
    {
    	assert(thread);
        if (thread->signaled == 0 && reached_max_epoch_duration(thread)) {
            DBG_LOG(DEBUG, "interrupting thread [%d]\n", thread->tid);
#ifdef USE_STATISTICS
            if (manager->stats.enabled) {
                thread->stats.signals_sent++;
            }
#endif
            // this flag must be set before the signal is sent to make sure
            // there will be no race condition
            thread->signaled = 1;
            pthread_kill(thread->pthread, SIGUSR1);
        }
    }
    assert(__lib_pthread_mutex_unlock);
    __lib_pthread_mutex_unlock(&manager->mutex);
}

void* monitor_thread(void* arg)
{
    thread_manager_t* manager = (thread_manager_t*) arg;
    struct timespec epoch_duration;
//    time_t secs = thread_manager->max_epoch_duration_us / USECS_PER_SEC;
//    long nanosecs = (thread_manager->max_epoch_duration_us % USECS_PER_SEC) * NANOS_PER_USEC;

    epoch_duration.tv_sec = 0;
    epoch_duration.tv_nsec = MIN_EPOCH_DURATION_US * 1000;
    while(1) {
        nanosleep(&epoch_duration, NULL);
        interrupt_threads(manager);
    }
    return NULL;
}

static void set_epoch_duration(config_t* cfg, const char *config_str, int *epoch_us, int default_epoch_us) {
    if (__cconfig_lookup_int(cfg, config_str, epoch_us) != CONFIG_TRUE) {
    	*epoch_us = default_epoch_us;
    } else {
        if (*epoch_us > MAX_EPOCH_DURATION_US ||
                *epoch_us < MIN_EPOCH_DURATION_US) {
            DBG_LOG(WARNING, "%s is out of supported bounds [%i, %i], setting it to %i\n",
            		config_str,
            		MIN_EPOCH_DURATION_US,
            		MAX_EPOCH_DURATION_US,
					default_epoch_us);
            *epoch_us = default_epoch_us;
        }
    }
}

int init_thread_manager(config_t* cfg, virtual_topology_t* virtual_topology)
{
    int ret;
    pthread_t monitor_tid;
    thread_manager_t* mgr;
    virtual_node_t* virtual_node;
    physical_node_t* physical_node;

    if (!(mgr = malloc(sizeof(thread_manager_t)))) {
        ret = E_ERROR;
        goto done;    
    }

    memset(mgr, 0, sizeof(thread_manager_t));

    mgr->thread_list = NULL;
    mgr->virtual_topology = virtual_topology;
    mgr->next_virtual_node_id = 0;

    set_epoch_duration(cfg, "latency.max_epoch_duration_us", &mgr->max_epoch_duration_us, MAX_EPOCH_DURATION_US);
    set_epoch_duration(cfg, "latency.min_epoch_duration_us", &mgr->min_epoch_duration_us, MIN_EPOCH_DURATION_US);

    if (mgr->min_epoch_duration_us > mgr->max_epoch_duration_us) {
        DBG_LOG(WARNING, "latency.min_epoch_duration_us is greater than latency.max_epoch_duration_us, setting it to %i\n",
                MIN_EPOCH_DURATION_US);
        mgr->min_epoch_duration_us = MIN_EPOCH_DURATION_US;
    }

    virtual_node = &virtual_topology->virtual_nodes[mgr->next_virtual_node_id];
    physical_node = virtual_node->dram_node;
    mgr->next_cpu_id = first_cpu(physical_node->cpu_bitmask);
    pthread_mutex_init(&mgr->mutex, NULL);

    // fire a monitoring thread that periodically interrupts threads
    assert(__lib_pthread_create);
    assert(__lib_pthread_detach);
    __lib_pthread_create(&monitor_tid, NULL, monitor_thread, (void*) mgr);
    __lib_pthread_detach(monitor_tid);

    thread_manager = mgr;
    return E_SUCCESS;

done:
    return ret;
}

int reached_min_epoch_duration(thread_t* thread) {
	double current_time;
	uint64_t diff_us;
	int result = 0;

    if (thread == NULL) {
    	// FIXME: JVM for instance create threads using a mechanism not traced by this emulator
    	//        for now we make sure the current thread is registered right when it makes the
    	//        first explicit NVM allocation or when interposed functions are called. A
    	//        better solution is to trace the thread creation done by JVM.
        if (register_self() != E_SUCCESS)
        	// if the thread could not be registered, exit this function
        	return 0;
        thread = thread_self();
    }

	current_time = monotonic_time_us();

#ifdef USE_STATISTICS
    diff_us = (uint64_t) (current_time - thread->stats.last_epoch_timestamp);
#else
    diff_us = (uint64_t) (current_time - thread->last_epoch_timestamp);
#endif

    DBG_LOG(DEBUG, "thread id [%d] last epoch was %lu usec ago\n", thread->tid, diff_us);

    if(diff_us >= thread_manager->min_epoch_duration_us) {
    	DBG_LOG(DEBUG, "thread id [%d] reached min epoch duration (%i usec)\n", thread->tid,
    			thread_manager->min_epoch_duration_us);
        result = 1;
    }
#ifdef USE_STATISTICS
    if (thread_manager->stats.enabled && ! result) {
    	thread->stats.min_epoch_not_reached++;
    }
#endif
    return result;
}

static int reached_max_epoch_duration(thread_t* thread) {
	double current_time;
	uint64_t diff_us;
	int result = 0;

	// it compares this time with the last_epoch_timestamp, which is set by another thread
	// so, this time must be based on a system time and not on CPU cycles/time registers
	current_time = monotonic_time_us();

#ifdef USE_STATISTICS
    diff_us = (uint64_t) (current_time - thread->stats.last_epoch_timestamp);
#else
    diff_us = (uint64_t) (current_time - thread->last_epoch_timestamp);
#endif

    DBG_LOG(DEBUG, "thread id [%d] last epoch was %lu usec ago\n", thread->tid, diff_us);

    if(diff_us >= thread_manager->max_epoch_duration_us) {
    	DBG_LOG(DEBUG, "thread id [%d] reached max epoch duration (%i usec)\n", thread->tid,
    			thread_manager->max_epoch_duration_us);
        result = 1;
    }

    return result;
}

void block_new_epoch() {
    sigset_t set;
    sigemptyset(&set);
    sigaddset(&set, SIGUSR1);
    pthread_sigmask(SIG_BLOCK, &set, NULL);
}

void unblock_new_epoch() {
    sigset_t set;
    sigemptyset(&set);
    sigaddset(&set, SIGUSR1);
    pthread_sigmask(SIG_UNBLOCK, &set, NULL);
}

thread_manager_t* get_thread_manager() {
	return thread_manager;
}


================================================
FILE: src/lib/thread.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __THREAD_H
#define __THREAD_H

#include <sys/types.h>
#include <stdint.h>
#include <numa.h>
#include <pthread.h>
#include <libconfig.h>
#include "topology.h"
#include "cpu/cpu.h"
#include "stat.h"


struct thread_manager_s; // opaque

typedef uint64_t hrtime_t;

// TODO: Used by memlat benchmark, should be disabled on a release version
#define MEMLAT_SUPPORT

typedef struct thread_s {
    struct virtual_node_s* virtual_node;
    pthread_t pthread;
    pid_t tid;
    int cpu_id; // the processor the thread is bound on
    int cpu_speed_mhz;
    struct thread_manager_s* thread_manager;
    struct thread_s* next;
    int signaled;
#ifdef MEMLAT_SUPPORT
	uint64_t stall_cycles;
#endif
#ifdef USE_STATISTICS
    thread_stats_t stats;
#else
    double last_epoch_timestamp;
#endif
} thread_t;

typedef struct thread_manager_s {
    pthread_mutex_t mutex;
    thread_t* thread_list;
    int max_epoch_duration_us; // maximum epoch duration in microseconds
    int min_epoch_duration_us; // minimum epoch duration in microseconds
    int next_virtual_node_id; // used by the round-robin policy -- next virtual node to run on 
    int next_cpu_id; // used by the round-robin policy -- next cpu to run on
    struct virtual_topology_s* virtual_topology;   
#ifdef USE_STATISTICS
    stats_t stats;
#endif
} thread_manager_t; 

int init_thread_manager(config_t* cfg, struct virtual_topology_s* virtual_topology);
int register_self();
int unregister_self();
thread_t* thread_self();
int reached_min_epoch_duration(thread_t* thread);
void block_new_epoch();
void unblock_new_epoch();

#endif /* __THREAD_H */


================================================
FILE: src/lib/topology.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
/**
 *  \file
 * 
 *  Constructs a virtual topology
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <limits.h>
#include <numa.h>
#include "cpu/cpu.h"
#include "error.h"
#include "measure.h"
#include "topology.h"
#include "model.h"

#define MAX_NUM_MC_PCI_BUS 16

extern latency_model_t latency_model;

void rr_set_next_cpu_based_on_rank(int rank, int max_rank);
void partition_cpus_based_on_rank(int rank, int max_rank, int num_cpus,
                                  virtual_topology_t* virtual_topology);

int select_cpus_based_on_local_rank(virtual_topology_t* virtual_topology)
{
    int num_cpus = 0;
    int vnode;
    virtual_node_t* virtual_node;
    physical_node_t* physical_node;
    int n_procs = latency_model.max_local_processe_ranks;
    int rank = latency_model.process_local_rank;

    if (rank >= n_procs) {
        DBG_LOG(ERROR, "process rank %d exceeded limit of %d max emulated processes\n",
                       rank, n_procs);
        return E_ERROR;
    }

    for (vnode = 0; vnode < virtual_topology->num_virtual_nodes; ++vnode) {
        virtual_node = &virtual_topology->virtual_nodes[vnode];
        physical_node = virtual_node->dram_node;
        num_cpus += physical_node->num_cpus;
    }

    DBG_LOG(DEBUG, "number of cpus is %d\n", num_cpus);

    if (n_procs > (num_cpus/2)) {
        // do not partition CPUs, but bind this process to the CPU
        // indicated by our rank, after that, a new thread will be
        // bound to next available CPU on a round robin policy from
        // the max rank
        rr_set_next_cpu_based_on_rank(rank, n_procs);
    } else {
        // partition the CPUs to each rank
        // some CPUs may end up idle/without bound processes, if n_procs is not
        // multiple of 2
        // TODO: warn or avoid idle CPUs
        partition_cpus_based_on_rank(rank, n_procs, num_cpus, virtual_topology);
    }

    return E_SUCCESS;
}

/** 
 *  \brief Returns a list of memory-controller pci buses
 */
int get_mc_pci_bus_list(pci_regs_t *bus_id_list[], int max_list_size, int* dev_countp)
{
    FILE* fp;
    char  buf[2048];
    int   bus_id, dev_id, funct;
    int   last_bus_id = -1;
    int   channel = 0;
    char  dontcare[512];
    int   dev_count = 0;

    fp = popen("lspci", "r");
    if (fp == NULL) {
        return E_ERROR;
    }

    for (dev_count=0; fgets(buf, sizeof(buf)-1, fp) != NULL; ) {
        if (strstr(buf, "Thermal Control")) {
            if (sscanf(buf, "%x:%x.%x %s", &bus_id, &dev_id, &funct, dontcare) == 4) {
                if (bus_id != last_bus_id) {
                    ++dev_count;
                    last_bus_id = bus_id;

                    if (dev_count > max_list_size) {
                        pclose(fp);
                        return E_ERROR;
                    }
                    channel = 0;
                    bus_id_list[dev_count-1] = (pci_regs_t*)malloc(sizeof(pci_regs_t));
                }

                bus_id_list[dev_count-1]->addr[channel].bus_id = bus_id;
                bus_id_list[dev_count-1]->addr[channel].dev_id = dev_id;
                bus_id_list[dev_count-1]->addr[channel].funct = funct;
                ++channel;
                bus_id_list[dev_count-1]->channels = channel;
            }
        }
    }
    *dev_countp = dev_count;
    pclose(fp);

    return E_SUCCESS;
}


/**
 *  \brief Discovers the physical memory-controller pci bus topology of the 
 *  machine, which includes the socket each memory controller is attached to
 * 
 *  To discover where a memory controller is connected to, we throttle the rest of 
 *  the memory controllers and measure local bandwidth of each node. The unthrottled 
 *  memory controller is attached to the node with the highest local bandwidth
 */
int discover_mc_pci_topology(cpu_model_t* cpu_model, physical_node_t* physical_nodes[], int num_physical_nodes)
{
    pci_regs_t *regs_addr[16];
    int dev_count;
    physical_node_t* local_node = NULL;
    int b, i;
    double max_local_rbw;
    double rbw;
    int count = 0;
    uint16_t throttle_reg_val;

    get_mc_pci_bus_list(regs_addr, MAX_NUM_MC_PCI_BUS, &dev_count);

    if (dev_count < num_physical_nodes) {
        // TODO: application is terminated on error only if in DEBUG mode
        DBG_LOG(WARNING, "The number of physical nodes is greater than the number of memory-controller pci buses.\n");
    }

    for (b=0; b<dev_count; b++) {
        // throttle all other buses except the one we are currently trying 
        // to figure out where it is attached
        for (i=0; i<dev_count; i++) {
            if (i == b) {
                cpu_model->get_throttle_register(regs_addr[i], THROTTLE_DDR_ACT, &throttle_reg_val);
                if (throttle_reg_val < 0x8fff)
                    cpu_model->set_throttle_register(regs_addr[i], THROTTLE_DDR_ACT, 0x8fff);
            } else {
                cpu_model->set_throttle_register(regs_addr[i], THROTTLE_DDR_ACT, 0x800f);
            }
        }
        // measure local bandwidth of each node
        max_local_rbw = 0;
        for (i=0; i<num_physical_nodes; i++) {
            physical_node_t* node_i = physical_nodes[i];
            rbw = measure_read_bw(node_i->node_id, node_i->node_id);
            if (rbw > max_local_rbw) {
                max_local_rbw = rbw;
                local_node = node_i;
            }
        }
        if (local_node) {
            DBG_LOG(DEBUG, "setting node_id %d to bus %X\n", local_node->node_id, regs_addr[b]->addr[0].bus_id);
            local_node->mc_pci_regs = regs_addr[b];
            if (++count == num_physical_nodes) break;
        }
    }

    for (i=0; i<dev_count; i++) {
        cpu_model->get_throttle_register(regs_addr[i], THROTTLE_DDR_ACT, &throttle_reg_val);
        if (throttle_reg_val < 0x8fff)
            cpu_model->set_throttle_register(regs_addr[i], THROTTLE_DDR_ACT, 0x8fff);
    }

    return E_SUCCESS;
}

/** 
 * \brief Loads the memory controller pci topology from a file
 */
static int load_mc_pci_topology(const char* path, physical_node_t* physical_nodes[], int num_physical_nodes)
{
    FILE *fp;
    char *line = NULL;
    size_t len = 0;
    ssize_t read;
    int j;
    int bus_id, dev_id, funct;
    int node_id;
    int dev_count;
    pci_regs_t *regs = NULL;
    int channel = 0;
    int last_bus_id = -1;

    fp = fopen(path, "r");
    if (fp == NULL) {
        return E_ERROR;
    }

    DBG_LOG(INFO, "Loading memory-controller pci topology from %s\n", path);
    for (dev_count = 0; (read = getline(&line, &len, fp)) != -1; ) {
        sscanf(line, "%d\t%x:%x.%x", &node_id, &bus_id, &dev_id, &funct);
        DBG_LOG(INFO, "node: %d, pci addr: %x:%x.%x\n", node_id, bus_id, dev_id, funct);
        if (bus_id != last_bus_id) {
            last_bus_id = bus_id;
            regs = (pci_regs_t*) malloc(sizeof(pci_regs_t));
            channel = 0;
            dev_count++;

            for (j=0; j<num_physical_nodes; j++) {
                if (node_id == physical_nodes[j]->node_id) {
                    physical_nodes[j]->mc_pci_regs = regs;
                    DBG_LOG(INFO, "node: %d, pci bus: 0x%x\n", physical_nodes[j]->node_id, bus_id);
                }
            }
        }

        regs->addr[channel].bus_id = bus_id; 
        regs->addr[channel].dev_id = dev_id; 
        regs->addr[channel].funct = funct;
        ++channel;
        regs->channels = channel;
    }
    free(line);
    if (dev_count < num_physical_nodes) {
        DBG_LOG(WARNING, "No complete memory-controller pci topology found in %s\n", path);
    }
    fclose(fp);
    return E_SUCCESS;
}


/** 
 * \brief Saves the memory controller pci topology in a file for later reuse
 */
static int save_mc_pci_topology(const char* path, physical_node_t* physical_nodes[], int num_physical_nodes)
{
    int i, j;
    FILE *fp;

    fp = fopen(path, "w");
    if (fp == NULL) {
        return E_ERROR;
    }

    DBG_LOG(INFO, "Saving memory-controller pci topology into %s\n", path);
    for (i=0; i<num_physical_nodes; i++) {
        pci_regs_t *regs = physical_nodes[i]->mc_pci_regs;
        int node_id = physical_nodes[i]->node_id;
        for (j=0; regs != NULL && j < regs->channels; ++j) {
            DBG_LOG(INFO, "node: %d, pci addr: %x:%x.%x\n", node_id, regs->addr[j].bus_id, regs->addr[j].dev_id, regs->addr[j].funct);
            fprintf(fp, "%d\t%x:%x.%x\n", node_id, regs->addr[j].bus_id, regs->addr[j].dev_id, regs->addr[j].funct);
        }
    }
    fclose(fp);
    return E_SUCCESS;
}

int num_cpus(struct bitmask* bitmask) 
{
    int i,n;
    // if we had knowledge of the bitmask structure then we could
    // count the bits faster but bitmask seems to be an opaque structure
    for (i=0, n=0; i<numa_num_configured_cpus(); i++) {
        if (numa_bitmask_isbitset(bitmask, i)) {
            n++;
        }
    }
    return n;
}

// number of cpus in the system
int system_num_cpus()
{
    return sysconf( _SC_NPROCESSORS_ONLN );
}

void print_bitmask(struct bitmask* bitmask) {
    int i;
    for (i=0; i<numa_num_configured_cpus(); i++) {
        if (numa_bitmask_isbitset(bitmask, i)) {
            DBG_LOG(INFO, "bit %d\n", i);
        }
    }
    return;
}

int next_cpu(struct bitmask* bitmask, int cpu_id)
{
    int i;
    // if we had knowledge of the bitmask structure then we could
    // count the bits faster but bitmask seems to be an opaque structure
    for (i=cpu_id; i<numa_num_configured_cpus(); i++) {
        if (numa_bitmask_isbitset(bitmask, i)) {
            return i;
        }
    }
    return -1;
}

int first_cpu(struct bitmask* bitmask)
{
    return next_cpu( bitmask, 0);
}

int partition_cpus(virtual_topology_t* virtual_topology)
{
    int ret = E_SUCCESS;
    // if there are more than one emulated process, then partition the available CPUs
    // among all processes, based on the current local rank
    if (latency_model.max_local_processe_ranks > 1) {
        ret = select_cpus_based_on_local_rank(virtual_topology);
    }

    return ret;
}

/**
 * \brief Construct a virtual topology
 *
 * Constructs a NUMA virtual topology where two physical sockets are fused into a 
 * single virtual node
 */
int init_virtual_topology(config_t* cfg, cpu_model_t* cpu_model, virtual_topology_t** virtual_topologyp)
{
    char* mc_pci_file;
    char* str;
    char* saveptr = NULL;
    char* token = "NULL";
    int* physical_node_ids;
    physical_node_t** physical_nodes = NULL;
    int num_physical_nodes;
    int n, v, i, j, sibling_idx;
    int node_id;
    physical_node_t* node_i, *node_j, *sibling_node;
    int ret;
    int min_distance;
    int hyperthreading;
    struct bitmask* mem_nodes;
    virtual_topology_t* virtual_topology;

    if (__cconfig_lookup_string(cfg, "topology.physical_nodes", &str) == CONFIG_FALSE) {
        return E_ERROR;
    }

    DBG_LOG(DEBUG, "Possible NUMA nodes are %d\n", numa_num_possible_nodes());
    DBG_LOG(DEBUG, "NUMA nodes allowed are %lu\n", numa_get_mems_allowed()->size);
    DBG_LOG(DEBUG, "NUMA configured CPUs are %d\n", numa_num_configured_cpus());

    // parse the physical nodes string
    physical_node_ids = calloc(numa_num_possible_nodes(), sizeof(*physical_node_ids));
    num_physical_nodes = 0;

    while ((token = strtok_r(str, ",", &saveptr))) {
        physical_node_ids[num_physical_nodes] = atoi(token);
        str = NULL;
        if (++num_physical_nodes > numa_num_possible_nodes()) {
            // we re being asked to run on more nodes than available
            free(physical_node_ids);
            ret = E_ERROR;
            goto done;
        }
    }
    if (!(physical_nodes = calloc(num_physical_nodes, sizeof(*physical_nodes)))) {
        DBG_LOG(ERROR, "Failed physical nodes allocation\n");
        abort();
    }

    // select those nodes we can run on (e.g. not constrained by any numactl)
    mem_nodes = numa_get_mems_allowed();
    for (i=0, n=0; i<num_physical_nodes; i++) {
        node_id = physical_node_ids[i];
        if (numa_bitmask_isbitset(mem_nodes, node_id)) {
            physical_nodes[n] = malloc(sizeof(**physical_nodes));
            memset(physical_nodes[n], 0, sizeof(**physical_nodes));
            physical_nodes[n]->node_id = node_id;
            physical_nodes[n]->cpu_bitmask = numa_allocate_cpumask();
            physical_nodes[n]->cpu_model = cpu_model;
            numa_node_to_cpus(node_id, physical_nodes[n]->cpu_bitmask);
            __cconfig_lookup_bool(cfg, "topology.hyperthreading", &hyperthreading);
            if (hyperthreading) {
                physical_nodes[n]->num_cpus = num_cpus(physical_nodes[n]->cpu_bitmask);
            } else {
                DBG_LOG(INFO, "Not using hyperthreading.\n");
                // disable the upper half of the processors in the bitmask
                physical_nodes[n]->num_cpus = num_cpus(physical_nodes[n]->cpu_bitmask) / 2;
                int fc = first_cpu(physical_nodes[n]->cpu_bitmask);
                for (j=fc+system_num_cpus()/2; j<fc+system_num_cpus()/2+physical_nodes[n]->num_cpus; j++) {
                    if (numa_bitmask_isbitset(physical_nodes[n]->cpu_bitmask, j)) {
                        numa_bitmask_clearbit(physical_nodes[n]->cpu_bitmask, j);
                    }
                }
            }
            DBG_LOG(INFO, "%d CPUs on physical node %d\n", physical_nodes[n]->num_cpus, n);
            n++;
        }
    }
    free(physical_node_ids);
    num_physical_nodes = n;

    // If pci bus topology of each physical node is not provided then discover it.
    // The bus topology must be always known even if BW model is disabled.
    if (__cconfig_lookup_string(cfg, "topology.mc_pci", &mc_pci_file) == CONFIG_FALSE ||
          (__cconfig_lookup_string(cfg, "topology.mc_pci", &mc_pci_file) == CONFIG_TRUE &&
          load_mc_pci_topology(mc_pci_file, physical_nodes, num_physical_nodes) != E_SUCCESS))
    {
        discover_mc_pci_topology(cpu_model, physical_nodes, num_physical_nodes);
        save_mc_pci_topology(mc_pci_file, physical_nodes, num_physical_nodes);
        DBG_LOG(INFO, "Topology MC PCI file saved, restart the process\n");
        exit(0);
    }

    // form virtual nodes by grouping physical nodes that are close to each other
    virtual_topology = malloc(sizeof(*virtual_topology));
    virtual_topology->num_virtual_nodes = num_physical_nodes / 2 + num_physical_nodes % 2;
    virtual_topology->virtual_nodes = calloc(virtual_topology->num_virtual_nodes, 
                                             sizeof(*(virtual_topology->virtual_nodes)));

    DBG_LOG(INFO, "Number of physical nodes %d\n", num_physical_nodes);
    DBG_LOG(INFO, "Number of virtual nodes %d\n", virtual_topology->num_virtual_nodes);

    for (i=0, v=0; i<num_physical_nodes; i++) {
        min_distance = INT_MAX;
        sibling_node = NULL;
        sibling_idx = -1;
        if ((node_i = physical_nodes[i]) == NULL) {
            continue;
        }

        for (j=i+1; j<num_physical_nodes; j++) {
            if ((node_j = physical_nodes[j]) == NULL) {
                continue;
            }
            // TODO: numa_distance() returns '0' on error
            if (numa_distance(node_i->node_id,node_j->node_id) < min_distance) {
                sibling_node = node_j;
                sibling_idx = j;
            }
        }

        if (sibling_node) {
            physical_nodes[i] = physical_nodes[sibling_idx] = NULL;
            virtual_node_t* virtual_node = &virtual_topology->virtual_nodes[v];
            virtual_node->dram_node = node_i;
            virtual_node->nvram_node = sibling_node;
            virtual_node->dram_node->latency = measure_latency(cpu_model,
                                                               virtual_node->dram_node->node_id,
                                                               virtual_node->dram_node->node_id);
            virtual_node->nvram_node->latency = measure_latency(cpu_model,
                                                                virtual_node->dram_node->node_id,
                                                                virtual_node->nvram_node->node_id);
            virtual_node->node_id = v;
            DBG_LOG(INFO, "Fusing physical nodes %d %d into virtual node %d\n", 
                    node_i->node_id, sibling_node->node_id, virtual_node->node_id);
            v++;
        }
    }

    // any physical node that is not paired with another physical node is 
    // formed into a virtual node on its own
    if (2*v < num_physical_nodes) {
        for (i=0; i<num_physical_nodes; i++) {
            node_i = physical_nodes[i];
            virtual_node_t* virtual_node = &virtual_topology->virtual_nodes[v];
            virtual_node->dram_node = virtual_node->nvram_node = node_i;
            virtual_node->node_id = v;
            virtual_node->dram_node->latency = measure_latency(cpu_model,
                                                               virtual_node->dram_node->node_id,
                                                               virtual_node->dram_node->node_id);
            DBG_LOG(WARNING, "Forming physical node %d into virtual node %d without a sibling node.\n",
                    node_i->node_id, virtual_node->node_id);
        }
    }

    *virtual_topologyp = virtual_topology;
    ret = E_SUCCESS;

done:
    free(physical_nodes);
    return ret;
}


================================================
FILE: src/lib/topology.h
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#ifndef __TOPOLOGY_H
#define __TOPOLOGY_H

#include <numa.h>
#include "config.h"
#include "cpu/cpu.h"
#include "dev.h"

/* DOXYGEN Documentation : */

/**
    \page virtual_topology Virtual topology
 
    The emulator constructs a topology of virtual nodes out of physical nodes
    (i.e., NUMA sockets) that represents the arrangement of processors, DRAM, 
    and NVRAM of the virtual machine that the emulator emulates. 

    Currently, the emulator supports a NUMA virtual topology where essentially
    two physical sockets are fused into a single virtual node. Each virtual 
    node comprises the processors from one socket only (active socket), and 
    DRAM from both two physical sockets. The DRAM attached to the active socket
    is used as the virtual node's locally attached DRAM and the DRAM of the other 
    socket (passive) is used as the virtual node's locally attached NVRAM.
    This topology allows us to emulate a machine that has both DRAM and NVRAM but
    reduces the computation capacity of the machine to half.
    
    In the future we would like to support a topology that matches the shared NVRAM
    storage of The Machine.

 */
 

typedef struct {
    int node_id;
    cpu_model_t* cpu_model;
    pci_regs_t  *mc_pci_regs;
    int num_cpus; // number of node's cpus
    struct bitmask* cpu_bitmask; // a bitmask of the node's CPUs 

    // this is actual physical latency. the latency number though depends on 
    // whether the node corresponds to a dram node or a nvram node. 
    // if dram then latency is the measured local latency to dram.
    // if nvram then latency is the measured remote latency to the sibling nvram node
    int latency; 
} physical_node_t;

typedef struct virtual_node_s {
    int node_id;
    physical_node_t* dram_node;
    physical_node_t* nvram_node;
    //cpu_model_t* cpu_model;
} virtual_node_t;

typedef struct virtual_topology_s {
    virtual_node_t* virtual_nodes; // pointer to an array of virtual nodes
    int num_virtual_nodes;
} virtual_topology_t;

int init_virtual_topology(config_t* cfg, cpu_model_t* cpu_model, virtual_topology_t** virtual_topologyp);
int system_num_cpus();
int first_cpu(struct bitmask* bitmask);
int next_cpu(struct bitmask* bitmask, int cpu_id);

#endif /* __TOPOLOGY_H */


================================================
FILE: test/CMakeLists.txt
================================================
include_directories(${CMAKE_SOURCE_DIR}/third_party/gtest-1.7.0/include)
include_directories(${CMAKE_SOURCE_DIR}/src/lib)

add_definitions(-g)
add_definitions(-Wall)
#add_definitions(-DNDEBUG)

add_executable(test_interpose ${CMAKE_CURRENT_SOURCE_DIR}/test_interpose.cc)
target_link_libraries(test_interpose pthread gtest)

add_executable(test_dev ${CMAKE_CURRENT_SOURCE_DIR}/test_dev.cc)
target_link_libraries(test_dev pthread nvmemul)

add_executable(test_thread ${CMAKE_CURRENT_SOURCE_DIR}/test_thread.cc)
target_link_libraries(test_thread nvmemul pthread)

add_executable(test_mutex ${CMAKE_CURRENT_SOURCE_DIR}/test_mutex.cc)
target_link_libraries(test_mutex nvmemul pthread)

add_executable(test_nvm_remote_dram ${CMAKE_CURRENT_SOURCE_DIR}/test_nvm_remote_dram.c)
target_link_libraries(test_nvm_remote_dram nvmemul)

add_executable(test_nvm ${CMAKE_CURRENT_SOURCE_DIR}/test_nvm.c)
target_link_libraries(test_nvm nvmemul)

add_executable(test_multithread ${CMAKE_CURRENT_SOURCE_DIR}/test_multithread.c)
#target_link_libraries(test_multithread rt)
target_link_libraries(test_multithread nvmemul pthread)

add_test(NAME interpose COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_interpose)

set(ENV_COMMON "LD_PRELOAD=${CMAKE_BINARY_DIR}/src/emul/libnvmemul.so")

SET_PROPERTY(TEST interpose PROPERTY ENVIRONMENT ${ENV_COMMON} "ENUM_INI=emul.ini")


================================================
FILE: test/test_dev.cc
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>
#include "gtest/gtest.h"
#include "pmalloc.h"

int main(int argc, char** argv)
{
//    ::testing::InitGoogleTest(&argc, argv);
//    return RUN_ALL_TESTS();
    printf("PID: %d\n", getpid());
    printf("malloc: %p\n", malloc(8));
    printf("malloc: %p\n", malloc(8));
    printf("pmalloc: %p\n", pmalloc(8));
}


================================================
FILE: test/test_interpose.cc
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>
#include "gtest/gtest.h"

static int interpose_pthread_create_success = 0;


// Ugly hack: we want to test whether interposition works. To do this we 
// hook on the functions that the interposition code calls by redefining these
// functions. As those functions are written in C, we need to make sure we force
// the C++ compiler use C linkage.

#ifdef __cplusplus
extern "C" {
#endif

// this function is called when interposition of pthread_create is successful
int register_thread(pthread_t thread)
{
    interpose_pthread_create_success = 1;
    return 0;
}

#ifdef __cplusplus
}
#endif

void* interpose_pthread_create_start_routine(void* args)
{
    return NULL;
}

void interpose_pthread_create()
{
    pthread_t thread;  
    
    pthread_create (&thread, NULL, &interpose_pthread_create_start_routine, NULL);

    pthread_join(thread, NULL);
              
}

void interpose_pthread_mutex_lock(pthread_mutex_t* lock)
{
    pthread_mutex_lock(lock);
}

void interpose_pthread_mutex_unlock(pthread_mutex_t* lock)
{
    pthread_mutex_unlock(lock);
}

TEST(Interpose, pthread_create)
{
    EXPECT_EQ(0, interpose_pthread_create_success);
    interpose_pthread_create();
    EXPECT_EQ(1, interpose_pthread_create_success);
}

TEST(Interpose, pthread_mutex_lock)
{
    //EXPECT_EQ(1, 0);
}


int main(int argc, char** argv)
{
    ::testing::InitGoogleTest(&argc, argv);
    return RUN_ALL_TESTS();

    pthread_mutex_t lock;
    pthread_mutex_init(&lock, NULL);
    interpose_pthread_mutex_lock(&lock);
    interpose_pthread_mutex_unlock(&lock);
}


================================================
FILE: test/test_multithread.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#define _GNU_SOURCE
#include <pthread.h>
#include <sched.h>

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>

#include "thread.h"
#include <sys/time.h>
#include "pmalloc.h"
#include "debug.h"
//#include "stat.h"


#ifndef NDEBUG
#include <sys/syscall.h>
#endif

typedef struct {
	int cs_n;
	int cs_duration;
	int out_cs_duration;
	int from_node;
	int to_node;
} arg_s;

#define MAX_NUM_THREADS 50
pthread_t thread_desc[MAX_NUM_THREADS];


#include <inttypes.h>
typedef struct {
	uint64_t val;
	char padding[0];
} element_t;

typedef struct {
    uint64_t   N;
    uint64_t   element_size;
    element_t* head;
} chain_t;
uint64_t trash_cache(uint64_t N);
chain_t* alloc_chain(uint64_t seedin, uint64_t N, uint64_t element_size, uint64_t node_i, uint64_t node_j);
element_t* element(chain_t* chain, uint64_t index);
void inline read_element(chain_t* chain, uint64_t index, char* buf, uint64_t buf_size);


// factor is 10 (could be more), to make sure we have a buffer much bigger than CPU cache
// the memory buffer is NOT shared among threads
// for now the cache size is hardcoded as 20 MB
#define NELEMS (10 * 20480000 / 64LLU)
#define PAGESZ 4096
#define MAX_NUM_CHAINS 16
//#undef USE_HUGETLB
#define SEED_IN 1
#define NCHAINS 1

pthread_mutex_t mutex;

static int max_number_of_cpus(void)
{
    int n, cpus = 2048;
    size_t setsize =  CPU_ALLOC_SIZE(cpus);
    cpu_set_t *set = CPU_ALLOC(cpus);
    if (!set)
        goto err;

	for (;;) {
		CPU_ZERO_S(setsize, set);
		/* the library version does not return size of cpumask_t */
		n = syscall(SYS_sched_getaffinity, 0, setsize, set);
		if (n < 0 && cpus < 1024 * 1024) {
		        CPU_FREE(set);
			cpus *= 2;
			set = CPU_ALLOC(cpus);
			if (!set)
				goto err;
			continue;
		}

	CPU_FREE(set);
	return n * 8;
	}
err:
	printf("cannot determine NR_CPUS");
	return 0;
}

static int bind_cpu(thread_t *thread) {
    size_t setsize;
    cpu_set_t *cur_cpuset;
    cpu_set_t *new_cpuset;

    int ncpus = max_number_of_cpus();

    if (thread == NULL) {
        // if thread is NULL it means the emulator is disabled, return without setting CPU affinity
        //printf("thread self is null");
        return 0;
    }

    if (ncpus == 0) {
    	return 1;
    }

    setsize = CPU_ALLOC_SIZE(ncpus);
    cur_cpuset = CPU_ALLOC(ncpus);
    new_cpuset = CPU_ALLOC(ncpus);
    CPU_ZERO_S(setsize, cur_cpuset);
    CPU_ZERO_S(setsize, new_cpuset);
    CPU_SET_S(thread->cpu_id, setsize, new_cpuset);

    if (pthread_getaffinity_np(thread->pthread, setsize, cur_cpuset) != 0) {
        DBG_LOG(ERROR, "Cannot get thread tid [%d] affinity, pthread: 0x%lx on processor %d\n",
        		thread->tid, thread->pthread, thread->cpu_id);
        return 1;
    }

    if (CPU_EQUAL(cur_cpuset, new_cpuset)) {
        //printf("No need to bind CPU\n");
    	return 0;
    }

    DBG_LOG(INFO, "Binding thread tid [%d] pthread: 0x%lx on processor %d\n", thread->tid, thread->pthread, thread->cpu_id);

    if (pthread_setaffinity_np(thread->pthread, setsize, new_cpuset) != 0) {
        DBG_LOG(ERROR, "Cannot bind thread tid [%d] pthread: 0x%lx on processor %d\n", thread->tid, thread->pthread, thread->cpu_id);
        return 1;
    }

    return 0;
}

uint64_t force_ldm_stalls(chain_t **C,
                          int element_size,
                          int access_size,
                          int duration,             // number of pointers/elements to chase
                          uint64_t nelems,          // max number of available elements/pointers
                          int it_n) {               // seed to calculate the first pointer to chase, used to avoid repeating
                                                    // pointers during consecutive calls
    uint64_t j, i;
    int nchains = SEED_IN;
    uint64_t sumv[MAX_NUM_CHAINS];
    uint64_t nextp[MAX_NUM_CHAINS];
    char *buf;
    uint64_t buf_size = 16384;
    int count = 0;
    uint64_t start;
    uint64_t it_limit;

    assert(nchains < MAX_NUM_CHAINS);

    if (duration <= 0) return 0;

    // TODO: ignore the use of buf?
    // TODO: ignore more than one chain?
    buf = (char*) malloc(buf_size);
    assert(buf != NULL);

    if (nelems > duration) {
        it_limit = nelems / duration;
    } else {
    	it_limit = 1;
    }
    it_n = it_n % it_limit;
    start = it_n * duration;
    if ((start + duration) > nelems) {
    	start = 0;
    }

    /* chase the pointers */
    if (nchains == 1) {
        sumv[0] = 0;
        // chase pointers until the 'duration' count, the pointer chasing will restart from beginning if duration
        // is greater than 'nelems'
        for (count = 0, i = start; count < duration; i = element(C[0], i)->val, ++count) {
            __asm__("");
            sumv[0] += element(C[0], i)->val;
            if (access_size > element_size) {
                read_element(C[0], i, buf, buf_size);
            }
        }
    } else {
        for (j=0; j < nchains; j++) {
            sumv[j] = 0;
            nextp[j] = 0;
        }
        for (; 0 != element(C[0], nextp[0])->val; ) {
            for (j=0; j < nchains; j++) {
                sumv[j] += element(C[j], nextp[j])->val;
                if (access_size > element_size) {
                    read_element(C[j], nextp[j], buf, buf_size);
                }
                nextp[j] = element(C[j], nextp[j])->val;
            }
        }
    }

    free(buf);
    return sumv[0];
}

void iter(int cs_n, int cs_duration, int out_cs_duration, int from_node, int to_node) {
	long it_n;
	struct timespec time_start, time_end;
	unsigned long diff_us;
	uint64_t seed;
	uint64_t j;
	chain_t *C[MAX_NUM_CHAINS];
#ifndef NDEBUG
	pid_t tid = (pid_t) syscall(SYS_gettid);
#endif

	DBG_LOG(INFO, "\t: from node: %d to node: %d\n", from_node, to_node);

	assert(NELEMS < UINT64_MAX);

    for (j=0; j < NCHAINS; j++) {
        seed = SEED_IN + j*j;
        C[j] = alloc_chain(seed, NELEMS, 64LLU, from_node, to_node);
        __asm__("");
    }

    bind_cpu(thread_self());

    trash_cache(NELEMS);

    for (it_n = 0; it_n < cs_n; ++it_n) {
    	__asm__("");
        pthread_mutex_lock(&mutex);
#ifndef NDEBUG
        clock_gettime(CLOCK_MONOTONIC, &time_start);
#endif
        // critical section
        // make cs_duration random memory accesses and leave
        force_ldm_stalls((chain_t **)&C, 64LLU, 8, cs_duration, NELEMS, it_n);
#ifndef NDEBUG
        clock_gettime(CLOCK_MONOTONIC, &time_end);
#endif
        pthread_mutex_unlock(&mutex);

        // outside critical section
        force_ldm_stalls((chain_t **)&C, 64LLU, 8, out_cs_duration, NELEMS, (it_n+1)*2);

#ifndef NDEBUG
        diff_us = ((time_end.tv_sec * 1000000) + (time_end.tv_nsec / 1000)) -
                  ((time_start.tv_sec * 1000000) + (time_start.tv_nsec / 1000));
        DBG_LOG(INFO, "\tthread [%d] critical section took %lu usec\n", tid, diff_us);
#endif
//        if ((it_n + 1) % out_cs_duration == 0) {
////            usleep(1);
////            pthread_yield();
//            sched_yield();
//        }
    }

    for (j=0; j < NCHAINS; j++) {
        free(C[j]);
    }
}

void *thread_fn(void *arg) {
	int cs_n = ((arg_s *) arg)->cs_n;
	int cs_duration = ((arg_s *) arg)->cs_duration;
	int out_cs_duration = ((arg_s *) arg)->out_cs_duration;
	int from_node = ((arg_s *) arg)->from_node;
	int to_node = ((arg_s *) arg)->to_node;

	iter(cs_n, cs_duration, out_cs_duration, from_node, to_node);

	return 0;
}

void manage_threads(int n_threads, int cs_n, int cs_duration, int out_cs_duration, int from_node, int to_node)
{
	pthread_attr_t attr;
    int i;
    arg_s args;

    if ((n_threads > MAX_NUM_THREADS) || (n_threads <= 0)) {
    	printf("INVALID RANGE:\n");
    	printf("\tMax number of threads is %d\n", MAX_NUM_THREADS);
    	exit(-1);
    }

    if (cs_n <= 0 || cs_duration <= 0 || out_cs_duration < 0) {
    	printf("INVALID RANGE:\n");
    	printf("\tcritical sections: %d, cs level: %d, out cs level: %d\n", cs_n, cs_duration, out_cs_duration);
    	exit(-1);
    }

    pthread_mutex_init(&mutex, NULL);

    if (pthread_attr_init(&attr) != 0) {
		printf("pthread_attr_init failed");
		exit(-1);
	}

    srand(time(NULL));

    args.cs_duration = cs_duration;
    args.cs_n = cs_n;
    args.out_cs_duration = out_cs_duration;
    args.from_node = from_node;
    args.to_node = to_node;

    for (i = 0; i < n_threads; ++i) {
	    pthread_create(&thread_desc[i], &attr, thread_fn, (void *)&args);
	}

    pthread_attr_destroy(&attr);

    for (i = 0; i < n_threads; ++i) {
        pthread_join(thread_desc[i], NULL);
    }

    pthread_mutex_destroy(&mutex);
}

int main(int argn, char **argv)
{
    int n_threads;
    int cs_n;
    int cs_duration;
    //int cs_n_before_yield;
    int out_cs_duration;
    int from_node;
    int to_node;

    if (argn != 7) {
        printf("INVALID ARGUMENTS:\n");
        printf("\t%s [# threads] [# critical sections per thread] [size of each critical section] "
        	   "[size of computation outside critical section] [from_node] [to_node]\n", argv[0]);
        return -1;
    }

    n_threads = atoi(argv[1]);
    cs_n = atoi(argv[2]);
    cs_duration = atoi(argv[3]);
    //cs_n_before_yield = atoi(argv[4]);
    out_cs_duration = atoi(argv[4]);
    from_node = atoi(argv[5]);
    to_node = atoi(argv[6]);

    manage_threads(n_threads, cs_n, cs_duration, out_cs_duration, from_node, to_node);

//    stats_report();

    return 0;
}


================================================
FILE: test/test_mutex.cc
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <pthread.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <stddef.h>
#include "gtest/gtest.h"

#define MAX_NUM_THREADS 128

pthread_mutex_t mutex;

void* worker(void* args) 
{
//    int i;
//    char* array = (char*) malloc(1024*1024);

    pthread_mutex_lock(&mutex);

    pthread_mutex_unlock(&mutex);
    return NULL;
}


int main(int argc, char** argv)
{
	pthread_t thread[MAX_NUM_THREADS];
	int thread_count = 4;
	int i;
//    int sum;

    pthread_mutex_init(&mutex, NULL);
    pthread_mutex_lock(&mutex);
    pthread_mutex_unlock(&mutex);
	for (i = 0; i< thread_count; i++)	
		pthread_create(&thread[i], NULL, worker, NULL);

	for(i = 0 ; i < thread_count ; i++)
		pthread_join(thread[i], NULL);
}


================================================
FILE: test/test_nvm.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/

#include <stdio.h>
#include <unistd.h>


#define BUF_SIZE (2048)

unsigned long mem[BUF_SIZE][BUF_SIZE];

void iter()
{
	int i;
	int j;
	unsigned long k;

	for (i=0; i < BUF_SIZE; ++i) {
		for (j=0; j < BUF_SIZE; ++j) {
			mem[i][j] = i * j;
		}
	}

	k = 0;
	while(1) {
		for (i=0; i < BUF_SIZE; ++i) {
			__asm__ __volatile__("");
			for (j=0; j < BUF_SIZE; ++j) {
		        k += mem[j][i] + i*j;
		        mem[j][i] = k;
			}
		}
//		fprintf(stdout, "k is %lu\n", (unsigned long)k);
		usleep(1000);
	}
}

int main()
{
    iter();
    return 0;
}


================================================
FILE: test/test_nvm_remote_dram.c
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/

#include <stdio.h>
#include <unistd.h>
#include "pmalloc.h"


#define BUF_SIZE (4 * 1024)

unsigned long **mem;

void iter()
{
	int i;
	int j;
	unsigned long k;

	mem = (unsigned long **) pmalloc(BUF_SIZE * sizeof(unsigned long *));
	for (i=0; i < BUF_SIZE; ++i) {
		mem[i] = (unsigned long *) pmalloc(BUF_SIZE * sizeof(unsigned long));
		for (j=0; j < BUF_SIZE; ++j) {
			mem[i][j] = i * j;
		}
	}

	k = 0;
	while(1) {
		for (i=0; i < BUF_SIZE; ++i) {
			__asm__ __volatile__("");
			for (j=0; j < BUF_SIZE; ++j) {
		        k += mem[j][i] + i*j;
		        mem[j][i] = k;
			}
		}
//		usleep(1000);
	}

	for (i=0; i < BUF_SIZE; ++i) {
		pfree(mem[i], BUF_SIZE * sizeof(unsigned long));
	}
	pfree(mem, BUF_SIZE * sizeof(unsigned long *));
}

int main()
{
    iter();
    return 0;
}


================================================
FILE: test/test_thread.cc
================================================
/***************************************************************************
Copyright 2016 Hewlett Packard Enterprise Development LP.  
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. This program is distributed in the
hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details. You
should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************************/
#include <pthread.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <stddef.h>
#include "gtest/gtest.h"

#define MAX_NUM_THREADS 128

void* worker(void* args) 
{
    int i;
    char* array = (char*) malloc(1024*1024);

    //while(1) {
        for (i=0; i<1024*1024; i++) {
            array[i] += 1;
        }
    //}
    //pthread_exit(NULL);
    printf("exiting\n");
    return NULL;
}


int main(int argc, char** argv)
{
	pthread_t thread[MAX_NUM_THREADS];
	int thread_count = 4;
	int i;
//    int sum;

	for (i = 0; i< thread_count; i++)	
		pthread_create(&thread[i], NULL, worker, NULL);

	for(i = 0 ; i < thread_count ; i++)
		pthread_join(thread[i], NULL);
}