[
  {
    "path": "AUTHORS",
    "content": "Haris Volos           (haris.volos@hpe.com)\nGuilherme Magalhaes   (guilherme.magalhaes@hpe.com)\nLucy Cherkasova       (lucy.cherkasova@gmail.com)\n"
  },
  {
    "path": "CMakeLists.txt",
    "content": "cmake_minimum_required(VERSION 2.8)\n\n#add_subdirectory(third_party)\nadd_subdirectory(src)\nadd_subdirectory(bench)\nenable_testing()\n#add_subdirectory(test)\n"
  },
  {
    "path": "Doxyfile",
    "content": "# Doxyfile 1.4.7\n\n# This file describes the settings to be used by the documentation system\n# doxygen (www.doxygen.org) for a project\n#\n# All text after a hash (#) is considered a comment and will be ignored\n# The format is:\n#       TAG = value [value, ...]\n# For lists items can also be appended using:\n#       TAG += value [value, ...]\n# Values that contain spaces should be placed between quotes (\" \")\n\n#---------------------------------------------------------------------------\n# Project related configuration options\n#---------------------------------------------------------------------------\n\n# The PROJECT_NAME tag is a single word (or a sequence of words surrounded \n# by quotes) that should identify the project.\n\nPROJECT_NAME           = \"Quartz\"\n\n# The PROJECT_NUMBER tag can be used to enter a project or revision number. \n# This could be handy for archiving the generated documentation or \n# if some version control system is used.\n\nPROJECT_NUMBER         = \n\n# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) \n# base path where the generated documentation will be put. \n# If a relative path is entered, it will be relative to the location \n# where doxygen was started. If left blank the current directory will be used.\n\nOUTPUT_DIRECTORY       = ./doc\n\n# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create \n# 4096 sub-directories (in 2 levels) under the output directory of each output \n# format and will distribute the generated files over these directories. \n# Enabling this option can be useful when feeding doxygen a huge amount of \n# source files, where putting all generated files in the same directory would \n# otherwise cause performance problems for the file system.\n\nCREATE_SUBDIRS         = NO\n\n# The OUTPUT_LANGUAGE tag is used to specify the language in which all \n# documentation generated by doxygen is written. Doxygen will use this \n# information to generate all constant output in the proper language. \n# The default language is English, other supported languages are: \n# Brazilian, Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, \n# Dutch, Finnish, French, German, Greek, Hungarian, Italian, Japanese, \n# Japanese-en (Japanese with English messages), Korean, Korean-en, Norwegian, \n# Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish, \n# Swedish, and Ukrainian.\n\nOUTPUT_LANGUAGE        = English\n\n# This tag can be used to specify the encoding used in the generated output. \n# The encoding is not always determined by the language that is chosen, \n# but also whether or not the output is meant for Windows or non-Windows users. \n# In case there is a difference, setting the USE_WINDOWS_ENCODING tag to YES \n# forces the Windows encoding (this is the default for the Windows binary), \n# whereas setting the tag to NO uses a Unix-style encoding (the default for \n# all platforms other than Windows).\n\nUSE_WINDOWS_ENCODING   = NO\n\n# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will \n# include brief member descriptions after the members that are listed in \n# the file and class documentation (similar to JavaDoc). \n# Set to NO to disable this.\n\nBRIEF_MEMBER_DESC      = YES\n\n# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend \n# the brief description of a member or function before the detailed description. \n# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the \n# brief descriptions will be completely suppressed.\n\nREPEAT_BRIEF           = YES\n\n# This tag implements a quasi-intelligent brief description abbreviator \n# that is used to form the text in various listings. Each string \n# in this list, if found as the leading text of the brief description, will be \n# stripped from the text and the result after processing the whole list, is \n# used as the annotated text. Otherwise, the brief description is used as-is. \n# If left blank, the following values are used (\"$name\" is automatically \n# replaced with the name of the entity): \"The $name class\" \"The $name widget\" \n# \"The $name file\" \"is\" \"provides\" \"specifies\" \"contains\" \n# \"represents\" \"a\" \"an\" \"the\"\n\nABBREVIATE_BRIEF       = \n\n# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then \n# Doxygen will generate a detailed section even if there is only a brief \n# description.\n\nALWAYS_DETAILED_SEC    = NO\n\n# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all \n# inherited members of a class in the documentation of that class as if those \n# members were ordinary class members. Constructors, destructors and assignment \n# operators of the base classes will not be shown.\n\nINLINE_INHERITED_MEMB  = NO\n\n# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full \n# path before files name in the file list and in the header files. If set \n# to NO the shortest path that makes the file name unique will be used.\n\nFULL_PATH_NAMES        = YES\n\n# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag \n# can be used to strip a user-defined part of the path. Stripping is \n# only done if one of the specified strings matches the left-hand part of \n# the path. The tag can be used to show relative paths in the file list. \n# If left blank the directory from which doxygen is run is used as the \n# path to strip.\n\nSTRIP_FROM_PATH        = \n\n# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of \n# the path mentioned in the documentation of a class, which tells \n# the reader which header file to include in order to use a class. \n# If left blank only the name of the header file containing the class \n# definition is used. Otherwise one should specify the include paths that \n# are normally passed to the compiler using the -I flag.\n\nSTRIP_FROM_INC_PATH    = \n\n# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter \n# (but less readable) file names. This can be useful is your file systems \n# doesn't support long names like on DOS, Mac, or CD-ROM.\n\nSHORT_NAMES            = NO\n\n# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen \n# will interpret the first line (until the first dot) of a JavaDoc-style \n# comment as the brief description. If set to NO, the JavaDoc \n# comments will behave just like the Qt-style comments (thus requiring an \n# explicit @brief command for a brief description.\n\nJAVADOC_AUTOBRIEF      = NO\n\n# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen \n# treat a multi-line C++ special comment block (i.e. a block of //! or /// \n# comments) as a brief description. This used to be the default behaviour. \n# The new default is to treat a multi-line C++ comment block as a detailed \n# description. Set this tag to YES if you prefer the old behaviour instead.\n\nMULTILINE_CPP_IS_BRIEF = NO\n\n# If the DETAILS_AT_TOP tag is set to YES then Doxygen \n# will output the detailed description near the top, like JavaDoc.\n# If set to NO, the detailed description appears after the member \n# documentation.\n\nDETAILS_AT_TOP         = NO\n\n# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented \n# member inherits the documentation from any documented member that it \n# re-implements.\n\nINHERIT_DOCS           = YES\n\n# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce \n# a new page for each member. If set to NO, the documentation of a member will \n# be part of the file/class/namespace that contains it.\n\nSEPARATE_MEMBER_PAGES  = NO\n\n# The TAB_SIZE tag can be used to set the number of spaces in a tab. \n# Doxygen uses this value to replace tabs by spaces in code fragments.\n\nTAB_SIZE               = 8\n\n# This tag can be used to specify a number of aliases that acts \n# as commands in the documentation. An alias has the form \"name=value\". \n# For example adding \"sideeffect=\\par Side Effects:\\n\" will allow you to \n# put the command \\sideeffect (or @sideeffect) in the documentation, which \n# will result in a user-defined paragraph with heading \"Side Effects:\". \n# You can put \\n's in the value part of an alias to insert newlines.\n\nALIASES                = \n\n# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C \n# sources only. Doxygen will then generate output that is more tailored for C. \n# For instance, some of the names that are used will be different. The list \n# of all members will be omitted, etc.\n\nOPTIMIZE_OUTPUT_FOR_C  = NO\n\n# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java \n# sources only. Doxygen will then generate output that is more tailored for Java. \n# For instance, namespaces will be presented as packages, qualified scopes \n# will look different, etc.\n\nOPTIMIZE_OUTPUT_JAVA   = NO\n\n# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want to \n# include (a tag file for) the STL sources as input, then you should \n# set this tag to YES in order to let doxygen match functions declarations and \n# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. \n# func(std::string) {}). This also make the inheritance and collaboration \n# diagrams that involve STL classes more complete and accurate.\n\nBUILTIN_STL_SUPPORT    = NO\n\n# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC \n# tag is set to YES, then doxygen will reuse the documentation of the first \n# member in the group (if any) for the other members of the group. By default \n# all members of a group must be documented explicitly.\n\nDISTRIBUTE_GROUP_DOC   = NO\n\n# Set the SUBGROUPING tag to YES (the default) to allow class member groups of \n# the same type (for instance a group of public functions) to be put as a \n# subgroup of that type (e.g. under the Public Functions section). Set it to \n# NO to prevent subgrouping. Alternatively, this can be done per class using \n# the \\nosubgrouping command.\n\nSUBGROUPING            = YES\n\n#---------------------------------------------------------------------------\n# Build related configuration options\n#---------------------------------------------------------------------------\n\n# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in \n# documentation are documented, even if no documentation was available. \n# Private class members and static file members will be hidden unless \n# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES\n\nEXTRACT_ALL            = NO\n\n# If the EXTRACT_PRIVATE tag is set to YES all private members of a class \n# will be included in the documentation.\n\nEXTRACT_PRIVATE        = NO\n\n# If the EXTRACT_STATIC tag is set to YES all static members of a file \n# will be included in the documentation.\n\nEXTRACT_STATIC         = NO\n\n# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) \n# defined locally in source files will be included in the documentation. \n# If set to NO only classes defined in header files are included.\n\nEXTRACT_LOCAL_CLASSES  = YES\n\n# This flag is only useful for Objective-C code. When set to YES local \n# methods, which are defined in the implementation section but not in \n# the interface are included in the documentation. \n# If set to NO (the default) only methods in the interface are included.\n\nEXTRACT_LOCAL_METHODS  = NO\n\n# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all \n# undocumented members of documented classes, files or namespaces. \n# If set to NO (the default) these members will be included in the \n# various overviews, but no documentation section is generated. \n# This option has no effect if EXTRACT_ALL is enabled.\n\nHIDE_UNDOC_MEMBERS     = NO\n\n# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all \n# undocumented classes that are normally visible in the class hierarchy. \n# If set to NO (the default) these classes will be included in the various \n# overviews. This option has no effect if EXTRACT_ALL is enabled.\n\nHIDE_UNDOC_CLASSES     = NO\n\n# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all \n# friend (class|struct|union) declarations. \n# If set to NO (the default) these declarations will be included in the \n# documentation.\n\nHIDE_FRIEND_COMPOUNDS  = NO\n\n# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any \n# documentation blocks found inside the body of a function. \n# If set to NO (the default) these blocks will be appended to the \n# function's detailed documentation block.\n\nHIDE_IN_BODY_DOCS      = NO\n\n# The INTERNAL_DOCS tag determines if documentation \n# that is typed after a \\internal command is included. If the tag is set \n# to NO (the default) then the documentation will be excluded. \n# Set it to YES to include the internal documentation.\n\nINTERNAL_DOCS          = NO\n\n# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate \n# file names in lower-case letters. If set to YES upper-case letters are also \n# allowed. This is useful if you have classes or files whose names only differ \n# in case and if your file system supports case sensitive file names. Windows \n# and Mac users are advised to set this option to NO.\n\nCASE_SENSE_NAMES       = YES\n\n# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen \n# will show members with their full class and namespace scopes in the \n# documentation. If set to YES the scope will be hidden.\n\nHIDE_SCOPE_NAMES       = NO\n\n# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen \n# will put a list of the files that are included by a file in the documentation \n# of that file.\n\nSHOW_INCLUDE_FILES     = YES\n\n# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] \n# is inserted in the documentation for inline members.\n\nINLINE_INFO            = YES\n\n# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen \n# will sort the (detailed) documentation of file and class members \n# alphabetically by member name. If set to NO the members will appear in \n# declaration order.\n\nSORT_MEMBER_DOCS       = YES\n\n# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the \n# brief documentation of file, namespace and class members alphabetically \n# by member name. If set to NO (the default) the members will appear in \n# declaration order.\n\nSORT_BRIEF_DOCS        = NO\n\n# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be \n# sorted by fully-qualified names, including namespaces. If set to \n# NO (the default), the class list will be sorted only by class name, \n# not including the namespace part. \n# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.\n# Note: This option applies only to the class list, not to the \n# alphabetical list.\n\nSORT_BY_SCOPE_NAME     = NO\n\n# The GENERATE_TODOLIST tag can be used to enable (YES) or \n# disable (NO) the todo list. This list is created by putting \\todo \n# commands in the documentation.\n\nGENERATE_TODOLIST      = YES\n\n# The GENERATE_TESTLIST tag can be used to enable (YES) or \n# disable (NO) the test list. This list is created by putting \\test \n# commands in the documentation.\n\nGENERATE_TESTLIST      = YES\n\n# The GENERATE_BUGLIST tag can be used to enable (YES) or \n# disable (NO) the bug list. This list is created by putting \\bug \n# commands in the documentation.\n\nGENERATE_BUGLIST       = YES\n\n# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or \n# disable (NO) the deprecated list. This list is created by putting \n# \\deprecated commands in the documentation.\n\nGENERATE_DEPRECATEDLIST= YES\n\n# The ENABLED_SECTIONS tag can be used to enable conditional \n# documentation sections, marked by \\if sectionname ... \\endif.\n\nENABLED_SECTIONS       = \n\n# The MAX_INITIALIZER_LINES tag determines the maximum number of lines \n# the initial value of a variable or define consists of for it to appear in \n# the documentation. If the initializer consists of more lines than specified \n# here it will be hidden. Use a value of 0 to hide initializers completely. \n# The appearance of the initializer of individual variables and defines in the \n# documentation can be controlled using \\showinitializer or \\hideinitializer \n# command in the documentation regardless of this setting.\n\nMAX_INITIALIZER_LINES  = 30\n\n# Set the SHOW_USED_FILES tag to NO to disable the list of files generated \n# at the bottom of the documentation of classes and structs. If set to YES the \n# list will mention the files that were used to generate the documentation.\n\nSHOW_USED_FILES        = YES\n\n# If the sources in your project are distributed over multiple directories \n# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy \n# in the documentation. The default is NO.\n\nSHOW_DIRECTORIES       = NO\n\n# The FILE_VERSION_FILTER tag can be used to specify a program or script that \n# doxygen should invoke to get the current version for each file (typically from the \n# version control system). Doxygen will invoke the program by executing (via \n# popen()) the command <command> <input-file>, where <command> is the value of \n# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file \n# provided by doxygen. Whatever the program writes to standard output \n# is used as the file version. See the manual for examples.\n\nFILE_VERSION_FILTER    = \n\n#---------------------------------------------------------------------------\n# configuration options related to warning and progress messages\n#---------------------------------------------------------------------------\n\n# The QUIET tag can be used to turn on/off the messages that are generated \n# by doxygen. Possible values are YES and NO. If left blank NO is used.\n\nQUIET                  = NO\n\n# The WARNINGS tag can be used to turn on/off the warning messages that are \n# generated by doxygen. Possible values are YES and NO. If left blank \n# NO is used.\n\nWARNINGS               = YES\n\n# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings \n# for undocumented members. If EXTRACT_ALL is set to YES then this flag will \n# automatically be disabled.\n\nWARN_IF_UNDOCUMENTED   = YES\n\n# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for \n# potential errors in the documentation, such as not documenting some \n# parameters in a documented function, or documenting parameters that \n# don't exist or using markup commands wrongly.\n\nWARN_IF_DOC_ERROR      = YES\n\n# This WARN_NO_PARAMDOC option can be abled to get warnings for \n# functions that are documented, but have no documentation for their parameters \n# or return value. If set to NO (the default) doxygen will only warn about \n# wrong or incomplete parameter documentation, but not about the absence of \n# documentation.\n\nWARN_NO_PARAMDOC       = NO\n\n# The WARN_FORMAT tag determines the format of the warning messages that \n# doxygen can produce. The string should contain the $file, $line, and $text \n# tags, which will be replaced by the file and line number from which the \n# warning originated and the warning text. Optionally the format may contain \n# $version, which will be replaced by the version of the file (if it could \n# be obtained via FILE_VERSION_FILTER)\n\nWARN_FORMAT            = \"$file:$line: $text\"\n\n# The WARN_LOGFILE tag can be used to specify a file to which warning \n# and error messages should be written. If left blank the output is written \n# to stderr.\n\nWARN_LOGFILE           = \n\n#---------------------------------------------------------------------------\n# configuration options related to the input files\n#---------------------------------------------------------------------------\n\n# The INPUT tag can be used to specify the files and/or directories that contain \n# documented source files. You may enter file names like \"myfile.cpp\" or \n# directories like \"/usr/src/myproject\". Separate the files or directories \n# with spaces.\n\nINPUT                  = nvmemul.dox TODO.dox src/\n\n# If the value of the INPUT tag contains directories, you can use the \n# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp \n# and *.h) to filter out the source-files in the directories. If left \n# blank the following patterns are tested: \n# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx \n# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py\n\nFILE_PATTERNS          = \n\n# The RECURSIVE tag can be used to turn specify whether or not subdirectories \n# should be searched for input files as well. Possible values are YES and NO. \n# If left blank NO is used.\n\nRECURSIVE              = YES\n\n# The EXCLUDE tag can be used to specify files and/or directories that should \n# excluded from the INPUT source files. This way you can easily exclude a \n# subdirectory from a directory tree whose root is specified with the INPUT tag.\n\nEXCLUDE                = \n\n# The EXCLUDE_SYMLINKS tag can be used select whether or not files or \n# directories that are symbolic links (a Unix filesystem feature) are excluded \n# from the input.\n\nEXCLUDE_SYMLINKS       = NO\n\n# If the value of the INPUT tag contains directories, you can use the \n# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude \n# certain files from those directories. Note that the wildcards are matched \n# against the file with absolute path, so to exclude all test directories \n# for example use the pattern */test/*\n\nEXCLUDE_PATTERNS       = \n\n# The EXAMPLE_PATH tag can be used to specify one or more files or \n# directories that contain example code fragments that are included (see \n# the \\include command).\n\nEXAMPLE_PATH           = \n\n# If the value of the EXAMPLE_PATH tag contains directories, you can use the \n# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp \n# and *.h) to filter out the source-files in the directories. If left \n# blank all files are included.\n\nEXAMPLE_PATTERNS       = \n\n# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be \n# searched for input files to be used with the \\include or \\dontinclude \n# commands irrespective of the value of the RECURSIVE tag. \n# Possible values are YES and NO. If left blank NO is used.\n\nEXAMPLE_RECURSIVE      = NO\n\n# The IMAGE_PATH tag can be used to specify one or more files or \n# directories that contain image that are included in the documentation (see \n# the \\image command).\n\nIMAGE_PATH             = ./doc/figures\n\n# The INPUT_FILTER tag can be used to specify a program that doxygen should \n# invoke to filter for each input file. Doxygen will invoke the filter program \n# by executing (via popen()) the command <filter> <input-file>, where <filter> \n# is the value of the INPUT_FILTER tag, and <input-file> is the name of an \n# input file. Doxygen will then use the output that the filter program writes \n# to standard output.  If FILTER_PATTERNS is specified, this tag will be \n# ignored.\n\nINPUT_FILTER           = \n\n# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern \n# basis.  Doxygen will compare the file name with each pattern and apply the \n# filter if there is a match.  The filters are a list of the form: \n# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further \n# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER \n# is applied to all files.\n\nFILTER_PATTERNS        = \n\n# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using \n# INPUT_FILTER) will be used to filter the input files when producing source \n# files to browse (i.e. when SOURCE_BROWSER is set to YES).\n\nFILTER_SOURCE_FILES    = NO\n\n#---------------------------------------------------------------------------\n# configuration options related to source browsing\n#---------------------------------------------------------------------------\n\n# If the SOURCE_BROWSER tag is set to YES then a list of source files will \n# be generated. Documented entities will be cross-referenced with these sources. \n# Note: To get rid of all source code in the generated output, make sure also \n# VERBATIM_HEADERS is set to NO.\n\nSOURCE_BROWSER         = YES\n\n# Setting the INLINE_SOURCES tag to YES will include the body \n# of functions and classes directly in the documentation.\n\nINLINE_SOURCES         = NO\n\n# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct \n# doxygen to hide any special comment blocks from generated source code \n# fragments. Normal C and C++ comments will always remain visible.\n\nSTRIP_CODE_COMMENTS    = YES\n\n# If the REFERENCED_BY_RELATION tag is set to YES (the default) \n# then for each documented function all documented \n# functions referencing it will be listed.\n\nREFERENCED_BY_RELATION = YES\n\n# If the REFERENCES_RELATION tag is set to YES (the default) \n# then for each documented function all documented entities \n# called/used by that function will be listed.\n\nREFERENCES_RELATION    = YES\n\n# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)\n# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from\n# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will\n# link to the source code.  Otherwise they will link to the documentstion.\n\nREFERENCES_LINK_SOURCE = YES\n\n# If the USE_HTAGS tag is set to YES then the references to source code \n# will point to the HTML generated by the htags(1) tool instead of doxygen \n# built-in source browser. The htags tool is part of GNU's global source \n# tagging system (see http://www.gnu.org/software/global/global.html). You \n# will need version 4.8.6 or higher.\n\nUSE_HTAGS              = NO\n\n# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen \n# will generate a verbatim copy of the header file for each class for \n# which an include is specified. Set to NO to disable this.\n\nVERBATIM_HEADERS       = YES\n\n#---------------------------------------------------------------------------\n# configuration options related to the alphabetical class index\n#---------------------------------------------------------------------------\n\n# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index \n# of all compounds will be generated. Enable this if the project \n# contains a lot of classes, structs, unions or interfaces.\n\nALPHABETICAL_INDEX     = YES\n\n# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then \n# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns \n# in which this list will be split (can be a number in the range [1..20])\n\nCOLS_IN_ALPHA_INDEX    = 5\n\n# In case all classes in a project start with a common prefix, all \n# classes will be put under the same header in the alphabetical index. \n# The IGNORE_PREFIX tag can be used to specify one or more prefixes that \n# should be ignored while generating the index headers.\n\nIGNORE_PREFIX          = \n\n#---------------------------------------------------------------------------\n# configuration options related to the HTML output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_HTML tag is set to YES (the default) Doxygen will \n# generate HTML output.\n\nGENERATE_HTML          = YES\n\n# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. \n# If a relative path is entered the value of OUTPUT_DIRECTORY will be \n# put in front of it. If left blank `html' will be used as the default path.\n\nHTML_OUTPUT            = html\n\n# The HTML_FILE_EXTENSION tag can be used to specify the file extension for \n# each generated HTML page (for example: .htm,.php,.asp). If it is left blank \n# doxygen will generate files with .html extension.\n\nHTML_FILE_EXTENSION    = .html\n\n# The HTML_HEADER tag can be used to specify a personal HTML header for \n# each generated HTML page. If it is left blank doxygen will generate a \n# standard header.\n\nHTML_HEADER            = \n\n# The HTML_FOOTER tag can be used to specify a personal HTML footer for \n# each generated HTML page. If it is left blank doxygen will generate a \n# standard footer.\n\nHTML_FOOTER            = \n\n# The HTML_STYLESHEET tag can be used to specify a user-defined cascading \n# style sheet that is used by each HTML page. It can be used to \n# fine-tune the look of the HTML output. If the tag is left blank doxygen \n# will generate a default style sheet. Note that doxygen will try to copy \n# the style sheet file to the HTML output directory, so don't put your own \n# stylesheet in the HTML output directory as well, or it will be erased!\n\nHTML_STYLESHEET        = \n\n# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, \n# files or namespaces will be aligned in HTML using tables. If set to \n# NO a bullet list will be used.\n\nHTML_ALIGN_MEMBERS     = YES\n\n# If the GENERATE_HTMLHELP tag is set to YES, additional index files \n# will be generated that can be used as input for tools like the \n# Microsoft HTML help workshop to generate a compressed HTML help file (.chm) \n# of the generated HTML documentation.\n\nGENERATE_HTMLHELP      = YES\n\n# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can \n# be used to specify the file name of the resulting .chm file. You \n# can add a path in front of the file if the result should not be \n# written to the html output directory.\n\nCHM_FILE               = \n\n# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can \n# be used to specify the location (absolute path including file name) of \n# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run \n# the HTML help compiler on the generated index.hhp.\n\nHHC_LOCATION           = \n\n# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag \n# controls if a separate .chi index file is generated (YES) or that \n# it should be included in the master .chm file (NO).\n\nGENERATE_CHI           = NO\n\n# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag \n# controls whether a binary table of contents is generated (YES) or a \n# normal table of contents (NO) in the .chm file.\n\nBINARY_TOC             = NO\n\n# The TOC_EXPAND flag can be set to YES to add extra items for group members \n# to the contents of the HTML help documentation and to the tree view.\n\nTOC_EXPAND             = YES\n\n# The DISABLE_INDEX tag can be used to turn on/off the condensed index at \n# top of each HTML page. The value NO (the default) enables the index and \n# the value YES disables it.\n\nDISABLE_INDEX          = NO\n\n# This tag can be used to set the number of enum values (range [1..20]) \n# that doxygen will group on one line in the generated HTML documentation.\n\nENUM_VALUES_PER_LINE   = 4\n\n# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be\n# generated containing a tree-like index structure (just like the one that \n# is generated for HTML Help). For this to work a browser that supports \n# JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+, \n# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are \n# probably better off using the HTML help feature.\n\nGENERATE_TREEVIEW      = YES\n\n# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be \n# used to set the initial width (in pixels) of the frame in which the tree \n# is shown.\n\nTREEVIEW_WIDTH         = 250\n\n#---------------------------------------------------------------------------\n# configuration options related to the LaTeX output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will \n# generate Latex output.\n\nGENERATE_LATEX         = NO\n\n# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. \n# If a relative path is entered the value of OUTPUT_DIRECTORY will be \n# put in front of it. If left blank `latex' will be used as the default path.\n\nLATEX_OUTPUT           = latex\n\n# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be \n# invoked. If left blank `latex' will be used as the default command name.\n\nLATEX_CMD_NAME         = latex\n\n# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to \n# generate index for LaTeX. If left blank `makeindex' will be used as the \n# default command name.\n\nMAKEINDEX_CMD_NAME     = makeindex\n\n# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact \n# LaTeX documents. This may be useful for small projects and may help to \n# save some trees in general.\n\nCOMPACT_LATEX          = NO\n\n# The PAPER_TYPE tag can be used to set the paper type that is used \n# by the printer. Possible values are: a4, a4wide, letter, legal and \n# executive. If left blank a4wide will be used.\n\nPAPER_TYPE             = a4wide\n\n# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX \n# packages that should be included in the LaTeX output.\n\nEXTRA_PACKAGES         = \n\n# The LATEX_HEADER tag can be used to specify a personal LaTeX header for \n# the generated latex document. The header should contain everything until \n# the first chapter. If it is left blank doxygen will generate a \n# standard header. Notice: only use this tag if you know what you are doing!\n\nLATEX_HEADER           = \n\n# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated \n# is prepared for conversion to pdf (using ps2pdf). The pdf file will \n# contain links (just like the HTML output) instead of page references \n# This makes the output suitable for online browsing using a pdf viewer.\n\nPDF_HYPERLINKS         = NO\n\n# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of \n# plain latex in the generated Makefile. Set this option to YES to get a \n# higher quality PDF documentation.\n\nUSE_PDFLATEX           = NO\n\n# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\\\batchmode. \n# command to the generated LaTeX files. This will instruct LaTeX to keep \n# running if errors occur, instead of asking the user for help. \n# This option is also used when generating formulas in HTML.\n\nLATEX_BATCHMODE        = NO\n\n# If LATEX_HIDE_INDICES is set to YES then doxygen will not \n# include the index chapters (such as File Index, Compound Index, etc.) \n# in the output.\n\nLATEX_HIDE_INDICES     = NO\n\n#---------------------------------------------------------------------------\n# configuration options related to the RTF output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output \n# The RTF output is optimized for Word 97 and may not look very pretty with \n# other RTF readers or editors.\n\nGENERATE_RTF           = NO\n\n# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. \n# If a relative path is entered the value of OUTPUT_DIRECTORY will be \n# put in front of it. If left blank `rtf' will be used as the default path.\n\nRTF_OUTPUT             = rtf\n\n# If the COMPACT_RTF tag is set to YES Doxygen generates more compact \n# RTF documents. This may be useful for small projects and may help to \n# save some trees in general.\n\nCOMPACT_RTF            = NO\n\n# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated \n# will contain hyperlink fields. The RTF file will \n# contain links (just like the HTML output) instead of page references. \n# This makes the output suitable for online browsing using WORD or other \n# programs which support those fields. \n# Note: wordpad (write) and others do not support links.\n\nRTF_HYPERLINKS         = NO\n\n# Load stylesheet definitions from file. Syntax is similar to doxygen's \n# config file, i.e. a series of assignments. You only have to provide \n# replacements, missing definitions are set to their default value.\n\nRTF_STYLESHEET_FILE    = \n\n# Set optional variables used in the generation of an rtf document. \n# Syntax is similar to doxygen's config file.\n\nRTF_EXTENSIONS_FILE    = \n\n#---------------------------------------------------------------------------\n# configuration options related to the man page output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_MAN tag is set to YES (the default) Doxygen will \n# generate man pages\n\nGENERATE_MAN           = NO\n\n# The MAN_OUTPUT tag is used to specify where the man pages will be put. \n# If a relative path is entered the value of OUTPUT_DIRECTORY will be \n# put in front of it. If left blank `man' will be used as the default path.\n\nMAN_OUTPUT             = man\n\n# The MAN_EXTENSION tag determines the extension that is added to \n# the generated man pages (default is the subroutine's section .3)\n\nMAN_EXTENSION          = .3\n\n# If the MAN_LINKS tag is set to YES and Doxygen generates man output, \n# then it will generate one additional man file for each entity \n# documented in the real man page(s). These additional files \n# only source the real man page, but without them the man command \n# would be unable to find the correct page. The default is NO.\n\nMAN_LINKS              = NO\n\n#---------------------------------------------------------------------------\n# configuration options related to the XML output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_XML tag is set to YES Doxygen will \n# generate an XML file that captures the structure of \n# the code including all documentation.\n\nGENERATE_XML           = NO\n\n# The XML_OUTPUT tag is used to specify where the XML pages will be put. \n# If a relative path is entered the value of OUTPUT_DIRECTORY will be \n# put in front of it. If left blank `xml' will be used as the default path.\n\nXML_OUTPUT             = xml\n\n# The XML_SCHEMA tag can be used to specify an XML schema, \n# which can be used by a validating XML parser to check the \n# syntax of the XML files.\n\nXML_SCHEMA             = \n\n# The XML_DTD tag can be used to specify an XML DTD, \n# which can be used by a validating XML parser to check the \n# syntax of the XML files.\n\nXML_DTD                = \n\n# If the XML_PROGRAMLISTING tag is set to YES Doxygen will \n# dump the program listings (including syntax highlighting \n# and cross-referencing information) to the XML output. Note that \n# enabling this will significantly increase the size of the XML output.\n\nXML_PROGRAMLISTING     = YES\n\n#---------------------------------------------------------------------------\n# configuration options for the AutoGen Definitions output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will \n# generate an AutoGen Definitions (see autogen.sf.net) file \n# that captures the structure of the code including all \n# documentation. Note that this feature is still experimental \n# and incomplete at the moment.\n\nGENERATE_AUTOGEN_DEF   = NO\n\n#---------------------------------------------------------------------------\n# configuration options related to the Perl module output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_PERLMOD tag is set to YES Doxygen will \n# generate a Perl module file that captures the structure of \n# the code including all documentation. Note that this \n# feature is still experimental and incomplete at the \n# moment.\n\nGENERATE_PERLMOD       = NO\n\n# If the PERLMOD_LATEX tag is set to YES Doxygen will generate \n# the necessary Makefile rules, Perl scripts and LaTeX code to be able \n# to generate PDF and DVI output from the Perl module output.\n\nPERLMOD_LATEX          = NO\n\n# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be \n# nicely formatted so it can be parsed by a human reader.  This is useful \n# if you want to understand what is going on.  On the other hand, if this \n# tag is set to NO the size of the Perl module output will be much smaller \n# and Perl will parse it just the same.\n\nPERLMOD_PRETTY         = YES\n\n# The names of the make variables in the generated doxyrules.make file \n# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. \n# This is useful so different doxyrules.make files included by the same \n# Makefile don't overwrite each other's variables.\n\nPERLMOD_MAKEVAR_PREFIX = \n\n#---------------------------------------------------------------------------\n# Configuration options related to the preprocessor   \n#---------------------------------------------------------------------------\n\n# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will \n# evaluate all C-preprocessor directives found in the sources and include \n# files.\n\nENABLE_PREPROCESSING   = YES\n\n# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro \n# names in the source code. If set to NO (the default) only conditional \n# compilation will be performed. Macro expansion can be done in a controlled \n# way by setting EXPAND_ONLY_PREDEF to YES.\n\nMACRO_EXPANSION        = NO\n\n# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES \n# then the macro expansion is limited to the macros specified with the \n# PREDEFINED and EXPAND_AS_DEFINED tags.\n\nEXPAND_ONLY_PREDEF     = NO\n\n# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files \n# in the INCLUDE_PATH (see below) will be search if a #include is found.\n\nSEARCH_INCLUDES        = YES\n\n# The INCLUDE_PATH tag can be used to specify one or more directories that \n# contain include files that are not input files but should be processed by \n# the preprocessor.\n\nINCLUDE_PATH           = \n\n# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard \n# patterns (like *.h and *.hpp) to filter out the header-files in the \n# directories. If left blank, the patterns specified with FILE_PATTERNS will \n# be used.\n\nINCLUDE_FILE_PATTERNS  = \n\n# The PREDEFINED tag can be used to specify one or more macro names that \n# are defined before the preprocessor is started (similar to the -D option of \n# gcc). The argument of the tag is a list of macros of the form: name \n# or name=definition (no spaces). If the definition and the = are \n# omitted =1 is assumed. To prevent a macro definition from being \n# undefined via #undef or recursively expanded use the := operator \n# instead of the = operator.\n\nPREDEFINED             = \n\n# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then \n# this tag can be used to specify a list of macro names that should be expanded. \n# The macro definition that is found in the sources will be used. \n# Use the PREDEFINED tag if you want to use a different macro definition.\n\nEXPAND_AS_DEFINED      = \n\n# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then \n# doxygen's preprocessor will remove all function-like macros that are alone \n# on a line, have an all uppercase name, and do not end with a semicolon. Such \n# function macros are typically used for boiler-plate code, and will confuse \n# the parser if not removed.\n\nSKIP_FUNCTION_MACROS   = YES\n\n#---------------------------------------------------------------------------\n# Configuration::additions related to external references   \n#---------------------------------------------------------------------------\n\n# The TAGFILES option can be used to specify one or more tagfiles. \n# Optionally an initial location of the external documentation \n# can be added for each tagfile. The format of a tag file without \n# this location is as follows: \n#   TAGFILES = file1 file2 ... \n# Adding location for the tag files is done as follows: \n#   TAGFILES = file1=loc1 \"file2 = loc2\" ... \n# where \"loc1\" and \"loc2\" can be relative or absolute paths or \n# URLs. If a location is present for each tag, the installdox tool \n# does not have to be run to correct the links.\n# Note that each tag file must have a unique name\n# (where the name does NOT include the path)\n# If a tag file is not located in the directory in which doxygen \n# is run, you must also specify the path to the tagfile here.\n\nTAGFILES               = \n\n# When a file name is specified after GENERATE_TAGFILE, doxygen will create \n# a tag file that is based on the input files it reads.\n\nGENERATE_TAGFILE       = \n\n# If the ALLEXTERNALS tag is set to YES all external classes will be listed \n# in the class index. If set to NO only the inherited external classes \n# will be listed.\n\nALLEXTERNALS           = NO\n\n# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed \n# in the modules index. If set to NO, only the current project's groups will \n# be listed.\n\nEXTERNAL_GROUPS        = YES\n\n# The PERL_PATH should be the absolute path and name of the perl script \n# interpreter (i.e. the result of `which perl').\n\nPERL_PATH              = /usr/bin/perl\n\n#---------------------------------------------------------------------------\n# Configuration options related to the dot tool   \n#---------------------------------------------------------------------------\n\n# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will \n# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base \n# or super classes. Setting the tag to NO turns the diagrams off. Note that \n# this option is superseded by the HAVE_DOT option below. This is only a \n# fallback. It is recommended to install and use dot, since it yields more \n# powerful graphs.\n\nCLASS_DIAGRAMS         = YES\n\n# If set to YES, the inheritance and collaboration graphs will hide \n# inheritance and usage relations if the target is undocumented \n# or is not a class.\n\nHIDE_UNDOC_RELATIONS   = YES\n\n# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is \n# available from the path. This tool is part of Graphviz, a graph visualization \n# toolkit from AT&T and Lucent Bell Labs. The other options in this section \n# have no effect if this option is set to NO (the default)\n\nHAVE_DOT               = NO\n\n# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen \n# will generate a graph for each documented class showing the direct and \n# indirect inheritance relations. Setting this tag to YES will force the \n# the CLASS_DIAGRAMS tag to NO.\n\nCLASS_GRAPH            = YES\n\n# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen \n# will generate a graph for each documented class showing the direct and \n# indirect implementation dependencies (inheritance, containment, and \n# class references variables) of the class with other documented classes.\n\nCOLLABORATION_GRAPH    = YES\n\n# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen \n# will generate a graph for groups, showing the direct groups dependencies\n\nGROUP_GRAPHS           = YES\n\n# If the UML_LOOK tag is set to YES doxygen will generate inheritance and \n# collaboration diagrams in a style similar to the OMG's Unified Modeling \n# Language.\n\nUML_LOOK               = NO\n\n# If set to YES, the inheritance and collaboration graphs will show the \n# relations between templates and their instances.\n\nTEMPLATE_RELATIONS     = NO\n\n# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT \n# tags are set to YES then doxygen will generate a graph for each documented \n# file showing the direct and indirect include dependencies of the file with \n# other documented files.\n\nINCLUDE_GRAPH          = YES\n\n# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and \n# HAVE_DOT tags are set to YES then doxygen will generate a graph for each \n# documented header file showing the documented files that directly or \n# indirectly include this file.\n\nINCLUDED_BY_GRAPH      = YES\n\n# If the CALL_GRAPH and HAVE_DOT tags are set to YES then doxygen will \n# generate a call dependency graph for every global function or class method. \n# Note that enabling this option will significantly increase the time of a run. \n# So in most cases it will be better to enable call graphs for selected \n# functions only using the \\callgraph command.\n\nCALL_GRAPH             = NO\n\n# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then doxygen will \n# generate a caller dependency graph for every global function or class method. \n# Note that enabling this option will significantly increase the time of a run. \n# So in most cases it will be better to enable caller graphs for selected \n# functions only using the \\callergraph command.\n\nCALLER_GRAPH           = NO\n\n# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen \n# will graphical hierarchy of all classes instead of a textual one.\n\nGRAPHICAL_HIERARCHY    = YES\n\n# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES \n# then doxygen will show the dependencies a directory has on other directories \n# in a graphical way. The dependency relations are determined by the #include\n# relations between the files in the directories.\n\nDIRECTORY_GRAPH        = YES\n\n# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images \n# generated by dot. Possible values are png, jpg, or gif\n# If left blank png will be used.\n\nDOT_IMAGE_FORMAT       = png\n\n# The tag DOT_PATH can be used to specify the path where the dot tool can be \n# found. If left blank, it is assumed the dot tool can be found in the path.\n\nDOT_PATH               = \n\n# The DOTFILE_DIRS tag can be used to specify one or more directories that \n# contain dot files that are included in the documentation (see the \n# \\dotfile command).\n\nDOTFILE_DIRS           = \n\n# The MAX_DOT_GRAPH_WIDTH tag can be used to set the maximum allowed width \n# (in pixels) of the graphs generated by dot. If a graph becomes larger than \n# this value, doxygen will try to truncate the graph, so that it fits within \n# the specified constraint. Beware that most browsers cannot cope with very \n# large images.\n\nMAX_DOT_GRAPH_WIDTH    = 1024\n\n# The MAX_DOT_GRAPH_HEIGHT tag can be used to set the maximum allows height \n# (in pixels) of the graphs generated by dot. If a graph becomes larger than \n# this value, doxygen will try to truncate the graph, so that it fits within \n# the specified constraint. Beware that most browsers cannot cope with very \n# large images.\n\nMAX_DOT_GRAPH_HEIGHT   = 1024\n\n# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the \n# graphs generated by dot. A depth value of 3 means that only nodes reachable \n# from the root by following a path via at most 3 edges will be shown. Nodes \n# that lay further from the root node will be omitted. Note that setting this \n# option to 1 or 2 may greatly reduce the computation time needed for large \n# code bases. Also note that a graph may be further truncated if the graph's \n# image dimensions are not sufficient to fit the graph (see MAX_DOT_GRAPH_WIDTH \n# and MAX_DOT_GRAPH_HEIGHT). If 0 is used for the depth value (the default), \n# the graph is not depth-constrained.\n\nMAX_DOT_GRAPH_DEPTH    = 0\n\n# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent \n# background. This is disabled by default, which results in a white background. \n# Warning: Depending on the platform used, enabling this option may lead to \n# badly anti-aliased labels on the edges of a graph (i.e. they become hard to \n# read).\n\nDOT_TRANSPARENT        = NO\n\n# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output \n# files in one run (i.e. multiple -o and -T options on the command line). This \n# makes dot run faster, but since only newer versions of dot (>1.8.10) \n# support this, this feature is disabled by default.\n\nDOT_MULTI_TARGETS      = NO\n\n# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will \n# generate a legend page explaining the meaning of the various boxes and \n# arrows in the dot generated graphs.\n\nGENERATE_LEGEND        = YES\n\n# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will \n# remove the intermediate dot files that are used to generate \n# the various graphs.\n\nDOT_CLEANUP            = YES\n\n#---------------------------------------------------------------------------\n# Configuration::additions related to the search engine   \n#---------------------------------------------------------------------------\n\n# The SEARCHENGINE tag specifies whether or not a search engine should be \n# used. If set to NO the values of all tags below this one will be ignored.\n\nSEARCHENGINE           = NO\n"
  },
  {
    "path": "README-BENCHMARKS-TESTING.md",
    "content": "**For testing whether your environment is configured correctly for\nrunning Quartz** (e.g., whether you set all the required environmental\nvariables, etc.) **we have created a few scripts with benchmarks, which\ncan be executed automatically** and which can provide you with a\nfeedback on Quartz performance in your environment.\n\n**The directory with these scripts is called: *benchmark-tests*. There are three scripts which you can run:**\n- **bandwidth-model-building.sh**\n\n   This script will execute for approximately **10 min** and will build a memory\n   bandwidth model that can be used in the experiments with memory bandwidth\n   throttling. The configuration file uses a \"debug\" mode on purpose -- that\n   you can see the messages on the screen about the progress of the memory\n   bandwidth  model building, which can be found at */tmp/bandwidth_model*\n\n- **memlat-orig-lat-test.sh**\n\n    This script will measure your server hardware *memory access latency* in nanoseconds: local\n    and remote (for two sockets servers).  It will execute the test 20 times, and   write the results in directory *ORIG-lat-test*.\n    You can find the summary of the results in the file *ORIG-lat-test/final-hw-latency.txt*.\n    It will have measurements like:\n    \n               FORMAT:  1_min_local  2_aver_local  3_max_local  4_min_remote  5_aver_remote  6_max_remote\n                           91             91.9           92           152        163.9           176\n   \n    First three numbers show: minimal, average and maximum measured local\n    memory access latency (in ns, over 20 measurements). The last three numbers\n    show show similar measurements for  access latency of the remote memory,\n    i.e., in the second socket.\n\n-  **memlat-bench-test-10M.sh**\n\n    This script will execute memlat benchmark (pointer-chasing benchmark) with\n    nine emulated memory access latencies: 200 ns, 300 ns,..., 1000 ns.\n    It will run the benchmark with these emulated latencies in two settings:\n    in the local socket (.i.e., emulating a higher memory access latency in the\n    local socket) and similarly, in the remote socket.\n    Each test is repeated 10 times: this is used for assessing the variability\n    of  your environment. In some cases, we had issues with TurboBoost mode, \\\n    which did impact the quality of the emulation...\n    This test might take **approx. 30 min to finish** (since it executes 180 tests),\n    and will create two output directories:  *FULL-RESULTS-test*  and\n    *SUMMARY-RESULTS-test*\n    In the directory SUMMARY-RESULTS-test, you will find two files that\n    summarize the outcome of the experiments in the local and remote sockets.\n    The outcome should look like this:\n    \n          FORMAT: 1_emul_lat  2_min_meas_lat  3_aver_meas_lat  4_max_meas_lat  5_aver_error(%) 6_max_error(%)\n                   200           177            197.9             204              1.05            11.5\n                   300           259            289.5             300              3.5             13.6  \n                   400           354            382.6             395              4.3             11.5\n                   500           468            485.8             490              2.8             6.4\n                   600           554            575.3             585              4.1             7.6\n                   700           640            666.6             681              4.7             8.5\n                   800           749            766.4             776              4.2             6.3\n                   900           851            866.2             871              3.7             5.4\n                   1000          926            956.5             966              4.35            7.4\n    \n          The format is the following:\n          1st column:    emulated latency (in nanoseconds)\n          2nd column:    minimum measured  latency (across 10 tests, in ns)\n          3d column:     average measured  latency (across 10 tests, in ns)\n          4th column:    maximum measured  latency (across 10 tests, in ns)\n          5th column:    average error (between emulated and measured latencies, in %)\n          6th column:    max error (between emulated and measured latencies, in %)\n\nOne of the goals of the designed performance emulator is to provide a\nframework for application sensitivity studies under different\nlatencies and memory bw. Even if you have 15% deviation (error) from\nthe targeted emulated latencies, but the benchmark measurements are\nconsistent -- this is a good sign that you can perform a good\nsensitivity study.\n"
  },
  {
    "path": "README.md",
    "content": "\nQuartz: A DRAM-based performance emulator for NVM\n----------------------\n\nQuartz leverages features available in commodity hardware to emulate\ndifferent latency and bandwidth characteristics of future\nbyte-addressable NVM technologies.\n\nQuartz's design, implementation details, evaluation, and overhead  can be found \nin the following research paper:\n - **H. Volos, G. Magalhaes, L. Cherkasova, J. Li: Quartz: A Lightweight \n   Performance Emulator for Persistent Memory Software. In Proc. of the \n   16th ACM/IFIP/USENIX International Middleware Conference, (Middleware'2015),\n   Vancouver, Canada, December 8-11, 2015.  and can be downloaded from:\n   http://www.jahrhundert.net/papers/middleware2015.pdf**\n\nWhile the emulator is designed to cover three processor families:\n*Sandy Bridge, Ivy Bridge*, and *Haswell* -- we have had the best results\non the *Ivy Bridge* platform. Haswell processor has a TurboBoost feature\nthat cause higher variance and deviations when emulating higher range\nlatencies (above 600 ns).\n\nContributors\n----------------------\nFor a list of contributors see [AUTHORS](https://github.hpe.com/labs/quartz/blob/master/AUTHORS). \n\nExtended documentation\n----------------------\nExtended documentation available in Doxygen form. To build and view:\n\n    doxygen\n    xdg-open doc/html/index.html\n\n\nDependencies\n------------\nThis is the list of libraries and tools used by Quartz:\n\nOn RPM based distributions:\n- cmake 2.8\n- libconfig and libconfig-devel\n- numactl-devel\n- uthash-devel\n- kernel-devel\n\nOn Debian based distributions:\n- cmake 2.8\n- libconfig-dev\n- libnuma-dev\n- uthash-dev\n- linux-headers\n\nYou can run 'sudo scripts/install.sh' in order to automatically install these \ndependencies.\n\n\nSupported environment\n---------------------\nCurrently the latency emulator can be used on Linux with *Sandy Bridge, \nIvy Bridge*, and *Haswell* Intel processors. For bandwidth emulation support, Intel \nThermal Memory Controller device is required.\nNo specific Linux distribution or kernel version is required.\n\n\nSource code tree overview\n-------------------------\n\n    bench             Benchmarks\n    doc               Documentation, including Doxygen generated documentation (doc/html)\n    src/lib           Emulator main library code\n    src/dev           Kernel-module for accessing performance counters and \n                      memory-controller PCI registers\n    scripts           Helper scripts to run a program using the emulator and install \n                      dependencies\n    test              Several tests and application code examples\n    benchmark-tests   Several automated tests with benchmark runs and output analysis \n                      for testing the correctness of configured emulation environment and \n                      the accuracy of expected results\n\nFor more details, please see the extended documentation generated using Doxygen.\n\nBuilding\n--------\nAfter installing the dependencies, go to the emulator's source code root folder \nand execute the following steps:\n\n    mkdir build\n    cd build\n    cmake ..\n    make clean all\n\nIn order to disable statistics support, replace the third step above with:\n\n    cmake .. -DSTATISTICS=OFF\nSee more details about statistics on the respective section below.\nThe emulator library, benchmark and test binaries resulted from the build \nprocess will be available in the respective subfolder inside the 'build' folder.\n\n\nUsage\n-----\nFirst, load the emulator's kernel module. From the emulator's source code root \nfolder, execute:\n\n    sudo scripts/setupdev.sh load\n\nSet your processor to run at maximum frequency to ensure fixed cycle \nrate (as the cycle counter is used to project delay time). You can \nuse the scaling governor:\n\n    echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor\n\nSet the LD_PRELOAD and NVMEMUL_INI environment variables to point respectively \nto the emulators library and the configuration file to be used. The LD_PRELOAD \nis used for automatically loading the emulator's library when the user \napplication is executed. Thus, there is no need to statically link the library \nto the user application. See below details about the configuration file in the \nrespective section.\n\nRather than configuring the scaling governor and the environment variables \nmanually as indicated above, you can use the scripts/runenv.sh script. See \nbelow.\n\nAn additional configuration step may be required depending on the Linux Kernel\nversion. This emulator makes use of rdpmc x86 instruction to read CPU counters.\nBefore kernel 4.0, when rdpmc support was enabled, any process (not just ones\nwith an active perf event) could use the rdpmc instruction to access the counters.\nStarting with Linux 4.0 rdpmc support is only allowed if an event is currently\nenabled in a process's context. To restore the old behavior, write the value 2\nto /sys/devices/cpu/rdpmc if kernel version is 4.0 or greater:\n\n    echo 2 | sudo tee /sys/devices/cpu/rdpmc\n\nRun your application:\n\n    scripts/runenv.sh <your_app>\n\nThe runenv.sh script runs an application in a new shell environment that\nproperly sets LD_PRELOAD to the library available in the build folder. We do\nnot modify the current shell environment to avoid getting other applications \ninterposed by the emulator unexpectedly. \n\nAlternatively, you may directly link \nthe library to your application but the nvmemul library must come first in the \nlinking order to ensure we properly interpose on necessary functions.\nAdditionally, this script sets the NVMEMUL_INI environment variable to point\nto the nvmemul.ini configuration file available in the emulator's source code \nroot folder.\n\n\nConfiguration file\n------------------\nEmulator runtime parameters can be defined in a configuration file. \n\nThe default path is ./nvmemul.ini but you may change the path through the \nenvironment variable $NVMEMUL_INI (see scripts/runenv.sh).\n\nThe main available parameters are:\n\n    - Latency:\n      enable                  True means the latency emulation is on, false,\n                              the latency emulation is disabled.\n      inject_delay            True means the delay injection is on, false,\n                              the emulator will skip the delay injection\n      read                    The target read latency in nano seconds. It must \n                              be greater than the hardware latency. This value\n                              is automatically consisted by the emulator.\n      write                   The target write latency in nano seconds. It must \n                              be greater than the hardware latency. This value\n                              is automatically consisted by the emulator.\n      max_epoch_duration_us   This is the epoch duration in micro seconds. \n                              Eventually an epoch may be greater than this value\n                              depending on signal delivery managed by Kernel.\n      min_epoch_duration_us   The minimum epoch duration. \n    - Bandwidth:\n      enable                  True means the bandwidth emulation is on, false, \n                              it is disabled.\n      model                   File path used by the emulator to cache the \n                              detected hardware bandwidth characteristics.\n      read                    Target read bandwidth in MB/s.\n      write                   Target write bandwidth in MB/s;\n    - Topology:\n      mc_pci                  File path used by the emulator to cache the PCI \n                              bus topology. It is not required if bandwidth \n                              emulation is disabled.\n      physical_nodes          List all CPU sockets ids to be added to the known\n                              topology. An odd number of CPU sockets means it\n                              will not be possible to configure all CPUs in\n                              pairs and then a single CPU will be used as NVM\n                              only. See Emulation modes section below.\n    - Statistics:\n      enable                  True means the statistics collection and report is\n                              enable, false, it is disable. See the Statistics\n                              section below.\n      file                    File path used by the emulator to write the \n                              statistics report. If not provided, emulator will \n                              use stdout.\n    - Debug:\n      level                   Shows debugging message with level up to this \n                              value, the greater this value is, the more verbose \n                              the debug log will be.\n                              0: off; 1: critical; 2: error; 3: warning; 4: info;\n                              5: debugging.\n      verbose                 If greater than zero shows source code information\n                              along with the debugging message.\n\n\nLatency emulation modes\n-----------------------\nThe emulator may run application threads on a *NVM only* mode or *DRAM+NVM* mode.\nIt depends if the system has more than one CPU socket and if the topology \nconfiguration enables multiple CPU socket.\n\nFor *NVM only* mode, the emulator will use a CPU socket with no sibling node and\nmake use of the DRAM available in that socket to emulate NVM. Any DRAM memory \naccess on this socket will produce delays injection to emulate the target \nlatency.\n\nFor *DRAM+NVM* mode, the emulator will differentiate DRAM from virtual NVM \nlatencies. It is supported only on IvyBridge, Haswell (and higher) Intel processor \nsystems with 2 CPU sockets or more. A proper configuration as mentioned above and \nexplicit calls to NVM memory allocation in the application’s source code is required.\n- The emulator will bind application threads to node 0 CPU and DRAM. The \n other CPU socket will not be used for application threads and the DRAM \nfrom this second socket will be used as virtual NVM;\n- The application must explicitly allocate virtual NVRAM memory using \npmalloc(size) and pfree(pointer, size) API provided by the emulator. \n\nSee the NVM programming section below.\n\n\nNVM programming\n---------------\nThe emulator provides an API for allocating and deallocating memory from NVM\nspace. It is possible to use this API on both NVM only and DRAM+NVM modes. \nHowever, it is really required to use this API in the DRAM+NVM mode so the \nemulator can clearly differentiate DRAM from NVM memory access latencies.\nThis is the API available for user applications:\n\n    void *pmalloc(size_t size);\n    void pfree(void *start, size_t size);\n\nThe application can include the NVM_EMUL/src/lib/pmalloc.h header file to\nproperly define these headers.\nSee test/test_nvm.c and test/test_nvm_remote_dram.c for an example on how to\nallocate memory on respectively local DRAM or virtual NVM on a DRAM+NVM \nemulation mode.\n\n\nStatistics\n----------\nThe emulator collects statistical data to help on emulation accuracy validation.\nIf enabled, by default the emulator will show the statistics report when the \nuser application terminates to the standard output. Some applications suppress\noutput to stdout, you can still see the reports by defining a target file for \nthe report in the configuration file. When using a file as output, the emulator\nappends the result to the file and then previous reports are not overwritten.\nThe statistics source code can also be statically removed at compile time. See \nBuilding section.\n\nThese are the reported statistics:\n\n    - initialization duration   Time in micro seconds took by the emulator to \n                                initialize.\n    - running threads           The number of threads still running. If the report\n                                was called automatically by the emulator, all user \n                                threads are already terminated.\n    - terminated threads        Number of terminated threads, including the main\n                                thread.\n    For each application thread:\n    - thread id                 Thread id.\n    - cpu id                    CPU id where the user thread was bind to.\n    - spawn timestamp           Thread spawn timestamp as reported by the\n                                monotonic time.\n    - termination timestamp     Thread termination timestamp as reported by the\n                                monotonic time.\n    - execution time\n    - stall cycles              Total number of CPU stalls caused by memory \n                                accesses made by this thread.\n    - NVM accesses              Number of effective NVM accesses performed by\n                                the application.\n    - latency calculation overhead cycles     Overhead cycles caused by the \n                                              emulator and that could not be\n                                              amortized. Zero is expected.\n                                              Otherwise, consider increasing\n                                              the epoch duration.\n    - injected delay cycles     Total number of cycles injected by the emulator\n                                to emulate the target latency.\n    - injected delay in usec    Same value as above, but shown in micro seconds.\n    - longest epoch duration    The effective longest epoch duration ever \n                                performed for this thread.\n    - shortest epoch duration   The effective shortest epoch duration ever \n                                performed for this thread.\n    - average epoch duration    The average epoch duration for this thread.\n    - number of epochs          Total number of epochs performed for this \n                                thread.\n    - epochs which didn't reach min duration   Number of epochs requested by \n                                               either Thread Monitor or thread \n                                               synchronizations, but were not \n                                               open since the epoch durations\n                                               didn't reach the minimum epoch\n                                               duration.\n    - static epochs requested   Number of epochs requested by the Thread Monitor.\n\n\nSupport to PAPI\n---------------\nPerformance API (PAPI) library may be used with the emulator and there are some \nhooks to switch the current CPU counters reading method to PAPI. Up to the time \nof this writing, there was no way to make PAPI CPU counter reading to perform \nat the performance level required by the emulation. In the future, if it is \ndesired to switch to PAPI, follow these steps:\n - Device pmc_ioctl_setcounter() and emulator lib set_counter() in dev/pmc.c \n   calls can be deleted.\n - Define PAPI_SUPPORT for src/lib/* source code.\n - Compile with lib/cpu/pmc-papi.c rather than lib/cpu/pmc.c.\n - Link code with PAPI and add PAPI include directory.\n - Some extra tweaks may be required, check TODOs in the code.\n\n\nMultiple emulated processes and MPI programs\n--------------------------------------------\nThe emulator needs to bind user threads to specific CPU cores in order to \noptimize emulation results. It is required to export the EMUL_LOCAL_PROCESSES \nenvironment variable with the number or emulated processes on the host. The \nemulator will manage each emulated processes to partition the available CPUs in \na coordinated way. It is recommended to set EMUL_LOCAL_PROCESSES with up to half \nnumber of available CPU cores (note DRAM+NVM mode already reserves half of \navailable CPU cores).\n\nIf EMUL_LOCAL_PROCESSES is not set or set with a value lower than 2, the \nemulator will not partition CPU cores per process.\n\nIf some process crashes the emulator might not have cleaned up the environment\nand the process rank ids will not be correctly managed. On this case, close all\nemulated processes and delete files /tmp/emul_lock_file and \n/tmp/emul_process_local_rank if they exist.\n\n\nBandwidth emulation\n-------------------\nQuartz supports an emulation mode with \"throttled\" memory bandwidth. \n\nThe memory bandwidth emulation  makes use of the copy kernel from the Stream benchmark, \nopenMP version. When the bandwidth emulation is enabled for a first time, Quartz\ncreates a memory bandwidth model by utilizing the available *Thermal Registers* in the \nMemory Controller and measuring the corresponding memory bandwidth. This initial step of \nbuilding a model might take several minutes **(~10min)**.\n\nFor the memory bandwitdh emulation, *turn off the latency modeling*\nin the configuration file and select all available NUMA nodes in the \nconfiguration file in order to prepare the model for any combination of NUMA\nnodes selection.\n\nModeling data will be cached to these files:\n\n    /tmp/bandwidth_model\n    /tmp/mc_pci_bus\nAs first step, the emulator will detect the Memory Controller Thermal Registers\nControl PCI addresses and cache it to /tmp/mc/pci_bus. After this step, the \nemulator will close the current execution to safely clear NUMA bindings. Rerun\nthe process to resume the work. \n\nQuartz will create the file: **/tmp/bandwidth_model**. \n\nIt reflects the relationship between Thermal Registers and achievable memory \nbandwidth (in a single socket). The line format in this file is:\n\n    read <thermal register value> <memory bandwidth MB/s>\nThis file should present ascending values of memory bandwidth ranging from\nhundreds of MiB/s to tens of GiB/S. These values (or their approximations) \ncan be used for the experiments with memory bandwidth throttling. Note, that \nthe model is built once: it is cached and then used for all later experiments.\n(You can also run a specially prepared  automated script *bandwidth-model-building.sh* \nin directory *benchmark-tests*. For details see [README-BENCHMARKS-TESTING.md]\n(https://github.hpe.com/labs/quartz/blob/master/README-BENCHMARKS-TESTING.md).\n\nFor example, to enable memory bandwidth throttling at 2 GB/s, you should change\nthe emulator configuration file  \"nvmemul.ini\" using the following settings:\n\n    bandwidth:\n    {\n    enable = true;\n    model = \"/tmp/bandwidth_model\";\n    read = 2000;\n    write = 2000;\n    };\n\nBoth read and write bandwidth values must be set to the same value since the \nemulator does not model read/write independently in the current version. \nSee Limitations session.\n\nThe pmalloc() family is not intended to be used with the bandwidth modeling. Use\nnumactl for instance to bind CPU and memory of the used application to the \nintended NUMA node depending. The bandwidth emulator considers the virtual NVRAM \nnode only (in the configuration with two sockets). So it is required the application \nto keep processes/threads and data on the same NUMA node for bandwidth experiments.\n\nAutomated Benchmark Runs\n-------------------------\nWe have created several automated tests with benchmark runs and output analysis \nfor testing the correctness of configured emulation environment and the accuracy \nof expected results. For details see [README-BENCHMARKS-TESTING.md]\n(https://github.hpe.com/labs/quartz/blob/master/README-BENCHMARKS-TESTING.md).\n\nLimitations\n-----------\nThe emulator functionality may be affected by certain conditions in user \napplications:\n - application sets threads CPU and memory affinity.\n - application opens much more concurrent threads than available cores per \n   socket. Note that on DRAM+NVM emulation mode, half of the available CPU \n   cores is not used for user threads.\n - application sets handler for SIGUSR1.\nOther:\n - Write memory latency is not yet implemented.\n - Write/Read memory bandwidth emulation cannot be set independently.\n - The signal handler may cause syscalls in the application to fail. It is\n   recommended to implement retries at the application level as a good practice \n   for syscalls.\n - Child process from fork() calls are not tracked by the emulator. As a\n   workaround, the emulator could make the library initialization function \n   available in the external API. Applications then should call this function\n   in the beginning of the child process.\n - OpenMP applications may use synchronization primitives not based on\n   pthreads which are currently not supported.\n - See Todo session for details.\n\n\nTodo list\n---------\nPlease see accompanied TODO.dox or extended documentation for an extensive \nlist.\n\n#License\n\n    This program is free software; you can redistribute it and/or modify\n    it under the terms of the GNU General Public License as published by\n    the Free Software Foundation; either version 2 of the License, or (at\n    your option) any later version. This program is distributed in the\n    hope that it will be useful, but WITHOUT ANY WARRANTY; without even\n    the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\n    PURPOSE. See the GNU General Public License for more details. You\n    should have received a copy of the GNU General Public License along\n    with this program; if not, write to the Free Software Foundation,\n    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n\n    \n#Copyright\n\n\t    (c) Copyright 2016 Hewlett Packard Enterprise Development LP\n\n**NOTE**: This software depends on other packages that may be licensed under different open source licenses.\n\n"
  },
  {
    "path": "TODO.dox",
    "content": "/**\n\\file\n\n\\todo Improve performance counter API by making it more generic. For example, autogenerate pmc event_id using perf.\n\\todo Currently we may interrupt a thread to form a new epoch while it is blocked. This might cause accumulation of overhead cycles.\n\\todo Currently our bandwidth model cannot independently throttle read and write bandwidth as it relies on throttling DDR ACT transactions. We tried throttling DDR READ and DDR WRITE transactions but this didn't work.\n\\todo Extend library to interpose on other synchronization events we care: semaphores, barriers, context switches, openMP sync primitives, etc.\n\\todo Currently our library does not support context switching. Extent the device driver to properly handle context switching: keep track of per-thread cpu counters, introduce proper delay at context switch points.\n\\todo Support uncacheable and write-through memory.\n\\todo Signal SIGUSR1 should be dedicated to the emulator. If the application makes use of this signal, the emulator will not work. Figure out a way to fix this limitation.\n\\todo Interpose pthread_cancel() e pthread_exit() to make sure the thread is always deregistered internally to the emulator?\n\\todo CPU counters overflow is not currently handled.\n\\todo Multiple processes emulation must be reviewed: log file per process, statistics report by process, process id and thread id indications in the log messages.\n\\todo See Limitations section in the README file.\n*/\n"
  },
  {
    "path": "bench/CMakeLists.txt",
    "content": "add_subdirectory(memlat)\nadd_subdirectory(new_memlat)\nadd_subdirectory(multilat)\n"
  },
  {
    "path": "bench/memlat/CMakeLists.txt",
    "content": "include_directories(${CMAKE_SOURCE_DIR}/src/lib)\nadd_executable(memlat memlat.c)\ntarget_link_libraries(memlat nvmemul pthread)\n"
  },
  {
    "path": "bench/memlat/memlat.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <stddef.h>\n#include <stdint.h>\n#include <stdio.h>\n#include <assert.h>\n#include <pthread.h>\n\n#define MAX_NUM_THREADS 512\n\nuint64_t g_seed, g_nchains, g_nelems, g_from_node_id, g_to_node_id, g_element_size, g_access_size;\n\nextern int measure_latency2(uint64_t seedin, int nchains, size_t nelems, int element_size, int access_size, int from_node_id, int to_node_id);\n\nstatic uint64_t safe_strtoull(const char *s) {\n    char *ep;\n    uint64_t r;\n    assert(NULL != s && '\\0' != *s);\n    r = strtoull(s, &ep, 10);\n    assert('\\0' == *ep);\n    return r;\n}\n\n\nvoid* worker(void* arg) \n{\n    int latency_ns;\n\n    latency_ns = measure_latency2(g_seed, g_nchains, g_nelems, g_element_size, g_access_size, g_from_node_id, g_to_node_id);\n    printf(\"latency_ns: %d\\n\", latency_ns);\n\n    return NULL;\n}\nint main(int argc, char *argv[]) {\n\tint i;\n    uint64_t nthreads;\n    pthread_t thread[MAX_NUM_THREADS];\n\n    if (9 != argc) {\n        fprintf(stderr, \"usage: %s PRNGseed Nthreads Nchains Nelems SZelem SZaccess from_node to_node\\n\", argv[0]);\n        return 1;\n    }\n    g_seed  = safe_strtoull(argv[1]);\n    nthreads = safe_strtoull(argv[2]);\n    g_nchains = safe_strtoull(argv[3]);\n    g_nelems = safe_strtoull(argv[4]);\n    g_element_size = safe_strtoull(argv[5]);\n    g_access_size = safe_strtoull(argv[6]);\n    g_from_node_id = safe_strtoull(argv[7]);\n    g_to_node_id = safe_strtoull(argv[8]);\n\n\tfor (i = 0; i< nthreads; i++) {\n\t\tpthread_create(&thread[i], NULL, worker, NULL);\n    }\n\tfor(i = 0 ; i < nthreads; i++) {\n\t\tpthread_join(thread[i], NULL);\n    }\n    return 0;\n}\n"
  },
  {
    "path": "bench/multilat/CMakeLists.txt",
    "content": "include_directories(${CMAKE_SOURCE_DIR}/src/lib)\n\nadd_executable(multilat multilat.c)\ntarget_link_libraries(multilat nvmemul pthread)\n"
  },
  {
    "path": "bench/multilat/multilat.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#define _GNU_SOURCE\n#include <pthread.h>\n#include <sched.h>\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <unistd.h>\n//#include <pthread.h>\n\n#include \"thread.h\"\n#include <sys/time.h>\n#include \"pmalloc.h\"\n#include \"debug.h\"\n//#include \"stat.h\"\n\n\n#define NDEBUG\n\n//#ifndef NDEBUG\n#include <sys/syscall.h>\n//#endif\n\n// packs the arguments received from user\ntypedef struct {\n\tint mem_refs_dram;\n\tint mem_refs_nvm;\n\tint interleave_dram;\n\tint interleave_nvm;\n\t//int from_node;\n\t//int to_node;\n} arg_s;\n\n\n// for multi thread management\n#define MAX_NUM_THREADS 50\npthread_t thread_desc[MAX_NUM_THREADS];\n//pthread_mutex_t mutex;\n\n\n// for CPU cache trashing and pointer chasing\n#include <inttypes.h>\ntypedef struct {\n\tuint64_t val;\n\tchar padding[0];\n} element_t;\n\ntypedef struct {\n    uint64_t   N;\n    uint64_t   element_size;\n    element_t* head;\n} chain_t;\nuint64_t trash_cache(uint64_t N);\nchain_t* alloc_chain(uint64_t seedin, uint64_t N, uint64_t element_size, uint64_t node_i, uint64_t node_j);\nelement_t* element(chain_t* chain, uint64_t index);\nvoid inline read_element(chain_t* chain, uint64_t index, char* buf, uint64_t buf_size);\n\n// factor is 10 (could be more), to make sure we have a buffer much bigger than CPU cache\n// the memory buffer is NOT shared among threads\n// for now the cache size is hardcoded as 20 MB\n#define NELEMS (10 * 20480000 / 64LLU)\n#define PAGESZ 4096\n#define MAX_NUM_CHAINS 16\n//#undef USE_HUGETLB\n#define SEED_IN 1\n#define NCHAINS 1\n\n\n/*extern inline hrtime_t hrtime_cycles(void);\nstatic inline void delay_cycles(hrtime_t cycles)\n{\n    hrtime_t start, stop;\n\n    start = hrtime_cycles();\n    do {\n        stop = hrtime_cycles();\n    } while (stop - start < cycles);\n}*/\n\n\n// for fixing thread affinity to a single CPU after allocating memory chains and binding it to the local or remote nodes\nstatic int max_number_of_cpus(void)\n{\n    int n, cpus = 2048;\n    size_t setsize =  CPU_ALLOC_SIZE(cpus);\n    cpu_set_t *set = CPU_ALLOC(cpus);\n    if (!set)\n        goto err;\n\n\tfor (;;) {\n\t\tCPU_ZERO_S(setsize, set);\n\t\t/* the library version does not return size of cpumask_t */\n\t\tn = syscall(SYS_sched_getaffinity, 0, setsize, set);\n\t\tif (n < 0 && cpus < 1024 * 1024) {\n\t\t        CPU_FREE(set);\n\t\t\tcpus *= 2;\n\t\t\tset = CPU_ALLOC(cpus);\n\t\t\tif (!set)\n\t\t\t\tgoto err;\n\t\t\tcontinue;\n\t\t}\n\n\tCPU_FREE(set);\n\treturn n * 8;\n\t}\nerr:\n\tprintf(\"cannot determine NR_CPUS\");\n\treturn 0;\n}\n\nstatic int bind_cpu(thread_t *thread) {\n    size_t setsize;\n    cpu_set_t *cur_cpuset;\n    cpu_set_t *new_cpuset;\n\n    int ncpus = max_number_of_cpus();\n\n    if (thread == NULL) {\n        // if thread is NULL it means the emulator is disabled, return without setting CPU affinity\n        //printf(\"thread self is null\");\n        return 0;\n    }\n\n    if (ncpus == 0) {\n    \treturn 1;\n    }\n\n    setsize = CPU_ALLOC_SIZE(ncpus);\n    cur_cpuset = CPU_ALLOC(ncpus);\n    new_cpuset = CPU_ALLOC(ncpus);\n    CPU_ZERO_S(setsize, cur_cpuset);\n    CPU_ZERO_S(setsize, new_cpuset);\n    CPU_SET_S(thread->cpu_id, setsize, new_cpuset);\n\n    if (pthread_getaffinity_np(thread->pthread, setsize, cur_cpuset) != 0) {\n        DBG_LOG(ERROR, \"Cannot get thread tid [%d] affinity, pthread: 0x%lx on processor %d\\n\",\n        \t\tthread->tid, thread->pthread, thread->cpu_id);\n        return 1;\n    }\n\n    if (CPU_EQUAL(cur_cpuset, new_cpuset)) {\n        //printf(\"No need to bind CPU\\n\");\n    \treturn 0;\n    }\n\n    DBG_LOG(INFO, \"Binding thread tid [%d] pthread: 0x%lx on processor %d\\n\", thread->tid, thread->pthread, thread->cpu_id);\n\n    if (pthread_setaffinity_np(thread->pthread, setsize, new_cpuset) != 0) {\n        DBG_LOG(ERROR, \"Cannot bind thread tid [%d] pthread: 0x%lx on processor %d\\n\", thread->tid, thread->pthread, thread->cpu_id);\n        return 1;\n    }\n\n    return 0;\n}\n\nuint64_t force_ldm_stalls(chain_t **C,\n                          int element_size,\n                          int access_size,\n                          int mem_refs,               // number of pointers/elements to chase\n                          uint64_t max_nelems,        // max number of available elements/pointers\n                          int it_n,                   // seed to calculate the first pointer to chase, used to avoid repeating\n                                                      // pointers during consecutive calls\n\t                      unsigned long *time_diff_ns) {\n    uint64_t j, i;\n    int nchains = SEED_IN;\n    uint64_t sumv[MAX_NUM_CHAINS];\n    uint64_t nextp[MAX_NUM_CHAINS];\n    char *buf;\n    uint64_t buf_size = 16384;\n    int count = 0;\n    uint64_t start;\n    uint64_t it_limit;\n    struct timespec time_start, time_end;\n\n    assert(nchains < MAX_NUM_CHAINS);\n\n    if (mem_refs <= 0) return 0;\n\n    buf = (char*) malloc(buf_size);\n    assert(buf != NULL);\n\n    if (max_nelems > mem_refs) {\n        it_limit = max_nelems / mem_refs;\n    } else {\n    \tit_limit = 1;\n    }\n    it_n = it_n % it_limit;\n    start = it_n * mem_refs;\n    if ((start + mem_refs) > max_nelems) {\n    \tstart = 0;\n    }\n\n    /* chase the pointers */\n    if (nchains == 1) {\n    \tclock_gettime(CLOCK_MONOTONIC, &time_start);\n        sumv[0] = 0;\n        // chase pointers until the 'mem_refs' count, the pointer chasing will restart from beginning if 'mem_refs'\n        // is greater than 'nelems'\n        for (count = 0, i = start; count < mem_refs; i = element(C[0], i)->val, ++count) {\n            __asm__(\"\");\n            sumv[0] += element(C[0], i)->val;\n            if (access_size > element_size) {\n                read_element(C[0], i, buf, buf_size);\n            }\n        }\n        clock_gettime(CLOCK_MONOTONIC, &time_end);\n    }\n//    else {\n//        for (j=0; j < nchains; j++) {\n//            sumv[j] = 0;\n//            nextp[j] = 0;\n//        }\n//        for (; 0 != element(C[0], nextp[0])->val; ) {\n//            for (j=0; j < nchains; j++) {\n//                sumv[j] += element(C[j], nextp[j])->val;\n//                if (access_size > element_size) {\n//                    read_element(C[j], nextp[j], buf, buf_size);\n//                }\n//                nextp[j] = element(C[j], nextp[j])->val;\n//            }\n//        }\n//    }\n\n    *time_diff_ns = ((time_end.tv_sec * 1000000000) + time_end.tv_nsec) -\n                    ((time_start.tv_sec * 1000000000) + time_start.tv_nsec);\n\n    free(buf);\n    return sumv[0];\n}\n\nvoid thread_iter(int dram_refs, int nvm_refs, int interleave_dram, int interleave_nvm) {\n\tlong it_n;\n\tunsigned long time_dram, time_nvm, total_time_dram_ns, total_time_nvm_ns;\n\tuint64_t seed;\n\tuint64_t j;\n\tchain_t *C_dram[MAX_NUM_CHAINS];\n\tchain_t *C_nvm[MAX_NUM_CHAINS];\n\tint missing_dram_refs, missing_nvm_refs;\n\tint dram_stalls, nvm_stalls;\n\tstruct timespec task_time_start, task_time_end;\n\tunsigned long task_time_diff_ns;\n#ifndef NDEBUG\n\tpid_t tid = (pid_t) syscall(SYS_gettid);\n#endif\n\n\tassert(NELEMS < UINT64_MAX);\n\n    for (j=0; j < NCHAINS; j++) {\n        seed = SEED_IN + j*j;\n        C_dram[j] = alloc_chain(seed, NELEMS, 64LLU, 0, 0);\n        C_nvm[j] = alloc_chain(seed, NELEMS, 64LLU, 0, 1);\n        __asm__(\"\");\n    }\n\n    bind_cpu(thread_self());\n\n    // cache must be trashed after bind_cpu() call\n    trash_cache(NELEMS);\n\n    total_time_dram_ns = 0;\n    total_time_nvm_ns = 0;\n\n    missing_dram_refs = dram_refs;\n    missing_nvm_refs = nvm_refs;\n\n#ifndef NDEBUG\n    printf(\"DRAM accesses to be made: %ld\\n\", dram_refs);\n    printf(\"NVM accesses to be made: %ld\\n\", nvm_refs);\n#endif\n\n    //delay_cycles(8000000000);\n    //printf(\"STARTING MEASURES\\n\");\n\n    clock_gettime(CLOCK_MONOTONIC, &task_time_start);\n\n    for (it_n = 0; (missing_dram_refs > 0) || (missing_nvm_refs > 0); ++it_n) {\n    \t__asm__(\"\");\n\n    \t// calculate the number o memory accesses to be made on each memory type\n    \tif (missing_dram_refs > interleave_dram) {\n    \t\tmissing_dram_refs -= interleave_dram;\n    \t\tdram_stalls = interleave_dram;\n    \t} else {\n    \t\tdram_stalls = missing_dram_refs;\n    \t\tmissing_dram_refs = 0;\n    \t}\n\n    \tif (missing_nvm_refs > interleave_nvm) {\n\t\t\tmissing_nvm_refs -= interleave_nvm;\n\t\t\tnvm_stalls = interleave_nvm;\n\t\t} else {\n\t\t\tnvm_stalls = missing_nvm_refs;\n\t\t\tmissing_nvm_refs = 0;\n\t\t}\n\n    \ttime_dram = 0;\n    \ttime_nvm = 0;\n\n    \t// do memory accesses interleaved by dividing the number of accesses in smaller amount\n    \t// as configured by user\n        force_ldm_stalls((chain_t **)&C_dram, 64LLU, 8, dram_stalls, NELEMS, it_n, &time_dram);\n        force_ldm_stalls((chain_t **)&C_nvm, 64LLU, 8, nvm_stalls, NELEMS, it_n, &time_nvm);\n\n        total_time_dram_ns += time_dram;\n        total_time_nvm_ns += time_nvm;\n#ifndef NDEBUG\n        printf(\"%ld DRAM accesses took: %ld ns\\n\", dram_stalls, time_dram);\n        printf(\"%ld NVM accesses took: %ld ns\\n\", nvm_stalls, time_nvm);\n#endif\n    }\n\n    clock_gettime(CLOCK_MONOTONIC, &task_time_end);\n    task_time_diff_ns = ((task_time_end.tv_sec * 1000000000) + task_time_end.tv_nsec) -\n                        ((task_time_start.tv_sec * 1000000000) + task_time_start.tv_nsec);\n\n    // the memory latency is the total time divided by the number of accesses for each memory type\n    if (dram_refs > 0)\n        total_time_dram_ns /= dram_refs;\n    else\n        total_time_dram_ns = 0;\n    if (nvm_refs > 0)\n        total_time_nvm_ns /= nvm_refs;\n    else\n        total_time_nvm_ns = 0;\n\n    printf(\"DRAM latency: %ld ns\\n\", total_time_dram_ns);\n    printf(\"NVM latency: %ld ns\\n\", total_time_nvm_ns);\n    printf(\"Measure time: %.3lf ms\\n\", (double)task_time_diff_ns/1000000.0);\n    \n    printf(\"Expected time: %.3ld ms\\n\", ((total_time_dram_ns * dram_refs) + (total_time_nvm_ns * nvm_refs)) / 1000000);\n\n    for (j=0; j < NCHAINS; j++) {\n        free(C_dram[j]);\n        free(C_nvm[j]);\n    }\n}\n\nvoid *thread_fn(void *arg) {\n\tint interleave_dram = ((arg_s *) arg)->interleave_dram;\n\tint interleave_nvm = ((arg_s *) arg)->interleave_nvm;\n\tint dram_refs = ((arg_s *) arg)->mem_refs_dram;\n\tint nvm_refs = ((arg_s *) arg)->mem_refs_nvm;\n\n\tthread_iter(dram_refs, nvm_refs, interleave_dram, interleave_nvm);\n\n\treturn 0;\n}\n\nvoid run_threads(int n_threads, int dram_refs, int nvm_refs, int interleaved_dram, int interleaved_nvm)\n{\n\tpthread_attr_t attr;\n    int i;\n    arg_s args;\n\n    if ((n_threads > MAX_NUM_THREADS) || (n_threads <= 0)) {\n    \tprintf(\"INVALID RANGE:\\n\");\n    \tprintf(\"\\tMax number of threads is %d\\n\", MAX_NUM_THREADS);\n    \texit(-1);\n    }\n\n    if (dram_refs < 0 || nvm_refs < 0 || interleaved_dram < 0 || interleaved_nvm < 0) {\n    \tprintf(\"INVALID RANGE:\\n\");\n    \tprintf(\"\\tdram refs: %d, nvm refs: %d, interleaved dram refs: %d, interleaved nvm refs: %d\\n\",\n    \t\t\tdram_refs, nvm_refs, interleaved_dram, interleaved_nvm);\n    \texit(-1);\n    }\n\n    if ((dram_refs > 0 && interleaved_dram == 0) || (nvm_refs > 0 && interleaved_nvm == 0)) {\n    \tprintf(\"INVALID ARGUMENTS:\\n\");\n    \tprintf(\"\\tnumber of accesses in sequence cannot be zero if the number of accesses for the same memory type is greater than zero.\\n\");\n    \texit(-1);\n    }\n\n    if (dram_refs < interleaved_dram) {\n    \tprintf(\"INVALID ARGUMENTS:\\n\");\n    \tprintf(\"\\tnumber of DRAM accesses cannot be lower than the number of DRAM accesses in sequence\\n\");\n    \texit(-1);\n    }\n    if (nvm_refs < interleaved_nvm) {\n    \tprintf(\"INVALID ARGUMENTS:\\n\");\n    \tprintf(\"\\tnumber of NVM accesses cannot be lower than the number of NVM accesses in sequence\\n\");\n    \texit(-1);\n    }\n\n    if (pthread_attr_init(&attr) != 0) {\n\t\tprintf(\"pthread_attr_init failed\");\n\t\texit(-1);\n\t}\n\n    //srand(time(NULL));\n\n    args.interleave_dram = interleaved_dram;\n    args.interleave_nvm = interleaved_nvm;\n    args.mem_refs_dram = dram_refs;\n    args.mem_refs_nvm = nvm_refs;\n\n    for (i = 0; i < n_threads; ++i) {\n\t    pthread_create(&thread_desc[i], &attr, thread_fn, (void *)&args);\n\t}\n\n    pthread_attr_destroy(&attr);\n\n    for (i = 0; i < n_threads; ++i) {\n        pthread_join(thread_desc[i], NULL);\n    }\n}\n\nint main(int argn, char **argv)\n{\n    int dram_refs;\n    int nvm_refs;\n    int interleaved_dram;\n    int interleaved_nvm;\n    int n_threads;\n\n    if (argn != 6) {\n        printf(\"INVALID ARGUMENTS:\\n\");\n        printf(\"\\t%s [# threads] [# total dram accesses] [# total nvm accesses] [# dram accesses in sequence] [# nvm accesses in sequence]\\n\", argv[0]);\n        return -1;\n    }\n\n    n_threads = atoi(argv[1]);\n    dram_refs = atoi(argv[2]);\n    nvm_refs = atoi(argv[3]);\n    interleaved_dram = atoi(argv[4]);\n    interleaved_nvm = atoi(argv[5]);\n\n    run_threads(n_threads, dram_refs, nvm_refs, interleaved_dram, interleaved_nvm);\n\n    return 0;\n}\n"
  },
  {
    "path": "bench/new_memlat/CMakeLists.txt",
    "content": "include_directories(${CMAKE_SOURCE_DIR}/src/lib)\nadd_executable(new_memlat memlat.c)\ntarget_link_libraries(new_memlat nvmemul pthread)\n"
  },
  {
    "path": "bench/new_memlat/memlat.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <stddef.h>\n#include <stdint.h>\n#include <stdio.h>\n#include <assert.h>\n#include <pthread.h>\n#include \"model.h\"\n#include \"thread.h\"\n\n#define MAX_NUM_THREADS 512\n\nuint64_t g_seed, g_nchains, g_nelems, g_from_node_id, g_to_node_id, g_element_size, g_access_size;\n\nextern int measure_latency2(uint64_t seedin, int nchains, size_t nelems, int element_size, int access_size, int from_node_id, int to_node_id);\n\nstatic uint64_t safe_strtoull(const char *s) {\n    char *ep;\n    uint64_t r;\n    assert(NULL != s && '\\0' != *s);\n    r = strtoull(s, &ep, 10);\n    assert('\\0' == *ep);\n    return r;\n}\n\nextern latency_model_t latency_model;\n\n#ifdef MEMLAT_SUPPORT\nextern __thread int tls_hw_local_latency;\nextern __thread int tls_hw_remote_latency;\nextern __thread uint64_t tls_global_remote_dram;\nextern __thread uint64_t tls_global_local_dram;\n\nstatic inline uint64_t ns_to_cycles(int cpu_speed_mhz, int ns)\n{\n    return (cpu_speed_mhz * ns) / 1000;\n}\n#endif\n\nvoid* worker(void* arg) \n{\n    int latency_ns;\n#ifdef MEMLAT_SUPPORT\n    uint64_t exp_stalls;\n    uint64_t calc_nvm_accesses;\n    uint64_t detected_hw_lat;\n    uint64_t actual_lat = 0;\n    uint64_t total_time;\n    uint64_t fixed_latency_ns = 0;\n    uint64_t nvm_accesses = 0;\n    uint64_t nvm_hw_latency;\n#endif\n\n    latency_ns = measure_latency2(g_seed, g_nchains, g_nelems, g_element_size, g_access_size, g_from_node_id, g_to_node_id);\n    printf(\"latency_ns: %d ns\\n\", latency_ns);\n\n#ifdef MEMLAT_SUPPORT\n    total_time = g_nelems * latency_ns;\n    if (thread_self()->virtual_node->dram_node != thread_self()->virtual_node->nvram_node) {\n        detected_hw_lat = ns_to_cycles(thread_self()->cpu_speed_mhz, tls_hw_remote_latency);\n        if (tls_global_remote_dram > 0) {\n    \t    actual_lat = thread_self()->stall_cycles / tls_global_remote_dram;\n    \t    fixed_latency_ns = total_time / tls_global_remote_dram;\n    \t    nvm_accesses = tls_global_remote_dram;\n    \t}\n    \tnvm_hw_latency = tls_hw_remote_latency;\n    } else {\n        detected_hw_lat = ns_to_cycles(thread_self()->cpu_speed_mhz, tls_hw_local_latency);\n        if (tls_global_local_dram > 0) {\n    \t    actual_lat = thread_self()->stall_cycles / tls_global_local_dram;\n    \t    fixed_latency_ns = total_time / tls_global_local_dram;\n    \t    nvm_accesses = tls_global_local_dram;\n    \t}\n    \tnvm_hw_latency = tls_hw_local_latency;\n    }\n    exp_stalls = g_nelems * detected_hw_lat;\n    calc_nvm_accesses = thread_self()->stall_cycles / detected_hw_lat;\n\n    printf(\"target latency: %d ns\\n\", latency_model.read_latency);\n    printf(\"Error: %3.1f%%\\n\", (double)(abs(latency_model.read_latency - latency_ns)*100) / (double)latency_model.read_latency);\n    printf(\"target NVM accesses: %ld\\n\", g_nelems);\n    printf(\"detected HW latency: %ld ns\\n\", nvm_hw_latency);\n    printf(\"detected HW latency: %ld cycles (detected_hw_lat making use of cpu_speed_mhz)\\n\", detected_hw_lat);\n    printf(\"expected CPU stalls: %ld cycles (target_nvm_accesses * detected_hw_lat)\\n\", exp_stalls);\n    printf(\"actual CPU stalls: %ld cycles\\n\", thread_self()->stall_cycles);\n    printf(\"calculated NVM accesses: %ld (actual_cpu_stalls / detected_hw_lat)\\n\", calc_nvm_accesses);\n    if (nvm_accesses != 0) {\n        printf(\"actual NVM accesses: %ld\\n\", nvm_accesses);\n        printf(\"actual latency: %ld cyles (actual_stalls / actual_nvm_accesses)\\n\", actual_lat);\n        printf(\"fixed measured latency: %ld ns (total_chasing_time / actual_nvm_accesses)\\n\", fixed_latency_ns);\n        printf(\"fixed latency error: %3.1f%%\\n\", (double)(abs(latency_model.read_latency - fixed_latency_ns)*100) / (double)latency_model.read_latency);\n    } else {\n        fixed_latency_ns = total_time / calc_nvm_accesses;\n        printf(\"fixed measured latency: %ld ns (total_chasing_time / calculated_nvm_accesses)\\n\", fixed_latency_ns);\n        printf(\"fixed latency error: %3.1f%%\\n\", (double)(abs(latency_model.read_latency - fixed_latency_ns)*100) / (double)latency_model.read_latency);\n    }\n#endif\n    return NULL;\n}\nint main(int argc, char *argv[]) {\n\tint i;\n    uint64_t nthreads;\n    pthread_t thread[MAX_NUM_THREADS];\n\n    if (9 != argc) {\n        fprintf(stderr, \"usage: %s PRNGseed Nthreads Nchains Nelems SZelem SZaccess from_node to_node\\n\", argv[0]);\n        return 1;\n    }\n    g_seed  = safe_strtoull(argv[1]);\n    nthreads = safe_strtoull(argv[2]);\n    g_nchains = safe_strtoull(argv[3]);\n    g_nelems = safe_strtoull(argv[4]);\n    g_element_size = safe_strtoull(argv[5]);\n    g_access_size = safe_strtoull(argv[6]);\n    g_from_node_id = safe_strtoull(argv[7]);\n    g_to_node_id = safe_strtoull(argv[8]);\n\n\tfor (i = 0; i< nthreads; i++) {\n\t\tpthread_create(&thread[i], NULL, worker, NULL);\n    }\n\tfor(i = 0 ; i < nthreads; i++) {\n\t\tpthread_join(thread[i], NULL);\n    }\n    return 0;\n}\n"
  },
  {
    "path": "bench/new_memlat/memlat.sh",
    "content": "#################################################################\n#Copyright 2016 Hewlett Packard Enterprise Development LP.  \n#This program is free software; you can redistribute it and/or modify\n#it under the terms of the GNU General Public License as published by\n#the Free Software Foundation; either version 2 of the License, or (at\n#your option) any later version. This program is distributed in the\n#hope that it will be useful, but WITHOUT ANY WARRANTY; without even\n#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\n#PURPOSE. See the GNU General Public License for more details. You\n#should have received a copy of the GNU General Public License along\n#with this program; if not, write to the Free Software Foundation,\n#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n#################################################################\n#!/bin/bash\n\n# percentage of error as threshold to discard outliers, anything above this percentage will be discarded\nMAX_ERROR_PERCENTAGE=10\n# max number of tries to execute memlat\nMAX_TRIES=10\n\n\nTEMP_FILE=/tmp/tmp_memlat.out\n\n\nNVM_EMUL_PATH=\"`dirname $0`/../..\"\nNELEMS=$1\nTARGET_DRAM=$2\n\n\nfunction usage()\n{\n    echo \"$0 [number of elements] [0=local dram|1=remote dram]\"\n    exit 1\n}\n\nfunction validate_decimal()\n{\n    re='^[0-9]+$'\n    if ! [[ $1 =~ $re ]] ; then\n        return 1\n    fi\n    return 0\n}\n\nfunction check_parameters()\n{\n    if [ $# -ne 2 ]; then\n        echo \"Incorrect arguments\"\n        usage\n    fi\n\n    validate_decimal ${NELEMS}\n\n    if [ $? -ne 0 ]; then\n        echo \"Invalid number of arguments\"\n        usage\n    fi\n\n    if [ ${TARGET_DRAM} -ne 0 -a ${TARGET_DRAM} -ne 1 ]; then\n        echo \"Incorret dram target\"\n        usage\n    fi\n}\n\nfunction verify_run\n{\n    target=$(cat ${TEMP_FILE} | grep \"target latency\" | awk '{ print $3 }')\n    measured=$(cat ${TEMP_FILE} | grep \"measured latency\" | awk '{ print $4 }')\n\n    if [ ${measured} -gt ${target} ]; then\n        delta=$(expr ${measured} - ${target});\n    else\n        delta=$(expr ${target} - ${measured});\n    fi\n\n    if [ ${target} -gt 0 ]; then\n        error=$(expr ${delta} \\* 100)\n        error=$(expr ${error} \\/ ${target})\n    else\n        error=0\n    fi\n\n\n    if [ ${error} -gt ${MAX_ERROR_PERCENTAGE} ]; then\n        return 1\n    fi\n\n    return 0\n}\n\n############ MAIN ######################\n\ncheck_parameters $*\n\n# execute memlat in loop until the result is within the threshold or the max tries is reached\nfor (( c=0; c<${MAX_TRIES}; c++ )); do\n    ${NVM_EMUL_PATH}/scripts/runenv.sh ${NVM_EMUL_PATH}/build/bench/new_memlat/new_memlat 1 1 1 ${NELEMS} 64 8 0 ${TARGET_DRAM} &> ${TEMP_FILE}\n\n    verify_run\n\n    ret=$?\n\n    if [ ${ret} -eq 0 ]; then\n        cat ${TEMP_FILE} | grep \"measured latency\"\n        break\n    fi\ndone\n\nif [ ${ret} -ne 0 ]; then\n    echo \"Could not produce a valid run\"\nfi\n\nrm -f ${TEMP_FILE}\n\nexit ${ret}\n"
  },
  {
    "path": "benchmark-tests/bandwidth-model-building.sh",
    "content": "#################################################################\n#Copyright 2016 Hewlett Packard Enterprise Development LP.  \n#This program is free software; you can redistribute it and/or modify\n#it under the terms of the GNU General Public License as published by\n#the Free Software Foundation; either version 2 of the License, or (at\n#your option) any later version. This program is distributed in the\n#hope that it will be useful, but WITHOUT ANY WARRANTY; without even\n#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\n#PURPOSE. See the GNU General Public License for more details. You\n#should have received a copy of the GNU General Public License along\n#with this program; if not, write to the Free Software Foundation,\n#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n#################################################################\n#!/bin/bash\n\necho performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor\n\ncp  nvmemul-bandwidth.ini  nvmemul.ini\nrm /tmp/bandwidth_model\n../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 0\n../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 0\n"
  },
  {
    "path": "benchmark-tests/memlat-bench-test-10M-single-socket.sh",
    "content": "#################################################################\n#Copyright 2016 Hewlett Packard Enterprise Development LP.  \n#This program is free software; you can redistribute it and/or modify\n#it under the terms of the GNU General Public License as published by\n#the Free Software Foundation; either version 2 of the License, or (at\n#your option) any later version. This program is distributed in the\n#hope that it will be useful, but WITHOUT ANY WARRANTY; without even\n#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\n#PURPOSE. See the GNU General Public License for more details. You\n#should have received a copy of the GNU General Public License along\n#with this program; if not, write to the Free Software Foundation,\n#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n#################################################################\n#!/bin/bash\n\n#awk '($1~/physical_nodes/) {print;}'  nvmemul.ini\n\necho performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor\n\ndir_name_res=FULL-RESULTS-test\ndir_name_sum=SUMMARY-RESULTS-test\n\nrm -rf $dir_name_sum\nmkdir  $dir_name_sum\n\nrm -f foo*\nrm -rf $dir_name_res\nmkdir $dir_name_res\n\ncat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor >> $dir_name_res/foo-runs-test\n\ncp nvmemul-orig.ini nvmemul.ini\n../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 0 >foo\n\n\n    for numchains in 1 \n    do\n\tfor epoch in 10000 \n\tdo \n\t    echo \"#FORMAT #1_emul_lat(ns) #2_min_meas_lat(ns)  #3_aver_meas_lat(ns)  #4_max_meas_lat(ns)  #5_aver_error(%) #6_max_error(%)\" >  $dir_name_sum/summary-nvm-lat-accuracy-epoch-$epoch-numchains-$numchains.txt\n\n\t    for lat in 200 300 400 500 600 700 800 900 1000\n\t    do\n\t\tawk 'BEGIN {read_lat = substr(ARGV[2],3); epoch_lat = substr(ARGV[3],3);}\n(!(NR==7 || NR==9 || NR==10 || $1~/physical_nodes/)){ print;}\n(NR==7){ print $1,$2, read_lat,\";\";}\n(NR==9){ print $1,$2, epoch_lat,\";\";}\n(NR==10){ print $1,$2, epoch_lat,\";\";}\n($1~/physical_nodes/) {print $1,$2,\"\\\"0\\\"\"\";\";}\n' nvmemul-orig.ini v=$lat v=$epoch > foo-nvmemul-$lat-$epoch.ini\n\t\tmv foo-nvmemul-$lat-$epoch.ini  nvmemul.ini\n\t\techo \"lat epoch chains\" $lat $epoch $numchains >>   $dir_name_res/foo-runs\n\t\t\n\t\tfor time in 1 2 3 4 5 6 7 8 9 10\n\t\tdo\n\t\t    ../build/bench/memlat/memlat 1 1 $numchains 10000000 64 8 0 0 >> $dir_name_res/full_results-$lat-$epoch-$numchains.txt\n \t\tdone\n                grep latency_ns $dir_name_res/full_results-$lat-$epoch-$numchains.txt > $dir_name_res/results-$lat-$epoch-$numchains.txt\n\t\tawk 'BEGIN {max = 0; min = 1000000; sum = 0; aver=0.0; max_error=0.0; aver_error=0.0;read_lat = substr(ARGV[2],3);epoch_lat = substr(ARGV[3],3); MPL = substr(ARGV[4],3); }\n($2 > max){max = $2;}\n($2 < min){min = $2;}\n{sum=sum+$2; if ($2 < read_lat*1.0) {error=read_lat -$2} else {error=$2 - read_lat}; if (error > max_error) max_error=error;}\nEND {aver=sum/NR; if (aver < read_lat*1.0) {aver_error = (read_lat - aver)*100.0/read_lat} else {aver_error = (aver - read_lat )*100.0/read_lat}; print read_lat, min,aver,max, aver_error,max_error*100.0/read_lat;} '   $dir_name_res/results-$lat-$epoch-$numchains.txt v=$lat v=$epoch v=$numchains >> $dir_name_sum/summary-nvm-lat-accuracy-epoch-$epoch-numchains-$numchains.txt\n\t\t\n\t    done\n\tdone\n    done\n\n\n#FORMAT_summary-results: #1_nvm_lat(ns) #2_min_nvm_lat(ns)  #3_aver_nvm_lat(ns)  #4_max_nvm_lat(ns)  #5_aver_error(%) #6_max_error(%)\n\n#parameter is nvm_lat\n\n\n\n"
  },
  {
    "path": "benchmark-tests/memlat-bench-test-10M.sh",
    "content": "#################################################################\n#Copyright 2016 Hewlett Packard Enterprise Development LP.  \n#This program is free software; you can redistribute it and/or modify\n#it under the terms of the GNU General Public License as published by\n#the Free Software Foundation; either version 2 of the License, or (at\n#your option) any later version. This program is distributed in the\n#hope that it will be useful, but WITHOUT ANY WARRANTY; without even\n#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\n#PURPOSE. See the GNU General Public License for more details. You\n#should have received a copy of the GNU General Public License along\n#with this program; if not, write to the Free Software Foundation,\n#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n#################################################################\n#!/bin/bash\n\n#awk '($1~/physical_nodes/) {print;}'  nvmemul.ini\n\nnum_sockets=$(cat /proc/cpuinfo | grep \"physical id\" | sort -u | wc -l)\nif [ $num_sockets -eq 1 ]; \nthen\necho \"Single Socket\"\n./memlat-bench-test-10M-single-socket.sh\nexit 0\nfi\n\necho performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor\n\ndir_name_res=FULL-RESULTS-test\ndir_name_sum=SUMMARY-RESULTS-test\n\nrm -rf $dir_name_sum\nmkdir  $dir_name_sum\n\nrm -f foo*\nrm -rf $dir_name_res\nmkdir $dir_name_res\n\ncat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor >> $dir_name_res/foo-runs-test\n\ncp nvmemul-orig.ini nvmemul.ini\n../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 1 >foo\n\nfor conf in local remote\ndo\n    if [ $conf = local ]; then confpar=0 \n    else confpar=1\n    fi\n    for numchains in 1 \n    do\n\tfor epoch in 10000 \n\tdo \n\t    echo \"#FORMAT #1_emul_lat(ns) #2_min_meas_lat(ns)  #3_aver_meas_lat(ns)  #4_max_meas_lat(ns)  #5_aver_error(%) #6_max_error(%)\" >  $dir_name_sum/summary-nvm-lat-accuracy-$conf-epoch-$epoch-numchains-$numchains.txt\n\n\t    for lat in 200 300 400 500 600 700 800 900 1000\n\t    do\n\t\tawk 'BEGIN {read_lat = substr(ARGV[2],3); epoch_lat = substr(ARGV[3],3); config = substr(ARGV[4],3);}\n(!(NR==7 || NR==9 || NR==10 || $1~/physical_nodes/)){ print;}\n(NR==7){ print $1,$2, read_lat,\";\";}\n(NR==9){ print $1,$2, epoch_lat,\";\";}\n(NR==10){ print $1,$2, epoch_lat,\";\";}\n($1~/physical_nodes/ && config ~ /local/) {print $1,$2,\"\\\"0\\\"\"\";\";}\n($1~/physical_nodes/ && config ~ /remote/) {print $1,$2,\"\\\"0,1\\\"\"\";\";}\n' nvmemul-orig.ini v=$lat v=$epoch v=$conf > foo-nvmemul-$lat-$epoch.ini\n\t\tmv foo-nvmemul-$lat-$epoch.ini  nvmemul.ini\n\t\techo \"lat epoch chains\" $lat $epoch $numchains >>   $dir_name_res/foo-runs\n\t\t\n\t\tfor time in 1 2 3 4 5 6 7 8 9 10\n\t\tdo\n\t\t    ../build/bench/memlat/memlat 1 1 $numchains 10000000 64 8 0 $confpar >> $dir_name_res/full_results-$conf-$lat-$epoch-$numchains.txt\n \t\tdone\n                grep latency_ns $dir_name_res/full_results-$conf-$lat-$epoch-$numchains.txt > $dir_name_res/results-$conf-$lat-$epoch-$numchains.txt\n\t\tawk 'BEGIN {max = 0; min = 1000000; sum = 0; aver=0.0; max_error=0.0; aver_error=0.0;read_lat = substr(ARGV[2],3);epoch_lat = substr(ARGV[3],3); MPL = substr(ARGV[4],3); }\n($2 > max){max = $2;}\n($2 < min){min = $2;}\n{sum=sum+$2; if ($2 < read_lat*1.0) {error=read_lat -$2} else {error=$2 - read_lat}; if (error > max_error) max_error=error;}\nEND {aver=sum/NR; if (aver < read_lat*1.0) {aver_error = (read_lat - aver)*100.0/read_lat} else {aver_error = (aver - read_lat )*100.0/read_lat}; print read_lat, min,aver,max, aver_error,max_error*100.0/read_lat;} '   $dir_name_res/results-$conf-$lat-$epoch-$numchains.txt v=$lat v=$epoch v=$numchains >> $dir_name_sum/summary-nvm-lat-accuracy-$conf-epoch-$epoch-numchains-$numchains.txt\n\t\t\n\t    done\n\tdone\n    done\ndone\n\n\n#FORMAT_summary-results: #1_nvm_lat(ns) #2_min_nvm_lat(ns)  #3_aver_nvm_lat(ns)  #4_max_nvm_lat(ns)  #5_aver_error(%) #6_max_error(%)\n\n#parameter is nvm_lat\n\n\n\n"
  },
  {
    "path": "benchmark-tests/memlat-orig-lat-test-single-socket.sh",
    "content": "#################################################################\n#Copyright 2016 Hewlett Packard Enterprise Development LP.  \n#This program is free software; you can redistribute it and/or modify\n#it under the terms of the GNU General Public License as published by\n#the Free Software Foundation; either version 2 of the License, or (at\n#your option) any later version. This program is distributed in the\n#hope that it will be useful, but WITHOUT ANY WARRANTY; without even\n#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\n#PURPOSE. See the GNU General Public License for more details. You\n#should have received a copy of the GNU General Public License along\n#with this program; if not, write to the Free Software Foundation,\n#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n#################################################################\n#!/bin/bash\n\n#awk '($1~/physical_nodes/) {print;}'  nvmemul.ini\n\necho performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor\n\ndir_name_res=ORIG-lat-test\n\nrm -f foo*\nrm -rf $dir_name_res\nmkdir $dir_name_res\n\n\ncp  nvmemul-debug.ini  nvmemul.ini\n../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 0\n\nfor time in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20\ndo\n    ../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 0 > $dir_name_res/foo-hw-latency.txt\n    grep \"measuring latency: latency is\" $dir_name_res/foo-hw-latency.txt > $dir_name_res/foo\n    awk 'NR==1 {local=$7;}\n         END {print local}'  $dir_name_res/foo >>  $dir_name_res/list-hw-latency.txt\ndone\n\necho \"#FORMAT:#1_min #2_aver #3_max\" > $dir_name_res/final-hw-latency.txt  \n\nawk 'BEGIN {max1 = 0.0; min1 = 10000000.0; sum1 = 0.0;}\n         ($1 > max1){max1 = $1;}\n         ($1 < min1){min1 = $1;}\n         {sum1=sum1+$1;sum2=sum2+$2;}\n         END {print min1, sum1/NR, max1;}'  $dir_name_res/list-hw-latency.txt  >> $dir_name_res/final-hw-latency.txt  \n\nrm  $dir_name_res/foo*\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
  },
  {
    "path": "benchmark-tests/memlat-orig-lat-test.sh",
    "content": "#################################################################\n#Copyright 2016 Hewlett Packard Enterprise Development LP.  \n#This program is free software; you can redistribute it and/or modify\n#it under the terms of the GNU General Public License as published by\n#the Free Software Foundation; either version 2 of the License, or (at\n#your option) any later version. This program is distributed in the\n#hope that it will be useful, but WITHOUT ANY WARRANTY; without even\n#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\n#PURPOSE. See the GNU General Public License for more details. You\n#should have received a copy of the GNU General Public License along\n#with this program; if not, write to the Free Software Foundation,\n#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n#################################################################\n#!/bin/bash\n\n#awk '($1~/physical_nodes/) {print;}'  nvmemul.ini\n\nnum_sockets=$(cat /proc/cpuinfo | grep \"physical id\" | sort -u | wc -l)\nif [ $num_sockets -eq 1 ]; \nthen\necho \"Single Socket\"\n./memlat-orig-lat-test-single-socket.sh\nexit 0\nfi\n\necho performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor\n\ndir_name_res=ORIG-lat-test\n\nrm -f foo*\nrm -rf $dir_name_res\nmkdir $dir_name_res\n\n\ncp  nvmemul-debug.ini  nvmemul.ini\n../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 1\n\n#FORMAT: ns\n#FORMAT: min_local #2_aver_local max_local min_remote #5_aver_remote max_remote \n#FORMAT: \n\nfor time in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20\ndo\n    ../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 1 > $dir_name_res/foo-hw-latency.txt\n    grep \"measuring latency: latency is\" $dir_name_res/foo-hw-latency.txt > $dir_name_res/foo\n    awk 'NR==1 {local=$7;}\n         NR==2 {remote=$7;}\n         END {print local , remote}'  $dir_name_res/foo >>  $dir_name_res/list-hw-latency.txt\ndone\n\necho \"#FORMAT:#1_min_local #2_aver_local #3_max_local #4_min_remote #5_aver_remote #6_max_remote\" > $dir_name_res/final-hw-latency.txt  \n\nawk 'BEGIN {max1 = 0.0; min1 = 10000000.0; max2 = 0.0; min2 = 10000000.0; sum1 = 0.0; sum2 = 0.0;}\n         ($1 > max1){max1 = $1;}\n         ($1 < min1){min1 = $1;}\n         ($2 > max2){max2 = $2;}\n         ($2 < min2){min2 = $2;}\n         {sum1=sum1+$1;sum2=sum2+$2;}\n         END {print min1, sum1/NR, max1,  min2, sum2/NR, max2 ;}'  $dir_name_res/list-hw-latency.txt  >> $dir_name_res/final-hw-latency.txt  \n\nrm  $dir_name_res/foo*\n\n#FORMAT:   ns\n#FORMAT:#1_min_local #2_aver_local #3_max_local #4_min_remote #5_aver_remote #6_max_remote \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
  },
  {
    "path": "benchmark-tests/nvmemul-bandwidth.ini",
    "content": "# Configuration file \n\nlatency:\n{\n    enable = true;\n    inject_delay = true;\n    read = 1000;\n    write = 1000;\n    max_epoch_duration_us = 10000;\n    min_epoch_duration_us = 10000;\n    calibration = false;\n};\n\nbandwidth:\n{\n    enable = true;\n    model = \"/tmp/bandwidth_model\";\n    read = 2000;\n    write = 2000;\n};\n\ntopology:\n{\n    mc_pci = \"/tmp/mc_pci_bus\";\n    physical_nodes = \"0\";\n    hyperthreading = true; # do not use multiple hardware threads per core\n};\n\nstatistics:\n{\n    enable = true;\n    #file = \"/tmp/statistics\";\n};\n\ndebug:\n{\n    # debugging level\n    level = 5;\n    verbose = 0;\n\n    # modules set to True produce debugging output\n    module:\n    {\n        all = False;\n    };\n};\n"
  },
  {
    "path": "benchmark-tests/nvmemul-debug.ini",
    "content": "# Configuration file \n\nlatency:\n{\n    enable = true;\n    inject_delay = true;\nread = 1000 ;\n    write = 1000;\nmax_epoch_duration_us = 10000 ;\nmin_epoch_duration_us = 10000 ;\n    calibration = false;\n};\n\nbandwidth:\n{\n    enable = false;\n    model = \"/tmp/bandwidth_model\";\n    read = 2000;\n    write = 2000;\n};\n\ntopology:\n{\n    mc_pci = \"/tmp/mc_pci_bus\";\nphysical_nodes = \"0,1\";\n    hyperthreading = true; # do not use multiple hardware threads per core\n};\n\nstatistics:\n{\n    enable = true;\n    #file = \"/tmp/statistics\";\n};\n\ndebug:\n{\n    # debugging level\n    level = 5;\n    verbose = 0;\n\n    # modules set to True produce debugging output\n    module:\n    {\n        all = False;\n    };\n};\n"
  },
  {
    "path": "benchmark-tests/nvmemul-orig.ini",
    "content": "# Configuration file \n\nlatency:\n{\n    enable = true;\n    inject_delay = true;\nread = 1000 ;\n    write = 1000;\nmax_epoch_duration_us = 10000 ;\nmin_epoch_duration_us = 10000 ;\n    calibration = false;\n};\n\nbandwidth:\n{\n    enable = false;\n    model = \"/tmp/bandwidth_model\";\n    read = 2000;\n    write = 2000;\n};\n\ntopology:\n{\n    mc_pci = \"/tmp/mc_pci_bus\";\nphysical_nodes = \"0,1\";\n    hyperthreading = true; # do not use multiple hardware threads per core\n};\n\nstatistics:\n{\n    enable = true;\n    #file = \"/tmp/statistics\";\n};\n\ndebug:\n{\n    # debugging level\n    level = 3;\n    verbose = 0;\n\n    # modules set to True produce debugging output\n    module:\n    {\n        all = False;\n    };\n};\n"
  },
  {
    "path": "benchmark-tests/nvmemul.ini",
    "content": "# Configuration file \n\nlatency:\n{\n    enable = true;\n    inject_delay = true;\nread = 300 ;\n    write = 200;\nmax_epoch_duration_us = 10000 ;\nmin_epoch_duration_us = 10000 ;\n    calibration = false;\n};\n\nbandwidth:\n{\n    enable = false;\n    model = \"/tmp/bandwidth_model\";\n    read = 2000;\n    write = 2000;\n};\n\ntopology:\n{\n    mc_pci = \"/tmp/mc_pci_bus\";\nphysical_nodes = \"0,1\";\n    hyperthreading = true; # do not use multiple hardware threads per core\n};\n\nstatistics:\n{\n    enable = true;\n    #file = \"/tmp/statistics\";\n};\n\ndebug:\n{\n    # debugging level\n    level = 5;\n    verbose = 0;\n\n    # modules set to True produce debugging output\n    module:\n    {\n        all = False;\n    };\n};\n"
  },
  {
    "path": "license.txt",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n\n\n"
  },
  {
    "path": "nvmemul-orig.ini",
    "content": "# Configuration file \n\nlatency:\n{\n    enable = true;\n    inject_delay = true;\nread = 1000 ;\n    write = 1000;\nmax_epoch_duration_us = 10000 ;\nmin_epoch_duration_us = 10000 ;\n    calibration = false;\n};\n\nbandwidth:\n{\n    enable = false;\n    model = \"/tmp/bandwidth_model\";\n    read = 2000;\n    write = 2000;\n};\n\ntopology:\n{\n    mc_pci = \"/tmp/mc_pci_bus\";\nphysical_nodes = \"0,1\";\n    hyperthreading = true; # do not use multiple hardware threads per core\n};\n\nstatistics:\n{\n    enable = true;\n    #file = \"/tmp/statistics\";\n};\n\ndebug:\n{\n    # debugging level\n    level = 3;\n    verbose = 0;\n\n    # modules set to True produce debugging output\n    module:\n    {\n        all = False;\n    };\n};\n"
  },
  {
    "path": "nvmemul.dox",
    "content": "/**\n\n@mainpage Quartz:  A Lightweight  Performance Emulator for  Persistent Memory Software.\n\n\n\\section section-intro Introduction\n\nQuartz: A DRAM-based performance emulation platform that leverages features \navailable in commodity hardware to emulate different latency and bandwidth \ncharacteristics of future byte-addressable NVM technologies.\n\n*/\n\n    \n\n\n\n\n\n\n\n"
  },
  {
    "path": "nvmemul.ini",
    "content": "# Configuration file \n\nlatency:\n{\n    enable = true;\n    inject_delay = true;\nread = 1000 ;\n    write = 1000;\nmax_epoch_duration_us = 10000 ;\nmin_epoch_duration_us = 10000 ;\n    calibration = false;\n};\n\nbandwidth:\n{\n    enable = false;\n    model = \"/tmp/bandwidth_model\";\n    read = 500;\n    write = 500;\n};\n\ntopology:\n{\n    mc_pci = \"/tmp/mc_pci_bus\";\nphysical_nodes = \"0,1\";\n    hyperthreading = true; # do not use multiple hardware threads per core\n};\n\nstatistics:\n{\n    enable = true;\n    #file = \"/tmp/statistics\";\n};\n\ndebug:\n{\n    # debugging level\n    level = 1;\n    verbose = 0;\n\n    # modules set to True produce debugging output\n    module:\n    {\n        all = False;\n    };\n};\n"
  },
  {
    "path": "scripts/install.sh",
    "content": "#!/bin/bash\n#################################################################\n#Copyright 2016 Hewlett Packard Enterprise Development LP.  \n#This program is free software; you can redistribute it and/or modify\n#it under the terms of the GNU General Public License as published by\n#the Free Software Foundation; either version 2 of the License, or (at\n#your option) any later version. This program is distributed in the\n#hope that it will be useful, but WITHOUT ANY WARRANTY; without even\n#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\n#PURPOSE. See the GNU General Public License for more details. You\n#should have received a copy of the GNU General Public License along\n#with this program; if not, write to the Free Software Foundation,\n#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n#################################################################\n\nPAPI_MAJOR=5\nPAPI_MINOR=1\nPAPI_RELEASE=1\n\nCMAKE_MAJOR=2\nCMAKE_MINOR=8\n\nfunction install_deps_rpm() {\n    yum install -q -y numactl-devel libconfig libconfig-devel cmake kernel-devel-`uname -r` msr-tools uthash-devel\n\n    if [ $? -ne 0 ]; then\n        echo \"Dependencies installation failed\"\n        exit -1\n    fi\n}\n\nfunction install_deps_deb() {\n    apt-get install -y libnuma-dev libconfig-dev cmake  msr-tools uthash-dev\n\n    if [ $? -ne 0 ]; then\n        echo \"Dependencies installation failed\"\n        exit -1\n    fi\n}\n\nfunction check_supported_papi() {\n    major=`papi_version | cut -d ' ' -f3 | cut -d '-' -f1 | cut -d '.' -f1`\n    minor=`papi_version | cut -d ' ' -f3 | cut -d '-' -f1 | cut -d '.' -f2`\n    release=`papi_version | cut -d ' ' -f3 | cut -d '-' -f1 | cut -d '.' -f3`\n\n    if [ ${major} -ne ${PAPI_MAJOR} ]; then\n        echo \"CMake version (${major}.${minor}.${release}) not supported (=${PAPI_MAJOR}.${PAPI_MINOR}.${PAPI_RELEASE})\"\n        exit -1\n    fi\n    if [ ${minor} -ne ${PAPI_MINOR} ]; then\n        echo \"CMake version (${major}.${minor}.${release}) not supported (=${PAPI_MAJOR}.${PAPI_MINOR}.${PAPI_RELEASE})\"\n        exit -1\n    fi\n    if [ ${release} -ne ${PAPI_RELEASE} ]; then\n        echo \"CMake version (${major}.${minor}.${release}) not supported (=${PAPI_MAJOR}.${PAPI_MINOR}.${PAPI_RELEASE})\"\n        exit -1\n    fi\n}\n\nfunction check_supported_cmake() {\n    major=`cmake -version | head -1 | cut -d ' ' -f3 | cut -d '-' -f1 | cut -d '.' -f1`\n    minor=`cmake -version | head -1 | cut -d ' ' -f3 | cut -d '-' -f1 | cut -d '.' -f2`\n    \n    if [ ${major} -lt ${CMAKE_MAJOR} ]; then\n        echo \"CMake version (${major}.${minor}) not supported (>=${CMAKE_MAJOR}.${CMAKE_MINOR})\"\n        exit -1\n    fi\n    if [ ${major} -eq ${CMAKE_MAJOR} ]; then\n        if [ ${minor} -lt ${CMAKE_MINOR} ]; then\n            echo \"CMake version (${major}.${minor}) not supported (>=${CMAKE_MAJOR}.${CMAKE_MINOR})\"\n            exit -1\n        fi\n    fi\n}\n\nfunction check_supported_versions() {\n    check_supported_cmake\n#    check_supported_papi\n}\n\n\n#################### MAIN ####################\n\nif [ $(id -u) -ne 0 ]; then\n   echo \"You mut be root to execute this script\"\n   exit -1\nfi\n\nif [ -f /etc/redhat-release ]; then\n    install_deps_rpm\nelif [ -f /etc/centos-release ]; then\n    install_deps_rpm\nelif [ -f /etc/debian_version -o -f /etc/debian-release ]; then\n    install_deps_deb\nelse\n    echo \"Linux distribution not supported\"\n    exit -1\nfi\n\ncheck_supported_versions\n\n"
  },
  {
    "path": "scripts/runenv.sh",
    "content": "#!/bin/bash\n#################################################################\n#Copyright 2016 Hewlett Packard Enterprise Development LP.  \n#This program is free software; you can redistribute it and/or modify\n#it under the terms of the GNU General Public License as published by\n#the Free Software Foundation; either version 2 of the License, or (at\n#your option) any later version. This program is distributed in the\n#hope that it will be useful, but WITHOUT ANY WARRANTY; without even\n#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\n#PURPOSE. See the GNU General Public License for more details. You\n#should have received a copy of the GNU General Public License along\n#with this program; if not, write to the Free Software Foundation,\n#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n#################################################################\n\nNVM_EMUL_PATH=\"`dirname $0`/..\"\n\n\nif [ -z \"$1\" ]; then\n    echo \"runenv.sh [cmd to run]\"\n    exit 1\nfi\n\nrootdir=\"$NVM_EMUL_PATH\"\nbindir=$rootdir\"/build\"\n\nif [ -f /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor ]; then\n    current_scaling=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor);\n\n    if [ \"${current_scaling}\" != \"performance\" ]; then\n        file_list=$(ls /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor)\n        for cpu_file in ${file_list}; do\n            echo \"performance\" | sudo tee ${cpu_file} > /dev/null\n        done\n    fi\nfi\n\n$rootdir/scripts/turboboost.sh disable\n\nv=$(uname -r | cut -d '.' -f1)\nif [ $v -ge 4 ]; then\n    echo \"2\" | sudo tee /sys/bus/event_source/devices/cpu/rdpmc\nfi\n\nexport LD_PRELOAD=$bindir\"/src/lib/libnvmemul.so\"\nexport NVMEMUL_INI=$rootdir\"/nvmemul.ini\"\n\nif [ ! -f ${LD_PRELOAD} ]; then\n    echo \"Library not found. Compile the emulator's library first.\"\n    exit -1\nfi\n\necho $LD_PRELOAD\necho $NVMEMUL_INI\n\n# execute the command passed as argument\n$@\n\n"
  },
  {
    "path": "scripts/setupdev.sh",
    "content": "#!/bin/bash\n#################################################################\n#Copyright 2016 Hewlett Packard Enterprise Development LP.  \n#This program is free software; you can redistribute it and/or modify\n#it under the terms of the GNU General Public License as published by\n#the Free Software Foundation; either version 2 of the License, or (at\n#your option) any later version. This program is distributed in the\n#hope that it will be useful, but WITHOUT ANY WARRANTY; without even\n#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\n#PURPOSE. See the GNU General Public License for more details. You\n#should have received a copy of the GNU General Public License along\n#with this program; if not, write to the Free Software Foundation,\n#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n#################################################################\n\nNVM_EMUL_PATH=\"`dirname $0`/..\"\n\ndevice_name=\"nvmemul\"\ndevice_module_name=${device_name}\".ko\"\ndevice_path=\"/dev/${device_name}\"\ndevice_module_path=`find ${NVM_EMUL_PATH}/build -name ${device_module_name}`\n\n\nfunction loaddev {\n    if [ -z \"${device_module_path}\" ]; then\n        echo \"Module not found. Compile the emulator's source code first.\"\n        exit -1\n    fi\n\n    /sbin/insmod ${device_module_path} 2> /dev/null\n\n    if [ $? -ne 0 ]; then\n        lsmod | grep ${device_name} > /dev/null\n        if [ $? -eq 0 ]; then\n            echo \"Kernel module already loaded, please reload it.\"\n            exit 1\n        fi\n        echo \"Kernel module loading failed\"\n        exit 1\n    fi\n\n    device_major=`grep ${device_name} /proc/devices | awk '{ print $1 }'`\n    if [ $? -ne 0 -o -z \"${device_major}\" ]; then\n        echo \"Failed to detect module major\"\n        exit 1\n    fi\n\n    rm -f ${device_path}\n    if [ $? -ne 0 ]; then\n        echo \"Failed to delete kernel module device file\"\n        exit 1\n    fi\n\n    mknod ${device_path} c ${device_major} 0\n    chmod a+wr ${device_path}\n\n    lsmod | grep ${device_name} > /dev/null\n\n    if [ $? -eq 0 ]; then\n        echo \"Kernel module loaded successfully\"\n    else\n        echo \"kernel module loading failed\"\n        exit 1\n    fi\n}\n\nfunction unloaddev {\n    /sbin/rmmod ${device_name} 2> /dev/null\n    rm -f ${device_path}\n    if [ $? -eq 0 ]; then\n        echo \"Kernel module unloaded successfully\"\n    else\n        echo \"Failed to delete kernel module device file\"\n        exit 1\n    fi\n}\n\nfunction help() {\n    echo \"$0 <load|unload|reload>\"\n}\n\n### MAIN ###\n\nif [ $(id -u) -ne 0 ]; then\n   echo \"You mut be root to execute this script\"\n   exit -1\nfi\n\nif [ $# -eq 0 ]; then\n    help\n    exit 1\nfi\n\nif [ \"$1\" = \"load\" ] || [ \"$1\" = \"l\" ]; then\n    loaddev\nelif [ \"$1\" = \"unload\" ] || [ \"$1\" = \"u\" ]; then\n    unloaddev\nelif [ \"$1\" = \"reload\" ] || [ \"$1\" = \"r\" ]; then\n    unloaddev\n    loaddev\nelse\n    help\n    exit 1\nfi\n\nexit 0\n"
  },
  {
    "path": "scripts/turboboost.sh",
    "content": "#!/bin/bash\n#################################################################\n#Copyright 2016 Hewlett Packard Enterprise Development LP.  \n#This program is free software; you can redistribute it and/or modify\n#it under the terms of the GNU General Public License as published by\n#the Free Software Foundation; either version 2 of the License, or (at\n#your option) any later version. This program is distributed in the\n#hope that it will be useful, but WITHOUT ANY WARRANTY; without even\n#the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\n#PURPOSE. See the GNU General Public License for more details. You\n#should have received a copy of the GNU General Public License along\n#with this program; if not, write to the Free Software Foundation,\n#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n#################################################################\n\nfunction usage()\n{\n    echo \"$0 <function> [target CPU id]\"\n    echo -e \"\\tfunctions:\"\n    echo -e \"\\t\\t check: verifies if a given CPU id has Turbo Boost enabled\"\n    echo -e \"\\t\\t disable: disables a given CPU id or all CPUs if not specified\"\n    echo -e \"\\t\\t enabled: enables a given CPU id or all CPUs if not specified\"\n}\n\nfunction verify_cpu_id()\n{\n    re='^[0-9]+$'\n    if ! [[ $1 =~ $re ]]; then\n        echo \"CPU id is not a number\"\n        exit 1\n    fi\n}\n\nfunction check_msr_module()\n{\n    lsmod | grep msr > /dev/null\n    if [ $? -ne 0 ]; then\n         # some systems need this, others don't\n        sudo modprobe msr &> /dev/null\n        #if [ $? -ne 0 ]; then\n        #    echo \"Failed to load MSR module\"\n        #    exit 1\n        #fi\n    fi\n}\n\nfunction check()\n{\n    cpu=$1\n\n    if [ -z \"${cpu}\" ]; then\n        usage\n        exit 1\n    fi\n\n    cpus=$(lscpu | sed -n 4p | awk '{ print $2 }')\n\n    if [ ${cpu} -ge ${cpus} ]; then\n        echo \"CPU id out of range\"\n        exit 1\n    fi\n\n    disabled=$(sudo rdmsr -p${cpu} 0x1a0 -f 38:38)\n\n    if [ \"${disabled}\" == \"1\" ]; then\n        echo \"Turbo Boost for processor ${cpu} is disabled\"\n    else\n        echo \"Turbo Boost for processor ${cpu} is enabled\"\n    fi\n}\n\nfunction enable()\n{\n    cpu=$1\n\n    cpus=$(lscpu | sed -n 4p | awk '{ print $2 }')\n\n    if [ -z \"${cpu}\" ]; then\n        for (( i=0; i<${cpus}; i++ )); do \n            sudo wrmsr -p$i 0x1a0 0x850089\n        done\n        echo \"Turbo Boost enabled for all CPUs\"\n    else\n        if [ ${cpu} -ge ${cpus} ]; then\n            echo \"CPU id out of range\"\n            exit 1\n        fi\n        sudo wrmsr -p${cpu} 0x1a0 0x850089\n        echo \"Turbo Boost enabled for CPU ${cpu}\"\n    fi\n}\n\nfunction disable()\n{\n    cpu=$1\n\n    cpus=$(lscpu | sed -n 4p | awk '{ print $2 }')\n\n    if [ -z \"${cpu}\" ]; then\n        for (( i=0; i<${cpus}; i++ )); do \n            sudo wrmsr -p$i 0x1a0 0x4000850089;\n        done\n        echo \"Turbo Boost disabled for all CPUs\"\n    else\n        if [ ${cpu} -ge ${cpus} ]; then\n            echo \"CPU id out of range\"\n            exit 1\n        fi\n        sudo wrmsr -p${cpu} 0x1a0 0x4000850089;\n        echo \"Turbo Boost disabled for CPU ${cpu}\"\n    fi\n}\n\n\n\n### MAIN ###\n\nif [ $# -eq 0 ]; then\n    usage\n    exit 1\nfi\n\nfunct=$1\ntarget_cpu=$2\n\ncheck_msr_module\n\nif [ ! -z \"${target_cpu}\" ]; then\n    verify_cpu_id ${target_cpu}\nfi\n\ncase ${funct} in\n    \"enable\")\n        enable ${target_cpu}\n        ;;\n    \"disable\")\n        disable ${target_cpu}\n        ;;\n    \"check\")\n        check ${target_cpu}\n        ;;\n    *)\n        usage\n        exit 1\nesac\n\nexit 0\n\n"
  },
  {
    "path": "src/CMakeLists.txt",
    "content": "add_subdirectory(lib)\nadd_subdirectory(dev)\n"
  },
  {
    "path": "src/dev/CMakeLists.txt",
    "content": "# Build NVM Emulation device driver (using Kbuild Makefile)\n\nset(DEV_DIR \"${CMAKE_CURRENT_SOURCE_DIR}\")\nset(DEV_BIN_DIR \"${CMAKE_CURRENT_BINARY_DIR}\")\nset(DEV_KERNEL_MODULE \"${DEV_BIN_DIR}/nvmemul.ko\")\nmark_as_advanced(DEV_DIR DEV_BIN_DIR)\n\n# We invoke make in build folder to keep the glog's source folder clean.\nfile(MAKE_DIRECTORY ${DEV_BIN_DIR})\nadd_custom_command(OUTPUT ${DEV_KERNEL_MODULE}\n    COMMAND ${CMAKE_COMMAND} -E copy_directory ${DEV_DIR} ${DEV_BIN_DIR}\n    COMMAND ${CMAKE_MAKE_PROGRAM} -j\n    COMMENT [Build-NVM Emulation Device]\n    WORKING_DIRECTORY \"${DEV_BIN_DIR}\"\n    DEPENDS ${DEV_DIR}/pmc.c # just to see if it has been overwritten\n)\n\n# we use add_custom_command for the build itself because otherwise we have to build it\n# every time. the following add_custom_target gives a name for the output.\nadd_custom_target(dev_build ALL DEPENDS ${DEV_KERNEL_MODULE})\n"
  },
  {
    "path": "src/dev/Makefile",
    "content": "# build modules\nobj-m = nvmemul.o\nnvmemul-objs = pmc.o\n\n# use the kernel build system\nKERNEL_VERSION := `uname -r`\nKERNEL_SOURCE := /lib/modules/$(KERNEL_VERSION)/build\n\nSRCDIR=`pwd`\nOBJDIR=`pwd`\n\nall:\n\tmake -C $(KERNEL_SOURCE)  M=$(OBJDIR) modules\n\nclean: \n\tmake -C $(KERNEL_SOURCE) M=$(OBJDIR) clean\n"
  },
  {
    "path": "src/dev/ioctl_query.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __IOCTL_QUERY_H\n#define __IOCTL_QUERY_H\n\n#include <linux/ioctl.h>\n\n#define MYDEV_MAGIC (0xAA)\n\ntypedef struct { \n    unsigned int counter_id;\n    unsigned int event_id;\n} ioctl_query_setcounter_t;\n\ntypedef struct { \n    unsigned int bus_id;\n    unsigned int device_id;\n    unsigned int function_id;\n    unsigned int offset;\n    unsigned int val;\n} ioctl_query_setgetpci_t;\n\n#define IOCTL_SETCOUNTER _IOR(MYDEV_MAGIC, 0, ioctl_query_setcounter_t *) \n#define IOCTL_SETPCI     _IOR(MYDEV_MAGIC, 1, ioctl_query_setgetpci_t *) \n#define IOCTL_GETPCI     _IOWR(MYDEV_MAGIC, 2, ioctl_query_setgetpci_t *) \n\n\n#endif /* __IOCTL_QUERY_H */\n"
  },
  {
    "path": "src/dev/pmc.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <linux/init.h>\n#include <linux/pci.h>\n#include <linux/module.h>\n#include <linux/moduleparam.h>\n#include <linux/major.h>\n#include <linux/kernel.h>\n#include <linux/slab.h>\n#include <linux/fs.h>\n#include <linux/errno.h>\n#include <linux/types.h>\n#include <linux/proc_fs.h>\n#include <linux/fcntl.h>\n#include <linux/smp.h>\n#include <linux/uaccess.h>\n\n#include <asm/msr.h>\n#include <asm/uaccess.h>\n\n#include \"ioctl_query.h\"\n\nstatic long pmc_ioctl(struct file *f, unsigned int cmd, unsigned long arg);\n//unsigned long read_cr4(void);\n//void write_cr4(unsigned long);\n#ifndef read_cr4\n#define read_cr4 native_read_cr4\n#endif\n#ifndef write_cr4\n#define write_cr4 native_write_cr4\n#endif\n\nstruct file_operations pmc_fops = {\n\t.unlocked_ioctl = pmc_ioctl,\n\t.compat_ioctl = pmc_ioctl,\n};\n\nstatic const char* module_name = \"nvmemul\";\nstatic int mod_major = 0;\nstatic const int NVMEMUL_MAJOR = 0;\nconst const int PERFCTR0 = 0xc1;\nconst const int PERFEVENTSEL0 = 0x186;\n\n\nvoid pmc_set_pce_bit(void* arg) \n{\n\tunsigned long cr4reg;\n\n    cr4reg = read_cr4();\n\tcr4reg |= 0x100; // setting the PCE bit\n\twrite_cr4(cr4reg);\n}\n\nint pmc_init_module(void)\n{\n \tprintk(KERN_INFO \"%s: Loading. Initializing...\\n\", module_name);\n\tif ((mod_major = register_chrdev(NVMEMUL_MAJOR, module_name, &pmc_fops)) == -EBUSY) {\n\t\tprintk(KERN_INFO \"%s: Unable to get major for %s device\\n\", module_name, module_name);\n\t\treturn -EIO;\n\t}\n\n\tif (mod_major <= 0) {\n\t\tprintk(KERN_INFO \"%s: Unable to get major for %s device\\n\", module_name, module_name);\n\t\treturn -EIO;\n\t}\n\n\tprintk(KERN_INFO \"%s: major is %d\\n\", module_name, mod_major);\n\n\t/*\n\t * In order to use the rdpmc instruction in user mode, we need to set the\n\t * PCE bit of CR4. PCE is 8th bit of cr4, and 256 is 2 << 8\n\t */\n\n    pmc_set_pce_bit(NULL);\n    smp_call_function(pmc_set_pce_bit, NULL, 1);\n\n\treturn 0;\n}\t\n\nvoid pmc_exit_module(void) {\n \tprintk(KERN_INFO \"%s: Unloading. Cleaning up...\\n\", module_name);\n\t/* Freeing the major number */\n\tunregister_chrdev(mod_major, module_name);\n}\t\n\nstruct counter_s {\n    int counter_id;\n    unsigned long val; \n};\n\n\n/* \n * pmc_clear clears the PMC specified by counter\n * counter = 0 => perfctr0\n * counter = 1 => perfctr1\n * it uses WRMSR to write the values in the counters\n */\nstatic void __pmc_clear(int counter_id) {\n\tint counterRegister = PERFCTR0 + counter_id;\n\t/* clear the old register */\n\n\t__asm__ __volatile__(\"mov %0, %%ecx\\n\\t\"\n\t        \"xor %%edx, %%edx\\n\\t\"\n            \"xor %%eax, %%eax\\n\\t\"\n            \"wrmsr\\n\\t\"\n\t        : /* no outputs */\n\t        : \"m\" (counterRegister)\n\t        : \"eax\", \"ecx\", \"edx\" /* all clobbered */);\n}\n\nstatic void pmc_clear(void* arg) {\n    struct counter_s* counter = (struct counter_s*) arg;\n    __pmc_clear(counter->counter_id);\n}\n\nvoid pmc_clear_all_cpu(int counter_id)\n{\n    struct counter_s counter = { counter_id, 0};\n    pmc_clear((void*) &counter);\n    smp_call_function(pmc_clear, (void*) &counter, 1);\n}\n\n/* \n * This function writes the value specified by the arg to the counter\n * indicated by counter \n */\n\nstatic void __set_counter(int counter_id, unsigned long val) \n{\n    int selectionRegister = PERFEVENTSEL0 + counter_id;\n    __pmc_clear(counter_id);\n\n    /* set the value */\n\n    __asm__ __volatile__(\"mov %0, %%ecx\\n\\t\" /* ecx contains the number of the MSR to set */\n            \"xor %%edx, %%edx\\n\\t\"/* edx contains the high bits to set the MSR to */\n            \"mov %1, %%eax\\n\\t\" /* eax contains the low bits to set the MSR to */\n            \"wrmsr\\n\\t\"\n            : /* no outputs */\n            : \"m\" (selectionRegister), \"m\" (val)\n            : \"eax\", \"ecx\", \"edx\" /* clobbered */);\n}\n\nvoid set_counter(void* arg)\n{\n    struct counter_s* counter = (struct counter_s*) arg;\n\n    __set_counter(counter->counter_id, counter->val);\n}\n\nvoid set_counter_all_cpu(int counter_id, unsigned long arg)\n{\n    struct counter_s counter = { counter_id, arg};\n\n    set_counter((void*) &counter);    \n    smp_call_function(set_counter, (void*) &counter, 1);\n}\n\nstatic long pmc_ioctl_setcounter(struct file* f, unsigned int cmd, unsigned long arg)\n{\n    ioctl_query_setcounter_t q;\n\n    if (copy_from_user(&q, (ioctl_query_setcounter_t*) arg, sizeof(ioctl_query_setcounter_t))) {\n        return -EFAULT;\n    }\n\n\tif ((q.counter_id < 0) || (q.counter_id > 3)) {\n\t\tprintk(KERN_INFO \"%s: set_counter illegal value 0x%x for counter\\n\", module_name, q.counter_id);\n        return -ENXIO;\n    }\n    /* disable counter */\n    set_counter_all_cpu(q.counter_id, 0);\n    pmc_clear_all_cpu(q.counter_id);\n\t/* set counter */\n\tset_counter_all_cpu(q.counter_id, q.event_id);\n    printk(KERN_INFO \"%s: setcounter counter_id: 0x%x event_id=0x%x\\n\", module_name, q.counter_id, q.event_id); \n    return 0;\n}\n\nstatic long pmc_ioctl_setpci(struct file* f, unsigned int cmd, unsigned long arg)\n{\n    ioctl_query_setgetpci_t q;\n    struct pci_bus *bus = NULL;\n\n    if (copy_from_user(&q, (ioctl_query_setgetpci_t*) arg, sizeof(ioctl_query_setgetpci_t))) {\n        return -EFAULT;\n    }\n\n    while ((bus = pci_find_next_bus(bus))) {\n        if (q.bus_id == bus->number) {\n            pci_bus_write_config_word(bus, PCI_DEVFN(q.device_id, q.function_id), q.offset, (u16) q.val);\n            printk(KERN_INFO \"%s: setpci bus_id=0x%x device_id=0x%x, function_id=0x%x, val=0x%x\\n\",\n                    module_name, q.bus_id, q.device_id, q.function_id, q.val);\n            return 0;\n        }\n    }\n    return -ENXIO;\n}\n\nstatic long pmc_ioctl_getpci(struct file* f, unsigned int cmd, unsigned long arg)\n{\n    ioctl_query_setgetpci_t q;\n    struct pci_bus *bus = NULL;\n\n    if (copy_from_user(&q, (ioctl_query_setgetpci_t*) arg, sizeof(ioctl_query_setgetpci_t))) {\n        return -EFAULT;\n    }\n\n    while ((bus = pci_find_next_bus(bus))) {\n        if (q.bus_id == bus->number) {\n            unsigned int val = 0;\n            pci_bus_read_config_word(bus, PCI_DEVFN(q.device_id, q.function_id), q.offset, (u16*) &val);\n            printk(KERN_INFO \"%s: getpci bus_id 0x%x device_id 0x%x, function_id 0x%x, offset 0x%x, val 0x%x\\n\",\n                    module_name, q.bus_id, q.device_id, q.function_id, q.offset, val);\n            q.val = val;\n            if (copy_to_user((ioctl_query_setgetpci_t*) arg, &q, sizeof(ioctl_query_setgetpci_t))) {\n                return -EFAULT;\n            }\n            return 0;\n        }\n    }\n    return -ENXIO;\n}\n\nstatic long pmc_ioctl(struct file *f, unsigned int cmd, unsigned long arg) \n{\n    int ret = -1;\n\n\tprintk(KERN_INFO \"%s: ioctl command: 0x%x\\n\", module_name, cmd);\n\tswitch (cmd) {\n\t\tcase IOCTL_SETCOUNTER:\n            ret = pmc_ioctl_setcounter(f, cmd, arg);\n            break;\n        case IOCTL_SETPCI:\n            ret = pmc_ioctl_setpci(f, cmd, arg);\n            break;\n        case IOCTL_GETPCI:\n            ret = pmc_ioctl_getpci(f, cmd, arg);\n            break;\n\t\tdefault:\n\t\t\tprintk(KERN_INFO \"%s: ioctl illegal command: 0x%x\\n\", module_name, cmd);\n\t\t\tbreak;\n\t}\n\treturn ret;\n}\n\n\n/* Declaration of the init and exit functions */\nmodule_init(pmc_init_module);\nmodule_exit(pmc_exit_module);\n\nMODULE_LICENSE(\"GPL\");\nMODULE_AUTHOR(\"HPLabs\");\n"
  },
  {
    "path": "src/lib/CMakeLists.txt",
    "content": "project(nvmemul)\n\noption(STATISTICS \"Enable statistics report\" ON)\n\nif(STATISTICS)\n  message(STATUS \"WITH STATISTICS\")\n  add_definitions(-DUSE_STATISTICS)\nelse()\n  message(STATUS \"WITHOUT STATISTICS\")\nendif()\n\nset(nvmemul_src\n    config.c\n    debug.c\n    dev.c\n    init.c\n    interpose.c\n    measure_bw.c\n    measure_lat.c\n    misc.c\n    monotonic_timer.c\n    model_bw.c\n    model_lat.c\n    pflush.c\n    pmalloc.c\n    stat.c\n    thread.c\n    topology.c\n    process_rank.c\n)\n\ninclude_directories(${CMAKE_SOURCE_DIR}/third_party)\ninclude_directories(${CMAKE_SOURCE_DIR}/src)\ninclude_directories(${CMAKE_SOURCE_DIR}/src/lib)\nadd_definitions(-g)\nadd_definitions(-O2)\nadd_definitions(-fPIC)\nadd_definitions(-Wall)\nadd_definitions(-march=native)\nadd_definitions(-fopenmp)\nadd_definitions(-std=gnu89)\n#add_definitions(-DNDEBUG)\n#add_definitions(-std=c99)\nadd_definitions(-msse4)\nadd_subdirectory(cpu)\nadd_library(nvmemul SHARED ${nvmemul_src} $<TARGET_OBJECTS:cpu>)\ntarget_link_libraries(nvmemul dl)\ntarget_link_libraries(nvmemul config)\ntarget_link_libraries(nvmemul numa)\ntarget_link_libraries(nvmemul rt)\ntarget_link_libraries(nvmemul m)\ntarget_link_libraries(nvmemul gomp)\n"
  },
  {
    "path": "src/lib/config.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include \"config.h\"\n#include <libconfig.h>\n#include <string.h>\n#include <stdlib.h>\n#include <stdarg.h>\n#include <ctype.h>\n\n#define ENVVAR_MAX_LEN 128\n\nstatic char* __getenv(const char* prefix, const char* name)\n{\n\tchar normalized_name[ENVVAR_MAX_LEN];\n\n\tif ((strlen(name) + strlen(prefix) + 1) > ENVVAR_MAX_LEN) {\n\t\treturn NULL;\n\t}\n\t\n    strcpy(normalized_name, prefix);\n    strcat(normalized_name, \"_\");\n    strcat(normalized_name, name);\n\n    return getenv(normalized_name);\n}\n\nstatic inline int \nenv_setting_lookup(const char *name, char **value_str)\n{\n\tchar *val;\n\tchar normalized_name[ENVVAR_MAX_LEN];\n\tint  i;\n\n\tif ((strlen(name)) > ENVVAR_MAX_LEN) {\n\t\treturn CONFIG_FALSE;\n\t}\n\t\n\tfor (i=0; name[i]; i++) {\n\t\tif (name[i] == '.') {\n\t\t\tnormalized_name[i] = '_';\n\t\t} else {\n\t\t\tnormalized_name[i] = toupper(name[i]);\n\t\t}\n\t}\n\tnormalized_name[i] = '\\0';\n\t\n\tval = __getenv(ENVVAR_PREFIX, normalized_name);\n\tif (val) {\n\t\t*value_str = val;\n\t\treturn CONFIG_TRUE;\n\t} else {\n\t\treturn CONFIG_FALSE;\n\t}\n}\n\n\nstatic inline int\nenv_setting_lookup_int(const char *name, int *value)\n{\n\tchar *value_str;\n\n\tif (env_setting_lookup(name, &value_str) == CONFIG_FALSE) {\n\t\treturn CONFIG_FALSE;\n\t}\n\n\tif (value_str) {\n\t\t*value = atoi(value_str);\n\t\treturn CONFIG_TRUE;\n\t} else {\n\t\treturn CONFIG_FALSE;\n\t}\n}\n\n\nstatic inline int\nenv_setting_lookup_bool(const char *name, int *value)\n{\n\treturn env_setting_lookup_int(name, value);\n}\n\n\nstatic inline int \nenv_setting_lookup_string(const char *name, char **value)\n{\n\treturn env_setting_lookup(name, value);\n}\n\n\nint\n__cconfig_lookup_bool(config_t *cfg, const char *name, int *value) \n{\n\tint val;\n\tint found_val = 0;\n\n\tif (env_setting_lookup_bool(name, &val) == CONFIG_TRUE) {\n\t\tfound_val = 1;\n\t} else {\n\t    if (config_lookup_bool(cfg, name, &val) == CONFIG_TRUE) {\n\t\t\tfound_val = 1;\n\t\t}\n\t}\n\n\tif (found_val)\t{\n\t\t*value = val;\n\t\treturn CONFIG_TRUE;\n\t}\n\treturn CONFIG_FALSE;\n}\n\n\nint\n__cconfig_lookup_valid_bool(config_t *cfg, \n                     const char *name, \n                     int *value, \n                     int validity_check, ...)\n{\n\treturn __cconfig_lookup_bool(cfg, name, value);\n}\n\n\nint\n__cconfig_lookup_int(config_t *cfg, const char *name, int *value)\n{\n\tint val;\n\tint found_val = 0;\n\n\tif (env_setting_lookup_int(name, &val) == CONFIG_TRUE) {\n\t\tfound_val = 1;\n\t} else {\n\t\t// third parameter changed from libconfig 1.3 to 1.4, it was 'long' and now it is 'int'\n\t    if (config_lookup_int(cfg, name, &val) == CONFIG_TRUE) {\n\t\t\tfound_val = 1;\n\t\t}\n\t}\n\n\tif (found_val)\t{\n\t\t*value = val;\n\t\treturn CONFIG_TRUE;\n\t}\n\treturn CONFIG_FALSE;\n}\n\n\nint\n__cconfig_lookup_valid_int(config_t *cfg, \n                           const char *name, \n                           int *value, \n                           int validity_check, ...)\n{\n\tint              min;\n\tint              max;\n\tint              list_length;\n\tint              i;\n\tint              val;\n\tint              listval;\n\tva_list          ap;\n\n\tif (__cconfig_lookup_int(cfg, name, &val) == CONFIG_TRUE) {\n\t\tswitch (validity_check) {\n\t\t\tcase CONFIG_NO_CHECK:\n\t\t\t\t*value = val;\n\t\t\t\treturn CONFIG_TRUE;\n\t\t\tcase CONFIG_RANGE_CHECK:\n\t\t\t\tva_start(ap, validity_check);\n\t\t\t\tmin = va_arg(ap, int);\n\t\t\t\tmax = va_arg(ap, int);\n\t\t\t\tva_end(ap);\n\t\t\t\tif (*value >= min && *value <= max) {\n\t\t\t\t\t*value = val;\n\t\t\t\t\treturn CONFIG_TRUE;\n\t\t\t\t}\n\t\t\t\tbreak;\n\t\t\tcase CONFIG_LIST_CHECK:\n\t\t\t\tva_start(ap, validity_check);\n\t\t\t\tlist_length = va_arg(ap, int);\n\t\t\t\tfor (i=0; i<list_length; i++) {\n\t\t\t\t\tlistval = va_arg(ap, int);\n\t\t\t\t\tif (val == listval) {\n\t\t\t\t\t\t*value = val;\n\t\t\t\t\t\treturn CONFIG_TRUE;\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\tva_end(ap);\n\t\t\t\tbreak;\n\t\t}\n\t}\n\treturn CONFIG_FALSE;\n}\n\n\nint\n__cconfig_lookup_string(config_t *cfg, const char *name, char **value)\n{\n\tchar *val;\n\tint  found_val = 0;\n\n\tif (env_setting_lookup_string(name, &val) == CONFIG_TRUE) {\n\t\tfound_val = 1;\n\t} else {\t\n\t    if (config_lookup_string(cfg, name, (const char**) &val) == CONFIG_TRUE) {\n\t\t\tfound_val = 1;\n\t\t}\n\t}\n\n\tif (found_val)\t{\n\t\t*value = val;\n\t\treturn CONFIG_TRUE;\n\t}\n\treturn CONFIG_FALSE;\n}\n\n\nint\n__cconfig_lookup_valid_string(config_t *cfg, \n                              const char *name, \n                              char **value, \n                              int validity_check, ...)\n{\n\tint       list_length;\n\tint       i;\n\tchar      *val;\n\tva_list   ap;\n\n\tif (__cconfig_lookup_string(cfg, name, &val) == CONFIG_TRUE) {\n\t\tswitch (validity_check) {\n\t\t\tcase CONFIG_NO_CHECK:\n\t\t\t\t*value = val;\n\t\t\t\treturn CONFIG_TRUE;\n\t\t\tcase CONFIG_RANGE_CHECK:\n\t\t\t\tbreak;\n\t\t\tcase CONFIG_LIST_CHECK:\n\t\t\t\tva_start(ap, validity_check);\n\t\t\t\tlist_length = va_arg(ap, int);\n\t\t\t\tfor (i=0; i<list_length; i++) {\n\t\t\t\t\tif (strcmp(val, va_arg(ap, char *))==0) {\n\t\t\t\t\t\t*value = val;\n\t\t\t\t\t\treturn CONFIG_TRUE;\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\tva_end(ap);\n\t\t\t\tbreak;\n\t\t}\n\t}\n\treturn CONFIG_FALSE;\n}\n\n\nint \n__cconfig_init(config_t *cfg, const char *config_file)\n{\n    int ret;\n\tchar* env_config_file;\n\n\tif ((env_config_file = __getenv(ENVVAR_PREFIX, \"INI\"))) {\n\t\tconfig_file = env_config_file;\n\t}\n\t\n\tconfig_init(cfg);\n\tif ((ret = config_read_file(cfg, config_file)) == CONFIG_FALSE) {\n        fprintf(stderr, \"ERROR: nvmemul: Configuration file %s not found.\\n\", config_file);\n    }\n    return ret;\n}\n"
  },
  {
    "path": "src/lib/config.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __CONFIG_H\n#define __CONFIG_H\n\n/**\n * \\file \n * \n * Runtime configuration parameters\n */\n\n\n#include <stdio.h>\n#include <libconfig.h>\n\n#define ENVVAR_PREFIX \"NVMEMUL\"\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n/* Make sure we don't redefine a macro already defined in libconfig.h */\n\n#ifdef CONFIG_NO_CHECK\n# error \"ERROR: Redefining previously defined CONFIG_NO_CHECK\"\n#else\n# define CONFIG_NO_CHECK    0\n#endif\n\n#ifdef CONFIG_RANGE_CHECK\n# error \"ERROR: Redefining previously defined CONFIG_RANGE_CHECK\"\n#else\n# define CONFIG_RANGE_CHECK 1\n#endif\n\n#ifdef CONFIG_LIST_CHECK\n# error \"ERROR: Redefining previously defined CONFIG_LIST_CHECK\"\n#else\n# define CONFIG_LIST_CHECK  2\n#endif\n\n\n\n/** \n * The lookup functions return the value of a configuration variable based on \n * the following order: \n *  1) value of environment variable\n *  2) value in configuration file variable\n *  \n * If the variable is not found then a lookup function does not set the value.\n */\n\nint __cconfig_lookup_bool(config_t *cfg, const char *name, int *value);\nint __cconfig_lookup_int(config_t *cfg, const char *name, int *value);\nint __cconfig_lookup_string(config_t *cfg, const char *name, char **value);\nint __cconfig_lookup_valid_bool(config_t *cfg, const char *name, int *value, int validity_check, ...);\nint __cconfig_lookup_valid_int(config_t *cfg, const char *name, int *value, int validity_check, ...);\nint __cconfig_lookup_valid_string(config_t *cfg, const char *name, char **value, int validity_check, ...);\nint __cconfig_init(config_t *cfg, const char *config_file);\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif /* __CONFIG_H */\n"
  },
  {
    "path": "src/lib/cpu/CMakeLists.txt",
    "content": "set(nvmemul_cpu_src\n    cpu.c\n    pmc.c\n)\n\nadd_library(cpu OBJECT ${nvmemul_cpu_src})\n"
  },
  {
    "path": "src/lib/cpu/cpu.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <stdio.h>\n#include <stdlib.h>\n#include <regex.h>\n#include <string.h>\n#include \"cpu.h\"\n#include \"dev.h\"\n#include \"error.h\"\n#include \"misc.h\"\n#include \"known_cpus.h\"\n#include \"xeon-ex.h\"\n#include <cpuid.h>\n\n// Mainline architectures and processors available here:\n// https://software.intel.com/en-us/articles/intel-architecture-and-processor-identification-with-cpuid-model-and-family-numbers\n//\n// It turns out that CPUID is not an accurate approach to identifying a\n// processor as different processors may have the same CPUID.\n// So instead we rely on the brand string returned by /proc/cpuinfo:model_name\n\n#define MASK(msb, lsb) (~((~0) << (msb + 1)) & ((~0) << lsb))\n#define EXTRACT(val, msb, lsb) ((MASK(msb, lsb) & val) >> lsb)\n#define MODEL(eax) EXTRACT(eax, 7, 4)\n#define EXTENDED_MODEL(eax) EXTRACT(eax, 19, 16)\n#define MODEL_NUMBER(eax) ((EXTENDED_MODEL(eax) << 4) | MODEL(eax))\n#define FAMILY(eax) EXTRACT(eax, 11, 8)\n#define Extended_Family(eax) EXTRACT(eax, 27, 20)\n#define Family_Number(eax) (FAMILY(eax) + Extended_Family(eax))\n\nvoid cpuid(unsigned int info, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)\n{\n    __asm__(\n        \"cpuid;\"\n        : \"=a\"(*eax), \"=b\"(*ebx), \"=c\"(*ecx), \"=d\"(*edx)\n        : \"a\"(info));\n}\n\nvoid get_family_model(int *family, int *model)\n{\n    unsigned int eax, ebx, ecx, edx;\n    int success = __get_cpuid(1, &eax, &ebx, &ecx, &edx);\n    if (family != NULL)\n    {\n        *family = success ? Family_Number(eax) : 0;\n    }\n\n    if (model != NULL)\n    {\n        *model = success ? MODEL_NUMBER(eax) : 0;\n    }\n}\n\n// caller is responsible for freeing memory allocated by this function\nchar *cpuinfo(char *valname)\n{\n    FILE *fp;\n    char *line = NULL;\n    size_t len = 0;\n    ssize_t read;\n\n    fp = fopen(\"/proc/cpuinfo\", \"r\");\n    if (fp == NULL)\n    {\n        return NULL;\n    }\n\n    while ((read = getline(&line, &len, fp)) != -1)\n    {\n        if (strstr(line, valname))\n        {\n            char *colon = strchr(line, ':');\n            int len = colon - line;\n            char *buf = malloc(strlen(line) - len);\n            strcpy(buf, &line[len + 2]);\n            free(line);\n            fclose(fp);\n            return buf;\n        }\n    }\n\n    free(line);\n    fclose(fp);\n    return NULL;\n}\n\n// reads current cpu frequency through the /proc/cpuinfo file\n// avoid calling this function often\nint cpu_speed_mhz()\n{\n    size_t val;\n    char *str = cpuinfo(\"cpu MHz\");\n    val = string_to_size(str);\n    free(str);\n    return val;\n}\n\n// reads cpu LLC cache size through the /proc/cpuinfo file\n// avoid calling this function often\nsize_t cpu_llc_size_bytes()\n{\n    size_t val;\n    char *str = cpuinfo(\"cache size\");\n    val = string_to_size(str);\n    free(str);\n    return val;\n}\n\n// caller is responsible for freeing memory allocated by this function\nchar *cpu_model_name()\n{\n    return cpuinfo(\"model name\");\n}\n\nint match(const char *to_match, const char *regex_text)\n{\n    int ret;\n    const char *p = to_match;\n    regex_t regex;\n    regmatch_t m[1];\n\n    if ((ret = regcomp(&regex, regex_text, REG_EXTENDED | REG_NEWLINE)) != 0)\n    {\n        return E_ERROR;\n    }\n    if ((ret = regexec(&regex, p, 1, m, 0)))\n    {\n        regfree(&regex);\n        return E_ERROR; // no match\n    }\n    regfree(&regex);\n    return E_SUCCESS;\n}\n\nint is_Xeon()\n{\n    char *model_name;\n    if ((model_name = cpu_model_name()) == NULL)\n    {\n        return 0;\n    }\n\n    if (match(model_name, \"Xeon\") == E_SUCCESS)\n    {\n        free(model_name);\n        return 1;\n    }\n    else\n    {\n        free(model_name);\n        return 0;\n    }\n}\n\nint is_Intel()\n{\n    char *model_name;\n    if ((model_name = cpu_model_name()) == NULL)\n    {\n        return 0;\n    }\n\n    if (match(model_name, \"Intel\") == E_SUCCESS)\n    {\n        free(model_name);\n        return 1;\n    }\n    else\n    {\n        free(model_name);\n        return 0;\n    }\n}\n\ncpu_model_t *cpu_model()\n{\n    int i, family, model;\n    cpu_model_t *cpu_model = NULL;\n\n    if (!is_Intel())\n        return NULL;\n\n    get_family_model(&family, &model);\n\n    int isXeon = is_Xeon();\n\n    for (i = 0; known_cpus[i].microarch != Invalid; i++)\n    {\n        microarch_ID_t c = known_cpus[i];\n\n        if (c.family == family && c.model == model)\n        {\n            switch (c.microarch)\n            {\n            case SandyBridge:\n                cpu_model = &cpu_model_intel_xeon_ex;\n                break;\n            case IvyBridge:\n                cpu_model = &cpu_model_intel_xeon_ex_v2;\n                break;\n            case Haswell:\n                cpu_model = &cpu_model_intel_xeon_ex_v3;\n                break;\n            default:\n                return NULL;\n            }\n\n            if (!isXeon)\n                cpu_model->microarch = (microarch_t)(cpu_model->microarch - 1);\n\n            DBG_LOG(INFO, \"Detected CPU model '%s'\\n\", microarch_strings[cpu_model->microarch]);\n            break;\n        }\n    }\n\n    if (!cpu_model)\n    {\n        return NULL;\n    }\n\n    // complete the model with some runtime information\n    cpu_model->llc_size_bytes = cpu_llc_size_bytes();\n    //    cpu_model->speed_mhz = cpu_speed_mhz();\n\n    return cpu_model;\n}\n"
  },
  {
    "path": "src/lib/cpu/cpu.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __CPU_H\n#define __CPU_H\n\n#include <stddef.h>\n#include <stdint.h>\n#include \"dev.h\"\n\n#define MAX_THROTTLE_VALUE 1023\n\nint set_throttle_register(int node, uint64_t val);\nsize_t cpu_llc_size_bytes();\n\nstruct pmc_set_s;\n\ntypedef enum {\n    THROTTLE_DDR_ACT = 0,\n    THROTTLE_DDR_READ,\n    THROTTLE_DDR_WRITE\n} throttle_type_t;\n\n// order matters. see cpu_model()\ntypedef enum {\n    Invalid,\n    SandyBridge,\n    SandyBridgeXeon,\n    IvyBridge,\n    IvyBridgeXeon,\n    Haswell,\n    HaswellXeon\n} microarch_t;\n\ntypedef struct\n{\n    int family;\n    int model;\n    microarch_t microarch;\n} microarch_ID_t;\n\n/**\n *  CPU object that encapsulates processor-specific methods for accessing\n *  performance counters and memory controller PCI registers\n */\ntypedef struct cpu_model_s {\n    microarch_t microarch; // processor description\n    size_t llc_size_bytes; // last level cache size\n//    int speed_mhz; // cpu clock frequency\n    struct pmc_events_s* pmc_events; // performance monitoring events supported by the processor\n    int (*set_throttle_register)(pci_regs_t *regs, throttle_type_t throttle_type, uint16_t val);\n    int (*get_throttle_register)(pci_regs_t *regs, throttle_type_t throttle_type, uint16_t* val);\n} cpu_model_t;\n\ncpu_model_t* cpu_model();\nint cpu_speed_mhz();\n\n#endif /* __CPU_H */\n"
  },
  {
    "path": "src/lib/cpu/haswell-papi.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __CPU_HASWELL_H\r\n#define __CPU_HASWELL_H\r\n\r\n#include <papi.h>\r\n#include \"debug.h\"\r\n\r\n// Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with\r\n// applications to list all available performance events with their architecture specific\r\n// detailed description and translate them to their respective event code. 'showevtinfo' application can\r\n// be used to list all available performance event names with detailed description and 'check_events' application\r\n// can be used to translate the performance event to the corresponding event code.  \r\n\r\n// These events will be initialized and started.\r\n// Every event reading will return an array with the values for all these events.\r\n// The array index is the same index used to define the event in the *_native_events array below\r\nconst char *haswell_native_events[MAX_NUM_EVENTS] = {\r\n    \"CYCLE_ACTIVITY:STALLS_L2_PENDING\",\r\n    \"MEM_LOAD_UOPS_L3_HIT_RETIRED:XSNP_NONE\",\r\n    \"MEM_LOAD_UOPS_L3_MISS_RETIRED:REMOTE_DRAM\",\r\n    \"MEM_LOAD_UOPS_L3_MISS_RETIRED:LOCAL_DRAM\"\r\n};\r\n\r\nuint64_t haswell_read_stall_events_local() {\r\n    long long values[MAX_NUM_EVENTS];\r\n    uint64_t events = 0;\r\n\r\n    if (pmc_events_read_local_thread(values) == PAPI_OK) {\r\n\t\tuint64_t l2_pending = values[0];\r\n\t\tuint64_t llc_hit  = values[1];\r\n\t\tuint64_t remote_dram = values[2];\r\n\t\tuint64_t local_dram  = values[3];\r\n\r\n\t\tDBG_LOG(DEBUG, \"read stall L2 cycles %lu; llc_hit %lu; remote_dram %lu; local_dram %lu\\n\",\r\n\t\t\tl2_pending, llc_hit, remote_dram, local_dram);\r\n\r\n\t\tdouble num = remote_dram + local_dram;\r\n\t\tdouble den = num + llc_hit;\r\n\t\tif (den == 0) return 0;\r\n\r\n\t\tevents = (uint64_t)((double)l2_pending * ((double)num / den));\r\n    } else {\r\n        DBG_LOG(ERROR, \"read stall cycles failed\\n\");\r\n    }\r\n\r\n    return events;\r\n}\r\n\r\nuint64_t haswell_read_stall_events_remote() {\r\n    long long values[MAX_NUM_EVENTS];\r\n    uint64_t events = 0;\r\n\r\n    if (pmc_events_read_local_thread(values) == PAPI_OK) {\r\n\t\tuint64_t l2_pending = values[0];\r\n\t\tuint64_t llc_hit  = values[1];\r\n\t\tuint64_t remote_dram = values[2];\r\n\t\tuint64_t local_dram  = values[3];\r\n\r\n\t\tDBG_LOG(DEBUG, \"read stall L2 cycles %lu; llc_hit %lu; remote_dram %lu; local_dram %lu\\n\",\r\n\t\t\tl2_pending, llc_hit, remote_dram, local_dram);\r\n\r\n\t\t// calculate stalls based on l2 stalls and LLC miss/hit\r\n\t\tdouble num = remote_dram + local_dram;\r\n\t\tdouble den = num + llc_hit;\r\n\t\tif (den == 0) return 0;\r\n\t\tdouble stalls = (double)l2_pending * ((double)num / den);\r\n\r\n\t\t// calculate remote dram stalls based on total stalls and local/remote dram accesses\r\n\t\tden = remote_dram + local_dram;\r\n\t\tif (den == 0) return 0;\r\n\t\tevents = (uint64_t) (stalls * ((double)remote_dram / den));\r\n    } else {\r\n        DBG_LOG(ERROR, \"read stall cycles failed\\n\");\r\n    }\r\n\r\n    return events;\r\n}\r\n\r\n#endif /* __CPU_HASWELL_H */\r\n"
  },
  {
    "path": "src/lib/cpu/haswell.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __CPU_HASWELL_H\n#define __CPU_HASWELL_H\n\n#include <math.h>\n#include \"thread.h\"\n#include \"cpu/pmc.h\"\n#include \"debug.h\"\n\n// Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with\n// applications to list all available performance events with their architecture specific\n// detailed description and translate them to their respective event code. 'showevtinfo' application can\n// be used to list all available performance event names with detailed description and 'check_events' application\n// can be used to translate the performance event to the corresponding event code.  \n\nextern __thread int tls_hw_local_latency;\nextern __thread int tls_hw_remote_latency;\n#ifdef MEMLAT_SUPPORT\nextern __thread uint64_t tls_global_remote_dram;\nextern __thread uint64_t tls_global_local_dram;\n#endif\n\n#undef FOREACH_PMC_HW_EVENT\n#define FOREACH_PMC_HW_EVENT(ACTION)                                                                       \\\n  ACTION(\"CYCLE_ACTIVITY:STALLS_L2_PENDING\", NULL, 0x55305a3)                                              \\\n  ACTION(\"MEM_LOAD_UOPS_L3_HIT_RETIRED:XSNP_NONE\", NULL, 0x5308d2)                                        \\\n  ACTION(\"MEM_LOAD_UOPS_L3_MISS_RETIRED:REMOTE_DRAM\", NULL, 0x530cd3)                                     \\\n  ACTION(\"MEM_LOAD_UOPS_L3_MISS_RETIRED:LOCAL_DRAM\", NULL, 0x5303d3)\n\n#undef FOREACH_PMC_EVENT\n#define FOREACH_PMC_EVENT(ACTION, prefix)                                                                  \\\n  ACTION(ldm_stall_cycles, prefix)                                                                         \\\n  ACTION(remote_dram, prefix)\n\n#define L3_FACTOR 7.0\n\nDECLARE_ENABLE_PMC(haswell, ldm_stall_cycles)\n{\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"CYCLE_ACTIVITY:STALLS_L2_PENDING\", 0);\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"MEM_LOAD_UOPS_L3_HIT_RETIRED:XSNP_NONE\", 1);\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"MEM_LOAD_UOPS_L3_MISS_RETIRED:REMOTE_DRAM\", 2);\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"MEM_LOAD_UOPS_L3_MISS_RETIRED:LOCAL_DRAM\", 3);\n\n    return E_SUCCESS;\n}\n\nDECLARE_CLEAR_PMC(haswell, ldm_stall_cycles)\n{\n}\n\nDECLARE_READ_PMC(haswell, ldm_stall_cycles)\n{\n   uint64_t l2_pending_diff  = READ_MY_HW_EVENT_DIFF(0);\n   uint64_t llc_hit_diff     = READ_MY_HW_EVENT_DIFF(1);\n   uint64_t remote_dram_diff = READ_MY_HW_EVENT_DIFF(2);\n   uint64_t local_dram_diff  = READ_MY_HW_EVENT_DIFF(3);\n\n   DBG_LOG(DEBUG, \"read stall L2 cycles diff %lu; llc_hit %lu; cycles diff remote_dram %lu; local_dram %lu\\n\",\n\t\t   l2_pending_diff, llc_hit_diff, remote_dram_diff, local_dram_diff);\n\n   if ((remote_dram_diff == 0) && (local_dram_diff == 0)) return 0;\n#ifdef MEMLAT_SUPPORT\n   tls_global_local_dram += local_dram_diff;\n#endif\n\n   // calculate stalls based on L2 stalls and LLC miss/hit\n   double num = L3_FACTOR * (remote_dram_diff + local_dram_diff);\n   double den = num + llc_hit_diff;\n   if (den == 0) return 0;\n   return (uint64_t) ((double)l2_pending_diff * (num / den));\n}\n\n\nDECLARE_ENABLE_PMC(haswell, remote_dram)\n{\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"CYCLE_ACTIVITY:STALLS_L2_PENDING\", 0);\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"MEM_LOAD_UOPS_L3_HIT_RETIRED:XSNP_NONE\", 1);\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"MEM_LOAD_UOPS_L3_MISS_RETIRED:REMOTE_DRAM\", 2);\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"MEM_LOAD_UOPS_L3_MISS_RETIRED:LOCAL_DRAM\", 3);\n\n    return E_SUCCESS;\n}\n\nDECLARE_CLEAR_PMC(haswell, remote_dram)\n{\n}\n\nDECLARE_READ_PMC(haswell, remote_dram)\n{\n   uint64_t l2_pending_diff  = READ_MY_HW_EVENT_DIFF(0);\n   uint64_t llc_hit_diff     = READ_MY_HW_EVENT_DIFF(1);\n   uint64_t remote_dram_diff = READ_MY_HW_EVENT_DIFF(2);\n   uint64_t local_dram_diff  = READ_MY_HW_EVENT_DIFF(3);\n\n   DBG_LOG(DEBUG, \"read stall L2 cycles diff %lu; llc_hit %lu; cycles diff remote_dram %lu; local_dram %lu\\n\",\n\t\t   l2_pending_diff, llc_hit_diff, remote_dram_diff, local_dram_diff);\n\n   if ((remote_dram_diff == 0) && (local_dram_diff == 0)) return 0;\n#ifdef MEMLAT_SUPPORT\n   tls_global_remote_dram += remote_dram_diff;\n#endif\n\n   // calculate stalls based on L2 stalls and LLC miss/hit\n   double num = L3_FACTOR * (remote_dram_diff + local_dram_diff);\n   double den = num + llc_hit_diff;\n   if (den == 0) return 0;\n   double stalls = (double)l2_pending_diff * (num / den);\n\n   // calculate remote dram stalls based on total stalls and local/remote dram accesses\n   // also consider the weight of remote memory access against local memory access\n   den = (remote_dram_diff * tls_hw_remote_latency) + (local_dram_diff * tls_hw_local_latency);\n   if (den == 0) return 0;\n   return (uint64_t) (stalls * ((double)(remote_dram_diff * tls_hw_remote_latency) / den));\n}\n\n\nPMC_EVENTS(haswell, 4)\n#endif /* __CPU_HASWELL_H */\n"
  },
  {
    "path": "src/lib/cpu/ivybridge-papi.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __CPU_IVYBRIDGE_H\r\n#define __CPU_IVYBRIDGE_H\r\n\r\n#include <papi.h>\r\n#include \"debug.h\"\r\n\r\n// Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with\r\n// applications to list all available performance events with their architecture specific\r\n// detailed description and translate them to their respective event code. 'showevtinfo' application can\r\n// be used to list all available performance event names with detailed description and 'check_events' application\r\n// can be used to translate the performance event to the corresponding event code.  \r\n\r\n// These events will be initialized and started.\r\n// Every event reading will return an array with the values for all these events.\r\n// The array index is the same index used to define the event in the *_native_events array below\r\nconst char *ivybridge_native_events[MAX_NUM_EVENTS] = {\r\n    \"CYCLE_ACTIVITY:STALLS_L2_PENDING\",\r\n    \"MEM_LOAD_UOPS_LLC_HIT_RETIRED:XSNP_NONE\",\r\n    \"MEM_LOAD_UOPS_LLC_MISS_RETIRED:REMOTE_DRAM\",\r\n    \"MEM_LOAD_UOPS_LLC_MISS_RETIRED:LOCAL_DRAM\"\r\n};\r\n\r\nuint64_t ivybridge_read_stall_events_local() {\r\n    long long values[MAX_NUM_EVENTS];\r\n    uint64_t events = 0;\r\n\r\n    if (pmc_events_read_local_thread(values) == PAPI_OK) {\r\n\t\tuint64_t l2_pending = values[0];\r\n\t\tuint64_t llc_hit  = values[1];\r\n\t\tuint64_t remote_dram = values[2];\r\n\t\tuint64_t local_dram  = values[3];\r\n\r\n\t\tDBG_LOG(DEBUG, \"read stall L2 cycles %lu; llc_hit %lu; remote_dram %lu; local_dram %lu\\n\",\r\n\t\t\tl2_pending, llc_hit, remote_dram, local_dram);\r\n\r\n\t\tdouble num = remote_dram + local_dram;\r\n\t\tdouble den = num + llc_hit;\r\n\t\tif (den == 0) return 0;\r\n\r\n\t\tevents = (uint64_t)((double)l2_pending * ((double)num / den));\r\n    } else {\r\n        DBG_LOG(ERROR, \"read stall cycles failed\\n\");\r\n    }\r\n\r\n    return events;\r\n}\r\n\r\nuint64_t ivybridge_read_stall_events_remote() {\r\n    long long values[MAX_NUM_EVENTS];\r\n    uint64_t events = 0;\r\n\r\n    if (pmc_events_read_local_thread(values) == PAPI_OK) {\r\n\t\tuint64_t l2_pending = values[0];\r\n\t\tuint64_t llc_hit  = values[1];\r\n\t\tuint64_t remote_dram = values[2];\r\n\t\tuint64_t local_dram  = values[3];\r\n\r\n\t\tDBG_LOG(DEBUG, \"read stall L2 cycles %lu; llc_hit %lu; remote_dram %lu; local_dram %lu\\n\",\r\n\t\t\tl2_pending, llc_hit, remote_dram, local_dram);\r\n\r\n\t\t// calculate stalls based on l2 stalls and LLC miss/hit\r\n\t\tdouble num = remote_dram + local_dram;\r\n\t\tdouble den = num + llc_hit;\r\n\t\tif (den == 0) return 0;\r\n\t\tdouble stalls = (double)l2_pending * ((double)num / den);\r\n\r\n\t\t// calculate remote dram stalls based on total stalls and local/remote dram accesses\r\n\t\tden = remote_dram + local_dram;\r\n\t\tif (den == 0) return 0;\r\n\t\tevents = (uint64_t) (stalls * ((double)remote_dram / den));\r\n    } else {\r\n        DBG_LOG(ERROR, \"read stall cycles failed\\n\");\r\n    }\r\n\r\n    return events;\r\n}\r\n\r\n#endif /* __CPU_IVYBRIDGE_H */\r\n"
  },
  {
    "path": "src/lib/cpu/ivybridge.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __CPU_IVYBRIDGE_H\n#define __CPU_IVYBRIDGE_H\n\n#include <math.h>\n#include \"thread.h\"\n#include \"cpu/pmc.h\"\n#include \"debug.h\"\n\n// Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with\n// applications to list all available performance events with their architecture specific\n// detailed description and translate them to their respective event code. 'showevtinfo' application can\n// be used to list all available performance event names with detailed description and 'check_events' application\n// can be used to translate the performance event to the corresponding event code.  \n\nextern __thread int tls_hw_local_latency;\nextern __thread int tls_hw_remote_latency;\n#ifdef MEMLAT_SUPPORT\nextern __thread uint64_t tls_global_remote_dram;\nextern __thread uint64_t tls_global_local_dram;\n#endif\n\n#undef FOREACH_PMC_HW_EVENT\n#define FOREACH_PMC_HW_EVENT(ACTION)                                                                       \\\n  ACTION(\"CYCLE_ACTIVITY:STALLS_L2_PENDING\", NULL, 0x55305a3)                                              \\\n  ACTION(\"MEM_LOAD_UOPS_LLC_HIT_RETIRED:XSNP_NONE\", NULL, 0x5308d2)                                        \\\n  ACTION(\"MEM_LOAD_UOPS_LLC_MISS_RETIRED:REMOTE_DRAM\", NULL, 0x530cd3)                                     \\\n  ACTION(\"MEM_LOAD_UOPS_LLC_MISS_RETIRED:LOCAL_DRAM\", NULL, 0x5303d3)\n\n#undef FOREACH_PMC_EVENT\n#define FOREACH_PMC_EVENT(ACTION, prefix)                                                                  \\\n  ACTION(ldm_stall_cycles, prefix)                                                                         \\\n  ACTION(remote_dram, prefix)\n\n\n#define L3_FACTOR 7.0\n\nDECLARE_ENABLE_PMC(ivybridge, ldm_stall_cycles)\n{\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"CYCLE_ACTIVITY:STALLS_L2_PENDING\", 0);\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"MEM_LOAD_UOPS_LLC_HIT_RETIRED:XSNP_NONE\", 1);\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"MEM_LOAD_UOPS_LLC_MISS_RETIRED:REMOTE_DRAM\", 2);\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"MEM_LOAD_UOPS_LLC_MISS_RETIRED:LOCAL_DRAM\", 3);\n\n    return E_SUCCESS;\n}\n\nDECLARE_CLEAR_PMC(ivybridge, ldm_stall_cycles)\n{\n}\n\nDECLARE_READ_PMC(ivybridge, ldm_stall_cycles)\n{\n   uint64_t l2_pending_diff  = READ_MY_HW_EVENT_DIFF(0);\n   uint64_t llc_hit_diff     = READ_MY_HW_EVENT_DIFF(1);\n   uint64_t remote_dram_diff = READ_MY_HW_EVENT_DIFF(2);\n   uint64_t local_dram_diff  = READ_MY_HW_EVENT_DIFF(3);\n\n   DBG_LOG(DEBUG, \"read stall L2 cycles diff %lu; llc_hit %lu; cycles diff remote_dram %lu; local_dram %lu\\n\",\n\t\t   l2_pending_diff, llc_hit_diff, remote_dram_diff, local_dram_diff);\n\n   if ((remote_dram_diff == 0) && (local_dram_diff == 0)) return 0;\n#ifdef MEMLAT_SUPPORT\n   tls_global_local_dram += local_dram_diff;\n#endif\n\n   // calculate stalls based on L2 stalls and LLC miss/hit\n   double num = L3_FACTOR * (remote_dram_diff + local_dram_diff);\n   double den = num + llc_hit_diff;\n   if (den == 0) return 0;\n   return (uint64_t) ((double)l2_pending_diff * (num / den));\n}\n\n\nDECLARE_ENABLE_PMC(ivybridge, remote_dram)\n{\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"CYCLE_ACTIVITY:STALLS_L2_PENDING\", 0);\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"MEM_LOAD_UOPS_LLC_HIT_RETIRED:XSNP_NONE\", 1);\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"MEM_LOAD_UOPS_LLC_MISS_RETIRED:REMOTE_DRAM\", 2);\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"MEM_LOAD_UOPS_LLC_MISS_RETIRED:LOCAL_DRAM\", 3);\n\n    return E_SUCCESS;\n}\n\nDECLARE_CLEAR_PMC(ivybridge, remote_dram)\n{\n}\n\nDECLARE_READ_PMC(ivybridge, remote_dram)\n{\n   uint64_t l2_pending_diff  = READ_MY_HW_EVENT_DIFF(0);\n   uint64_t llc_hit_diff     = READ_MY_HW_EVENT_DIFF(1);\n   uint64_t remote_dram_diff = READ_MY_HW_EVENT_DIFF(2);\n   uint64_t local_dram_diff  = READ_MY_HW_EVENT_DIFF(3);\n\n   DBG_LOG(DEBUG, \"read stall L2 cycles diff %lu; llc_hit %lu; cycles diff remote_dram %lu; local_dram %lu\\n\",\n\t\t   l2_pending_diff, llc_hit_diff, remote_dram_diff, local_dram_diff);\n\n   if ((remote_dram_diff == 0) && (local_dram_diff == 0)) return 0;\n#ifdef MEMLAT_SUPPORT\n   tls_global_remote_dram += remote_dram_diff;\n#endif\n\n   // calculate stalls based on L2 stalls and LLC miss/hit\n   double num = L3_FACTOR * (remote_dram_diff + local_dram_diff);\n   double den = num + llc_hit_diff;\n   if (den == 0) return 0;\n   double stalls = (double)l2_pending_diff * (num / den);\n\n   // calculate remote dram stalls based on total stalls and local/remote dram accesses\n   // also consider the weight of remote memory access against local memory access\n   den = (remote_dram_diff * tls_hw_remote_latency) + (local_dram_diff * tls_hw_local_latency);\n   if (den == 0) return 0;\n   return (uint64_t) (stalls * ((double)(remote_dram_diff * tls_hw_remote_latency) / den));\n}\n\n\nPMC_EVENTS(ivybridge, 4)\n#endif /* __CPU_IVYBRIDGE_H */\n"
  },
  {
    "path": "src/lib/cpu/known_cpus.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __KNOWN_CPUS_H\n#define __KNOWN_CPUS_H\n\n#include \"cpu.h\"\n\n// later, cpu_model_name() is used to distinguish between\n// Xeon and non-Xeon processors. It's much easier here\n// to consider all processors non-Xeon.\n// references:\n// 1- http://a4lg.com/tech/x86/database/x86-families-and-models.en.html\n// 2- Intel® Xeon® Processor E7-8800/4800 v3 Product Family Specification\n// 3- https://software.intel.com/en-us/articles/intel-architecture-and-processor-identification-with-cpuid-model-and-family-numbers\nmicroarch_ID_t known_cpus[] =\n    {\n        // order does not matter\n        {.family = 0x06, .model = 0x2A, .microarch = SandyBridge},\n        {.family = 0x06, .model = 0x2D, .microarch = SandyBridge},\n\n        {.family = 0x06, .model = 0x3A, .microarch = IvyBridge},\n        {.family = 0x06, .model = 0x3E, .microarch = IvyBridge},\n\n        {.family = 0x06, .model = 0x3C, .microarch = Haswell},\n        {.family = 0x06, .model = 0x3F, .microarch = Haswell},\n        {.family = 0x06, .model = 0x45, .microarch = Haswell},\n        {.family = 0x06, .model = 0x46, .microarch = Haswell},\n\n        // must be the last element\n        {.family = 0x0, .model = 0x0, .microarch = Invalid}};\n\n// order must correspond to microarch_t\nchar *microarch_strings[] =\n    {\n        \"Invalid\",\n        \"Sandy Bridge\",\n        \"Sandy Bridge Xeon\",\n        \"Ivy Bridge\",\n        \"Ivy Bridge Xeon\",\n        \"Haswell\",\n        \"Haswell Xeon\"};\n\n#endif /* __KNOWN_CPUS_H */\n"
  },
  {
    "path": "src/lib/cpu/pmc-papi.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <papi.h>\n#include <pthread.h>\n#include <sys/syscall.h>\n#include \"cpu/pmc-papi.h\"\n#include \"debug.h\"\n\n__thread int tls_event_set = PAPI_NULL;\n\n#define STR_MAX_SIZE 256\n\nstatic void log_papi_critical(int ret_val, const char *msg) {\n\t//char papi_str[STR_MAX_SIZE];\n\t//PAPI_perror(ret_val, (char *)papi_str, sizeof(papi_str));\n    DBG_LOG(CRITICAL, \"%s (%s)\\n\", msg, PAPI_strerror(ret_val));\n}\n\nint pmc_init() {\n\tint ret_val;\n\n    if ((ret_val = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT) {\n        log_papi_critical(ret_val, \"PMC library init error\");\n        return -1;\n    }\n\n    if ((ret_val = PAPI_thread_init(pthread_self)) != PAPI_OK) {\n        log_papi_critical(ret_val, \"PMC thread support init error\");\n        return -1;\n    }\n\n//    if ((ret_val = PAPI_set_domain(PAPI_DOM_ALL)) != PAPI_OK) {\n//        log_papi_critical(ret_val, \"PMC set domain error\");\n//        return -1;\n//    }\n\n    return 0;\n}\n\nvoid pmc_shutdown() {\n    PAPI_shutdown();\n}\n\nint pmc_create_event_set_local_thread() {\n\tint ret_val;\n\n    if ((ret_val = PAPI_create_eventset(&tls_event_set)) != PAPI_OK) {\n        log_papi_critical(ret_val, \"PMC event set init error\");\n        return -1;\n    }\n\n//    if ((ret_val = PAPI_set_granularity(PAPI_GRN_SYS)) != PAPI_OK) {\n//        log_papi_critical(ret_val, \"PMC set granularity error\");\n//        return -1;\n//    }\n\n    return 0;\n}\n\nvoid pmc_destroy_event_set_local_thread() {\n    PAPI_cleanup_eventset(tls_event_set);\n    PAPI_destroy_eventset(&tls_event_set);\n}\n\nint pmc_register_thread() {\n\treturn PAPI_register_thread();\n}\n\nint pmc_unregister_thread() {\n\treturn PAPI_unregister_thread();\n}\n\nint pmc_register_event_local_thread(const char *event_name) {\n    int ret_val;\n    char msg[STR_MAX_SIZE];\n\n    // The pthread scope for each thread should be set to PTHREAD_SCOPE_SYSTEM.\n    // On linux, pthread supports only PTHREAD_SCOPE_SYSTEM.\n\n    assert(tls_event_set != PAPI_NULL);\n    assert(event_name);\n\n    if ((ret_val = PAPI_add_named_event(tls_event_set, (char *)event_name)) != PAPI_OK) {\n    \tsnprintf(msg, sizeof(msg), \"PMC event (%s) register error\", event_name);\n    \tlog_papi_critical(ret_val, msg);\n        return -1;\n    }\n\n    return 0;\n}\n\nint pmc_events_start_local_thread() {\n    int ret_val;\n\n    assert(tls_event_set != PAPI_NULL);\n\n    if ((ret_val = PAPI_start(tls_event_set)) != PAPI_OK) {\n    \tlog_papi_critical(ret_val, \"PMC events start error\");\n        return -1;\n    }\n\n    return 0;\n}\n\nvoid pmc_events_stop_local_thread() {\n\tlong long values[MAX_NUM_EVENTS];\n\n\tassert(tls_event_set != PAPI_NULL);\n\n    PAPI_stop(tls_event_set, values);\n}\n\nint pmc_events_read_local_thread(long long *values) {\n    int ret_val;\n//    int status = 0;\n\n    assert(values);\n\n//    PAPI_state(event_set, &status);\n//    if (status != PAPI_RUNNING) {\n//        DBG_LOG(CRITICAL, \"PMC event set not in running state\");\n//        return -1;\n//    }\n\n    if ((ret_val = PAPI_read(tls_event_set, values)) != PAPI_OK) {\n    \tlog_papi_critical(ret_val, \"PMC events read error\");\n        return -1;\n    }\n\n    if ((ret_val = PAPI_reset(tls_event_set)) != PAPI_OK) {\n        log_papi_critical(ret_val, \"PMC events reset error\");\n        return -1;\n    }\n\n    return 0;\n}\n"
  },
  {
    "path": "src/lib/cpu/pmc-papi.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __CPU_PMC_H\r\n#define __CPU_PMC_H\r\n\r\n#include <stdint.h>\r\n\r\n\r\n// Usually the architectures support up to 4 counters enabled at the same\r\n// time per core when HT is enabled\r\n#define MAX_NUM_EVENTS 4\r\n\r\ntypedef uint64_t (*read_stalls_t)(void);\r\n\r\ntypedef struct {\r\n\tconst char **native_events;\r\n\tread_stalls_t read_stalls_events_local;\r\n\tread_stalls_t read_stalls_events_remote;\r\n} pmc_event_t;\r\n\r\nint pmc_init();\r\nvoid pmc_shutdown();\r\nint pmc_create_event_set_local_thread();\r\nvoid pmc_destroy_event_set_local_thread();\r\nint pmc_register_event_local_thread(const char *event_name);\r\nint pmc_events_start_local_thread();\r\nvoid pmc_events_stop_local_thread();\r\nint pmc_events_read_local_thread(long long *values);\r\n\r\nint pmc_register_thread();\r\nint pmc_unregister_thread();\r\n\r\n#endif /* __CPU_PMC_H */\r\n"
  },
  {
    "path": "src/lib/cpu/pmc.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <stdlib.h>\n#include \"cpu/pmc.h\"\n#include \"dev.h\"\n#include \"error.h\"\n#include \"thread.h\"\n#include \"topology.h\"\n\n#pragma GCC push_options\n#pragma GCC optimize (\"O0\")\n\n// The width of general purpose counters are 40bits.\n// https://www.felixcloutier.com/x86/RDPMC.html\n#define RDPMC_MAX_VALUE 0xFFFFFFFFFF  \n\nlong long rdpmc(int counter) \n{\n\n\tunsigned eax;\n\tunsigned edx;\n\tunsigned long long r;\n\n\t__asm__ __volatile__ (\"mov %2, %%ecx\\n\\t\"\n\t                      \"rdpmc\\n\\t\"\n\t                      \"mov %%eax, %0\\n\\t\"\n\t                      \"and $255, %%edx\\n\\t\"\n\t                      \"mov %%edx, %1\\n\\t\"\n\t                      : \"=m\" (eax), \"=m\" (edx), \"=m\" (counter)\n\t                      : /* no inputs */\n\t                      : \"eax\", \"ecx\", \"edx\"); /* eax, ecx, edx clobbered */\n\t                      r = ((unsigned long long) edx << 32) | eax;\n\treturn r;\n\n}\n\nint rdpmc32(int counter) {\n\n\tunsigned eax;\n\t\n\t__asm__ __volatile__ (\"mov %1, %%ecx\\n\\t\"\n\t                      \"rdpmc\\n\\t\"\n\t                      \"mov %%eax, %0\\n\\t\"\n\t                      : \"=m\" (eax), \"=m\" (counter)\n\t                      : /* no inputs */\n\t                      : \"eax\", \"ecx\", \"edx\"); /* eax, ecx, edx clobbered */\n\treturn eax;\n\n}\n#pragma GCC pop_options\n\n\n/*int num_used_hw_cntrs(pmc_events_t* events)\n{\n    int i;\n    int used;\n    pmc_hw_event_t* event = 0;\n\n     // check if this a known registered hardware event\n    for (i=0, used=0; events->known_hw_events[i].name; i++) {\n        event = &events->known_hw_events[i];\n        used += event->active ? 0 : 1;\n    }\n    return used;    \n}*/\n\nint get_avail_hw_cntr_id(pmc_events_t* events)\n{\n    int i;\n    int used;\n    pmc_hw_event_t* event = 0;\n    int status = -1;\n\n    int* hw_cntr_id_status = calloc(events->num_avail_hw_cntrs, sizeof(int));\n    \n    for (i=0, used=0; events->known_hw_events[i].name; i++) {\n        event = &events->known_hw_events[i];\n        if (event->active) {\n            used++;\n            hw_cntr_id_status[event->hw_cntr_id] = 1;\n        }\n    }\n    \n    if (used == events->num_avail_hw_cntrs) {\n        goto done;\n    }\n\n    for (i=0; events->num_avail_hw_cntrs; i++) {\n        if (hw_cntr_id_status[i] == 0) {\n            status = i;\n            goto done;\n        }\n    }\n\ndone:\n\tfree(hw_cntr_id_status);\n\treturn status;\n}\n\npmc_hw_event_t* enable_pmc_hw_event(pmc_events_t* events, const char* name)\n{\n    int i;\n    pmc_hw_event_t* event = 0;\n    int found = 0;\n\n     // check if this a known registered hardware event\n    for (i=0; events->known_hw_events[i].name; i++) {\n        event = &events->known_hw_events[i];\n        if (strcasecmp(event->name, name) == 0) {\n        \tfound = 1;\n            if (event->active) {\n                return event;\n            }\n            break;\n        }\n    }\n\n    if (!found) {\n        DBG_LOG(WARNING, \"Unknown hardware performance monitoring event\\n\");\n        return NULL;\n    }\n\n    // enable it \n    // need to find an available performance counter to monitor this event\n    if ((event->hw_cntr_id = get_avail_hw_cntr_id(events)) < 0) {\n        DBG_LOG(ERROR, \"No available hardware performance counters\\n\");\n        return NULL;\n    }\n\n    // assign an array to keep per processor last read values (useful to calculate the diff since the last read)\n    int num_cpus = system_num_cpus();\n    if (!event->last_val) {\n        event->last_val = calloc(num_cpus, sizeof(*event->last_val));\n    }\n    for (i=0; i<num_cpus; i++) {\n        event->last_val[i] = 0;\n    }\n    // call into the kernel driver to enable the counter on all processors\n    if (set_counter(event->hw_cntr_id, event->encoding) != E_SUCCESS) {\n    \tDBG_LOG(ERROR, \"Can't enable counter on all processors\\n\");\n    \treturn NULL;\n    }\n\n    event->active = 1;\n    return event;\n}\n\nvoid disable_pmc_hw_event(pmc_events_t* events, const char* name)\n{\n    int i;\n    pmc_hw_event_t* event = 0;\n    int found = 0;\n\n    // check if this a known registered hardware event\n    for (i=0; events->known_hw_events[i].name; i++) {\n        event = &events->known_hw_events[i];\n        if (strcasecmp(event->name, name) == 0) {\n        \tfound = 1;\n            if (!event->active) {\n                return;\n            }\n            break;\n        }\n    }\n\n    if (!found) {\n        DBG_LOG(WARNING, \"Unknown hardware performance monitoring event\\n\");\n        return;\n    }\n\n    event->active = 0;\n}\n\nvoid clear_pmc_hw_event(pmc_hw_event_t* event)\n{\n    DBG_LOG(CRITICAL, \"Unimplemented functionality\\n\");\n}\n\nuint64_t read_pmc_hw_event_cur(pmc_hw_event_t* event)\n{\n    return rdpmc(event->hw_cntr_id);\n}\n\nuint64_t read_pmc_hw_event_diff(pmc_hw_event_t* event)\n{\n    int cpu_id = thread_self()->cpu_id;\n    uint64_t cur_val = read_pmc_hw_event_cur(event);\n    uint64_t last_val = event->last_val[cpu_id];\n    //if (cur_val < last_val && (event->hw_cntr_id == 0)) {\n    if (cur_val < last_val) {\n        event->last_val[cpu_id] = cur_val;\n        return (cur_val + (RDPMC_MAX_VALUE - last_val));\n    }\n    event->last_val[cpu_id] = cur_val;\n    return cur_val - last_val;\n}\n\n\npmc_event_t* enable_pmc_event(cpu_model_t* cpu, const char* name) \n{\n    int i;\n    pmc_event_t* event = 0;\n    int found = 0;\n\n    // check if this a known registered event\n    for (i=0; cpu->pmc_events->known_events[i].name; i++) {\n        event = &cpu->pmc_events->known_events[i];\n        if (strcasecmp(event->name, name) == 0) {\n        \tfound = 1;\n            if (event->active) {\n                return event;\n            }\n            break;\n        }\n    }\n\n    if (!found) {\n    \treturn NULL;\n    }\n\n    // enable it \n    event->hw_events = NULL;\n    event->num_hw_events = 0;\n    if (event->enable(cpu->pmc_events, event) != E_SUCCESS) {\n        assert(0 && \"DIE\");\n        return NULL;\n    }\n    event->active = 1;\n    return event;\n}\n\nint assign_pmc_hw_event_to_event(pmc_events_t* events, const char* name, pmc_event_t* event, int local_id)\n{\n    pmc_hw_event_t* hw_event;\n\n    if (!(hw_event = enable_pmc_hw_event(events, name))) {\n        return E_ERROR;\n    }\n    if (local_id != event->num_hw_events) {\n        DBG_LOG(CRITICAL, \"local_id does not match assign id\\n\")\n        // TODO: application should abort here, look for all DBG_LOG(CRITICAL)\n    }\n\n    event->hw_events = realloc(event->hw_events, (event->num_hw_events+1) * sizeof(*event->hw_events));\n    event->hw_events[event->num_hw_events] = hw_event;\n    event->num_hw_events++; \n    return E_SUCCESS;\n}\n\nvoid release_all_pmc_hw_events_of_event(pmc_event_t* event)\n{\n    int i;\n    if (event->num_hw_events > 0) {\n        for (i=0; i<event->num_hw_events; i++) {\n            event->hw_events[i]->active = 0;\n        }\n        free(event->hw_events);\n        event->hw_events = NULL;\n        event->num_hw_events = 0;\n    }\n}\n\nvoid disable_pmc_event(cpu_model_t* cpu, const char* name) \n{\n    int i;\n    pmc_event_t* event;\n\n    for (i=0; cpu->pmc_events->known_events[i].name; i++) {\n        event = &cpu->pmc_events->known_events[i];\n        if (strcasecmp(event->name, name) == 0 && event->active) {\n            event->active = 0;\n        }\n    }\n}\n"
  },
  {
    "path": "src/lib/cpu/pmc.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __CPU_PMC_H\n#define __CPU_PMC_H\n\n#include \"cpu/cpu.h\"\n\n#define DECLARE_ENABLE_PMC(prefix, name) int prefix##_create_pmc_##name(struct pmc_events_s* events, struct pmc_event_s* event)\n#define DECLARE_CLEAR_PMC(prefix, name) void prefix##_clear_pmc_##name(struct pmc_event_s* event)\n#define DECLARE_READ_PMC(prefix, name) uint64_t prefix##_read_pmc_##name(struct pmc_event_s* event)\n#define ENABLE_PMC_FNAME(prefix, name) prefix##_create_pmc_##name\n#define CLEAR_PMC_FNAME(prefix, name) prefix##_clear_pmc_##name\n#define READ_PMC_FNAME(prefix, name) prefix##_read_pmc_##name\n\n#define PMC_HW_EVENT(name, os_name, encoding)  { name, os_name, encoding, 0, 0},\n#define PMC_EVENT(name, prefix)  { #name, NULL, 0, 0, ENABLE_PMC_FNAME(prefix, name), CLEAR_PMC_FNAME(prefix, name), READ_PMC_FNAME(prefix, name)},\n\n#define PMC_EVENTS_PTR(prefix) &prefix##_pmc_events\n\n#define PMC_EVENTS(prefix, num_hw_cntrs)          \\\n  pmc_hw_event_t prefix##_known_hw_event[] = {    \\\n    FOREACH_PMC_HW_EVENT(PMC_HW_EVENT)            \\\n    {NULL, NULL, 0, 0, 0}                         \\\n  };                                              \\\n  pmc_event_t prefix##_known_event[] = {          \\\n    FOREACH_PMC_EVENT(PMC_EVENT, prefix)          \\\n    {NULL, NULL, 0, 0, NULL, NULL, NULL}          \\\n  };                                              \\\n  pmc_events_t prefix##_pmc_events = {            \\\n    num_hw_cntrs,                                 \\\n    prefix##_known_hw_event,                      \\\n    prefix##_known_event                          \\\n  };\n\n#define ASSIGN_PMC_HW_EVENT_TO_ME(name, local_id)                                   \\\n  if (assign_pmc_hw_event_to_event(events, name, event, local_id) != E_SUCCESS) {   \\\n    release_all_pmc_hw_events_of_event(event);                                      \\\n  }\n\n#define READ_MY_HW_EVENT_DIFF(local_id) read_pmc_hw_event_diff(event->hw_events[local_id])\n#define READ_MY_HW_EVENT_CUR(local_id) read_pmc_hw_event_cur(event->hw_events[local_id])\n\ntypedef struct {\n    char* name;\n    char* os_name; // perf name if known\n    uint64_t encoding;\n    int active;\n    int hw_cntr_id;\n    uint64_t* last_val; // array holding the last read values per processor (useful to calculate the diff since the last read)\n} pmc_hw_event_t;\n\ntypedef struct pmc_event_s {\n    const char* name;\n    pmc_hw_event_t** hw_events;\n    int num_hw_events;\n    int active;\n    int (*enable)(struct pmc_events_s* events, struct pmc_event_s* event);\n    void (*clear)(struct pmc_event_s* event);\n    uint64_t (*read)(struct pmc_event_s* event);\n} pmc_event_t;\n\ntypedef struct pmc_events_s {\n    int num_avail_hw_cntrs; \n    pmc_hw_event_t* known_hw_events;\n    pmc_event_t* known_events;\n} pmc_events_t;\n\npmc_hw_event_t* enable_pmc_hw_event(pmc_events_t* events, const char* name);\nvoid disable_pmc_hw_event(pmc_events_t* events, const char* name);\nvoid clear_pmc_hw_event(pmc_hw_event_t* event);\nuint64_t read_pmc_hw_event_cur(pmc_hw_event_t* event);\nuint64_t read_pmc_hw_event_diff(pmc_hw_event_t* event);\nint assign_pmc_hw_event_to_event(pmc_events_t* events, const char* name, pmc_event_t* event, int local_id);\nvoid release_all_pmc_hw_events_of_event(pmc_event_t* event);\n\npmc_event_t* enable_pmc_event(cpu_model_t* cpu, const char* name);\nvoid disable_pmc_event(cpu_model_t* cpu, const char* name);\n\nstatic inline void clear_pmc_event(pmc_event_t* event)\n{\n    event->clear(event);\n}\n\n//#include \"debug.h\"\n\nstatic inline uint64_t read_pmc_event(pmc_event_t* event)\n{\n    uint64_t ret;\n    ret = event->read(event);\n    return ret;\n}\n\n#endif /* __CPU_PMC_H */\n"
  },
  {
    "path": "src/lib/cpu/sandybridge-papi.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __CPU_SANDYBRIDGE_H\n#define __CPU_SANDYBRIDGE_H\n\n#include <papi.h>\n#include <math.h>\n#include \"debug.h\"\n\n// Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with\n// applications to list all available performance events with their architecutre specific \n// detailed description and translate them to their respective event code. showevtinfo application can \n// be used to list all available performance event names with detailed desciption and check_events application\n// can be used to translate the performance event to the corresponding event code.  \n\n// These events will be initialized and started.\n// Every event reading will return an array with the values for all these events.\n// The array index is the same index used to define the event in the *_native_events array below\nconst char *sandybridge_native_events[MAX_NUM_EVENTS] = {\n    \"CYCLE_ACTIVITY:STALLS_L2_PENDING\",\n    \"MEM_LOAD_UOPS_MISC_RETIRED:LLC_MISS\",\n    \"MEM_LOAD_UOPS_RETIRED:L3_HIT\",\n    NULL\n};\n\n\nvoid sandybridge_latency_calibration_local(int *hw_latency, int target_latency) {\n\tif ((*hw_latency + 10) < target_latency)\n\t\t*hw_latency += 10;\n}\n\nvoid sandybridge_latency_calibration_remote(int *hw_latency, int target_latency) {\n\tif ((*hw_latency + 30) < target_latency)\n\t\t*hw_latency += 30;\n}\n\nuint64_t sandybridge_read_stall_events_local() {\n    long long values[MAX_NUM_EVENTS];\n    uint64_t events = 0;\n\n    if (pmc_events_read_local_thread(values) == PAPI_OK) {\n        uint64_t cycle_activity_stalls_l2_pending_diff = values[0];\n        uint64_t mem_load_uops_misc_retired_llc_miss_diff = values[1];\n        uint64_t mem_load_uops_retired_l3_hit_diff = values[2];\n\n        DBG_LOG(DEBUG, \"read stall L2 cycles %lu, LLC miss %lu, L3 hit %lu\\n\",\n        \t\tcycle_activity_stalls_l2_pending_diff, mem_load_uops_misc_retired_llc_miss_diff,\n        \t\tmem_load_uops_retired_l3_hit_diff);\n\n    \tuint64_t uden = 7.0 * mem_load_uops_misc_retired_llc_miss_diff + mem_load_uops_retired_l3_hit_diff;\n        if (uden == 0) {\n            return 0;\n        }\n        double den = uden;\n        double num = 7.0 * mem_load_uops_misc_retired_llc_miss_diff;\n\n        events = (uint64_t) floorl(cycle_activity_stalls_l2_pending_diff*num/den);\n    } else {\n        DBG_LOG(DEBUG, \"read stall cycles failed\\n\");\n    }\n\n    return events;\n}\n\n#endif /* __CPU_SANDYBRIDGE_H */\n"
  },
  {
    "path": "src/lib/cpu/sandybridge.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __CPU_SANDYBRIDGE_H\n#define __CPU_SANDYBRIDGE_H\n\n#include <math.h>\n#include \"thread.h\"\n#include \"cpu/pmc.h\"\n#include \"debug.h\"\n\n// Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with\n// applications to list all available performance events with their architecutre specific \n// detailed description and translate them to their respective event code. showevtinfo application can \n// be used to list all available performance event names with detailed desciption and check_events application\n// can be used to translate the performance event to the corresponding event code.  \n\n#undef FOREACH_PMC_HW_EVENT\n#define FOREACH_PMC_HW_EVENT(ACTION)                                                                       \\\n  ACTION(\"CYCLE_ACTIVITY:STALLS_L2_PENDING\", NULL, 0x55305a3)                                              \\\n  ACTION(\"MEM_LOAD_UOPS_MISC_RETIRED:LLC_MISS\", NULL, 0x5302d4)                                            \\\n  ACTION(\"MEM_LOAD_UOPS_RETIRED:L3_HIT\", NULL, 0x5304d1)                                                   \\\n  ACTION(\"INSTRUCTION_RETIRED\", NULL, 0x5300c0)               \n\n#undef FOREACH_PMC_EVENT\n#define FOREACH_PMC_EVENT(ACTION, prefix)                                                                  \\\n  ACTION(ldm_stall_cycles, prefix)\n\n\nDECLARE_ENABLE_PMC(sandybridge, ldm_stall_cycles)\n{\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"CYCLE_ACTIVITY:STALLS_L2_PENDING\", 0);\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"MEM_LOAD_UOPS_MISC_RETIRED:LLC_MISS\", 1);\n    //ASSIGN_PMC_HW_EVENT_TO_ME(\"INSTRUCTION_RETIRED\", 2);\n    ASSIGN_PMC_HW_EVENT_TO_ME(\"MEM_LOAD_UOPS_RETIRED:L3_HIT\", 2);\n\n    return E_SUCCESS;\n}\n\nDECLARE_CLEAR_PMC(sandybridge, ldm_stall_cycles)\n{\n}\n\nDECLARE_READ_PMC(sandybridge, ldm_stall_cycles)\n{\n\t//return 0;\n   uint64_t cycle_activity_stalls_l2_pending_diff = READ_MY_HW_EVENT_DIFF(0);\n   uint64_t mem_load_uops_misc_retired_llc_miss_diff = READ_MY_HW_EVENT_DIFF(1);\n   uint64_t mem_load_uops_retired_l3_hit_diff = READ_MY_HW_EVENT_DIFF(2);\n\n   //return floor(cycle_activity_stalls_l2_pending_diff * (((double) (7*mem_load_uops_misc_retired_llc_miss_diff))/((double)(7*mem_load_uops_misc_retired_llc_miss_diff + mem_load_uops_retired_l3_hit_diff))));\n   uint64_t uden = 7.0 * mem_load_uops_misc_retired_llc_miss_diff + mem_load_uops_retired_l3_hit_diff;\n   if (uden == 0) {\n      return 0;  \n   }\n   double den = uden;\n   double num = 7.0 * mem_load_uops_misc_retired_llc_miss_diff;\n\n   return (uint64_t) floorl(cycle_activity_stalls_l2_pending_diff*num/den);\n}\n\n\nPMC_EVENTS(sandybridge, 4)\n#endif /* __CPU_SANDYBRIDGE_H */\n"
  },
  {
    "path": "src/lib/cpu/xeon-ex.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include \"dev.h\"\n\n#ifdef PAPI_SUPPORT\n#include \"sandybridge-papi.h\"\n#include \"ivybridge-papi.h\"\n#include \"haswell-papi.h\"\n#else\n#include \"sandybridge.h\"\n#include \"ivybridge.h\"\n#include \"haswell.h\"\n#endif\n\nint intel_xeon_ex_set_throttle_register(pci_regs_t *regs, throttle_type_t throttle_type, uint16_t val)\n{\n    int offset;\n    int i;\n\n    switch(throttle_type) {\n        case THROTTLE_DDR_ACT:\n            offset = 0x190; break;\n        case THROTTLE_DDR_READ:\n            offset = 0x192; break;\n        case THROTTLE_DDR_WRITE:\n            offset = 0x194; break;\n        default:\n            offset = 0x190;\n    }\n\n    // write to all 4 channels\n\n    // first Activate throttling\n    /*set_pci(bus_id, 0x10, 0x0, 0x190, (uint16_t) val);\n    set_pci(bus_id, 0x10, 0x1, 0x190, (uint16_t) val);\n    set_pci(bus_id, 0x10, 0x4, 0x190, (uint16_t) val);\n    set_pci(bus_id, 0x10, 0x5, 0x190, (uint16_t) val);*/\n\n    // then the Read or Write throttling\n    for (i=0; i < regs->channels; ++i) {\n        set_pci(regs->addr[i].bus_id, regs->addr[i].dev_id, regs->addr[i].funct, offset, (uint16_t) val);\n    }\n\n    return 0;\n}\n\nint intel_xeon_ex_get_throttle_register(pci_regs_t *regs, throttle_type_t throttle_type, uint16_t* val)\n{\n    int offset;\n\n    switch(throttle_type) {\n        case THROTTLE_DDR_ACT:\n            offset = 0x190; break;\n        case THROTTLE_DDR_READ:\n            offset = 0x192; break;\n        case THROTTLE_DDR_WRITE:\n            offset = 0x194; break;\n        default:\n            offset = 0x190;\n    }\n\n    // read just channel 1\n    get_pci(regs->addr[0].bus_id, regs->addr[0].dev_id, regs->addr[0].funct, offset, val);\n    return 0;\n}\n\n\n// desc is fixed in cpu_model() if not Xeon\n\ncpu_model_t cpu_model_intel_xeon_ex = {\n    .microarch = SandyBridgeXeon,\n#ifdef PAPI_SUPPORT\n    .pmc_events = {sandybridge_native_events, sandybridge_read_stall_events_local, NULL},\n#else\n    .pmc_events = PMC_EVENTS_PTR(sandybridge),\n#endif\n    .set_throttle_register = intel_xeon_ex_set_throttle_register,\n    .get_throttle_register = intel_xeon_ex_get_throttle_register\n};\n\ncpu_model_t cpu_model_intel_xeon_ex_v2 = {\n    .microarch = IvyBridgeXeon,\n#ifdef PAPI_SUPPORT\n    .pmc_events = {ivybridge_native_events, ivybridge_read_stall_events_local, ivybridge_read_stall_events_remote},\n#else\n    .pmc_events = PMC_EVENTS_PTR(ivybridge),\n#endif\n    .set_throttle_register = intel_xeon_ex_set_throttle_register,\n    .get_throttle_register = intel_xeon_ex_get_throttle_register\n};\n\ncpu_model_t cpu_model_intel_xeon_ex_v3 = {\n    .microarch = HaswellXeon,\n#ifdef PAPI_SUPPORT\n    .pmc_events = {haswell_native_events, haswell_read_stall_events_local, haswell_read_stall_events_remote},\n#else\n    .pmc_events = PMC_EVENTS_PTR(haswell),\n#endif\n    .set_throttle_register = intel_xeon_ex_set_throttle_register,\n    .get_throttle_register = intel_xeon_ex_get_throttle_register\n};\n"
  },
  {
    "path": "src/lib/debug.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include \"debug.h\"\n#include <sys/types.h>\n#include <execinfo.h>\n#include <unistd.h>\n#include <stdio.h>\n#include \"config.h\"\n\n\nint         dbg_modules[dbg_module_count];\nint         dbg_level = 0;\nint         dbg_verbose = 0;\nconst char* dbg_identifier = \"\";\nstatic char dbg_identifier_buf[128];\n\nstatic int \nstrrep(char *target, char *source, char oldc, char newc)\n{\n\tint i;\n\n\tfor (i=0; source[i]; i++) {\n\t\tif (source[i] == oldc) {\n\t\t\ttarget[i] = newc;\n\t\t} else {\n\t\t\ttarget[i] = source[i];\n\t\t}\n\t}\n\ttarget[i] = '\\0';\n\treturn 0;\n}\n\n\nvoid\ndbg_set_level(int level)\n{\n\tdbg_level = level;\n}\n\n\nint \ndbg_init(config_t* dbg_cfg, int level, const char* identifier)\n{\n\t// if user hasn't provided a debugging level then get it from the \n\t// configuration env/file\n\tif (level < 0) {\n\t\t__cconfig_lookup_int(dbg_cfg, \"debug.level\", &dbg_level);\n\t} else {\n\t\tdbg_level = level;\n\t}\n\n\t__cconfig_lookup_int(dbg_cfg, \"debug.verbose\", &dbg_verbose);\n\n\t// if user hasn't provide an identifier then check whether the environment \n\t// provides one, othewise create one based on process' pid \n\tif (!identifier) {\n\t\tdbg_identifier = getenv(\"DEBUG_IDENTIFIER\");\n\t\tif (!dbg_identifier) {\n\t\t\tsprintf(dbg_identifier_buf, \"%d\", getpid()); \n\t\t\tdbg_identifier = dbg_identifier_buf;\n\t\t}\n\t} else {\n\t\tdbg_identifier = identifier;\n\t}\n\n\n\t// read per module debugging flags\n#define STR(name) #name\n#define ACTION(name)                                                           \\\n\tdo {                                                                       \\\n\t\tchar dotstr[128];                                                      \\\n\t\tstrrep(dotstr, STR(debug_module_##name), '_', '.');                    \\\n\t\t__cconfig_lookup_bool(dbg_cfg, dotstr,                                 \\\n\t\t                      &dbg_modules[dbg_module_##name]);                \\\n\t} while (0);\n\n\tFOREACH_DEBUG_MODULE(ACTION)\n#undef ACTION\n        DBG_LOG(DEBUG, \"\"); // prevent compiler warning\n\treturn 0;\n}\n\n\nvoid\ndbg_backtrace (void)\n{\n\tvoid *array[10];\n\tsize_t size;\n\tchar **strings;\n\tsize_t i;\n \n\tsize = backtrace (array, 10);\n\tstrings = backtrace_symbols (array, size);\n\t\t\t\t\t\t\t\t\t      \n\tprintf (\"Obtained %zd stack frames.\\n\", size);\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t      \n\tfor (i = 0; i < size; i++)\n\t\tprintf (\"%s\\n\", strings[i]);\n\tfree (strings);\n}\n"
  },
  {
    "path": "src/lib/debug.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __DEBUG_H\n#define __DEBUG_H\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <assert.h>\n#include <time.h>\n#include \"config.h\"\n\n#define FOREACH_DEBUG_MODULE(ACTION)                        \\\n\tACTION(all) /* special name that covers all modules */\n\n\n#define ACTION(name)                                        \\\n\tdbg_module_##name,\n\nenum {\n\tFOREACH_DEBUG_MODULE(ACTION)\n\tdbg_module_count\n};\n#undef ACTION\n\n#ifndef NDEBUG\n#define DBG_CODE(code) DBG_##code\n\nenum dbg_code {\n\tDBG_OFF = 0,\n\tDBG_CODE(CRITICAL) = 1, // Critical\n\tDBG_CODE(ERROR)    = 2, // Error\n\tDBG_CODE(WARNING)  = 3, // Warning\n\tDBG_CODE(INFO)     = 4, // Info\n\tDBG_CODE(DEBUG)    = 5, // Debugging\n};\n\nstatic const char* dbg_code2str[] = {\n\t(char*) \"OFF\",\n\t(char*) \"CRITICAL\",\n\t(char*) \"ERROR\",\n\t(char*) \"WARNING\",\n\t(char*) \"INFO\",\n\t(char*) \"DEBUG\",\n};\n\nstatic const int dbg_terminate_level = DBG_ERROR;\nstatic const int dbg_stderr_level = DBG_WARNING;\n\nextern int         dbg_modules[];\nextern int         dbg_level;\nextern int         dbg_verbose;\nextern const char* dbg_identifier;\n\n#define DBG_MODULE(name) dbg_module_##name\n\n#define DBG_LOG(level, format, ...)                                            \\\n  do {                                                                         \\\n    FILE* ferr = stdout;                                                       \\\n    time_t ctime;                                                              \\\n    if (DBG_CODE(level) && (DBG_CODE(level) <= dbg_level ||                    \\\n                  DBG_CODE(level) <= dbg_terminate_level))                     \\\n    {                                                                          \\\n      if (DBG_CODE(level) <= dbg_stderr_level) {                               \\\n        ferr=stderr;                                                           \\\n      }                                                                        \\\n      if (dbg_verbose) {                                                       \\\n        ctime = time(NULL);                                                    \\\n        fprintf(ferr, \"[%s] [%lu] %s in %s <%s,%d>: \" format,                  \\\n                dbg_identifier,                                                \\\n                ctime,                                                         \\\n                dbg_code2str[DBG_CODE(level)],                                 \\\n                __FUNCTION__, __FILE__, __LINE__, ##__VA_ARGS__);              \\\n      } else {                                                                 \\\n        fprintf(ferr, \"[%s] %s: \" format,                                      \\\n                dbg_identifier,                                                \\\n                dbg_code2str[DBG_CODE(level)],                                 \\\n                ##__VA_ARGS__);                                                \\\n      }                                                                        \\\n      if (DBG_CODE(level) <= dbg_terminate_level) {                            \\\n        exit(-1);                                                              \\\n      }\t                                                                       \\\n    }\t\t\t                                                               \\\n  } while(0);\n\n\n#define DBG_LOG2(level, module, format, ...)                                   \\\n  do {                                                                         \\\n    FILE* ferr = stdout;                                                       \\\n    if (DBG_CODE(level) &&                                                     \\\n\t    (dbg_modules[module] || dbg_modules[dbg_module_all] ||                 \\\n\t\t DBG_CODE(level) <= dbg_terminate_level) &&                            \\\n\t    (DBG_CODE(level) <= dbg_level ||                                       \\\n         DBG_CODE(level) <= dbg_terminate_level))                              \\\n    {                                                                          \\\n      if (DBG_CODE(level) <= dbg_stderr_level) {                               \\\n        ferr=stderr;                                                           \\\n      }                                                                        \\\n \t  fprintf(ferr, \"[%s] %s in %s <%s,%d>: \" format,                          \\\n              dbg_identifier,                                                  \\\n              dbg_code2str[DBG_CODE(level)],                                   \\\n              __FUNCTION__, __FILE__, __LINE__, ##__VA_ARGS__);                \\\n      if (DBG_CODE(level) <= dbg_terminate_level) {                            \\\n        exit(-1);                                                              \\\n      }\t                                                                       \\\n    }\t\t\t                                                               \\\n  } while(0);\n\n#else /* NDEBUG */\n\n#define DBG_LOG(level, format, ...)\n#define DBG_LOG2(level, module, format, ...)\n\n#endif /* NDEBUG */\n\n\n#define VERIFY(condition)                                                      \\\n  do {                                                                         \\\n    if (!(condition)) {                                                        \\\n      fprintf(stderr, \"Assumption \\\"%s\\\"\\nFailed in file %s: at line:%i\\n\",    \\\n              #condition,__FILE__,__LINE__);                                   \\\n      DBG_LOG (DBG_CRITICAL, #condition);}                                     \\\n      fflush(stderr);                                                          \\\n  } while (0);\n\n\nint dbg_init(config_t* dbg_cfg, int level, const char* identifier);\nvoid dbg_backtrace (void);\nvoid dbg_set_level(int level);\n\n#endif // __DEBUG_H\n"
  },
  {
    "path": "src/lib/dev.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <sys/types.h>\n#include <sys/stat.h>\n#include <sys/ioctl.h>\n#include <fcntl.h>\n#include <stdint.h>\n#include <unistd.h>\n#include <errno.h>\n#include \"dev/ioctl_query.h\"\n#include \"error.h\"\n#include \"dev.h\"\n\n// TODO: get this value from the config file\n#define DEV_PATH \"/dev/nvmemul\"\n\nint set_counter(unsigned int counter_id, unsigned int event_id)\n{\n    int fd;\n    int ret;\n\n    ioctl_query_setcounter_t q;\n    fd = open(DEV_PATH, O_RDONLY);\n    if (fd < 0) {\n        DBG_LOG(ERROR, \"Can't open %s - Is the NVM emulator device driver installed?\\n\", DEV_PATH);\n        return E_ERROR;\n    }\n    q.counter_id = counter_id;\n    q.event_id = event_id;\n    if ((ret = ioctl(fd, IOCTL_SETCOUNTER, &q)) < 0) {\n    close(fd);\n        return E_ERROR;\n    }\n    close(fd);\n    return E_SUCCESS;\n}\n\n\nint set_pci(unsigned int bus_id, unsigned int device_id, unsigned int function_id, unsigned int offset, uint16_t val)\n{\n\tint fd; \n    int ret;\n\n    ioctl_query_setgetpci_t q;\n\tfd = open(DEV_PATH, O_RDONLY);\n\tif (fd < 0) {\n\t\tDBG_LOG(ERROR, \"Can't open %s - Is the NVM emulator device driver installed?\\n\", DEV_PATH);\n\t\treturn E_ERROR;\n\t}\n    q.bus_id = bus_id;\n    q.device_id = device_id;\n    q.function_id = function_id;\n    q.offset = offset;\n    q.val = val;\n    if ((ret = ioctl(fd, IOCTL_SETPCI, &q)) < 0) {\n    \tclose(fd);\n        return E_ERROR;\n    }\n\tclose(fd);\n    return E_SUCCESS;\n}\n\nint get_pci(unsigned int bus_id, unsigned int device_id, unsigned int function_id, unsigned int offset, uint16_t* val)\n{\n\tint fd; \n    int ret;\n\n    ioctl_query_setgetpci_t q;\n\tfd = open(DEV_PATH, O_RDWR);\n\tif (fd < 0) {\n\t\tDBG_LOG(ERROR, \"Can't open %s - Is the NVM emulator device driver installed?\\n\", DEV_PATH);\n\t\treturn E_ERROR;\n\t}\n    q.bus_id = bus_id;\n    q.device_id = device_id;\n    q.function_id = function_id;\n    q.offset = offset;\n    q.val = 0;\n    if ((ret = ioctl(fd, IOCTL_GETPCI, &q)) < 0) {\n    \tclose(fd);\n        return E_ERROR;\n    }\n    *val = q.val;\n\tclose(fd);\n    return E_SUCCESS;\n}\n\n\n"
  },
  {
    "path": "src/lib/dev.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __DEVICE_DRIVER_API_H\n#define __DEVICE_DRIVER_API_H\n\n#include <stdint.h>\n\n#define MAX_NUM_MC_PCI_BUS 16\n#define MAX_NUM_MC_CHANNELS 16\n\ntypedef struct {\n    unsigned int bus_id;\n    unsigned int dev_id;\n    unsigned int funct;\n} pci_addr;\n\ntypedef struct {\n    pci_addr addr[MAX_NUM_MC_CHANNELS];\n    unsigned int channels;\n} pci_regs_t;\n\nint set_counter(unsigned int counter_id, unsigned int event_id);\nint set_pci(unsigned bus_id, unsigned int device_id, unsigned int function_id, unsigned int offset, uint16_t val);\nint get_pci(unsigned bus_id, unsigned int device_id, unsigned int function_id, unsigned int offset, uint16_t* val);\n\n#endif /* __DEVICE_DRIVER_API_H */\n"
  },
  {
    "path": "src/lib/errno.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __ERRNO_H\n#define __ERRNO_H\n\n#ifdef __DEFINE_ERRNO\n# error \"__DEFINE_ERRNO previously defined\"\n#endif\n\n/*\n * Define error codes and error messages here\n */\n#define __DEFINE_ERRNO(ACTION)                                               \\\n\tACTION(E_SUCCESS, \"Success\")                                             \\\n\tACTION(E_ERROR, \"Generic error\")                                         \\\n\tACTION(E_NOMEM, \"No memory\")                                             \\\n    ACTION(E_EXIST, \"Name already exists\")                                   \\\n    ACTION(E_NOENT, \"Name does not exist\")                                   \\\n    ACTION(E_INVAL, \"Invalid argument\")                                      \\\n    ACTION(E_BUSY, \"Resource busy\")                                          \\\n    ACTION(E_NOTEMPTY, \"Not empty\")                                          \\\n    ACTION(E_ERRNO, \"Standard C library error; check errno for details\")\n\n\n#ifdef __ENUM_MEMBER\n# error \"__ENUM_MEMBER previously defined\"\n#endif\n\n#define __ENUM_MEMBER(name, str)  name,\n\nenum {\n\t__DEFINE_ERRNO(__ENUM_MEMBER)\n\tE_MAXERRNO\n};\n\n#undef __ENUM_MEMBER /* don't polute the macro namespace */\n\n#ifdef __ERRNO_STRING\n# error \"__ERRNO_STRING previously defined\"\n#endif\n\n#define __ERRNO_STRING(name, str) str,\n\n/*\n    TODO: not used for now\nstatic const char* \nErrorToString(int err) {\n\tstatic const char* errstr[] = {\n\t\t__DEFINE_ERRNO(__ERRNO_STRING)\n\t\t\"Unknown error code\"\n\t};\n\tif (err >= 0 && err < E_MAXERRNO) {\n\t\treturn errstr[err];\n\t}\n\treturn errstr[E_MAXERRNO];\n}\n*/\n#undef __ERRNO_STRING /* don't polute the macro namespace */\n#undef __DEFINE_ERRNO /* don't polute the macro namespace */\n\n#endif /* __ERRNO_H */\n"
  },
  {
    "path": "src/lib/error.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __ERROR_H\n#define __ERROR_H\n\n#include \"errno.h\"\n#include \"debug.h\"\n\n#endif /* __ERROR_H */\n"
  },
  {
    "path": "src/lib/init.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <errno.h>\n#include \"cpu/cpu.h\"\n#include \"config.h\"\n#include \"error.h\"\n#include \"model.h\"\n#include \"measure.h\"\n#include \"thread.h\"\n#include \"topology.h\"\n#include \"interpose.h\"\n#include \"monotonic_timer.h\"\n#include \"pflush.h\"\n#include \"stat.h\"\n\nstatic void init() __attribute__((constructor));\nstatic void finalize() __attribute__((destructor));\n\nint set_process_local_rank();\nint unset_process_local_rank();\nint partition_cpus(virtual_topology_t* virtual_topology);\n\nstatic virtual_topology_t* virtual_topology = NULL;\n\nvoid finalize() {\n    int i;\n    if (latency_model.enabled) {\n        unregister_self();\n    }\n\n    if (read_bw_model.enabled) {\n        for (i=0; i < virtual_topology->num_virtual_nodes; i++) {\n            // FIXME: currently we keep a single bandwidth model and not per-node BW model\n            physical_node_t* phys_node = virtual_topology->virtual_nodes[i].nvram_node;\n            pci_regs_t *regs = phys_node->mc_pci_regs;\n\n            // reset throttling\n            phys_node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, 0x8FFF);\n        }\n    }\n#ifdef USE_STATISTICS\n    stats_report();\n#endif\n    // finalize libraries and release resources\n#ifdef PAPI_SUPPORT\n    pmc_shutdown();\n#endif\n\n    unset_process_local_rank();\n\n    //__cconfig_destroy(&cfg);\n}\n\nvoid init()\n{\n    config_t cfg;\n    cpu_model_t* cpu;\n    char* ld_preload_path;\n    double start_time, end_time;\n#ifdef CALIBRATION_SUPPORT\n    int i;\n#endif\n\n    // FIXME: do we need to register the main thread with our system?\n    // YES: for sure for single-threaded apps\n\n    start_time = monotonic_time_us();\n\n    // we reset LD_PRELOAD to ensure we don't get into recursive preloads when \n    // calling popen during initialization. before exiting we reactivate LD_PRELOAD \n    // to allow LD_PRELOADS on children\n    ld_preload_path = getenv(\"LD_PRELOAD\");\n    unsetenv(\"LD_PRELOAD\");\n\n    if (__cconfig_init(&cfg, \"nvmemul.ini\") == CONFIG_FALSE) {\n        goto error;\n    }\n\n    __cconfig_lookup_bool(&cfg, \"latency.enable\", &latency_model.enabled);\n    __cconfig_lookup_bool(&cfg, \"bandwidth.enable\", &read_bw_model.enabled);\n\n    if (dbg_init(&cfg, -1, NULL) != E_SUCCESS) {\n        goto error;\n    }\n\n    if (init_interposition() != E_SUCCESS) {\n        goto error;\n    }\n\n    if ((cpu = cpu_model()) == NULL) {\n        DBG_LOG(ERROR, \"No supported processor found\\n\");\n        goto error;\n    }\n\n    init_virtual_topology(&cfg, cpu, &virtual_topology);\n\n    if (init_bandwidth_model(&cfg, virtual_topology) != E_SUCCESS) {\n        goto error;\n    }\n\n    if (latency_model.enabled) {\n        if (init_latency_model(&cfg, cpu, virtual_topology) != E_SUCCESS) {\n   \t        goto error;\n        }\n\n        init_thread_manager(&cfg, virtual_topology);\n\n#ifdef USE_STATISTICS\n        // statistics makes use of the thread manager and is used by the register_self()\n        stats_enable(&cfg);\n#endif\n\n        set_process_local_rank();\n\n        // thread manager must be initialized and local rank set\n        // CPU partitioning must be made before the first thread is registered\n        if (partition_cpus(virtual_topology) != E_SUCCESS) {\n            goto error;\n        }\n\n        if (register_self() != E_SUCCESS) {\n            goto error;\n        }\n\n#ifdef CALIBRATION_SUPPORT\n        // main thread is now tracked by the latency emulator\n        // first, calibrate the latency emulation\n        if (latency_model.calibration) {\n            for (i = 0; i < virtual_topology->num_virtual_nodes; ++i) {\n                latency_calibration(&virtual_topology->virtual_nodes[i]);\n            }\n        }\n#endif\n        int write_latency;\n        __cconfig_lookup_int(&cfg, \"latency.write\", &write_latency);\n        init_pflush(cpu_speed_mhz(), write_latency);\n    }\n\n    end_time = monotonic_time_us();\n\n#ifdef USE_STATISTICS\n    if (latency_model.enabled) {\n        stats_set_init_time(end_time - start_time);\n    }\n#endif\n\n    if (ld_preload_path)\n        setenv(\"LD_PRELOAD\", ld_preload_path, 1);\n\n    return;\n\nerror:\n    /* Cannot initialize library -- catastrophic error */\n    if (ld_preload_path)\n        setenv(\"LD_PRELOAD\", ld_preload_path, 1);\n\n    fprintf(stderr, \"ERROR: nvmemul: Initialization failed. Running without non-volatile memory emulation.\\n\");\n}\n"
  },
  {
    "path": "src/lib/interpose.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#define _GNU_SOURCE\n#include <stdio.h>\n#include <dlfcn.h>\n#include <pthread.h>\n#include <assert.h>\n#include <signal.h>\n#include \"error.h\"\n#include \"model.h\"\n#include \"thread.h\"\n#include \"cpu/cpu.h\"\n#ifdef PAPI_SUPPORT\n#include \"cpu/pmc-papi.h\"\n#else\n#include \"cpu/pmc.h\"\n#endif\n\n\n// WARNING: Our library MUST directly use the functions we interpose on by \n// calling __lib_X to avoid interposition on ourselves.\n\n\nint (*__lib_pthread_create)(pthread_t *thread, const pthread_attr_t *attr,\n                              void *(*start_routine) (void *), void *arg);\nint (*__lib_pthread_mutex_lock)(pthread_mutex_t *mutex);\nint (*__lib_pthread_mutex_trylock)(pthread_mutex_t *mutex);\nint (*__lib_pthread_mutex_unlock)(pthread_mutex_t *mutex);\nint (*__lib_pthread_detach)(pthread_t thread);\n\nextern inline hrtime_t hrtime_cycles(void);\nextern inline int cycles_to_us(cpu_model_t* cpu, hrtime_t cycles);\n\n\nint init_interposition()\n{\n\tchar *error;\n    // if no symbol is returned then no interposition needed\n    __lib_pthread_create = dlsym(RTLD_NEXT, \"pthread_create\");\n    __lib_pthread_mutex_lock = dlsym(RTLD_NEXT, \"pthread_mutex_lock\");\n    __lib_pthread_mutex_trylock = dlsym(RTLD_NEXT, \"pthread_mutex_trylock\");\n    __lib_pthread_mutex_unlock = dlsym(RTLD_NEXT, \"pthread_mutex_unlock\");\n    __lib_pthread_detach = dlsym(RTLD_NEXT, \"pthread_detach\");\n\n    if (__lib_pthread_mutex_lock == NULL || __lib_pthread_mutex_unlock == NULL ||\n    \t    __lib_pthread_create == NULL || __lib_pthread_mutex_trylock == NULL ||\n    \t    __lib_pthread_detach == NULL) {\n    \terror = dlerror();\n    \tDBG_LOG(ERROR, \"Interposition failed: %s\\n\", error != NULL ? error : \"unknown reason\");\n    \treturn E_ERROR;\n    }\n\n    return E_SUCCESS;\n}\n\n\n// Interposing on pthread_create requires interposing on the thread created as we \n// require the TID of that thread which we can only get by executing the gettid() \n// system call from that thread. So we interpose on the start_routine which is\n// called by the new thread\ntypedef struct {\n    void *(*start_routine) (void *);\n    void *arg;\n} pthread_create_functor_t;\n\nvoid* __interposed_start_routine(void* args)\n{\n    void* ret;\n    pthread_create_functor_t* f = (pthread_create_functor_t*) args;\n    if (register_self() != E_SUCCESS) {\n        free(args);\n        return NULL;\n    }\n    ret = f->start_routine(f->arg);\n    // FIXME: directly calling unregister may miss cases where the \n    // thread terminates prematurely (such as pthread_exit or cancel)\n    // consider using a key destructor function instead\n    //fprintf(stderr, \"stall cycles: %lu\\n\", thread_self()->stall_cycles);\n    //fprintf(stderr, \"signals_sent: %lu signals_recv: %lu\\n\", thread_self()->signals_sent, thread_self()->signals_recv);\n    unregister_self();\n    free(args);\n    return ret;\n}\n\nint pthread_create(pthread_t *thread, const pthread_attr_t *attr,\n                   void *(*start_routine) (void *), void *arg)\n{\n    int ret;\n\n    //DBG_LOG(DEBUG, \"interposing pthread_create\\n\");\n\n    //assert(__lib_pthread_create);\n    if (__lib_pthread_create == NULL)\n        init_interposition();\n\n    if (latency_model.enabled) {\n        pthread_create_functor_t *functor = malloc(sizeof(pthread_create_functor_t));\n        functor->arg = arg;\n        functor->start_routine = start_routine;\n\n        if ((ret = __lib_pthread_create(thread, attr, __interposed_start_routine, (void*) functor)) != 0) {\n            DBG_LOG(ERROR, \"call to __lib_pthread_create failed\\n\");\n            return ret;\n        }\n    } else {\n        ret = __lib_pthread_create(thread, attr, start_routine, arg);\n    }\n\n    return ret;    \n}\n\nint pthread_mutex_lock(pthread_mutex_t *mutex)\n{\n    int err;\n\n    if (latency_model.enabled) {\n        if(reached_min_epoch_duration(thread_self())) {\n            // create new epoch here in order to propagate only the critical session delay to other threads\n            // the thread monitor will keep trying to create new epoch, unless the min duration has not been reached\n            create_latency_epoch();\n        }\n    }\n\n    //DBG_LOG(DEBUG, \"interposing pthread_mutex_lock\\n\");\n\n    //assert(__lib_pthread_mutex_lock);\n    if (__lib_pthread_mutex_lock == NULL)\n        init_interposition();\n    err =  __lib_pthread_mutex_lock(mutex);\n\n    return err;\n}\n\nint pthread_mutex_trylock(pthread_mutex_t *mutex)\n{\n    int err;\n\n    if (latency_model.enabled) {\n        if(reached_min_epoch_duration(thread_self())) {\n            create_latency_epoch();\n        }\n    }\n\n    //DBG_LOG(DEBUG, \"interposing pthread_mutex_trylock\\n\");\n\n    //assert(__lib_pthread_mutex_trylock);\n    if (__lib_pthread_mutex_trylock == NULL)\n        init_interposition();\n    err =  __lib_pthread_mutex_trylock(mutex);\n\n    return err;\n}\n\nint pthread_mutex_unlock(pthread_mutex_t *mutex)\n{\n    int err;\n\n    if (latency_model.enabled) {\n        if (reached_min_epoch_duration(thread_self())) {\n            create_latency_epoch();\n        }\n    }\n\n    //DBG_LOG(DEBUG, \"interposing pthread_mutex_unlock\\n\");\n\n    //assert(__lib_pthread_mutex_unlock);\n    if (__lib_pthread_mutex_unlock == NULL)\n        init_interposition();\n    err = __lib_pthread_mutex_unlock(mutex);\n\n    return err;\n}\n"
  },
  {
    "path": "src/lib/interpose.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __INTERPOSE_H\n#define __INTERPOSE_H\n\n\n/**\n * \n * \\page library_interposition Library interposition \n * \n * The emulator intercepts several events of interest. It achieves this\n * by interposing on corresponding functions. \n * Currently this includes thread creation and POSIX synchronization mechanisms.\n */\n\nextern int (*__lib_pthread_create)(pthread_t *thread, const pthread_attr_t *attr,\n                                   void *(*start_routine) (void *), void *arg);\nextern int (*__lib_pthread_mutex_lock)(pthread_mutex_t *mutex);\nextern int (*__lib_pthread_mutex_trylock)(pthread_mutex_t *mutex);\nextern int (*__lib_pthread_mutex_unlock)(pthread_mutex_t *mutex);\nextern int (*__lib_pthread_detach)(pthread_t thread);\n\nint init_interposition();\n\n#endif /* __INTERPOSE_H */\n"
  },
  {
    "path": "src/lib/measure.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __MEASURE_H\n#define __MEASURE_H\n\n/**\n * \\file \n * \n * Memory latency and bandwidth measurements\n */\n\n/**\n * \\brief Measure memory read bandwidth\n *\n * Measures memory read bandwidth from a local socket (cpu_node) \n * to the memory of a remote socket (mem_node). It does this \n * by firing a bunch of threads issuing streaming instructions\n * to saturate memory bandwidth. \n */\ndouble measure_read_bw(int cpu_node, int mem_node);\n\n/**\n * \\brief Measure memory write bandwidth\n *\n * Measures memory write bandwidth from a local socket (cpu_node) \n * to the memory of a remote socket (mem_node).\n * See measure_read_bw for how this is done.\n */\ndouble measure_write_bw(int cpu_node, int mem_node);\n\n\n/** \n * \\brief Measure memory latency \n * \n * Measures memory read latency from one local socket to the memory of a \n * remote socket. It does this using a pointer chasing microbenchmark.\n * The microbenchmark setups an array where each element determines the\n * element to be read next.\n */ \nint measure_latency(cpu_model_t* cpu, int from_node_id, int to_node_id);\n\n/**\n * \\brief Calibrate memory latency\n *\n * Automatically tweaks the memory latency based on the detected hardware latency\n * on the target systems.\n */\nvoid latency_calibration();\n\n#endif\n"
  },
  {
    "path": "src/lib/measure_bw.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n\n\n// 2 BW measuring algorithms: one based on SSE4 instructions and the second based on \n// stream benchmark Copy kernel.\n\n\n//#define SSE4_VERSION\n\n#ifdef SSE4_VERSION\n\n#include <math.h>\n#include <assert.h>\n#include <stdint.h>\n#include <pthread.h>\n#include <string.h>\n#include <numa.h>\n#include \"monotonic_timer.h\"\n#include \"interpose.h\"\n\n\n\n#ifdef __SSE4_1__\n#include <smmintrin.h>\n#endif\n\n#define BYTES_PER_GB (1024*1024*1024LL)\n#define BYTES_PER_MB (1024*1024LL)\n\n// flag for terminating current test\nint g_done;\n\n// global current number of threads\nint g_nthreads = 0;\n\n// synchronization barrier for current thread counter\npthread_barrier_t g_barrier;\n\n// thread shared parameters for test function\nvoid* g_array;\nsize_t g_thrsize;\nint g_times;\nvoid (*g_func)(void*, size_t);\n\n// Compute bandwidth in MB/s.\nstatic inline double to_bw(size_t bytes, double secs) {\n  double size_bytes = (double) bytes;\n  double size_mb = size_bytes / ((double) BYTES_PER_MB);\n  return size_mb / secs;\n}\n\nvoid* thread_worker(void* arg)\n{\n    int j;\n    unsigned int thread_num = (uintptr_t) arg;\n\n    while (1)\n    {\n        // *** Barrier ****\n        pthread_barrier_wait(&g_barrier);\n\n        if (g_done) break;\n\n        for (j = 0; j < g_times; j++) {\n            g_func(&((char*) g_array)[g_thrsize * thread_num], g_thrsize);\n        }\n\n        // *** Barrier ****\n        pthread_barrier_wait(&g_barrier);\n    }\n\n    return NULL;\n}\n\n\nint timeitp(void (*function)(void*, size_t), int nthreads, void* array, size_t size, int samples, int times) {\n    double min = INFINITY;\n    double runtime;\n    size_t i, j, p;\n    int thread_num;\n\n    // globally set test function and thread number\n    g_func = function;\n    g_nthreads = nthreads;\n    g_array = array;\n    g_thrsize = size / nthreads;\n    g_times = times;\n\n    // create barrier and run threads\n    pthread_barrier_init(&g_barrier, NULL, nthreads);\n\n    pthread_t thr[nthreads];\n    //__lib_pthread_create(&thr[0], NULL, thread_master, new int(0));\n    for (p = 1; p < nthreads; ++p) {\n    \tassert(__lib_pthread_create);\n        __lib_pthread_create(&thr[p], NULL, thread_worker, (void *) p);\n    }\n\n    // use current thread as master thread;\n    g_done = 0;\n    thread_num = 0;\n    for (i = 0; i < samples; i++) \n    {\n        pthread_barrier_wait(&g_barrier);\n\n        assert(!g_done);\n\n        double ts1 = monotonic_time();\n\n        for (j = 0; j < times; j++) {\n            g_func(&((char*)g_array)[g_thrsize * thread_num], g_thrsize);\n        }\n\n        pthread_barrier_wait(&g_barrier);\n        double ts2 = monotonic_time();\n\n        runtime = ts2 - ts1;\n        if (runtime < min) {\n            min = runtime;\n        }\n    }\n    g_done = 1;\n\n    pthread_barrier_wait(&g_barrier);\n\n    for (p = 1; p < nthreads; ++p) {\n        pthread_join(thr[p], NULL);\n    }\n\n    pthread_barrier_destroy(&g_barrier);\n\n    return to_bw(size * times, min);\n}\n\n\nint timeit(void (*function)(void*, size_t), void* array, size_t size, int samples, int times) {\n    double min = INFINITY;\n    size_t i;\n\n    // force allocation of physical pages\n    memset(array, 0xff, size);\n\n    for (i = 0; i < samples; i++) {\n        double before, after, total;\n\n        before = monotonic_time();\n        int j;\n        for (j = 0; j < times; j++) {\n            function(array, size);\n        }\n        after = monotonic_time();\n\n        total = after - before;\n        if (total < min) {\n            min = total;\n        }\n    }\n\n    return to_bw(size * times, min);\n}\n\n\n#ifdef __SSE4_1__\nvoid write_memory_nontemporal_sse(void* array, size_t size) {\n  __m128i* varray = (__m128i*) array;\n\n  __m128i vals = _mm_set1_epi32(1);\n  size_t i;\n  for (i = 0; i < size / sizeof(__m128i); i++) {\n    _mm_stream_si128(&varray[i], vals);\n    vals = _mm_add_epi16(vals, vals);\n  }\n}\n\nvoid write_memory_sse(void* array, size_t size) {\n  __m128i* varray = (__m128i*) array;\n\n  __m128i vals = _mm_set1_epi32(1);\n  size_t i;\n  for (i = 0; i < size / sizeof(__m128i); i++) {\n    _mm_store_si128(&varray[i], vals);\n    vals = _mm_add_epi16(vals, vals);\n  }\n}\n\nvoid read_memory_sse(void* array, size_t size) {\n  __m128i* varray = (__m128i*) array;\n  __m128i accum = _mm_set1_epi32(0xDEADBEEF);\n  size_t i;\n  for (i = 0; i < size / sizeof(__m128i); i++) {\n    accum = _mm_add_epi16(varray[i], accum);\n  }\n\n  // This is unlikely, and we want to make sure the reads are not optimized\n  // away.\n  assert(!_mm_testz_si128(accum, accum));\n}\n#else\n# error \"No compiler support for SSE instructions\"\n#endif\n\n//static char array[1024*1024*1024];\n\ndouble measure_read_bw(int cpu_node, int mem_node)\n{\n    char* array;\n    size_t size = 1024*1024*1024;\n    double bw;\n    int nthreads = 16;\n\n    array = numa_alloc_onnode(size, mem_node);\n    assert(array);\n    numa_run_on_node(cpu_node);\n    // force allocation of physical pages\n    memset(array, 0xff, size);\n    bw = timeitp(read_memory_sse, nthreads, array, size, 5, 1);\n    numa_free(array, size);\n    return bw;\n}\n\ndouble measure_write_bw(int cpu_node, int mem_node)\n{\n    char* array;\n    size_t size = 1024*1024*1024;\n    double bw;\n    int nthreads = 16;\n\n    array = numa_alloc_onnode(size, mem_node);\n    assert(array);\n    numa_run_on_node(cpu_node);\n    // force allocation of physical pages\n    memset(array, 0xff, size);\n    bw = timeitp(write_memory_nontemporal_sse, nthreads, array, size, 5, 1);\n    numa_free(array, size);\n    return bw;\n}\n\n#else // SSE4_VERSION\n\n\n#include <stdio.h>\n#include <math.h>\n#include <float.h>\n#include <limits.h>\n#include <sys/time.h>\n#include <numa.h>\n#include <numaif.h>\n#include <omp.h>\n#include \"monotonic_timer.h\"\n#include \"debug.h\"\n\n\n# define N\t20000000\n# define NTIMES\t10\n# define OFFSET\t0\n\n# define HLINE \"-------------------------------------------------------------\\n\"\n\n# ifndef MIN\n# define MIN(x,y) ((x)<(y)?(x):(y))\n# endif\n# ifndef MAX\n# define MAX(x,y) ((x)>(y)?(x):(y))\n# endif\n\n\nstatic double\tmintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n\nstatic double\tbytes[4] = {\n    2 * sizeof(double) * N,\n    2 * sizeof(double) * N,\n    3 * sizeof(double) * N,\n    3 * sizeof(double) * N\n    };\n\n//extern double mysecond();\n\ndouble measure_read_bw(int cpu_node, int mem_node)\n    {\n    register int\tj, k;\n    double\t\tt, times[4][NTIMES];\n    double *a, *c;\n    //struct bitmask* membind;\n\n    /* --- SETUP --- determine precision and check timing --- */\n\n    //membind = numa_allocate_nodemask();\n    //numa_bitmask_setbit(membind, mem_node);\n    //numa_bind(membind);\n    //numa_free_nodemask(membind);\n    numa_run_on_node(cpu_node);\n\n    omp_set_num_threads(10);\n\n    // allocate memory dynamically to make sure the data is stored on the expected NUMA node\n    a = (double *)numa_alloc_onnode( (N+OFFSET) * sizeof(double), mem_node);\n    c = (double *)numa_alloc_onnode( (N+OFFSET) * sizeof(double), mem_node);\n\n    DBG_LOG(DEBUG, \"Measuring read BW on cpu node %d and mem node %d\\n\", cpu_node, mem_node);\n\n    /* Get initial value for system clock. */\n#pragma omp parallel for\n    for (j=0; j<N; j++) {\n\ta[j] = (double)random(); //1.0;\n\tc[j] = 0.0;\n\t}\n\n    t = monotonic_time(); //mysecond();\n#pragma omp parallel for\n    for (j = 0; j < N; j++)\n\ta[j] = 2.0E0 * a[j];\n    t = 1.0E6 * (monotonic_time() - t);\n\n    /*\t--- MAIN LOOP --- repeat test cases NTIMES times --- */\n\n    for (k=0; k<NTIMES; k++)\n\t{\n\ttimes[0][k] = monotonic_time(); //mysecond();\n#pragma omp parallel for\n\tfor (j=0; j<N; j++)\n\t    c[j] = a[j];\n\ttimes[0][k] = monotonic_time() - times[0][k];\n\t}\n\n    /*\t--- SUMMARY --- */\n    \n    mintime[0] = FLT_MAX;\n    for (k=1; k<NTIMES; k++) \n\t{\n\t    mintime[0] = MIN(mintime[0], times[0][k]);\n\t}\n\n    numa_free(a, (N+OFFSET) * sizeof(double));\n    numa_free(c, (N+OFFSET) * sizeof(double));\n\n    // reset NUMA binding\n    //numa_run_on_node_mask(numa_all_nodes_ptr);\n    //numa_set_membind(numa_all_nodes_ptr);\n    //numa_bind(numa_all_nodes_ptr);\n    numa_run_on_node(-1);\n\n    return 1.0E-06 * bytes[0]/mintime[0]; // bytes to MiB/s \n}\n\n\n\n#endif // SSE4_VERSION\n"
  },
  {
    "path": "src/lib/measure_lat.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n/*\n * Originally developed by Terence Kelly with contributions from Haris Volos\n */\n\n#include <string.h>\n#include <assert.h>\n#include <errno.h>\n#include <inttypes.h>\n#include <stdio.h>\n#include <unistd.h>\n#include <stdlib.h>\n#include <sys/time.h>\n#include <sys/mman.h>\n#include <numa.h>\n#include <numaif.h>\n#include <math.h>\n#include \"cpu/cpu.h\"\n#include \"error.h\"\n#include \"model.h\"\n\n#define P  (void)printf\n#define FP (void)fprintf\n\n#define PAGESZ 4096\n\n#define MAX_NUM_CHAINS 16\n\n#undef USE_HUGETLB\n\n#ifdef MEMLAT_SUPPORT\nextern __thread uint64_t tls_global_remote_dram;\nextern __thread uint64_t tls_global_local_dram;\n#endif\n\ntypedef struct {\n\tuint64_t val;\n\tchar padding[0];\n} element_t;\n\ntypedef struct {\n    uint64_t   N;\n    uint64_t   element_size;\n    element_t* head;\n} chain_t;\n\ninline uint64_t min(uint64_t a, uint64_t b)\n{\n    return a < b ? a : b;\n}\n\n/* G. Marsaglia, 2003.  \"Xorshift RNGs\", Journal of Statistical\n   Software v. 8 n. 14, pp. 1-6, discussed in _Numerical Recipes_\n   3rd ed. */\nstatic uint64_t prng(uint64_t* seed) {\n    uint64_t x = *seed;\n    x ^= x >> 21;\n    x ^= x << 35;\n    x ^= x >>  4;\n    *seed = x;\n    return x;\n}\n\nstatic uint64_t T(void) {\n    struct timeval tv;\n\n#ifndef NDEBUG\n    int r =\n#endif\n        gettimeofday(&tv, NULL);\n\n    assert(0 == r);\n\n    return (uint64_t)(tv.tv_sec) * 1000000 + tv.tv_usec;\n}\n\nelement_t* element(chain_t* chain, uint64_t index) \n{\n    char* p = (char*) chain->head + index * chain->element_size;\n    return (element_t *) p;\n}\n\nvoid inline read_element(chain_t* chain, uint64_t index, char* buf, uint64_t buf_size)\n{\n    uint64_t i;\n    element_t *elem = element(chain, index);\n    buf_size = min(chain->element_size, buf_size);\n    \n    memcpy(buf, &elem->padding[0], buf_size - sizeof(elem->val));\n    for (i = buf_size; i <= chain->element_size - buf_size; i += buf_size) {\n        memcpy(buf, &elem->padding[i], buf_size);\n    }\n}\n\nchain_t* alloc_chain(uint64_t seedin, uint64_t N, uint64_t element_size, uint64_t node_i, uint64_t node_j)\n{\n    uint64_t sum, p, i;\n    element_t *B;\n    char *A, *Aaligned, *M;\n    uint64_t seed = seedin;\n    chain_t* chain;\n#ifndef NDEBUG\n    long mbind_result;\n#endif\n    /* fill B[] with random permutation of 1..N */\n    chain = (chain_t*) malloc(sizeof(chain_t));\n    chain->N = N;\n    chain->element_size = element_size;\n    Aaligned = A = (char *) malloc(2 * PAGESZ + N * sizeof(element_t));\n    assert(NULL != A);\n    while ( 0 != (Aaligned - (char *)0) % PAGESZ )\n        Aaligned++;\n    B = (element_t *) Aaligned;\n    for (i = 0; i < N; i++)\n        B[i].val = 1+i;\n    for (i = 0; i < N; i++) {\n        uint64_t r, t;\n        r = prng(&seed);\n        r = r % N;  /* should be okay for N << 2^64 */\n        t = B[i].val;\n        B[i].val = B[r].val;\n        B[r].val = t;\n    }\n\n    sum = 0;\n    for (i = 0; i < N; i++)\n      sum += B[i].val;\n    assert((N+1)*N/2 == sum);  /* Euler's formula */\n\n    /* set up C[] such that \"chasing pointers\" through it visits\n       every element exactly once */\n#ifdef USE_HUGETLB\n    M = (char*) mmap(NULL, 2 * PAGESZ + (1+N) * element_size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB, -1, 0);\n#else\n    M = (char*) mmap(NULL, 2 * PAGESZ + (1+N) * element_size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);\n#endif\n    assert(NULL != M);\n    while ( 0 != (M - (char *)0) % PAGESZ )\n      M++;\n    numa_run_on_node(node_i);\n    uint64_t nodemask = 1 << node_j;\n#ifndef NDEBUG\n    mbind_result =\n#endif\n        mbind(M, N*element_size, MPOL_BIND, &nodemask, 64, MPOL_MF_MOVE);\n\n    assert(mbind_result == 0);\n\n    bzero(M, N*element_size); // force physical memory allocation\n    chain->head = (element_t *) M;\n    for (i = 0; i < N; i++) {\n        element(chain, i)->val = UINT64_MAX;\n    }\n    p = 0;\n    for (i = 0; i < N; i++) {\n        p = element(chain, p)->val = B[i].val;\n    }\n    element(chain, p)->val = 0;\n    for (i = 0; i <= N; i++) {\n        assert(N >= element(chain, i)->val);\n    }\n    free(A);\n    return chain;\n}\n\n\nuint64_t trash_cache(uint64_t N)\n{\n    uint64_t T1, i, sum;\n    char* A;\n    char* ptr;\n    element_t* B;\n    ptr = A = (char *) malloc(2 * PAGESZ + N * sizeof(element_t));\n    assert(NULL != A);\n    while ( 0 != (A - (char *)0) % PAGESZ ) {\n        A++;\n        __asm__(\"\"); /* prevent optimizer from removing loop */\n    }\n    B = (element_t *)A;\n\n    /* trash the CPU cache */\n    T1 = T() % 1000;\n    for (i = 0; i < N; i++) {\n        B[i].val = T1 * i + i % (T1+1);\n        __asm__(\"\"); /* prevent optimizer from removing loop */\n    }\n    sum = 0;\n    for (i = 0; i < N; i++) {\n        sum += B[i].val;\n        __asm__(\"\"); /* prevent optimizer from removing loop */\n    }\n    free(ptr);\n    return sum;\n}\n\n\nint __measure_latency(uint64_t seedin, int nchains, size_t nelems, int element_size, int access_size, int from_node_id, int to_node_id) \n{\n    uint64_t seed, j, i, T1, T2;\n    uint64_t sumv[MAX_NUM_CHAINS];\n    uint64_t nextp[MAX_NUM_CHAINS];\n    chain_t *C[MAX_NUM_CHAINS];\n    char *buf;\n    uint64_t buf_size = 16384;\n\n    assert(nelems < UINT64_MAX);\n    assert(nchains < MAX_NUM_CHAINS);\n\n    DBG_LOG(INFO, \"measuring latency: nchains %d, nelems %zu, elem_sz %d, access_sz %d, from_node_id %d, to_node_id %d\\n\", nchains, nelems, element_size, access_size, from_node_id, to_node_id);\n\n    for (j=0; j < nchains; j++) {\n        seed = seedin + j*j;\n        C[j] = alloc_chain(seed, nelems, element_size, from_node_id, to_node_id);\n    }\n\n    trash_cache(nelems);\n\n    buf = (char*) malloc(buf_size);\n    assert(buf != NULL);\n#ifdef MEMLAT_SUPPORT\n    tls_global_remote_dram = 0;\n    tls_global_local_dram = 0;\n#endif\n\n    /* chase the pointers */\n    if (nchains == 1) {\n        T1 = T();\n        sumv[0] = 0;\n        for (i = 0; 0 != element(C[0], i)->val; i = element(C[0], i)->val) {\n            sumv[0] += element(C[0], i)->val;\n            if (access_size > element_size) {\n                read_element(C[0], i, buf, buf_size);\n            }\n        }\n        T2 = T();\n    } else {\n        T1 = T();\n        for (j=0; j < nchains; j++) {\n            sumv[j] = 0;\n            nextp[j] = 0;\n        }\n        for (; 0 != element(C[0], nextp[0])->val; ) {\n            for (j=0; j < nchains; j++) {\n                sumv[j] += element(C[j], nextp[j])->val;\n                if (access_size > element_size) {\n                    read_element(C[j], nextp[j], buf, buf_size);\n                }\n                nextp[j] = element(C[j], nextp[j])->val;\n            }\n        }\n        T2 = T();\n    }\n    assert((nelems+1)*nelems/2 == sumv[0]);  /* Euler's formula */\n    uint64_t time_per_op_ns = ((T2-T1)*1000)/nelems;\n\n    DBG_LOG(INFO, \"measuring latency: latency is %lu ns\\n\", time_per_op_ns);\n\n    for (j=0; j < nchains; j++) {\n        free(C[j]);\n    }\n    free(buf);\n\n    return time_per_op_ns;\n}\n\nint measure_latency(cpu_model_t* cpu, int from_node_id, int to_node_id) \n{\n    size_t factor = 10; // this needs to be large enough to ensure we always miss in the LLC cache\n    size_t element_size = 64LLU;\n    size_t access_size = 8;\n    size_t nelems = factor * cpu->llc_size_bytes / element_size;\n    \n    return __measure_latency(1, 1, nelems, element_size, access_size, from_node_id, to_node_id);\n}\n\nint measure_latency2(uint64_t seedin, int nchains, size_t nelems, int element_size, int access_size, int from_node_id, int to_node_id) \n{\n    if (nelems*element_size < cpu_llc_size_bytes()) { \n        DBG_LOG(WARNING, \"warning:  #elements == %\" PRIu64 \" seems small!\\n\", nelems);\n    }\n\n    return __measure_latency(seedin, nchains, nelems, element_size, access_size, from_node_id, to_node_id);\n}\n\n#ifdef CALIBRATION_SUPPORT\n\n#define TOLERATED_DEVIATION_PERCENTAGE 5  // maximum deviation acceptable for the target latency\n#define CALIBRATION_STEP_SIZE 0.05        // max ns step size to calibrate the CPU stalls\n#define CALIBRATION_FINEST_STEP 0.01      // min (finest) ns step size to calibrate the CPU stalls\n#define MAX_TOLERATED_BAD_STEPS 2         // max number of bad steps in the calibration, before the calibration inverts the value to increment\n#define NELEMS 10000000\n#define SEED_IN 1\n#define NCHAINS 1\n#define ELEM_SIZE 64LLU\n#define ACCESS_SIZE 8\n#define FILE_CALIB_LOCAL \"/tmp/local_latency_calibration\"\n#define FILE_CALIB_REMOTE \"/tmp/remote_latency_calibration\"\n\nstatic int calibrate_load_from_file(virtual_node_t *virtual_node) {\n    FILE *fp = NULL;\n    char *file_name = NULL;\n    char *line = NULL;\n    size_t len;\n    double correction_factor;\n    int status = E_ERROR;\n\n    if (virtual_node->dram_node == virtual_node->nvram_node) {\n    \tfile_name = FILE_CALIB_LOCAL;\n    } else {\n    \tfile_name = FILE_CALIB_REMOTE;\n    }\n\n    if (access(file_name, R_OK | W_OK) == 0) {\n        // calibration file is available, check if the current target latency is mapped\n        if ((fp = fopen(file_name, \"r\"))) {\n            if (getline(&line, &len, fp) != -1) {\n                if (sscanf(line, \"%lf\", &correction_factor) == 1) {\n                    // set CPU stalls factor to the read value\n                    latency_model.stalls_calibration_factor = correction_factor;\n                    DBG_LOG(INFO, \"CALIBRATION: factor loaded from file (%s) (%f)\\n\",\n                            file_name, correction_factor);\n                    status = E_SUCCESS;\n                }\n            }\n\n            if (line) free(line);\n            fclose(fp);\n        }\n    }\n\n    return status;\n}\n\nstatic void calibrate_save_to_file(virtual_node_t *virtual_node, double correction_factor) {\n\tchar *file_name;\n\tFILE *fp;\n\n\tif (virtual_node->dram_node == virtual_node->nvram_node) {\n\t\tfile_name = FILE_CALIB_LOCAL;\n\t} else {\n\t\tfile_name = FILE_CALIB_REMOTE;\n\t}\n\n\t// calibration file is available, check if the current target latency is mapped\n\tif ((fp = fopen(file_name, \"a\"))) {\n\t\t// it is assumed this line is not yet present in the file\n\t\tfprintf(fp, \"%f\\n\", correction_factor);\n\t\tDBG_LOG(INFO, \"CALIBRATION: factor saved to file (%s) (%f)\\n\",\n                file_name, correction_factor);\n\t\tfclose(fp);\n\t}\n}\n\nstatic int diff_target_latencies(int measured_latency, int target_latency) {\n    int diff = target_latency - measured_latency;\n    return abs(diff);\n}\n\nstatic double calibrate(virtual_node_t *virtual_node, double step_value, int from_node, int to_node) {\n    int measured;\n    int best_diff_latency;\n    double best_factor = 0;\n    int diff;\n    int bad_step_count = 0;\n    int close_value;\n    int calib_done;\n\n    // force a change in correction factor and measure latency\n    // each step will increment the or decrement the factor\n    // at the end we have a calibrated correction factor for the CPU stalls\n\n    DBG_LOG(INFO, \"CALIBRATION: for nodes (dram %d, nvram %d)\\n\", from_node, to_node);\n    best_diff_latency = INT32_MAX;\n    close_value = 0;\n    calib_done = 0;\n\n    while(!calib_done) {\n        measured = measure_latency2(SEED_IN, NCHAINS, NELEMS, ELEM_SIZE, ACCESS_SIZE, from_node, to_node);\n        DBG_LOG(INFO, \"CALIBRATION: measured latency (%d)\\n\", measured);\n\n        diff = diff_target_latencies(measured, latency_model.read_latency);\n        if (diff < best_diff_latency) {\n        \t// best measured latency so far\n            bad_step_count = 0;\n            best_diff_latency = diff;\n            best_factor = latency_model.stalls_calibration_factor;\n            // check if the diff is less or equal than the configured percentage of the target latency\n            if (diff <= (latency_model.read_latency * TOLERATED_DEVIATION_PERCENTAGE / 100)) {\n                DBG_LOG(INFO, \"CALIBRATION: got a close latency value (factor %lf)\\n\", best_factor);\n                close_value = 1;\n            }\n        } else if (diff >= best_diff_latency) {\n        \t// measure latency is getting worse\n            if (close_value && bad_step_count == 0) {\n            \t// if we have a close_value, return it\n                calib_done = 1;\n            } else {\n            \t// otherwise let's give retries\n                ++bad_step_count;\n                if (bad_step_count >= MAX_TOLERATED_BAD_STEPS) {\n                    // this calibration method seem to be moving to the wrong direction\n                    // return invalid value and hopefully fall back to the second method\n                    return 0;\n                }\n            }\n        }\n\n        latency_model.stalls_calibration_factor += step_value;\n    } // while\n\n    return best_factor;\n}\n\nstatic double calibrate_with_size(virtual_node_t *virtual_node, double calib_size, int from_node, int to_node) {\n\tdouble best_factor;\n\n\t// first method decrements the factor with the provided step size\n    if (((best_factor = calibrate(virtual_node, (-calib_size), from_node, to_node)) == 0) ||\n            calib_size == CALIBRATION_FINEST_STEP) {\n        if (best_factor > 0.0) {\n        \t// recover last best factor\n            latency_model.stalls_calibration_factor = best_factor;\n        }\n        // second method increments the factor with the provided step size\n        // this method will be always performed if the provided step size is the finest\n        best_factor = calibrate(virtual_node, calib_size, from_node, to_node);\n    }\n\n    return best_factor;\n}\n\nvoid latency_calibration(virtual_node_t *virtual_node) {\n    double best_factor;\n    int from_node = virtual_node->dram_node->node_id;\n    int to_node = virtual_node->nvram_node->node_id;\n\n    // if calibration file exist, load the correction factor and exit\n    if (calibrate_load_from_file(virtual_node) == E_SUCCESS) {\n        return;\n    }\n\n    if ((best_factor = calibrate_with_size(virtual_node, CALIBRATION_STEP_SIZE, from_node, to_node)) != 0) {\n    \tlatency_model.stalls_calibration_factor = best_factor + CALIBRATION_FINEST_STEP;\n    \tbest_factor = calibrate_with_size(virtual_node, CALIBRATION_FINEST_STEP, from_node, to_node);\n    }\n\n    if (best_factor == 0.0) {\n        best_factor = 1.0;\n    }\n\n    // set the hardware latency to the best fit value\n    latency_model.stalls_calibration_factor = best_factor;\n    DBG_LOG(INFO, \"CALIBRATION: CPU stalls correction factor is %f (dram %d, nvram %d)\\n\",\n    \t\tbest_factor, from_node, to_node);\n\n    // save file for local or remote 'correction factor'\n    calibrate_save_to_file(virtual_node, best_factor);\n}\n\n#endif // CALIBRATION SUPPORT\n"
  },
  {
    "path": "src/lib/misc.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <stddef.h>\n#include <stdlib.h>\n#include <string.h>\n#include <ctype.h>\n\n\n#include <stdio.h>\nsize_t string_to_size(char* str)\n{\n    size_t factor = 1;\n    size_t size;\n    long   val;\n    char*  endptr = 0;\n\n    val = strtoull(str, &endptr, 10);\n    while(endptr && (endptr - str) < strlen(str) && !isalpha(*endptr)) {endptr++;}\n\n    switch (endptr[0]) {\n        case 'K': case 'k':\n            factor = 1024LLU;\n            break;\n        case 'M': case 'm':\n            factor = 1024LLU*1024LLU;\n            break;\n        case 'G': case 'g':\n            factor = 1024LLU*1024LLU*1024LLU;\n            break;\n        default:\n            factor = 1;\n    }\n    size = factor * val;\n    return size;\n}\n"
  },
  {
    "path": "src/lib/misc.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __MISC_H\n#define __MISC_H\n\nsize_t string_to_size(char* str);\n\n#endif\n"
  },
  {
    "path": "src/lib/model.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __MODEL_H\n#define __MODEL_H\n\n#include \"config.h\"\n#include \"cpu/cpu.h\"\n#include \"thread.h\"\n#ifdef PAPI_SUPPORT\n#include \"cpu/pmc-papi.h\"\n#else\n#include \"cpu/pmc.h\"\n#endif\n\n#define MAX_EPOCH_DURATION_US 1000000\n#define MIN_EPOCH_DURATION_US 1\n\ntypedef struct {\n\tint enabled;\n    int read_latency;\n    int write_latency;\n    int inject_delay;\n#ifdef CALIBRATION_SUPPORT\n    int calibration;\n#endif\n#ifdef PAPI_SUPPORT\n    read_stalls_t pmc_stall_local;\n    read_stalls_t pmc_stall_remote;\n#else\n    pmc_event_t* pmc_stall_cycles;\n    pmc_event_t* pmc_remote_dram;\n    int process_local_rank;\n    int max_local_processe_ranks;\n#endif\n\n    double stalls_calibration_factor;\n} latency_model_t;\n\nextern latency_model_t latency_model;\n\ntypedef struct {\n    unsigned int throttle_reg_val[MAX_THROTTLE_VALUE]; \n    double bandwidth[MAX_THROTTLE_VALUE];\n    int npoints;\n    int enabled;\n} bw_model_t;\n\nextern bw_model_t read_bw_model;\nextern bw_model_t write_bw_model;\n\nint init_bandwidth_model(config_t* cfg, struct virtual_topology_s* topology);\nint init_latency_model(config_t* cfg, cpu_model_t* cpu, struct virtual_topology_s* virtual_topology);\nvoid init_thread_latency_model(thread_t *thread);\n\nvoid create_latency_epoch();\n\n#endif /* __MODEL_H */\n"
  },
  {
    "path": "src/lib/model_bw.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <stdio.h>\n#include <string.h>\n#include <inttypes.h>\n#include <math.h>\n#include \"cpu/cpu.h\"\n#include \"config.h\"\n#include \"error.h\"\n#include \"measure.h\"\n#include \"stat.h\"\n#include \"topology.h\"\n#include \"monotonic_timer.h\"\n#include \"model.h\"\n\n/**\n * \\file\n * \n * \\page latency_emulation Memory bandwidth emulation\n * \n * To emulate bandwidth, we rely on memory power throttling (supported by recent memory \n * controllers) to limit the effective bandwidth to the DRAM attached to a socket.\n * Memory power throttling is configured through the PCI configuration space. \n * We use a kernel-module to set the proper PCI registers. \n * \n * Initially, we perform a series of bandwidth measurements to find out the bandwidth \n * that corresponds to each register value. We incrementally try out each register value \n * starting from 0x800f until we saturate memory bandwidth.\n * \n */ \n\n\nbw_model_t read_bw_model;\nbw_model_t write_bw_model;\n\n\n#define THROTTLE_INCREMENT 15\n#define THROTTLE_INITIAL_VALUE 0x800f\n\nstatic int train_model(physical_node_t* phys_node, char model_type, bw_model_t* bw_model)\n{\n    double x[MAX_THROTTLE_VALUE];\n    double best_rate;\n    double m;\n    int    i;\n    uint16_t    throttle_reg_val;\n\n    int min_number_throttle_points = 10;\n    double stop_slope = 0.1;\n    int phys_node_id = phys_node->node_id;\n    pci_regs_t *regs = phys_node->mc_pci_regs;\n\n    // reset throttling\n    phys_node->cpu_model->get_throttle_register(regs, THROTTLE_DDR_ACT, &throttle_reg_val);\n    if (throttle_reg_val < 0x8fff)\n        phys_node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, 0x8FFF);\n\n    DBG_LOG(INFO, \"throttle bus id %d, on physical node: %d\\n\", regs->addr[0].bus_id, phys_node_id);\n\n    // we run until our bandwidth curve flattens out which we find out using \n    // gradient (slope) analysis \n    for (i=0; i < MAX_THROTTLE_VALUE; i++) {\n        phys_node->cpu_model->get_throttle_register(regs, THROTTLE_DDR_ACT, &throttle_reg_val);\n        if (throttle_reg_val >= 0x8fff) throttle_reg_val = THROTTLE_INITIAL_VALUE;\n        else throttle_reg_val += THROTTLE_INCREMENT;\n        if (model_type == 'r') {\n            phys_node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, throttle_reg_val);\n            best_rate = measure_read_bw(phys_node_id, phys_node_id);\n            // restore throttling register\n            //phys_node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, 0x8fff);\n        } /*else if (model_type == 'w') {\n            phys_node->cpu_model->set_throttle_register(bus_id, THROTTLE_DDR_ACT, throttle_reg_val);\n            best_rate = measure_write_bw(phys_node_id, phys_node_id);\n            // restore throttling register\n            phys_node->cpu_model->set_throttle_register(bus_id, THROTTLE_DDR_ACT, 0x8fff);\n        }*/\n        DBG_LOG(INFO, \"throttle reg: 0x%x, %c bandwidth: %f\\n\", throttle_reg_val, model_type, best_rate);\n        bw_model->throttle_reg_val[i] = throttle_reg_val;\n        bw_model->bandwidth[i] = best_rate;\n        x[i] = (double) throttle_reg_val; // slope calculation requires values of type double\n        if (i > min_number_throttle_points) {\n            m = slope(&x[i-min_number_throttle_points], \n                      &bw_model->bandwidth[i-min_number_throttle_points], \n                      min_number_throttle_points);\n            if (abs(m) < stop_slope) {\n                break;\n            }\n        }\n    }\n    bw_model->npoints = i;\n    return E_SUCCESS;\n}\n\nstatic int load_model(const char* path, const char* prefix, bw_model_t* bw_model)\n{\n    FILE *fp;\n    char *line = NULL;\n    char str[64];\n    size_t len = 0;\n    ssize_t read;\n    int x;\n    double y;\n    int found_points;\n\n    fp = fopen(path, \"r\");\n    if (fp == NULL) {\n        return E_ERROR;\n    }\n\n    DBG_LOG(INFO, \"Loading %s bandwidth model from %s\\n\", prefix, path);\n    for (found_points = 0; (read = getline(&line, &len, fp)) != -1; ) {\n        if (strstr(line, prefix)) {\n            sscanf(line, \"%s\\t%d\\t%lf\", str, &x, &y);\n            DBG_LOG(INFO, \"throttle reg: 0x%x, bandwidth: %f\\n\", x, y);\n            bw_model->throttle_reg_val[found_points] = x;\n            bw_model->bandwidth[found_points] = y;\n            found_points++;\n        }\n    }\n    free(line);\n    if (found_points) {\n        bw_model->npoints = found_points;\n    } else {\n        DBG_LOG(INFO, \"No %s bandwidth model found in %s\\n\", prefix, path);\n        return E_ERROR;\n    }\n    fclose(fp);\n    return E_SUCCESS;\n}\n\nstatic int save_model(const char* path, const char* prefix, bw_model_t* bw_model)\n{\n    int i;\n    FILE *fp;\n\n    fp = fopen(path, \"a\");\n    if (fp == NULL) {\n        return E_ERROR;\n    }\n\n    DBG_LOG(INFO, \"Saving %s bandwidth model into %s\\n\", prefix, path);\n    for (i=0; i<bw_model->npoints; i++) {\n        int x = bw_model->throttle_reg_val[i];\n        double y = bw_model->bandwidth[i];\n        //DBG_LOG(INFO, \"throttle reg: 0x%x, bandwidth: %f\\n\", x, y);\n        fprintf(fp, \"%s\\t%d\\t%f\\n\", prefix, x, y);\n    }\n    fclose(fp);\n    return E_SUCCESS;\n}\n\nstatic int find_data_point(bw_model_t* model, double target_bw, unsigned int* point)\n{\n    int i;\n    double error;\n\n    // go through all points as we are not sorted and pick the one closest\n    *point = 0;\n    error = target_bw;    \n    for (i=1; i<model->npoints; i++) {\n        if (fabs(model->bandwidth[i] - target_bw) < error) {\n            *point = i;\n            error = fabs(model->bandwidth[i] - target_bw);\n        }\n    }\n    return E_SUCCESS;\n}\n\nint __set_write_bw(physical_node_t* node, uint64_t target_bw)\n{\n    pci_regs_t *regs = node->mc_pci_regs;\n    int ret;\n    unsigned int point;\n\n    if (regs == NULL) {\n        return E_SUCCESS;\n    }\n\n    if (target_bw == (uint64_t) (-1)) {\n        node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, 0x8fff);\n        return E_SUCCESS;\n    }\n\n    if ((ret = find_data_point(&write_bw_model, (double) target_bw, &point)) != E_SUCCESS) {\n        return ret;\n    }\n    DBG_LOG(INFO, \"Setting throttle reg: %d (0x%x), target write bandwidth: %\" PRIu64 \", actual write bandwidth: %\" PRIu64 \"\\n\", write_bw_model.throttle_reg_val[point], write_bw_model.throttle_reg_val[point], target_bw, (uint64_t) write_bw_model.bandwidth[point]);\n    node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, write_bw_model.throttle_reg_val[point]);\n    \n    return E_SUCCESS;\n}\n\nint set_write_bw(config_t* cfg, physical_node_t* node)\n{\n    int target_bw;\n    __cconfig_lookup_int(cfg, \"bandwidth.write\", &target_bw);\n\n    return __set_write_bw(node, target_bw);\n}\n\nint __set_read_bw(physical_node_t* node, uint64_t target_bw)\n{\n    pci_regs_t *regs = node->mc_pci_regs;\n    int ret;\n    unsigned int point;\n\n    if (regs == NULL) {\n        return E_SUCCESS;\n    }\n\n    if (target_bw == (uint64_t) (-1)) {\n        node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, 0x8fff);\n        return E_SUCCESS;\n    }\n\n    if ((ret = find_data_point(&read_bw_model, (double) target_bw, &point)) != E_SUCCESS) {\n        return ret;\n    }\n    DBG_LOG(INFO, \"Setting throttle reg: %d (0x%x), target read bandwidth: %\" PRIu64 \", actual read bandwidth: %\" PRIu64 \"\\n\", read_bw_model.throttle_reg_val[point], read_bw_model.throttle_reg_val[point], target_bw, (uint64_t) read_bw_model.bandwidth[point]);\n    node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, read_bw_model.throttle_reg_val[point]);\n\n    return E_SUCCESS;\n}\n\nint set_read_bw(config_t* cfg, physical_node_t* node)\n{\n    int target_bw;\n    __cconfig_lookup_int(cfg, \"bandwidth.read\", &target_bw);\n\n    return __set_read_bw(node, target_bw);\n}\n\nint init_bandwidth_model(config_t* cfg, virtual_topology_t* topology)\n{\n    int i;\n    char* model_file;\n\n    srandom((int)monotonic_time());\n\n    if (read_bw_model.enabled) {\n        DBG_LOG(INFO, \"Initializing bandwidth model\\n\");\n        // initialize bandwidth model\n        for (i=0; i<topology->num_virtual_nodes; i++) {\n            // FIXME: currently we keep a single bandwidth model and not per-node bandwidth model\n            physical_node_t* phys_node = topology->virtual_nodes[i].nvram_node;\n            if (__cconfig_lookup_string(cfg, \"bandwidth.model\", &model_file) == CONFIG_TRUE) {\n                if (load_model(model_file, \"read\", &read_bw_model) != E_SUCCESS) {\n                    train_model(phys_node, 'r', &read_bw_model);\n                    save_model(model_file, \"read\", &read_bw_model);\n                }\n                /*if (load_model(model_file, \"write\", &write_bw_model) != E_SUCCESS) {\n                    train_model(phys_node, 'w', &write_bw_model);\n                    save_model(model_file, \"write\", &write_bw_model);\n                }*/\n            }\n        }\n\n        // set read and write memory bandwidth \n        for (i=0; i<topology->num_virtual_nodes; i++) {\n            physical_node_t* phys_node = topology->virtual_nodes[i].nvram_node;\n            set_read_bw(cfg, phys_node);\n            //set_write_bw(cfg, phys_node);\n        }\n    } else {\n        // reset throttle registers\n        for (i=0; i<topology->num_virtual_nodes; i++) {\n            // FIXME: currently we keep a single bandwidth model and not per-node bandwidth model\n            physical_node_t* phys_node = topology->virtual_nodes[i].dram_node;\n            __set_read_bw(phys_node, (uint64_t) (-1));\n            __set_write_bw(phys_node, (uint64_t) (-1));\n        }\n    }\n\n    return E_SUCCESS;\n}\n"
  },
  {
    "path": "src/lib/model_lat.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <string.h>\n#include \"cpu/cpu.h\"\n#include \"config.h\"\n#include \"error.h\"\n#include \"thread.h\"\n#include \"topology.h\"\n#include \"model.h\"\n#include \"monotonic_timer.h\"\n\n/**\n * \\file\n * \n * \\page latency_emulation Memory latency emulation\n * \n * To emulate latency, we construct epochs and inject software created delays \n * at the end of each epoch.\n * Epochs are created either at fixed intervals by periodically interrupting \n * threads or on demand when a synchronization method (lock, unlock) is called.\n *\n * Delays are calculated using a simple analytic model that takes input from \n * performance counters.\n */ \n\n\n\nlatency_model_t latency_model;\n\n#pragma GCC push_options\n#pragma GCC optimize (\"O0\")\ninline hrtime_t hrtime_cycles(void)\n{\n    unsigned hi, lo;\n    __asm__ __volatile__ (\"rdtscp\" : \"=a\"(lo), \"=d\"(hi));\n    return ( (hrtime_t)lo)|( ((hrtime_t)hi)<<32 );\n}\n#pragma GCC pop_options\n\n/*\nstatic inline hrtime_t ns_to_cycles(int cpu_speed_mhz, int ns)\n{\n    return (cpu_speed_mhz * ns) / 1000;\n}\n*/\n\ninline hrtime_t cycles_to_us(int cpu_speed_mhz, hrtime_t cycles)\n{\n    return (cycles/cpu_speed_mhz);\n}\n\n#pragma GCC push_options\n#pragma GCC optimize (\"O0\")\nstatic inline void create_delay_cycles(hrtime_t cycles)\n{\n    hrtime_t start, stop;\n\n    start = hrtime_cycles();\n    do {\n        stop = hrtime_cycles();\n    } while (stop - start < cycles);\n}\n#pragma GCC pop_options\n\n/*\nstatic inline void create_delay_ns(cpu_model_t* cpu, int ns)\n{\n    hrtime_t cycles;\n    cycles = ns_to_cycles(cpu, ns);\n    create_delay_cycles(cycles);\n}\n*/\n\nstatic int check_target_latency_against_hw_latency(virtual_topology_t* virtual_topology) {\n    int status = 0;\n    int i;\n    int hw_latency_dram;\n    int hw_latency_nvram;\n\n    for (i = 0; i < virtual_topology->num_virtual_nodes; ++i) {\n        hw_latency_dram = virtual_topology->virtual_nodes[i].dram_node->latency;\n        hw_latency_nvram = virtual_topology->virtual_nodes[i].nvram_node->latency;\n        if (hw_latency_dram >= latency_model.read_latency ||\n            hw_latency_dram >= latency_model.write_latency ||\n            hw_latency_nvram >= latency_model.read_latency ||\n            hw_latency_nvram >= latency_model.write_latency) {\n            DBG_LOG(ERROR, \"Target read (%d) and write (%d) latency to be emulated must be greater than the \"\n            \t\t\"hardware latency dram (%d) and virtual nvram (%d) (virtual node %d)\\n\",\n            \t\tlatency_model.read_latency, latency_model.write_latency, hw_latency_dram, hw_latency_nvram, i);\n            status = -1;\n            break;\n        }\n    }\n\n    return status;\n}\n\nint init_latency_model(config_t* cfg, cpu_model_t* cpu, virtual_topology_t* virtual_topology)\n{\n\tint i;\n\n    DBG_LOG(INFO, \"Initializing latency model\\n\");\n\n    memset(&latency_model, 0, sizeof(latency_model_t));\n    latency_model.enabled = 1;\n\n    __cconfig_lookup_int(cfg, \"latency.read\", &latency_model.read_latency);\n    __cconfig_lookup_int(cfg, \"latency.write\", &latency_model.write_latency);\n\n    if (check_target_latency_against_hw_latency(virtual_topology) < 0) {\n        return E_INVAL;\n    }\n\n    __cconfig_lookup_bool(cfg, \"latency.inject_delay\", &latency_model.inject_delay);\n    if (!latency_model.inject_delay) {\n        DBG_LOG(WARNING, \"Latency model is enabled, but delay injection is disabled\\n\");\n    }\n\n#ifdef PAPI_SUPPORT\n    if (pmc_init() != 0) {\n        return E_ERROR;\n    }\n\n    latency_model.pmc_stall_local = cpu->pmc_events.read_stalls_events_local;\n    latency_model.pmc_stall_remote = cpu->pmc_events.read_stalls_events_remote;\n#else\n    for (i=0; cpu->pmc_events->known_events[i].name; ++i) {\n        // LDM_STALL_CYCLES implementation for each processor is mandatory\n        if (strcasecmp(cpu->pmc_events->known_events[i].name, \"LDM_STALL_CYCLES\") == 0) {\n            if (!(latency_model.pmc_stall_cycles = enable_pmc_event(cpu, \"LDM_STALL_CYCLES\"))) {\n                return E_NOENT;\n            }\n        }\n        if (strcasecmp(cpu->pmc_events->known_events[i].name, \"REMOTE_DRAM\") == 0) {\n            if (!(latency_model.pmc_remote_dram = enable_pmc_event(cpu, \"REMOTE_DRAM\"))) {\n                return E_NOENT;\n            }\n        }\n    }\n\n    assert(latency_model.pmc_stall_cycles);\n#endif\n\n#ifdef CALIBRATION_SUPPORT\n    __cconfig_lookup_bool(cfg, \"latency.calibration\", &latency_model.calibration);\n    if (latency_model.calibration) {\n        latency_model.stalls_calibration_factor = 1.0;\n    }\n#endif\n\n    return E_SUCCESS;\n}\n\n__thread uint64_t tls_overhead = 0;\n__thread int tls_hw_local_latency = 0;\n__thread int tls_hw_remote_latency = 0;\n#ifdef MEMLAT_SUPPORT\n__thread uint64_t tls_global_remote_dram = 0;\n__thread uint64_t tls_global_local_dram = 0;\n#endif\n\nvoid init_thread_latency_model(thread_t *thread)\n{\n    tls_hw_local_latency = thread->virtual_node->dram_node->latency;\n    tls_hw_remote_latency = thread->virtual_node->nvram_node->latency;\n}\n\nvoid create_latency_epoch()\n{\n    uint64_t stall_cycles = 0;\n    uint64_t delay_cycles = 0;\n    int hw_latency;\n    int target_latency;\n    hrtime_t start, stop;\n    double epoch_end;\n\n    start = hrtime_cycles();\n\n    // An epoch may be created by a critical section and the static epoch\n    // may interfere with the current epoch creation. Block the signal here\n    // and unblock it at the end of this function.\n    block_new_epoch();\n\n    // must always be thread_self since we call core specific data through hrtime_cycles\n    thread_t* thread = thread_self();\n\n    if (!reached_min_epoch_duration(thread)) {\n    \tif (!thread) thread = thread_self();\n    \tif (thread) thread->signaled = 0;\n    \tunblock_new_epoch();\n        return;\n    }\n\n    //DBG_LOG(INFO, \"new epoch for thread id [%i]\\n\", thread->tid);\n\n#ifdef USE_STATISTICS\n    if (thread->thread_manager->stats.enabled) {\n        thread->stats.epochs++;\n    }\n#endif\n\n    // this is the generic hardware latency for this thread (it takes into account the current virtual node latencies)\n    hw_latency = thread->virtual_node->nvram_node->latency;\n    target_latency = latency_model.read_latency;\n\n    // check if the thread_self is remote (virtual topology where dram != nvram) or local (dram == nvram)\n    // on this case, stall cycles will be a proportion of remote memory accesses\n    // TODO: the read pmc method used below must be changed to support PAPI\n    if (thread->virtual_node->dram_node != thread->virtual_node->nvram_node &&\n            latency_model.pmc_remote_dram) {\n        stall_cycles = read_pmc_event(latency_model.pmc_remote_dram);\n\t} else {\n\t\tstall_cycles = read_pmc_event(latency_model.pmc_stall_cycles);\n\t}\n\n#ifdef CALIBRATION_SUPPORT\n    if (latency_model.calibration) {\n        stall_cycles = (uint64_t)((double)stall_cycles * latency_model.stalls_calibration_factor);\n    }\n#endif\n\n    delay_cycles = stall_cycles * ((double)(target_latency - hw_latency) / ((double) hw_latency));\n\n    stop = hrtime_cycles();\n    tls_overhead += stop - start;\n\n    DBG_LOG(DEBUG, \"overhead cycles: %lu; immediate overhead %lu; stall cycles: %lu; delay cycles: %lu\\n\", tls_overhead, stop - start, stall_cycles, delay_cycles);\n\n    if (delay_cycles > tls_overhead) {\n    \tdelay_cycles -= tls_overhead;\n        tls_overhead = 0;\n    }\n    else {\n    \ttls_overhead -= delay_cycles;\n    \tdelay_cycles = 0;\n    }\n\n#ifdef MEMLAT_SUPPORT\n    thread->stall_cycles += stall_cycles;\n#endif\n\n#ifdef USE_STATISTICS\n    if (thread->thread_manager->stats.enabled) {\n        thread->stats.stall_cycles += stall_cycles;\n        thread->stats.delay_cycles += delay_cycles;\n        thread->stats.overhead_cycles = tls_overhead;\n    }\n#endif\n\n    epoch_end = monotonic_time_us();\n\n    DBG_LOG(DEBUG, \"injecting delay of %lu cycles (%lu usec) - discounted overhead\\n\", delay_cycles,\n                    cycles_to_us(thread->cpu_speed_mhz, delay_cycles));\n    if (delay_cycles && latency_model.inject_delay) {\n        create_delay_cycles(delay_cycles);\n    }\n\n#ifdef USE_STATISTICS\n    if (thread->thread_manager->stats.enabled) {\n    \tuint64_t older_epoch_timestamp = thread->stats.last_epoch_timestamp;\n    \tuint64_t diff_epoch_timestamp = epoch_end - older_epoch_timestamp;\n\n    \tif (diff_epoch_timestamp < thread->stats.shortest_epoch_duration_us) {\n    \t    thread->stats.shortest_epoch_duration_us = diff_epoch_timestamp;\n    \t}\n\n    \tif (diff_epoch_timestamp > thread->stats.longest_epoch_duration_us) {\n\t\t    thread->stats.longest_epoch_duration_us = diff_epoch_timestamp;\n    \t}\n\n    \tthread->stats.overall_epoch_duration_us += diff_epoch_timestamp;\n    \tthread->stats.last_epoch_timestamp = monotonic_time_us();\n    } else {\n    \t// last epoch timestamp must always be updated\n        thread->stats.last_epoch_timestamp = monotonic_time_us();\n    }\n#else\n    thread->last_epoch_timestamp = monotonic_time_us();\n#endif\n    // this must be the last step, since this function is called also from the signal handler\n    // and the monitor thread sets this flag, we must make sure race conditions are prevented\n    thread->signaled = 0;\n\n    unblock_new_epoch();\n}\n"
  },
  {
    "path": "src/lib/monotonic_timer.c",
    "content": "// Copyright 2013 Alex Reece.\n//\n// A cross platform monotonic timer.\n\n#include <unistd.h>\n#include \"monotonic_timer.h\"\n\n#if _POSIX_TIMERS > 0 && defined(_POSIX_MONOTONIC_CLOCK)\n  // If we have it, use clock_gettime and CLOCK_MONOTONIC.\n\n  #include <time.h>\n\n  double monotonic_time() {\n    struct timespec time;\n    // Note: Make sure to link with -lrt to define clock_gettime.\n    clock_gettime(CLOCK_MONOTONIC, &time);\n    return ((double) time.tv_sec) + ((double) time.tv_nsec / (NANOS_PER_SECF));\n  }\n\n  double monotonic_time_us() {\n\t  struct timespec time;\n\t  // Note: Make sure to link with -lrt to define clock_gettime.\n\t  clock_gettime(CLOCK_MONOTONIC, &time);\n\t  return ((double) (time.tv_sec * USECS_PER_SEC)) + ((double) time.tv_nsec / NANOS_PER_USECF);\n  }\n\n#else\n  // Fall back to rdtsc. The reason we don't use clock() is this scary message\n  // from the man page:\n  //     \"On several other implementations, the value returned by clock() also\n  //      includes the times of any children whose status has been collected via\n  //      wait(2) (or another wait-type call).\"\n  //\n  // Also, clock() only has microsecond accuracy.\n  //\n  // This whitepaper offered excellent advice on how to use rdtscp for\n  // profiling: http://download.intel.com/embedded/software/IA/324264.pdf\n  //\n  // Unfortunately, we can't follow its advice exactly with our semantics,\n  // so we're just going to use rdtscp with cpuid.\n  //\n  // Note that rdtscp will only be available on new processors.\n\n  #include <stdint.h>\n\n  static inline uint64_t rdtsc() {\n    uint32_t hi, lo;\n    asm volatile(\"rdtscp\\n\"\n                 \"movl %%edx, %0\\n\"\n                 \"movl %%eax, %1\\n\"\n                 \"cpuid\"\n                 : \"=r\" (hi), \"=r\" (lo) : : \"%rax\", \"%rbx\", \"%rcx\", \"%rdx\");\n    return (((uint64_t)hi) << 32) | (uint64_t)lo;\n  }\n\n  static uint64_t rdtsc_per_sec = 0;\n  static uint64_t rdtsc_per_usec = 0;\n  static void __attribute__((constructor)) init_rdtsc_per_sec() {\n    uint64_t before, after;\n\n    before = rdtsc();\n    usleep(USECS_PER_SEC);\n    after = rdtsc();\n\n    rdtsc_per_sec = after - before;\n\n    before = rdtsc();\n    usleep(1);\n    after = rdtsc();\n\n    rdtsc_per_usec = after - before;\n  }\n\n  double monotonic_time() {\n    return (double) rdtsc() / (double) rdtsc_per_sec;\n  }\n\n  // TODO: not tested, it is core specific and callers must be aware\n  double monotonic_time_us() {\n    return ((double) rdtsc() / (double) rdtsc_per_usec);\n  }\n\n#endif\n"
  },
  {
    "path": "src/lib/monotonic_timer.h",
    "content": "// Copyright 2013 Alex Reece.\n//\n// A cross platform monotonic timer.\n\n#ifndef MONOTONIC_TIMER_H_\n#define MONOTONIC_TIMER_H_\n\n#define NANOS_PER_SECF 1000000000.0\n#define NANOS_PER_USECF 1000.0\n#define NANOS_PER_USEC 1000\n#define USECS_PER_SEC 1000000\n\n// Returns seconds since some unspecified start time (guaranteed to be\n// monotonically increasing).\ndouble monotonic_time();\ndouble monotonic_time_us();\n\n#endif  // MONOTONIC_TIMER_H_\n"
  },
  {
    "path": "src/lib/pflush.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include \"pflush.h\"\n\n#include <stdint.h>\n\ntypedef uint64_t hrtime_t;\n\n#if defined(__i386__)\n\nstatic inline unsigned long long asm_rdtsc(void)\n{\n    unsigned long long int x;\n    __asm__ volatile (\".byte 0x0f, 0x31\" : \"=A\" (x));\n    return x;\n}\n\nstatic inline unsigned long long asm_rdtscp(void)\n{\n        unsigned hi, lo;\n    __asm__ __volatile__ (\"rdtscp\" : \"=a\"(lo), \"=d\"(hi)::\"ecx\");\n    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );\n\n}\n#elif defined(__x86_64__)\n\nstatic inline unsigned long long asm_rdtsc(void)\n{\n    unsigned hi, lo;\n    __asm__ __volatile__ (\"rdtsc\" : \"=a\"(lo), \"=d\"(hi));\n    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );\n}\n\nstatic inline unsigned long long asm_rdtscp(void)\n{\n    unsigned hi, lo;\n    __asm__ __volatile__ (\"rdtscp\" : \"=a\"(lo), \"=d\"(hi)::\"rcx\");\n    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );\n}\n#else\n#error \"What architecture is this???\"\n#endif\n\n/* Flush cacheline */\n#define asm_clflush(addr)                   \\\n({                              \\\n    __asm__ __volatile__ (\"clflush %0\" : : \"m\"(*addr)); \\\n})\n\n/* Memory fence */\n#define asm_mfence()                \\\n({                      \\\n    PM_FENCE();             \\\n    __asm__ __volatile__ (\"mfence\");    \\\n})\n\nstatic int global_cpu_speed_mhz = 0;\nstatic int global_write_latency_ns = 0;\n\nvoid init_pflush(int cpu_speed_mhz, int write_latency_ns)\n{\n    global_cpu_speed_mhz = cpu_speed_mhz;\n    global_write_latency_ns = write_latency_ns;\n}\n\ninline hrtime_t cycles_to_ns(int cpu_speed_mhz, hrtime_t cycles)\n{\n    return (cycles*1000/cpu_speed_mhz);\n}\n\ninline hrtime_t ns_to_cycles(int cpu_speed_mhz, hrtime_t ns)\n{\n    return (ns*cpu_speed_mhz/1000);\n}\n\nstatic inline\nvoid\nemulate_latency_ns(int ns)\n{\n    hrtime_t cycles;\n    hrtime_t start;\n    hrtime_t stop;\n    \n    start = asm_rdtsc();\n    cycles = ns_to_cycles(global_cpu_speed_mhz, ns);\n\n    do { \n        /* RDTSC doesn't necessarily wait for previous instructions to complete \n         * so a serializing instruction is usually used to ensure previous \n         * instructions have completed. However, in our case this is a desirable\n         * property since we want to overlap the latency we emulate with the\n         * actual latency of the emulated instruction. \n         */\n        stop = asm_rdtsc();\n    } while (stop - start < cycles);\n}\n\nvoid\npflush(uint64_t *addr)\n{\n    if (global_write_latency_ns == 0) {\n        return;\n    }\n\n    /* Measure the latency of a clflush and add an additional delay to\n     * meet the latency to write to NVM */\n    hrtime_t start;\n    hrtime_t stop;\n    start = asm_rdtscp();\n    asm_clflush(addr);  \n    stop = asm_rdtscp();\n    int to_insert_ns = global_write_latency_ns - cycles_to_ns(global_cpu_speed_mhz, stop-start);\n    if (to_insert_ns <= 0) {\n        return;\n    }\n    emulate_latency_ns(to_insert_ns);\n}\n"
  },
  {
    "path": "src/lib/pflush.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __PFLUSH_H\n#define __PFLUSH_H\n\n/**\n * \\file\n * \n * \\page pflush_api Persistent Memory API \n *\n * Method to be used by client to inject a write latency.\n */\n\n#include <stdint.h>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nvoid init_pflush(int cpu_speed_mhz, int write_latency_ns);\n\n/**\n * \\brief Flush the cacheline containing address addr.\n */\nvoid pflush(uint64_t *addr);\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif /* __PFLUSH_H */\n"
  },
  {
    "path": "src/lib/pmalloc.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <numa.h>\n#include \"topology.h\"\n#include \"pmalloc.h\"\n#include \"thread.h\"\n#include \"debug.h\"\n\n// pmalloc should be implemented as a separate library\n\n// FIXME: pmalloc currently uses numa_alloc_onnode() which is slower than regular malloc.\n// Consider layering another malloc on top of a emulated nvram \n\n\nvoid* pmalloc(size_t size)\n{\n    thread_t* thread = thread_self();\n\n    if (thread == NULL) {\n    \t// FIXME: JVM for instance create threads using a mechanism not traced by this emulator\n    \t//        for now we make sure the current thread is registered right when it makes the\n    \t//        first explicit NVM allocation. A better solution is to trace the thread creation\n    \t//        done by JVM.\n        register_self();\n        thread = thread_self();\n    }\n\n    if (thread) {\n        return numa_alloc_onnode(size, thread->virtual_node->nvram_node->node_id);\n    } else {\n    \tDBG_LOG(ERROR, \"pmalloc called with NULL thread\\n\");\n    }\n    \n    return NULL;\n}\n\nvoid *prealloc(void *old_addr, size_t old_size, size_t new_size)\n{\n    return numa_realloc(old_addr, old_size, new_size);\n}\n\nvoid pfree(void* start, size_t size)\n{\n    numa_free(start, size);\n}\n"
  },
  {
    "path": "src/lib/pmalloc.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __PMALLOC_H\n#define __PMALLOC_H\n\n/**\n * \\file\n * \n * \\page pmalloc_api Persistent Memory API \n *\n * Methods to be used by clients to allocate and free emulated NVRAM.\n */\n\n#include <stddef.h>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nvoid *pmalloc(size_t size);\nvoid *prealloc(void *old_addr, size_t old_size, size_t new_size);\nvoid pfree(void *start, size_t size);\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif /* __PMALLOC_H */\n"
  },
  {
    "path": "src/lib/process_rank.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n/*\n * process_rank.c\n *\n *  Created on: Jun 16, 2015\n *      Author: root\n */\n\n\n#include <unistd.h>\n#include \"model.h\"\n#include \"error.h\"\n\n#define EMUL_LOCAL_PROCESSES_VAR \"EMUL_LOCAL_PROCESSES\"\n\n#define EMUL_LOCK_FILE \"/tmp/emul_lock_file\"\n#define EMUL_PROCESS_LOCAL_RANK_FILE \"/tmp/emul_process_local_rank\"\n#define LOCKED_WAIT_US 1000\n#define MAX_LOCKED_RETRIES 50\n\nextern latency_model_t latency_model;\n\nint set_process_local_rank()\n{\n    FILE *flock = NULL;\n    FILE *fcounter = NULL;\n    int expired = 0;\n    int process_id = 0;\n    char *processes;\n    int ret = E_SUCCESS;\n#ifndef NDEBUG\n    char hname[64];\n#endif\n\n    processes = getenv(EMUL_LOCAL_PROCESSES_VAR);\n\n    if (!processes) {\n    \tDBG_LOG(WARNING, \"No %s variable set, skipping rank setting\\n\", EMUL_LOCAL_PROCESSES_VAR);\n    \treturn E_SUCCESS;\n    } else {\n    \tif (sscanf(processes, \"%d\", &latency_model.max_local_processe_ranks) != 1) {\n    \t\tDBG_LOG(WARNING, \"Ignoring EMUL_PROCESSES_PER_SYSTEM variable with invalid value '%s'\\n\", processes);\n    \t\treturn E_SUCCESS;\n    \t}\n    }\n\n    if (latency_model.max_local_processe_ranks < 2) {\n    \tDBG_LOG(WARNING, \"EMUL_PROCESSES_PER_SYSTEM value is %d, skipping rank setting\\n\",\n    \t\t\tlatency_model.max_local_processe_ranks);\n    \treturn E_SUCCESS;\n    }\n\n    DBG_LOG(DEBUG, \"setting process local rank for %d local processes\\n\",\n    \t\tlatency_model.max_local_processe_ranks);\n\n    while (expired < MAX_LOCKED_RETRIES) {\n    \t// open lock file on exclusive mode\n        flock = fopen(EMUL_LOCK_FILE, \"wx\");\n\n        if (flock == NULL) {\n//        \tDBG_LOG(DEBUG, \"failed to create lock file\\n\");\n            usleep(LOCKED_WAIT_US);\n            expired++;\n        }\n        if (flock) break;\n    }\n    if (expired >= MAX_LOCKED_RETRIES) {\n    \tDBG_LOG(ERROR, \"failed to set process local rank\\n\");\n    \treturn E_ERROR;\n    }\n\n    // lock acquired, read process counter file\n    if (access(EMUL_PROCESS_LOCAL_RANK_FILE, R_OK | W_OK) < 0) {\n    \t// rank file does not exist, create it and write \"1\" for next process\n    \t// this process rank id is 1\n    \tprocess_id = 1;\n    \tfcounter = fopen(EMUL_PROCESS_LOCAL_RANK_FILE, \"w\");\n    \tfwrite(&process_id, sizeof(int), 1, fcounter);\n    \tfclose(fcounter);\n    } else {\n    \t// rank file exists, read the current rank max value and use it as this process local\n    \t// rank id and increment the value in the rank file for the next process\n    \tfcounter = fopen(EMUL_PROCESS_LOCAL_RANK_FILE, \"r+\");\n    \tif (fread(&process_id, sizeof(int), 1, fcounter) == 0) {\n    \t    abort();\n    \t}\n    \tDBG_LOG(DEBUG, \"read from file current max rank %d\\n\", process_id);\n    \tlatency_model.process_local_rank = process_id;\n    \tprocess_id++;\n    \tif (process_id >= latency_model.max_local_processe_ranks) {\n    \t    DBG_LOG(ERROR, \"process rank %d exceeded limit of %d max emulated processes\\n\",\n    \t        process_id, latency_model.max_local_processe_ranks);\n    \t    fclose(fcounter);\n    \t    ret = E_ERROR;\n    \t} else {\n    \t    DBG_LOG(DEBUG, \"write to file new max rank %d\\n\", process_id);\n    \t    rewind(fcounter);\n            fwrite(&process_id, sizeof(int), 1, fcounter);\n            fclose(fcounter);\n        }\n    }\n\n    // close and delete lock file\n    fclose(flock);\n    remove(EMUL_LOCK_FILE);\n\n#ifndef NDEBUG\n    gethostname(hname, sizeof(hname));\n    DBG_LOG(DEBUG, \"process local rank is %d on system %s\\n\", latency_model.process_local_rank, hname);\n#endif\n\n    return ret;\n}\n\nint unset_process_local_rank()\n{\n    FILE *flock = NULL;\n    FILE *fcounter = NULL;\n    int expired = 0;\n    int process_id;\n\n    if (latency_model.max_local_processe_ranks < 2) {\n    \treturn E_SUCCESS;\n    }\n\n    DBG_LOG(DEBUG, \"Unsetting process local rank\\n\");\n\n    while (expired < MAX_LOCKED_RETRIES) {\n    \t// open lock file on Exclusive mode\n        flock = fopen(EMUL_LOCK_FILE, \"wx\");\n\n        if (flock == NULL) {\n//        \tDBG_LOG(DEBUG, \"failed to create lock file\\n\");\n            usleep(LOCKED_WAIT_US);\n            expired++;\n        }\n        if (flock) break;\n    }\n    if (expired >= MAX_LOCKED_RETRIES) {\n    \tDBG_LOG(ERROR, \"failed to unset process local rank\\n\");\n    \treturn E_ERROR;\n    }\n\n    // lock acquired, read process counter file\n    if (access(EMUL_PROCESS_LOCAL_RANK_FILE, R_OK | W_OK) == 0) {\n    \t// if rank file does not exist, nothing to be done\n    \t// file exists, read the current value and decrement it\n    \tfcounter = fopen(EMUL_PROCESS_LOCAL_RANK_FILE, \"r+\");\n    \tif (fread(&process_id, sizeof(int), 1, fcounter) == 0) {\n    \t    abort();\n    \t}\n    \tDBG_LOG(DEBUG, \"Exiting process and reading current rank max %d\\n\", process_id);\n    \tif (process_id > 0) process_id--;\n    \t{\n    \tchar hname[64];\n    \tgethostname(hname, sizeof(hname));\n    \tDBG_LOG(DEBUG, \"Exiting process and writing new rank max %d on %s\\n\", process_id, hname);\n    \t}\n    \trewind(fcounter);\n\t\tfwrite(&process_id, sizeof(int), 1, fcounter);\n\t\tfclose(fcounter);\n    }\n\n    // close and delete lock file\n    fclose(flock);\n    remove(EMUL_LOCK_FILE);\n\n    return E_SUCCESS;\n}\n"
  },
  {
    "path": "src/lib/stat.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <stdio.h>\n#include <math.h>\n#include <sys/types.h>\n#include <unistd.h>\n\n#include \"utlist.h\"\n#include \"stat.h\"\n#include \"thread.h\"\n#include \"interpose.h\"\n#include \"model.h\"\n\nthread_manager_t* get_thread_manager();\nhrtime_t cycles_to_us(int cpu_speed_mhz, hrtime_t cycles);\n\n#ifdef USE_STATISTICS\nvoid stats_set_init_time(double init_time_us) {\n\tthread_manager_t* thread_manager = get_thread_manager();\n\n\t__lib_pthread_mutex_lock(&thread_manager->mutex);\n\tthread_manager->stats.init_time_us = init_time_us;\n\t__lib_pthread_mutex_unlock(&thread_manager->mutex);\n}\n\nvoid stats_enable(config_t *cfg) {\n\tthread_manager_t* thread_manager = get_thread_manager();\n\n    __cconfig_lookup_bool(cfg, \"statistics.enable\", &thread_manager->stats.enabled);\n    if (__cconfig_lookup_string(cfg, \"statistics.file\", &thread_manager->stats.output_file) == CONFIG_FALSE) {\n    \t__lib_pthread_mutex_lock(&thread_manager->mutex);\n    \tthread_manager->stats.output_file = NULL;\n    \t__lib_pthread_mutex_unlock(&thread_manager->mutex);\n    }\n}\n\nstatic char *get_current_time() {\n    time_t curtime;\n    char *str_time;\n\n    time(&curtime);\n    str_time = ctime(&curtime);\n    str_time[strlen(str_time) - 1] = 0;\n\n    return str_time;\n}\n\nstatic inline hrtime_t ns_to_cycles(int cpu_speed_mhz, int ns)\n{\n    return (cpu_speed_mhz * ns) / 1000;\n}\n\nextern __thread int tls_hw_local_latency;\nextern __thread int tls_hw_remote_latency;\n\nstatic void show_thread_stats(thread_t *thread, FILE *out_file) {\n    uint64_t fixed_value;\n    uint64_t cycles;\n\n    fprintf(out_file, \"\\tThread id [%d]\\n\", thread->tid);\n    fprintf(out_file, \"\\t\\t: cpu id: %d\\n\", thread->cpu_id);\n    fprintf(out_file, \"\\t\\t: spawn timestamp: %lu\\n\", thread->stats.register_timestamp);\n    fprintf(out_file, \"\\t\\t: termination timestamp: %lu\\n\", thread->stats.unregister_timestamp);\n    fixed_value = thread->stats.unregister_timestamp > 0 ? (thread->stats.unregister_timestamp - thread->stats.register_timestamp) : 0;\n    fprintf(out_file, \"\\t\\t: execution time: %lu usecs\\n\", fixed_value);\n    fprintf(out_file, \"\\t\\t: stall cycles: %lu\\n\", thread->stats.stall_cycles);\n\n    if (thread->virtual_node->dram_node != thread->virtual_node->nvram_node &&\n                latency_model.pmc_remote_dram) {\n        cycles = ns_to_cycles(thread->cpu_speed_mhz, tls_hw_remote_latency);\n        fixed_value = cycles ? thread->stats.stall_cycles / cycles : 0;\n    }\n    else {\n        cycles = ns_to_cycles(thread->cpu_speed_mhz, tls_hw_local_latency);\n        fixed_value = cycles ? thread->stats.stall_cycles / cycles : 0;\n    }\n    fprintf(out_file, \"\\t\\t: NVM accesses: %lu\\n\", fixed_value);\n\n\n    fprintf(out_file, \"\\t\\t: latency calculation overhead cycles: %lu\\n\", thread->stats.overhead_cycles);\n    fprintf(out_file, \"\\t\\t: injected delay cycles: %lu\\n\", thread->stats.delay_cycles);\n    if (thread->cpu_speed_mhz) {\n        fprintf(out_file, \"\\t\\t: injected delay in usec: %lu\\n\", cycles_to_us(thread->cpu_speed_mhz, thread->stats.delay_cycles));\n    }\n    fprintf(out_file, \"\\t\\t: longest epoch duration: %lu usec\\n\", thread->stats.longest_epoch_duration_us);\n    fixed_value = (thread->stats.shortest_epoch_duration_us == UINT64_MAX) ? 0 : thread->stats.shortest_epoch_duration_us;\n    fprintf(out_file, \"\\t\\t: shortest epoch duration: %lu usec\\n\", fixed_value);\n    fixed_value = thread->stats.epochs ? (thread->stats.overall_epoch_duration_us / thread->stats.epochs) :\n    \t\tthread->stats.overall_epoch_duration_us;\n    fprintf(out_file, \"\\t\\t: average epoch duration: %lu usec\\n\", fixed_value);\n    fprintf(out_file, \"\\t\\t: number of epochs: %lu\\n\", thread->stats.epochs);\n    fprintf(out_file, \"\\t\\t: epochs which didn't reach min duration: %lu\\n\", thread->stats.min_epoch_not_reached);\n    fprintf(out_file, \"\\t\\t: static epochs requested: %lu\\n\", thread->stats.signals_sent);\n}\n\nvoid stats_report() {\n    thread_t *thread;\n    FILE *out_file;\n    uint64_t running_threads = 0;\n    thread_manager_t* thread_manager = get_thread_manager();\n    uint64_t terminated_threads;\n\n    if (!thread_manager) return;\n    if (!thread_manager->stats.enabled) return;\n\n    if (thread_manager->stats.output_file) {\n        out_file = fopen(thread_manager->stats.output_file, \"a\");\n        if (!out_file) {\n            fprintf(stderr, \"Failed to open statistics file for writing: %s\\n\", thread_manager->stats.output_file);\n            return;\n        }\n    } else {\n        out_file = stdout;\n    }\n\n    __lib_pthread_mutex_lock(&thread_manager->mutex);\n    LL_FOREACH(thread_manager->thread_list, thread) {\n        running_threads++;\n    }\n    __lib_pthread_mutex_unlock(&thread_manager->mutex);\n\n    fprintf(out_file, \"\\n\\n===== STATISTICS (%s) =====\\n\\n\", get_current_time());\n    if (!latency_model.inject_delay) {\n    \tfprintf(out_file, \"WARNING: delay injection is disabled\\n\");\n    }\n    fprintf(out_file, \"PID: %d\\n\", getpid());\n    fprintf(out_file, \"Initialization duration: %lu usec\\n\", thread_manager->stats.init_time_us);\n    fprintf(out_file, \"Running threads: %lu\\n\", running_threads);\n    terminated_threads = thread_manager->stats.n_threads > 0 ? (thread_manager->stats.n_threads - running_threads) : 0;\n    fprintf(out_file, \"Terminated threads: %lu\\n\", terminated_threads);\n    fprintf(out_file, \"\\n\");\n\n    fprintf(out_file, \"== Running threads == \\n\");\n\n    __lib_pthread_mutex_lock(&thread_manager->mutex);\n    LL_FOREACH(thread_manager->thread_list, thread) {\n    \tshow_thread_stats(thread, out_file);\n    }\n    __lib_pthread_mutex_unlock(&thread_manager->mutex);\n\n    fprintf(out_file, \"\\n== Terminated threads == \\n\");\n\n    __lib_pthread_mutex_lock(&thread_manager->mutex);\n    LL_FOREACH(thread_manager->stats.thread_list, thread) {\n    \tshow_thread_stats(thread, out_file);\n    }\n    __lib_pthread_mutex_unlock(&thread_manager->mutex);\n\n    if (out_file != stdout) {\n        fclose(out_file);\n    }\n}\n#endif\n\ndouble sum(double array[], int n)\n{\n    int i;\n    double s = 0;\n\n    for (i=0; i<n; i++) {\n        s += array[i];\n    }\n    return s;\n}\n\n// returns sum of x . y\ndouble sumxy(double x[], double y[], int n)\n{\n    int i;\n    double s = 0;\n\n    for (i=0; i<n; i++) {\n        s += x[i] * y[i];\n    }\n    return s;\n}\n\n\ndouble avg(double array[], int n)\n{\n    double s;\n\n    s = sum(array, n);\n    return s/n;\n}\n\ndouble slope(double x[], double y[], int n)\n{\n    double sumxy_;\n    double sumx2;\n    double sumx;\n    double sumy;\n    double m; \n\n    sumxy_ = sumxy(x, y, n);\n    sumx2 = sumxy(x, x, n);\n    sumx = sum(x, n);\n    sumy = sum(y, n);\n\n    m = (n * sumxy_ - sumx * sumy) / \n        (n * sumx2 - sumx*sumx);\n    return m;\n}\n"
  },
  {
    "path": "src/lib/stat.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __STATISTICS_H\n#define __STATISTICS_H\n\n//#include <sys/types.h>\n#include <stdint.h>\n#include \"config.h\"\n\n#ifdef USE_STATISTICS\nstruct thread_s;\n\ntypedef struct {\n    int enabled;\n    struct thread_s* thread_list;\n    uint64_t n_threads;\n    uint64_t init_time_us;\n    char *output_file;\n} stats_t;\n\ntypedef struct {\n    uint64_t stall_cycles;\n    uint64_t overhead_cycles;\n    uint64_t delay_cycles;\n    uint64_t signals_sent;\n    uint64_t epochs;\n    double last_epoch_timestamp;\n    uint64_t shortest_epoch_duration_us;\n    uint64_t longest_epoch_duration_us;\n    uint64_t overall_epoch_duration_us;\n    uint64_t min_epoch_not_reached;\n    uint64_t register_timestamp;\n    uint64_t unregister_timestamp;\n} thread_stats_t;\n\nvoid stats_enable(config_t *cfg);\nvoid stats_set_init_time(double init_time_us);\nvoid stats_report();\n#endif\n\ndouble sum(double array[], int n);\ndouble sumxy(double x[], double y[], int n);\ndouble avg(double array[], int n);\ndouble slope(double x[], double y[], int n);\n\n#endif /* __STATISTICS_H */\n"
  },
  {
    "path": "src/lib/thread.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <sys/syscall.h>\n#include <unistd.h>\n#include <pthread.h>\n#include <signal.h>\n#include <stdlib.h>\n#include \"cpu/cpu.h\"\n#include \"utlist.h\"\n#include \"error.h\"\n#include \"interpose.h\"\n#include \"model.h\"\n#include \"thread.h\"\n#include \"topology.h\"\n#include \"monotonic_timer.h\"\n\nstatic thread_manager_t* thread_manager = NULL;\n__thread thread_t* tls_thread = NULL;\n\nextern inline hrtime_t hrtime_cycles(void);\n\n// assign a virtual/physical node using a round-robin policy\nstatic void rr_next_cpu_id(thread_manager_t* thread_manager, int* next_virtual_node_idp, int* next_cpu_idp)\n{\n    int next_virtual_node_id;\n    virtual_node_t* virtual_node;\n    physical_node_t* physical_node;\n    virtual_topology_t* virtual_topology = thread_manager->virtual_topology;\n\n    *next_virtual_node_idp = thread_manager->next_virtual_node_id;\n    *next_cpu_idp = thread_manager->next_cpu_id;\n\n    // advance to the next virtual node and cpu id\n    next_virtual_node_id = thread_manager->next_virtual_node_id;\n    virtual_node = &virtual_topology->virtual_nodes[next_virtual_node_id];\n    physical_node = virtual_node->dram_node; // we run threads on the dram node\n    if ((thread_manager->next_cpu_id = next_cpu(physical_node->cpu_bitmask, thread_manager->next_cpu_id + 1)) < 0) {\n        next_virtual_node_id = (next_virtual_node_id + 1) % virtual_topology->num_virtual_nodes;\n        virtual_node = &virtual_topology->virtual_nodes[next_virtual_node_id];\n        physical_node = virtual_node->dram_node;\n        thread_manager->next_cpu_id = first_cpu(physical_node->cpu_bitmask);\n        thread_manager->next_virtual_node_id = next_virtual_node_id;\n    } \n}\n\nvoid rr_set_next_cpu_based_on_rank(int rank, int max_rank)\n{\n    int cpu_id;\n    int virtual_node_id;\n    int i;\n\n    // set the next CPU id based on this process rank id\n    thread_manager->next_virtual_node_id = 0;\n    thread_manager->next_cpu_id = 0;\n    for (i = 0; i <= rank; ++i) {\n        rr_next_cpu_id(thread_manager, &virtual_node_id, &cpu_id);\n    }\n\n    DBG_LOG(DEBUG, \"no partitioning of CPUs, set next CPU \"\n                   \"to vnode %d and cpu %d\\n\", virtual_node_id, cpu_id);\n}\n\nvoid partition_cpus_based_on_rank(int rank, int max_rank, int num_cpus,\n                                  virtual_topology_t* virtual_topology)\n{\n    // assumed the number of cpus/2 is greater or equal to max_rank\n    // this partition is num_cpus/max_rank\n    int part_size = num_cpus/max_rank;\n    int start = rank * part_size;\n    int end = start + part_size -1;\n    int i;\n    int cpu_id = 0;\n    int virtual_node_id = 0;\n    virtual_node_t* virtual_node;\n    physical_node_t* physical_node;\n\n    DBG_LOG(DEBUG, \"partitioning CPUS, this process has CPUs from %d and %d\\n\",\n            start, end);\n\n    thread_manager->next_virtual_node_id = 0;\n    thread_manager->next_cpu_id = 0;\n    for (i = 0; i < num_cpus; ++i) {\n        rr_next_cpu_id(thread_manager, &virtual_node_id, &cpu_id);\n        if (i < start || i > end) {\n            // this CPU is outside the partition of this process\n            // disable this CPU\n            virtual_node = &virtual_topology->virtual_nodes[virtual_node_id];\n            physical_node = virtual_node->dram_node;\n\n            DBG_LOG(DEBUG, \"disabling CPU %d\\n\", cpu_id);\n\n            if (numa_bitmask_isbitset(physical_node->cpu_bitmask, cpu_id)) {\n                numa_bitmask_clearbit(physical_node->cpu_bitmask, cpu_id);\n            }\n        }\n    }\n}\n\nint bind_thread_on_cpu(thread_manager_t* thread_manager, thread_t* thread, int virtual_node_id, int cpu_id)\n{\n    thread->virtual_node = &thread_manager->virtual_topology->virtual_nodes[virtual_node_id];\n    DBG_LOG(INFO, \"Binding thread tid [%d] pthread: 0x%lx on processor %d\\n\", thread->tid, thread->pthread, cpu_id);\n    struct bitmask* cpubind = numa_allocate_cpumask();\n    numa_bitmask_setbit(cpubind, cpu_id);\n    if (numa_sched_setaffinity(thread->tid, cpubind) != 0) {\n        DBG_LOG(ERROR, \"Cannot bind thread tid [%d] pthread: 0x%lx on processor %d\\n\", thread->tid, thread->pthread, cpu_id);\n        numa_bitmask_free(cpubind);\n        return E_ERROR;\n    }\n    numa_bitmask_free(cpubind);\n    return E_SUCCESS;\n}\n\nint bind_thread_on_mem(thread_manager_t* thread_manager, thread_t* thread, int virtual_node_id, int cpu_id)\n{\n    int physical_node_id;\n    struct bitmask* membind = numa_allocate_nodemask();\n    physical_node_id = thread_manager->virtual_topology->virtual_nodes[virtual_node_id].dram_node->node_id;\n    numa_bitmask_setbit(membind, physical_node_id);\n    numa_set_membind(membind);\n    numa_free_nodemask(membind);\n\n    return E_SUCCESS;\n}\n\nthread_t* thread_self()\n{\n    return tls_thread;\n}\n\nvoid thread_interrupt_handler(int signum)\n{\n    DBG_LOG(DEBUG, \"Handling interrupt thread [%d] pthread: 0x%lx\\n\", thread_self()->tid, thread_self()->pthread);\n\n    create_latency_epoch();\n}\n\n#ifdef PAPI_SUPPORT\nstatic int setup_events_thread_self(thread_t *thread, const char **native_events) {\n    int i;\n\n    // create event set for this thread\n    if (pmc_create_event_set_local_thread() != 0) {\n       return -1;\n    }\n\n    // register events for this thread\n    for (i = 0; i < MAX_NUM_EVENTS; ++i) {\n   \t    if (native_events[i]) {\n            DBG_LOG(INFO, \"registering event %s, thread id [%d]\\n\", native_events[i], thread->tid);\n            if (pmc_register_event_local_thread(native_events[i]) != 0) {\n                return E_ERROR;\n            }\n        }\n    }\n\n    // start event counting for this thread\n    if (pmc_events_start_local_thread() != 0) {\n    \treturn E_ERROR;\n    }\n\n    pmc_register_thread();\n\n    return 0;\n}\n#endif\n\nint register_thread(thread_manager_t* thread_manager, pthread_t pthread, pid_t tid)\n{\n    int ret = 0;\n    int cpu_id;\n    int virtual_node_id;\n    thread_t* thread = malloc(sizeof(thread_t));\n\n    if (thread_manager == NULL) {\n        // this is possible if both BW and latency modeling are enabled and the BW model is not yet created.\n        // the BW modeling will spawn threads which will attempt to register with the thread manager if the\n        // latency modeling is enabled. However the thread manager is instantiated later.\n        //goto error;\n        return E_SUCCESS;\n    }\n\n    memset(thread, 0, sizeof(thread_t));\n\n    thread->pthread = pthread;\n    thread->tid = tid;\n    thread->thread_manager = thread_manager;\n\n#ifdef USE_STATISTICS\n    if (thread_manager->stats.enabled) {\n        thread->stats.last_epoch_timestamp = monotonic_time_us();\n        thread->stats.shortest_epoch_duration_us = UINT64_MAX;\n    }\n#endif\n\n\t/* install thread interrupt handler as the signal handler for SIGUSR1. */\n    struct sigaction sa;\n    memset (&sa, 0, sizeof(sa));\n    sa.sa_handler = &thread_interrupt_handler;\n    sa.sa_flags = SA_RESTART;\n    sigaction (SIGUSR1, &sa, NULL);\n\n    // bind the thread on a cpu and memory node and\n    // link the thread to the list of threads\n    assert(__lib_pthread_mutex_lock);\n    __lib_pthread_mutex_lock(&thread_manager->mutex);\n    rr_next_cpu_id(thread_manager, &virtual_node_id, &cpu_id);\n    if ((ret = bind_thread_on_cpu(thread_manager, thread, virtual_node_id, cpu_id)) != E_SUCCESS) {\n    \t__lib_pthread_mutex_unlock(&thread_manager->mutex);\n    \tDBG_LOG(ERROR, \"thread id [%d] failed to bind to CPU\\n\", thread->tid);\n        goto error;\n    }\n    if ((ret = bind_thread_on_mem(thread_manager, thread, virtual_node_id, cpu_id)) != E_SUCCESS) {\n    \t__lib_pthread_mutex_unlock(&thread_manager->mutex);\n    \tDBG_LOG(ERROR, \"thread id [%d] failed to bind to Memory\\n\", thread->tid);\n        goto error;\n    }\n    thread->cpu_id = cpu_id;\n    thread->cpu_speed_mhz = cpu_speed_mhz();\n#ifdef PAPI_SUPPORT\n    cpu_model_t *cpu = thread_manager->virtual_topology->virtual_nodes[virtual_node_id].dram_node->cpu_model;\n    if (setup_events_thread_self(thread, cpu->pmc_events.native_events) != 0) {\n        ret = E_ERROR;\n        __lib_pthread_mutex_unlock(&thread_manager->mutex);\n        goto error;\n    }\n#endif\n    LL_APPEND(thread_manager->thread_list, thread);\n#ifdef USE_STATISTICS\n    if (thread_manager->stats.enabled) {\n        thread_manager->stats.n_threads++;\n        thread->stats.register_timestamp = monotonic_time_us();\n    }\n#endif\n    __lib_pthread_mutex_unlock(&thread_manager->mutex);\n\n    init_thread_latency_model(thread);\n\n    tls_thread = thread;\n\n    return E_SUCCESS;\n\nerror:\n    free(thread);\n    DBG_LOG(ERROR, \"thread id [%d] failed to register with Monitor Thread\\n\", thread->tid);\n    return ret;\n}\n\n\nint unregister_thread(thread_manager_t* thread_manager, thread_t * thread)\n{\n    __lib_pthread_mutex_lock(&thread_manager->mutex);\n\n    if (thread_manager == NULL) {\n        return E_SUCCESS;\n    }\n\n    LL_DELETE(thread_manager->thread_list, thread);\n\n#ifdef USE_STATISTICS\n    if (thread_manager->stats.enabled) {\n        thread->stats.unregister_timestamp = monotonic_time_us();\n        LL_APPEND(thread_manager->stats.thread_list, thread);\n    }\n#endif\n\n    __lib_pthread_mutex_unlock(&thread_manager->mutex);\n\n#ifdef PAPI_SUPPORT\n    pmc_events_stop_local_thread();\n    pmc_destroy_event_set_local_thread();\n    pmc_unregister_thread();\n#endif\n\n    return E_SUCCESS;\n}\n\n\nint register_self()\n{\n\tint ret = E_SUCCESS;\n\n    if (thread_self() == NULL) {\n    \tpid_t tid = (pid_t) syscall(SYS_gettid);\n    \tDBG_LOG(INFO, \"Registering thread tid [%d]\\n\", tid);\n        ret = register_thread(thread_manager, pthread_self(), tid);\n    }\n\n    return ret;\n}\n\nint unregister_self()\n{\n\tif (tls_thread) {\n\t    unregister_thread(thread_manager, tls_thread);\n\n#ifdef USE_STATISTICS\n\t    if (!thread_manager->stats.enabled) {\n\t\t    // statistics makes use of the thread descriptor\n            free(tls_thread);\n\t    }\n#else\n\t    free(tls_thread);\n#endif\n        tls_thread = NULL;\n\t}\n\n    return E_SUCCESS;\n}\n\nstatic int reached_max_epoch_duration(thread_t* thread);\nvoid interrupt_threads(thread_manager_t* manager)\n{\n    thread_t* thread;\n\n    assert(__lib_pthread_mutex_lock);\n    __lib_pthread_mutex_lock(&manager->mutex);\n    LL_FOREACH(manager->thread_list, thread)\n    {\n    \tassert(thread);\n        if (thread->signaled == 0 && reached_max_epoch_duration(thread)) {\n            DBG_LOG(DEBUG, \"interrupting thread [%d]\\n\", thread->tid);\n#ifdef USE_STATISTICS\n            if (manager->stats.enabled) {\n                thread->stats.signals_sent++;\n            }\n#endif\n            // this flag must be set before the signal is sent to make sure\n            // there will be no race condition\n            thread->signaled = 1;\n            pthread_kill(thread->pthread, SIGUSR1);\n        }\n    }\n    assert(__lib_pthread_mutex_unlock);\n    __lib_pthread_mutex_unlock(&manager->mutex);\n}\n\nvoid* monitor_thread(void* arg)\n{\n    thread_manager_t* manager = (thread_manager_t*) arg;\n    struct timespec epoch_duration;\n//    time_t secs = thread_manager->max_epoch_duration_us / USECS_PER_SEC;\n//    long nanosecs = (thread_manager->max_epoch_duration_us % USECS_PER_SEC) * NANOS_PER_USEC;\n\n    epoch_duration.tv_sec = 0;\n    epoch_duration.tv_nsec = MIN_EPOCH_DURATION_US * 1000;\n    while(1) {\n        nanosleep(&epoch_duration, NULL);\n        interrupt_threads(manager);\n    }\n    return NULL;\n}\n\nstatic void set_epoch_duration(config_t* cfg, const char *config_str, int *epoch_us, int default_epoch_us) {\n    if (__cconfig_lookup_int(cfg, config_str, epoch_us) != CONFIG_TRUE) {\n    \t*epoch_us = default_epoch_us;\n    } else {\n        if (*epoch_us > MAX_EPOCH_DURATION_US ||\n                *epoch_us < MIN_EPOCH_DURATION_US) {\n            DBG_LOG(WARNING, \"%s is out of supported bounds [%i, %i], setting it to %i\\n\",\n            \t\tconfig_str,\n            \t\tMIN_EPOCH_DURATION_US,\n            \t\tMAX_EPOCH_DURATION_US,\n\t\t\t\t\tdefault_epoch_us);\n            *epoch_us = default_epoch_us;\n        }\n    }\n}\n\nint init_thread_manager(config_t* cfg, virtual_topology_t* virtual_topology)\n{\n    int ret;\n    pthread_t monitor_tid;\n    thread_manager_t* mgr;\n    virtual_node_t* virtual_node;\n    physical_node_t* physical_node;\n\n    if (!(mgr = malloc(sizeof(thread_manager_t)))) {\n        ret = E_ERROR;\n        goto done;    \n    }\n\n    memset(mgr, 0, sizeof(thread_manager_t));\n\n    mgr->thread_list = NULL;\n    mgr->virtual_topology = virtual_topology;\n    mgr->next_virtual_node_id = 0;\n\n    set_epoch_duration(cfg, \"latency.max_epoch_duration_us\", &mgr->max_epoch_duration_us, MAX_EPOCH_DURATION_US);\n    set_epoch_duration(cfg, \"latency.min_epoch_duration_us\", &mgr->min_epoch_duration_us, MIN_EPOCH_DURATION_US);\n\n    if (mgr->min_epoch_duration_us > mgr->max_epoch_duration_us) {\n        DBG_LOG(WARNING, \"latency.min_epoch_duration_us is greater than latency.max_epoch_duration_us, setting it to %i\\n\",\n                MIN_EPOCH_DURATION_US);\n        mgr->min_epoch_duration_us = MIN_EPOCH_DURATION_US;\n    }\n\n    virtual_node = &virtual_topology->virtual_nodes[mgr->next_virtual_node_id];\n    physical_node = virtual_node->dram_node;\n    mgr->next_cpu_id = first_cpu(physical_node->cpu_bitmask);\n    pthread_mutex_init(&mgr->mutex, NULL);\n\n    // fire a monitoring thread that periodically interrupts threads\n    assert(__lib_pthread_create);\n    assert(__lib_pthread_detach);\n    __lib_pthread_create(&monitor_tid, NULL, monitor_thread, (void*) mgr);\n    __lib_pthread_detach(monitor_tid);\n\n    thread_manager = mgr;\n    return E_SUCCESS;\n\ndone:\n    return ret;\n}\n\nint reached_min_epoch_duration(thread_t* thread) {\n\tdouble current_time;\n\tuint64_t diff_us;\n\tint result = 0;\n\n    if (thread == NULL) {\n    \t// FIXME: JVM for instance create threads using a mechanism not traced by this emulator\n    \t//        for now we make sure the current thread is registered right when it makes the\n    \t//        first explicit NVM allocation or when interposed functions are called. A\n    \t//        better solution is to trace the thread creation done by JVM.\n        if (register_self() != E_SUCCESS)\n        \t// if the thread could not be registered, exit this function\n        \treturn 0;\n        thread = thread_self();\n    }\n\n\tcurrent_time = monotonic_time_us();\n\n#ifdef USE_STATISTICS\n    diff_us = (uint64_t) (current_time - thread->stats.last_epoch_timestamp);\n#else\n    diff_us = (uint64_t) (current_time - thread->last_epoch_timestamp);\n#endif\n\n    DBG_LOG(DEBUG, \"thread id [%d] last epoch was %lu usec ago\\n\", thread->tid, diff_us);\n\n    if(diff_us >= thread_manager->min_epoch_duration_us) {\n    \tDBG_LOG(DEBUG, \"thread id [%d] reached min epoch duration (%i usec)\\n\", thread->tid,\n    \t\t\tthread_manager->min_epoch_duration_us);\n        result = 1;\n    }\n#ifdef USE_STATISTICS\n    if (thread_manager->stats.enabled && ! result) {\n    \tthread->stats.min_epoch_not_reached++;\n    }\n#endif\n    return result;\n}\n\nstatic int reached_max_epoch_duration(thread_t* thread) {\n\tdouble current_time;\n\tuint64_t diff_us;\n\tint result = 0;\n\n\t// it compares this time with the last_epoch_timestamp, which is set by another thread\n\t// so, this time must be based on a system time and not on CPU cycles/time registers\n\tcurrent_time = monotonic_time_us();\n\n#ifdef USE_STATISTICS\n    diff_us = (uint64_t) (current_time - thread->stats.last_epoch_timestamp);\n#else\n    diff_us = (uint64_t) (current_time - thread->last_epoch_timestamp);\n#endif\n\n    DBG_LOG(DEBUG, \"thread id [%d] last epoch was %lu usec ago\\n\", thread->tid, diff_us);\n\n    if(diff_us >= thread_manager->max_epoch_duration_us) {\n    \tDBG_LOG(DEBUG, \"thread id [%d] reached max epoch duration (%i usec)\\n\", thread->tid,\n    \t\t\tthread_manager->max_epoch_duration_us);\n        result = 1;\n    }\n\n    return result;\n}\n\nvoid block_new_epoch() {\n    sigset_t set;\n    sigemptyset(&set);\n    sigaddset(&set, SIGUSR1);\n    pthread_sigmask(SIG_BLOCK, &set, NULL);\n}\n\nvoid unblock_new_epoch() {\n    sigset_t set;\n    sigemptyset(&set);\n    sigaddset(&set, SIGUSR1);\n    pthread_sigmask(SIG_UNBLOCK, &set, NULL);\n}\n\nthread_manager_t* get_thread_manager() {\n\treturn thread_manager;\n}\n"
  },
  {
    "path": "src/lib/thread.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __THREAD_H\n#define __THREAD_H\n\n#include <sys/types.h>\n#include <stdint.h>\n#include <numa.h>\n#include <pthread.h>\n#include <libconfig.h>\n#include \"topology.h\"\n#include \"cpu/cpu.h\"\n#include \"stat.h\"\n\n\nstruct thread_manager_s; // opaque\n\ntypedef uint64_t hrtime_t;\n\n// TODO: Used by memlat benchmark, should be disabled on a release version\n#define MEMLAT_SUPPORT\n\ntypedef struct thread_s {\n    struct virtual_node_s* virtual_node;\n    pthread_t pthread;\n    pid_t tid;\n    int cpu_id; // the processor the thread is bound on\n    int cpu_speed_mhz;\n    struct thread_manager_s* thread_manager;\n    struct thread_s* next;\n    int signaled;\n#ifdef MEMLAT_SUPPORT\n\tuint64_t stall_cycles;\n#endif\n#ifdef USE_STATISTICS\n    thread_stats_t stats;\n#else\n    double last_epoch_timestamp;\n#endif\n} thread_t;\n\ntypedef struct thread_manager_s {\n    pthread_mutex_t mutex;\n    thread_t* thread_list;\n    int max_epoch_duration_us; // maximum epoch duration in microseconds\n    int min_epoch_duration_us; // minimum epoch duration in microseconds\n    int next_virtual_node_id; // used by the round-robin policy -- next virtual node to run on \n    int next_cpu_id; // used by the round-robin policy -- next cpu to run on\n    struct virtual_topology_s* virtual_topology;   \n#ifdef USE_STATISTICS\n    stats_t stats;\n#endif\n} thread_manager_t; \n\nint init_thread_manager(config_t* cfg, struct virtual_topology_s* virtual_topology);\nint register_self();\nint unregister_self();\nthread_t* thread_self();\nint reached_min_epoch_duration(thread_t* thread);\nvoid block_new_epoch();\nvoid unblock_new_epoch();\n\n#endif /* __THREAD_H */\n"
  },
  {
    "path": "src/lib/topology.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n/**\n *  \\file\n * \n *  Constructs a virtual topology\n */\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <unistd.h>\n#include <fcntl.h>\n#include <limits.h>\n#include <numa.h>\n#include \"cpu/cpu.h\"\n#include \"error.h\"\n#include \"measure.h\"\n#include \"topology.h\"\n#include \"model.h\"\n\n#define MAX_NUM_MC_PCI_BUS 16\n\nextern latency_model_t latency_model;\n\nvoid rr_set_next_cpu_based_on_rank(int rank, int max_rank);\nvoid partition_cpus_based_on_rank(int rank, int max_rank, int num_cpus,\n                                  virtual_topology_t* virtual_topology);\n\nint select_cpus_based_on_local_rank(virtual_topology_t* virtual_topology)\n{\n    int num_cpus = 0;\n    int vnode;\n    virtual_node_t* virtual_node;\n    physical_node_t* physical_node;\n    int n_procs = latency_model.max_local_processe_ranks;\n    int rank = latency_model.process_local_rank;\n\n    if (rank >= n_procs) {\n        DBG_LOG(ERROR, \"process rank %d exceeded limit of %d max emulated processes\\n\",\n                       rank, n_procs);\n        return E_ERROR;\n    }\n\n    for (vnode = 0; vnode < virtual_topology->num_virtual_nodes; ++vnode) {\n        virtual_node = &virtual_topology->virtual_nodes[vnode];\n        physical_node = virtual_node->dram_node;\n        num_cpus += physical_node->num_cpus;\n    }\n\n    DBG_LOG(DEBUG, \"number of cpus is %d\\n\", num_cpus);\n\n    if (n_procs > (num_cpus/2)) {\n        // do not partition CPUs, but bind this process to the CPU\n        // indicated by our rank, after that, a new thread will be\n        // bound to next available CPU on a round robin policy from\n        // the max rank\n        rr_set_next_cpu_based_on_rank(rank, n_procs);\n    } else {\n        // partition the CPUs to each rank\n        // some CPUs may end up idle/without bound processes, if n_procs is not\n        // multiple of 2\n        // TODO: warn or avoid idle CPUs\n        partition_cpus_based_on_rank(rank, n_procs, num_cpus, virtual_topology);\n    }\n\n    return E_SUCCESS;\n}\n\n/** \n *  \\brief Returns a list of memory-controller pci buses\n */\nint get_mc_pci_bus_list(pci_regs_t *bus_id_list[], int max_list_size, int* dev_countp)\n{\n    FILE* fp;\n    char  buf[2048];\n    int   bus_id, dev_id, funct;\n    int   last_bus_id = -1;\n    int   channel = 0;\n    char  dontcare[512];\n    int   dev_count = 0;\n\n    fp = popen(\"lspci\", \"r\");\n    if (fp == NULL) {\n        return E_ERROR;\n    }\n\n    for (dev_count=0; fgets(buf, sizeof(buf)-1, fp) != NULL; ) {\n        if (strstr(buf, \"Thermal Control\")) {\n            if (sscanf(buf, \"%x:%x.%x %s\", &bus_id, &dev_id, &funct, dontcare) == 4) {\n                if (bus_id != last_bus_id) {\n                    ++dev_count;\n                    last_bus_id = bus_id;\n\n                    if (dev_count > max_list_size) {\n                        pclose(fp);\n                        return E_ERROR;\n                    }\n                    channel = 0;\n                    bus_id_list[dev_count-1] = (pci_regs_t*)malloc(sizeof(pci_regs_t));\n                }\n\n                bus_id_list[dev_count-1]->addr[channel].bus_id = bus_id;\n                bus_id_list[dev_count-1]->addr[channel].dev_id = dev_id;\n                bus_id_list[dev_count-1]->addr[channel].funct = funct;\n                ++channel;\n                bus_id_list[dev_count-1]->channels = channel;\n            }\n        }\n    }\n    *dev_countp = dev_count;\n    pclose(fp);\n\n    return E_SUCCESS;\n}\n\n\n/**\n *  \\brief Discovers the physical memory-controller pci bus topology of the \n *  machine, which includes the socket each memory controller is attached to\n * \n *  To discover where a memory controller is connected to, we throttle the rest of \n *  the memory controllers and measure local bandwidth of each node. The unthrottled \n *  memory controller is attached to the node with the highest local bandwidth\n */\nint discover_mc_pci_topology(cpu_model_t* cpu_model, physical_node_t* physical_nodes[], int num_physical_nodes)\n{\n    pci_regs_t *regs_addr[16];\n    int dev_count;\n    physical_node_t* local_node = NULL;\n    int b, i;\n    double max_local_rbw;\n    double rbw;\n    int count = 0;\n    uint16_t throttle_reg_val;\n\n    get_mc_pci_bus_list(regs_addr, MAX_NUM_MC_PCI_BUS, &dev_count);\n\n    if (dev_count < num_physical_nodes) {\n        // TODO: application is terminated on error only if in DEBUG mode\n        DBG_LOG(WARNING, \"The number of physical nodes is greater than the number of memory-controller pci buses.\\n\");\n    }\n\n    for (b=0; b<dev_count; b++) {\n        // throttle all other buses except the one we are currently trying \n        // to figure out where it is attached\n        for (i=0; i<dev_count; i++) {\n            if (i == b) {\n                cpu_model->get_throttle_register(regs_addr[i], THROTTLE_DDR_ACT, &throttle_reg_val);\n                if (throttle_reg_val < 0x8fff)\n                    cpu_model->set_throttle_register(regs_addr[i], THROTTLE_DDR_ACT, 0x8fff);\n            } else {\n                cpu_model->set_throttle_register(regs_addr[i], THROTTLE_DDR_ACT, 0x800f);\n            }\n        }\n        // measure local bandwidth of each node\n        max_local_rbw = 0;\n        for (i=0; i<num_physical_nodes; i++) {\n            physical_node_t* node_i = physical_nodes[i];\n            rbw = measure_read_bw(node_i->node_id, node_i->node_id);\n            if (rbw > max_local_rbw) {\n                max_local_rbw = rbw;\n                local_node = node_i;\n            }\n        }\n        if (local_node) {\n            DBG_LOG(DEBUG, \"setting node_id %d to bus %X\\n\", local_node->node_id, regs_addr[b]->addr[0].bus_id);\n            local_node->mc_pci_regs = regs_addr[b];\n            if (++count == num_physical_nodes) break;\n        }\n    }\n\n    for (i=0; i<dev_count; i++) {\n        cpu_model->get_throttle_register(regs_addr[i], THROTTLE_DDR_ACT, &throttle_reg_val);\n        if (throttle_reg_val < 0x8fff)\n            cpu_model->set_throttle_register(regs_addr[i], THROTTLE_DDR_ACT, 0x8fff);\n    }\n\n    return E_SUCCESS;\n}\n\n/** \n * \\brief Loads the memory controller pci topology from a file\n */\nstatic int load_mc_pci_topology(const char* path, physical_node_t* physical_nodes[], int num_physical_nodes)\n{\n    FILE *fp;\n    char *line = NULL;\n    size_t len = 0;\n    ssize_t read;\n    int j;\n    int bus_id, dev_id, funct;\n    int node_id;\n    int dev_count;\n    pci_regs_t *regs = NULL;\n    int channel = 0;\n    int last_bus_id = -1;\n\n    fp = fopen(path, \"r\");\n    if (fp == NULL) {\n        return E_ERROR;\n    }\n\n    DBG_LOG(INFO, \"Loading memory-controller pci topology from %s\\n\", path);\n    for (dev_count = 0; (read = getline(&line, &len, fp)) != -1; ) {\n        sscanf(line, \"%d\\t%x:%x.%x\", &node_id, &bus_id, &dev_id, &funct);\n        DBG_LOG(INFO, \"node: %d, pci addr: %x:%x.%x\\n\", node_id, bus_id, dev_id, funct);\n        if (bus_id != last_bus_id) {\n            last_bus_id = bus_id;\n            regs = (pci_regs_t*) malloc(sizeof(pci_regs_t));\n            channel = 0;\n            dev_count++;\n\n            for (j=0; j<num_physical_nodes; j++) {\n                if (node_id == physical_nodes[j]->node_id) {\n                    physical_nodes[j]->mc_pci_regs = regs;\n                    DBG_LOG(INFO, \"node: %d, pci bus: 0x%x\\n\", physical_nodes[j]->node_id, bus_id);\n                }\n            }\n        }\n\n        regs->addr[channel].bus_id = bus_id; \n        regs->addr[channel].dev_id = dev_id; \n        regs->addr[channel].funct = funct;\n        ++channel;\n        regs->channels = channel;\n    }\n    free(line);\n    if (dev_count < num_physical_nodes) {\n        DBG_LOG(WARNING, \"No complete memory-controller pci topology found in %s\\n\", path);\n    }\n    fclose(fp);\n    return E_SUCCESS;\n}\n\n\n/** \n * \\brief Saves the memory controller pci topology in a file for later reuse\n */\nstatic int save_mc_pci_topology(const char* path, physical_node_t* physical_nodes[], int num_physical_nodes)\n{\n    int i, j;\n    FILE *fp;\n\n    fp = fopen(path, \"w\");\n    if (fp == NULL) {\n        return E_ERROR;\n    }\n\n    DBG_LOG(INFO, \"Saving memory-controller pci topology into %s\\n\", path);\n    for (i=0; i<num_physical_nodes; i++) {\n        pci_regs_t *regs = physical_nodes[i]->mc_pci_regs;\n        int node_id = physical_nodes[i]->node_id;\n        for (j=0; regs != NULL && j < regs->channels; ++j) {\n            DBG_LOG(INFO, \"node: %d, pci addr: %x:%x.%x\\n\", node_id, regs->addr[j].bus_id, regs->addr[j].dev_id, regs->addr[j].funct);\n            fprintf(fp, \"%d\\t%x:%x.%x\\n\", node_id, regs->addr[j].bus_id, regs->addr[j].dev_id, regs->addr[j].funct);\n        }\n    }\n    fclose(fp);\n    return E_SUCCESS;\n}\n\nint num_cpus(struct bitmask* bitmask) \n{\n    int i,n;\n    // if we had knowledge of the bitmask structure then we could\n    // count the bits faster but bitmask seems to be an opaque structure\n    for (i=0, n=0; i<numa_num_configured_cpus(); i++) {\n        if (numa_bitmask_isbitset(bitmask, i)) {\n            n++;\n        }\n    }\n    return n;\n}\n\n// number of cpus in the system\nint system_num_cpus()\n{\n    return sysconf( _SC_NPROCESSORS_ONLN );\n}\n\nvoid print_bitmask(struct bitmask* bitmask) {\n    int i;\n    for (i=0; i<numa_num_configured_cpus(); i++) {\n        if (numa_bitmask_isbitset(bitmask, i)) {\n            DBG_LOG(INFO, \"bit %d\\n\", i);\n        }\n    }\n    return;\n}\n\nint next_cpu(struct bitmask* bitmask, int cpu_id)\n{\n    int i;\n    // if we had knowledge of the bitmask structure then we could\n    // count the bits faster but bitmask seems to be an opaque structure\n    for (i=cpu_id; i<numa_num_configured_cpus(); i++) {\n        if (numa_bitmask_isbitset(bitmask, i)) {\n            return i;\n        }\n    }\n    return -1;\n}\n\nint first_cpu(struct bitmask* bitmask)\n{\n    return next_cpu( bitmask, 0);\n}\n\nint partition_cpus(virtual_topology_t* virtual_topology)\n{\n    int ret = E_SUCCESS;\n    // if there are more than one emulated process, then partition the available CPUs\n    // among all processes, based on the current local rank\n    if (latency_model.max_local_processe_ranks > 1) {\n        ret = select_cpus_based_on_local_rank(virtual_topology);\n    }\n\n    return ret;\n}\n\n/**\n * \\brief Construct a virtual topology\n *\n * Constructs a NUMA virtual topology where two physical sockets are fused into a \n * single virtual node\n */\nint init_virtual_topology(config_t* cfg, cpu_model_t* cpu_model, virtual_topology_t** virtual_topologyp)\n{\n    char* mc_pci_file;\n    char* str;\n    char* saveptr = NULL;\n    char* token = \"NULL\";\n    int* physical_node_ids;\n    physical_node_t** physical_nodes = NULL;\n    int num_physical_nodes;\n    int n, v, i, j, sibling_idx;\n    int node_id;\n    physical_node_t* node_i, *node_j, *sibling_node;\n    int ret;\n    int min_distance;\n    int hyperthreading;\n    struct bitmask* mem_nodes;\n    virtual_topology_t* virtual_topology;\n\n    if (__cconfig_lookup_string(cfg, \"topology.physical_nodes\", &str) == CONFIG_FALSE) {\n        return E_ERROR;\n    }\n\n    DBG_LOG(DEBUG, \"Possible NUMA nodes are %d\\n\", numa_num_possible_nodes());\n    DBG_LOG(DEBUG, \"NUMA nodes allowed are %lu\\n\", numa_get_mems_allowed()->size);\n    DBG_LOG(DEBUG, \"NUMA configured CPUs are %d\\n\", numa_num_configured_cpus());\n\n    // parse the physical nodes string\n    physical_node_ids = calloc(numa_num_possible_nodes(), sizeof(*physical_node_ids));\n    num_physical_nodes = 0;\n\n    while ((token = strtok_r(str, \",\", &saveptr))) {\n        physical_node_ids[num_physical_nodes] = atoi(token);\n        str = NULL;\n        if (++num_physical_nodes > numa_num_possible_nodes()) {\n            // we re being asked to run on more nodes than available\n            free(physical_node_ids);\n            ret = E_ERROR;\n            goto done;\n        }\n    }\n    if (!(physical_nodes = calloc(num_physical_nodes, sizeof(*physical_nodes)))) {\n        DBG_LOG(ERROR, \"Failed physical nodes allocation\\n\");\n        abort();\n    }\n\n    // select those nodes we can run on (e.g. not constrained by any numactl)\n    mem_nodes = numa_get_mems_allowed();\n    for (i=0, n=0; i<num_physical_nodes; i++) {\n        node_id = physical_node_ids[i];\n        if (numa_bitmask_isbitset(mem_nodes, node_id)) {\n            physical_nodes[n] = malloc(sizeof(**physical_nodes));\n            memset(physical_nodes[n], 0, sizeof(**physical_nodes));\n            physical_nodes[n]->node_id = node_id;\n            physical_nodes[n]->cpu_bitmask = numa_allocate_cpumask();\n            physical_nodes[n]->cpu_model = cpu_model;\n            numa_node_to_cpus(node_id, physical_nodes[n]->cpu_bitmask);\n            __cconfig_lookup_bool(cfg, \"topology.hyperthreading\", &hyperthreading);\n            if (hyperthreading) {\n                physical_nodes[n]->num_cpus = num_cpus(physical_nodes[n]->cpu_bitmask);\n            } else {\n                DBG_LOG(INFO, \"Not using hyperthreading.\\n\");\n                // disable the upper half of the processors in the bitmask\n                physical_nodes[n]->num_cpus = num_cpus(physical_nodes[n]->cpu_bitmask) / 2;\n                int fc = first_cpu(physical_nodes[n]->cpu_bitmask);\n                for (j=fc+system_num_cpus()/2; j<fc+system_num_cpus()/2+physical_nodes[n]->num_cpus; j++) {\n                    if (numa_bitmask_isbitset(physical_nodes[n]->cpu_bitmask, j)) {\n                        numa_bitmask_clearbit(physical_nodes[n]->cpu_bitmask, j);\n                    }\n                }\n            }\n            DBG_LOG(INFO, \"%d CPUs on physical node %d\\n\", physical_nodes[n]->num_cpus, n);\n            n++;\n        }\n    }\n    free(physical_node_ids);\n    num_physical_nodes = n;\n\n    // If pci bus topology of each physical node is not provided then discover it.\n    // The bus topology must be always known even if BW model is disabled.\n    if (__cconfig_lookup_string(cfg, \"topology.mc_pci\", &mc_pci_file) == CONFIG_FALSE ||\n          (__cconfig_lookup_string(cfg, \"topology.mc_pci\", &mc_pci_file) == CONFIG_TRUE &&\n          load_mc_pci_topology(mc_pci_file, physical_nodes, num_physical_nodes) != E_SUCCESS))\n    {\n        discover_mc_pci_topology(cpu_model, physical_nodes, num_physical_nodes);\n        save_mc_pci_topology(mc_pci_file, physical_nodes, num_physical_nodes);\n        DBG_LOG(INFO, \"Topology MC PCI file saved, restart the process\\n\");\n        exit(0);\n    }\n\n    // form virtual nodes by grouping physical nodes that are close to each other\n    virtual_topology = malloc(sizeof(*virtual_topology));\n    virtual_topology->num_virtual_nodes = num_physical_nodes / 2 + num_physical_nodes % 2;\n    virtual_topology->virtual_nodes = calloc(virtual_topology->num_virtual_nodes, \n                                             sizeof(*(virtual_topology->virtual_nodes)));\n\n    DBG_LOG(INFO, \"Number of physical nodes %d\\n\", num_physical_nodes);\n    DBG_LOG(INFO, \"Number of virtual nodes %d\\n\", virtual_topology->num_virtual_nodes);\n\n    for (i=0, v=0; i<num_physical_nodes; i++) {\n        min_distance = INT_MAX;\n        sibling_node = NULL;\n        sibling_idx = -1;\n        if ((node_i = physical_nodes[i]) == NULL) {\n            continue;\n        }\n\n        for (j=i+1; j<num_physical_nodes; j++) {\n            if ((node_j = physical_nodes[j]) == NULL) {\n                continue;\n            }\n            // TODO: numa_distance() returns '0' on error\n            if (numa_distance(node_i->node_id,node_j->node_id) < min_distance) {\n                sibling_node = node_j;\n                sibling_idx = j;\n            }\n        }\n\n        if (sibling_node) {\n            physical_nodes[i] = physical_nodes[sibling_idx] = NULL;\n            virtual_node_t* virtual_node = &virtual_topology->virtual_nodes[v];\n            virtual_node->dram_node = node_i;\n            virtual_node->nvram_node = sibling_node;\n            virtual_node->dram_node->latency = measure_latency(cpu_model,\n                                                               virtual_node->dram_node->node_id,\n                                                               virtual_node->dram_node->node_id);\n            virtual_node->nvram_node->latency = measure_latency(cpu_model,\n                                                                virtual_node->dram_node->node_id,\n                                                                virtual_node->nvram_node->node_id);\n            virtual_node->node_id = v;\n            DBG_LOG(INFO, \"Fusing physical nodes %d %d into virtual node %d\\n\", \n                    node_i->node_id, sibling_node->node_id, virtual_node->node_id);\n            v++;\n        }\n    }\n\n    // any physical node that is not paired with another physical node is \n    // formed into a virtual node on its own\n    if (2*v < num_physical_nodes) {\n        for (i=0; i<num_physical_nodes; i++) {\n            node_i = physical_nodes[i];\n            virtual_node_t* virtual_node = &virtual_topology->virtual_nodes[v];\n            virtual_node->dram_node = virtual_node->nvram_node = node_i;\n            virtual_node->node_id = v;\n            virtual_node->dram_node->latency = measure_latency(cpu_model,\n                                                               virtual_node->dram_node->node_id,\n                                                               virtual_node->dram_node->node_id);\n            DBG_LOG(WARNING, \"Forming physical node %d into virtual node %d without a sibling node.\\n\",\n                    node_i->node_id, virtual_node->node_id);\n        }\n    }\n\n    *virtual_topologyp = virtual_topology;\n    ret = E_SUCCESS;\n\ndone:\n    free(physical_nodes);\n    return ret;\n}\n"
  },
  {
    "path": "src/lib/topology.h",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#ifndef __TOPOLOGY_H\n#define __TOPOLOGY_H\n\n#include <numa.h>\n#include \"config.h\"\n#include \"cpu/cpu.h\"\n#include \"dev.h\"\n\n/* DOXYGEN Documentation : */\n\n/**\n    \\page virtual_topology Virtual topology\n \n    The emulator constructs a topology of virtual nodes out of physical nodes\n    (i.e., NUMA sockets) that represents the arrangement of processors, DRAM, \n    and NVRAM of the virtual machine that the emulator emulates. \n\n    Currently, the emulator supports a NUMA virtual topology where essentially\n    two physical sockets are fused into a single virtual node. Each virtual \n    node comprises the processors from one socket only (active socket), and \n    DRAM from both two physical sockets. The DRAM attached to the active socket\n    is used as the virtual node's locally attached DRAM and the DRAM of the other \n    socket (passive) is used as the virtual node's locally attached NVRAM.\n    This topology allows us to emulate a machine that has both DRAM and NVRAM but\n    reduces the computation capacity of the machine to half.\n    \n    In the future we would like to support a topology that matches the shared NVRAM\n    storage of The Machine.\n\n */\n \n\n\ntypedef struct {\n    int node_id;\n    cpu_model_t* cpu_model;\n    pci_regs_t  *mc_pci_regs;\n    int num_cpus; // number of node's cpus\n    struct bitmask* cpu_bitmask; // a bitmask of the node's CPUs \n\n    // this is actual physical latency. the latency number though depends on \n    // whether the node corresponds to a dram node or a nvram node. \n    // if dram then latency is the measured local latency to dram.\n    // if nvram then latency is the measured remote latency to the sibling nvram node\n    int latency; \n} physical_node_t;\n\ntypedef struct virtual_node_s {\n    int node_id;\n    physical_node_t* dram_node;\n    physical_node_t* nvram_node;\n    //cpu_model_t* cpu_model;\n} virtual_node_t;\n\ntypedef struct virtual_topology_s {\n    virtual_node_t* virtual_nodes; // pointer to an array of virtual nodes\n    int num_virtual_nodes;\n} virtual_topology_t;\n\nint init_virtual_topology(config_t* cfg, cpu_model_t* cpu_model, virtual_topology_t** virtual_topologyp);\nint system_num_cpus();\nint first_cpu(struct bitmask* bitmask);\nint next_cpu(struct bitmask* bitmask, int cpu_id);\n\n#endif /* __TOPOLOGY_H */\n"
  },
  {
    "path": "test/CMakeLists.txt",
    "content": "include_directories(${CMAKE_SOURCE_DIR}/third_party/gtest-1.7.0/include)\ninclude_directories(${CMAKE_SOURCE_DIR}/src/lib)\n\nadd_definitions(-g)\nadd_definitions(-Wall)\n#add_definitions(-DNDEBUG)\n\nadd_executable(test_interpose ${CMAKE_CURRENT_SOURCE_DIR}/test_interpose.cc)\ntarget_link_libraries(test_interpose pthread gtest)\n\nadd_executable(test_dev ${CMAKE_CURRENT_SOURCE_DIR}/test_dev.cc)\ntarget_link_libraries(test_dev pthread nvmemul)\n\nadd_executable(test_thread ${CMAKE_CURRENT_SOURCE_DIR}/test_thread.cc)\ntarget_link_libraries(test_thread nvmemul pthread)\n\nadd_executable(test_mutex ${CMAKE_CURRENT_SOURCE_DIR}/test_mutex.cc)\ntarget_link_libraries(test_mutex nvmemul pthread)\n\nadd_executable(test_nvm_remote_dram ${CMAKE_CURRENT_SOURCE_DIR}/test_nvm_remote_dram.c)\ntarget_link_libraries(test_nvm_remote_dram nvmemul)\n\nadd_executable(test_nvm ${CMAKE_CURRENT_SOURCE_DIR}/test_nvm.c)\ntarget_link_libraries(test_nvm nvmemul)\n\nadd_executable(test_multithread ${CMAKE_CURRENT_SOURCE_DIR}/test_multithread.c)\n#target_link_libraries(test_multithread rt)\ntarget_link_libraries(test_multithread nvmemul pthread)\n\nadd_test(NAME interpose COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_interpose)\n\nset(ENV_COMMON \"LD_PRELOAD=${CMAKE_BINARY_DIR}/src/emul/libnvmemul.so\")\n\nSET_PROPERTY(TEST interpose PROPERTY ENVIRONMENT ${ENV_COMMON} \"ENUM_INI=emul.ini\")\n"
  },
  {
    "path": "test/test_dev.cc",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <pthread.h>\n#include <stdlib.h>\n#include <stdio.h>\n#include \"gtest/gtest.h\"\n#include \"pmalloc.h\"\n\nint main(int argc, char** argv)\n{\n//    ::testing::InitGoogleTest(&argc, argv);\n//    return RUN_ALL_TESTS();\n    printf(\"PID: %d\\n\", getpid());\n    printf(\"malloc: %p\\n\", malloc(8));\n    printf(\"malloc: %p\\n\", malloc(8));\n    printf(\"pmalloc: %p\\n\", pmalloc(8));\n}\n\n"
  },
  {
    "path": "test/test_interpose.cc",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <pthread.h>\n#include <stdlib.h>\n#include <stdio.h>\n#include \"gtest/gtest.h\"\n\nstatic int interpose_pthread_create_success = 0;\n\n\n// Ugly hack: we want to test whether interposition works. To do this we \n// hook on the functions that the interposition code calls by redefining these\n// functions. As those functions are written in C, we need to make sure we force\n// the C++ compiler use C linkage.\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n// this function is called when interposition of pthread_create is successful\nint register_thread(pthread_t thread)\n{\n    interpose_pthread_create_success = 1;\n    return 0;\n}\n\n#ifdef __cplusplus\n}\n#endif\n\nvoid* interpose_pthread_create_start_routine(void* args)\n{\n    return NULL;\n}\n\nvoid interpose_pthread_create()\n{\n    pthread_t thread;  \n    \n    pthread_create (&thread, NULL, &interpose_pthread_create_start_routine, NULL);\n\n    pthread_join(thread, NULL);\n              \n}\n\nvoid interpose_pthread_mutex_lock(pthread_mutex_t* lock)\n{\n    pthread_mutex_lock(lock);\n}\n\nvoid interpose_pthread_mutex_unlock(pthread_mutex_t* lock)\n{\n    pthread_mutex_unlock(lock);\n}\n\nTEST(Interpose, pthread_create)\n{\n    EXPECT_EQ(0, interpose_pthread_create_success);\n    interpose_pthread_create();\n    EXPECT_EQ(1, interpose_pthread_create_success);\n}\n\nTEST(Interpose, pthread_mutex_lock)\n{\n    //EXPECT_EQ(1, 0);\n}\n\n\nint main(int argc, char** argv)\n{\n    ::testing::InitGoogleTest(&argc, argv);\n    return RUN_ALL_TESTS();\n\n    pthread_mutex_t lock;\n    pthread_mutex_init(&lock, NULL);\n    interpose_pthread_mutex_lock(&lock);\n    interpose_pthread_mutex_unlock(&lock);\n}\n"
  },
  {
    "path": "test/test_multithread.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#define _GNU_SOURCE\n#include <pthread.h>\n#include <sched.h>\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <unistd.h>\n#include <pthread.h>\n\n#include \"thread.h\"\n#include <sys/time.h>\n#include \"pmalloc.h\"\n#include \"debug.h\"\n//#include \"stat.h\"\n\n\n#ifndef NDEBUG\n#include <sys/syscall.h>\n#endif\n\ntypedef struct {\n\tint cs_n;\n\tint cs_duration;\n\tint out_cs_duration;\n\tint from_node;\n\tint to_node;\n} arg_s;\n\n#define MAX_NUM_THREADS 50\npthread_t thread_desc[MAX_NUM_THREADS];\n\n\n\n#include <inttypes.h>\ntypedef struct {\n\tuint64_t val;\n\tchar padding[0];\n} element_t;\n\ntypedef struct {\n    uint64_t   N;\n    uint64_t   element_size;\n    element_t* head;\n} chain_t;\nuint64_t trash_cache(uint64_t N);\nchain_t* alloc_chain(uint64_t seedin, uint64_t N, uint64_t element_size, uint64_t node_i, uint64_t node_j);\nelement_t* element(chain_t* chain, uint64_t index);\nvoid inline read_element(chain_t* chain, uint64_t index, char* buf, uint64_t buf_size);\n\n\n// factor is 10 (could be more), to make sure we have a buffer much bigger than CPU cache\n// the memory buffer is NOT shared among threads\n// for now the cache size is hardcoded as 20 MB\n#define NELEMS (10 * 20480000 / 64LLU)\n#define PAGESZ 4096\n#define MAX_NUM_CHAINS 16\n//#undef USE_HUGETLB\n#define SEED_IN 1\n#define NCHAINS 1\n\npthread_mutex_t mutex;\n\nstatic int max_number_of_cpus(void)\n{\n    int n, cpus = 2048;\n    size_t setsize =  CPU_ALLOC_SIZE(cpus);\n    cpu_set_t *set = CPU_ALLOC(cpus);\n    if (!set)\n        goto err;\n\n\tfor (;;) {\n\t\tCPU_ZERO_S(setsize, set);\n\t\t/* the library version does not return size of cpumask_t */\n\t\tn = syscall(SYS_sched_getaffinity, 0, setsize, set);\n\t\tif (n < 0 && cpus < 1024 * 1024) {\n\t\t        CPU_FREE(set);\n\t\t\tcpus *= 2;\n\t\t\tset = CPU_ALLOC(cpus);\n\t\t\tif (!set)\n\t\t\t\tgoto err;\n\t\t\tcontinue;\n\t\t}\n\n\tCPU_FREE(set);\n\treturn n * 8;\n\t}\nerr:\n\tprintf(\"cannot determine NR_CPUS\");\n\treturn 0;\n}\n\nstatic int bind_cpu(thread_t *thread) {\n    size_t setsize;\n    cpu_set_t *cur_cpuset;\n    cpu_set_t *new_cpuset;\n\n    int ncpus = max_number_of_cpus();\n\n    if (thread == NULL) {\n        // if thread is NULL it means the emulator is disabled, return without setting CPU affinity\n        //printf(\"thread self is null\");\n        return 0;\n    }\n\n    if (ncpus == 0) {\n    \treturn 1;\n    }\n\n    setsize = CPU_ALLOC_SIZE(ncpus);\n    cur_cpuset = CPU_ALLOC(ncpus);\n    new_cpuset = CPU_ALLOC(ncpus);\n    CPU_ZERO_S(setsize, cur_cpuset);\n    CPU_ZERO_S(setsize, new_cpuset);\n    CPU_SET_S(thread->cpu_id, setsize, new_cpuset);\n\n    if (pthread_getaffinity_np(thread->pthread, setsize, cur_cpuset) != 0) {\n        DBG_LOG(ERROR, \"Cannot get thread tid [%d] affinity, pthread: 0x%lx on processor %d\\n\",\n        \t\tthread->tid, thread->pthread, thread->cpu_id);\n        return 1;\n    }\n\n    if (CPU_EQUAL(cur_cpuset, new_cpuset)) {\n        //printf(\"No need to bind CPU\\n\");\n    \treturn 0;\n    }\n\n    DBG_LOG(INFO, \"Binding thread tid [%d] pthread: 0x%lx on processor %d\\n\", thread->tid, thread->pthread, thread->cpu_id);\n\n    if (pthread_setaffinity_np(thread->pthread, setsize, new_cpuset) != 0) {\n        DBG_LOG(ERROR, \"Cannot bind thread tid [%d] pthread: 0x%lx on processor %d\\n\", thread->tid, thread->pthread, thread->cpu_id);\n        return 1;\n    }\n\n    return 0;\n}\n\nuint64_t force_ldm_stalls(chain_t **C,\n                          int element_size,\n                          int access_size,\n                          int duration,             // number of pointers/elements to chase\n                          uint64_t nelems,          // max number of available elements/pointers\n                          int it_n) {               // seed to calculate the first pointer to chase, used to avoid repeating\n                                                    // pointers during consecutive calls\n    uint64_t j, i;\n    int nchains = SEED_IN;\n    uint64_t sumv[MAX_NUM_CHAINS];\n    uint64_t nextp[MAX_NUM_CHAINS];\n    char *buf;\n    uint64_t buf_size = 16384;\n    int count = 0;\n    uint64_t start;\n    uint64_t it_limit;\n\n    assert(nchains < MAX_NUM_CHAINS);\n\n    if (duration <= 0) return 0;\n\n    // TODO: ignore the use of buf?\n    // TODO: ignore more than one chain?\n    buf = (char*) malloc(buf_size);\n    assert(buf != NULL);\n\n    if (nelems > duration) {\n        it_limit = nelems / duration;\n    } else {\n    \tit_limit = 1;\n    }\n    it_n = it_n % it_limit;\n    start = it_n * duration;\n    if ((start + duration) > nelems) {\n    \tstart = 0;\n    }\n\n    /* chase the pointers */\n    if (nchains == 1) {\n        sumv[0] = 0;\n        // chase pointers until the 'duration' count, the pointer chasing will restart from beginning if duration\n        // is greater than 'nelems'\n        for (count = 0, i = start; count < duration; i = element(C[0], i)->val, ++count) {\n            __asm__(\"\");\n            sumv[0] += element(C[0], i)->val;\n            if (access_size > element_size) {\n                read_element(C[0], i, buf, buf_size);\n            }\n        }\n    } else {\n        for (j=0; j < nchains; j++) {\n            sumv[j] = 0;\n            nextp[j] = 0;\n        }\n        for (; 0 != element(C[0], nextp[0])->val; ) {\n            for (j=0; j < nchains; j++) {\n                sumv[j] += element(C[j], nextp[j])->val;\n                if (access_size > element_size) {\n                    read_element(C[j], nextp[j], buf, buf_size);\n                }\n                nextp[j] = element(C[j], nextp[j])->val;\n            }\n        }\n    }\n\n    free(buf);\n    return sumv[0];\n}\n\nvoid iter(int cs_n, int cs_duration, int out_cs_duration, int from_node, int to_node) {\n\tlong it_n;\n\tstruct timespec time_start, time_end;\n\tunsigned long diff_us;\n\tuint64_t seed;\n\tuint64_t j;\n\tchain_t *C[MAX_NUM_CHAINS];\n#ifndef NDEBUG\n\tpid_t tid = (pid_t) syscall(SYS_gettid);\n#endif\n\n\tDBG_LOG(INFO, \"\\t: from node: %d to node: %d\\n\", from_node, to_node);\n\n\tassert(NELEMS < UINT64_MAX);\n\n    for (j=0; j < NCHAINS; j++) {\n        seed = SEED_IN + j*j;\n        C[j] = alloc_chain(seed, NELEMS, 64LLU, from_node, to_node);\n        __asm__(\"\");\n    }\n\n    bind_cpu(thread_self());\n\n    trash_cache(NELEMS);\n\n    for (it_n = 0; it_n < cs_n; ++it_n) {\n    \t__asm__(\"\");\n        pthread_mutex_lock(&mutex);\n#ifndef NDEBUG\n        clock_gettime(CLOCK_MONOTONIC, &time_start);\n#endif\n        // critical section\n        // make cs_duration random memory accesses and leave\n        force_ldm_stalls((chain_t **)&C, 64LLU, 8, cs_duration, NELEMS, it_n);\n#ifndef NDEBUG\n        clock_gettime(CLOCK_MONOTONIC, &time_end);\n#endif\n        pthread_mutex_unlock(&mutex);\n\n        // outside critical section\n        force_ldm_stalls((chain_t **)&C, 64LLU, 8, out_cs_duration, NELEMS, (it_n+1)*2);\n\n#ifndef NDEBUG\n        diff_us = ((time_end.tv_sec * 1000000) + (time_end.tv_nsec / 1000)) -\n                  ((time_start.tv_sec * 1000000) + (time_start.tv_nsec / 1000));\n        DBG_LOG(INFO, \"\\tthread [%d] critical section took %lu usec\\n\", tid, diff_us);\n#endif\n//        if ((it_n + 1) % out_cs_duration == 0) {\n////            usleep(1);\n////            pthread_yield();\n//            sched_yield();\n//        }\n    }\n\n    for (j=0; j < NCHAINS; j++) {\n        free(C[j]);\n    }\n}\n\nvoid *thread_fn(void *arg) {\n\tint cs_n = ((arg_s *) arg)->cs_n;\n\tint cs_duration = ((arg_s *) arg)->cs_duration;\n\tint out_cs_duration = ((arg_s *) arg)->out_cs_duration;\n\tint from_node = ((arg_s *) arg)->from_node;\n\tint to_node = ((arg_s *) arg)->to_node;\n\n\titer(cs_n, cs_duration, out_cs_duration, from_node, to_node);\n\n\treturn 0;\n}\n\nvoid manage_threads(int n_threads, int cs_n, int cs_duration, int out_cs_duration, int from_node, int to_node)\n{\n\tpthread_attr_t attr;\n    int i;\n    arg_s args;\n\n    if ((n_threads > MAX_NUM_THREADS) || (n_threads <= 0)) {\n    \tprintf(\"INVALID RANGE:\\n\");\n    \tprintf(\"\\tMax number of threads is %d\\n\", MAX_NUM_THREADS);\n    \texit(-1);\n    }\n\n    if (cs_n <= 0 || cs_duration <= 0 || out_cs_duration < 0) {\n    \tprintf(\"INVALID RANGE:\\n\");\n    \tprintf(\"\\tcritical sections: %d, cs level: %d, out cs level: %d\\n\", cs_n, cs_duration, out_cs_duration);\n    \texit(-1);\n    }\n\n    pthread_mutex_init(&mutex, NULL);\n\n    if (pthread_attr_init(&attr) != 0) {\n\t\tprintf(\"pthread_attr_init failed\");\n\t\texit(-1);\n\t}\n\n    srand(time(NULL));\n\n    args.cs_duration = cs_duration;\n    args.cs_n = cs_n;\n    args.out_cs_duration = out_cs_duration;\n    args.from_node = from_node;\n    args.to_node = to_node;\n\n    for (i = 0; i < n_threads; ++i) {\n\t    pthread_create(&thread_desc[i], &attr, thread_fn, (void *)&args);\n\t}\n\n    pthread_attr_destroy(&attr);\n\n    for (i = 0; i < n_threads; ++i) {\n        pthread_join(thread_desc[i], NULL);\n    }\n\n    pthread_mutex_destroy(&mutex);\n}\n\nint main(int argn, char **argv)\n{\n    int n_threads;\n    int cs_n;\n    int cs_duration;\n    //int cs_n_before_yield;\n    int out_cs_duration;\n    int from_node;\n    int to_node;\n\n    if (argn != 7) {\n        printf(\"INVALID ARGUMENTS:\\n\");\n        printf(\"\\t%s [# threads] [# critical sections per thread] [size of each critical section] \"\n        \t   \"[size of computation outside critical section] [from_node] [to_node]\\n\", argv[0]);\n        return -1;\n    }\n\n    n_threads = atoi(argv[1]);\n    cs_n = atoi(argv[2]);\n    cs_duration = atoi(argv[3]);\n    //cs_n_before_yield = atoi(argv[4]);\n    out_cs_duration = atoi(argv[4]);\n    from_node = atoi(argv[5]);\n    to_node = atoi(argv[6]);\n\n    manage_threads(n_threads, cs_n, cs_duration, out_cs_duration, from_node, to_node);\n\n//    stats_report();\n\n    return 0;\n}\n"
  },
  {
    "path": "test/test_mutex.cc",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <pthread.h>\n#include <stdlib.h>\n#include <stdint.h>\n#include <stdio.h>\n#include <stddef.h>\n#include \"gtest/gtest.h\"\n\n#define MAX_NUM_THREADS 128\n\npthread_mutex_t mutex;\n\nvoid* worker(void* args) \n{\n//    int i;\n//    char* array = (char*) malloc(1024*1024);\n\n    pthread_mutex_lock(&mutex);\n\n    pthread_mutex_unlock(&mutex);\n    return NULL;\n}\n\n\nint main(int argc, char** argv)\n{\n\tpthread_t thread[MAX_NUM_THREADS];\n\tint thread_count = 4;\n\tint i;\n//    int sum;\n\n    pthread_mutex_init(&mutex, NULL);\n    pthread_mutex_lock(&mutex);\n    pthread_mutex_unlock(&mutex);\n\tfor (i = 0; i< thread_count; i++)\t\n\t\tpthread_create(&thread[i], NULL, worker, NULL);\n\n\tfor(i = 0 ; i < thread_count ; i++)\n\t\tpthread_join(thread[i], NULL);\n}\n"
  },
  {
    "path": "test/test_nvm.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n\n#include <stdio.h>\n#include <unistd.h>\n\n\n#define BUF_SIZE (2048)\n\nunsigned long mem[BUF_SIZE][BUF_SIZE];\n\nvoid iter()\n{\n\tint i;\n\tint j;\n\tunsigned long k;\n\n\tfor (i=0; i < BUF_SIZE; ++i) {\n\t\tfor (j=0; j < BUF_SIZE; ++j) {\n\t\t\tmem[i][j] = i * j;\n\t\t}\n\t}\n\n\tk = 0;\n\twhile(1) {\n\t\tfor (i=0; i < BUF_SIZE; ++i) {\n\t\t\t__asm__ __volatile__(\"\");\n\t\t\tfor (j=0; j < BUF_SIZE; ++j) {\n\t\t        k += mem[j][i] + i*j;\n\t\t        mem[j][i] = k;\n\t\t\t}\n\t\t}\n//\t\tfprintf(stdout, \"k is %lu\\n\", (unsigned long)k);\n\t\tusleep(1000);\n\t}\n}\n\nint main()\n{\n    iter();\n    return 0;\n}\n"
  },
  {
    "path": "test/test_nvm_remote_dram.c",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n\n#include <stdio.h>\n#include <unistd.h>\n#include \"pmalloc.h\"\n\n\n#define BUF_SIZE (4 * 1024)\n\nunsigned long **mem;\n\nvoid iter()\n{\n\tint i;\n\tint j;\n\tunsigned long k;\n\n\tmem = (unsigned long **) pmalloc(BUF_SIZE * sizeof(unsigned long *));\n\tfor (i=0; i < BUF_SIZE; ++i) {\n\t\tmem[i] = (unsigned long *) pmalloc(BUF_SIZE * sizeof(unsigned long));\n\t\tfor (j=0; j < BUF_SIZE; ++j) {\n\t\t\tmem[i][j] = i * j;\n\t\t}\n\t}\n\n\tk = 0;\n\twhile(1) {\n\t\tfor (i=0; i < BUF_SIZE; ++i) {\n\t\t\t__asm__ __volatile__(\"\");\n\t\t\tfor (j=0; j < BUF_SIZE; ++j) {\n\t\t        k += mem[j][i] + i*j;\n\t\t        mem[j][i] = k;\n\t\t\t}\n\t\t}\n//\t\tusleep(1000);\n\t}\n\n\tfor (i=0; i < BUF_SIZE; ++i) {\n\t\tpfree(mem[i], BUF_SIZE * sizeof(unsigned long));\n\t}\n\tpfree(mem, BUF_SIZE * sizeof(unsigned long *));\n}\n\nint main()\n{\n    iter();\n    return 0;\n}\n"
  },
  {
    "path": "test/test_thread.cc",
    "content": "/***************************************************************************\nCopyright 2016 Hewlett Packard Enterprise Development LP.  \nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or (at\nyour option) any later version. This program is distributed in the\nhope that it will be useful, but WITHOUT ANY WARRANTY; without even\nthe implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\nPURPOSE. See the GNU General Public License for more details. You\nshould have received a copy of the GNU General Public License along\nwith this program; if not, write to the Free Software Foundation,\nInc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n***************************************************************************/\n#include <pthread.h>\n#include <stdlib.h>\n#include <stdint.h>\n#include <stdio.h>\n#include <stddef.h>\n#include \"gtest/gtest.h\"\n\n#define MAX_NUM_THREADS 128\n\nvoid* worker(void* args) \n{\n    int i;\n    char* array = (char*) malloc(1024*1024);\n\n    //while(1) {\n        for (i=0; i<1024*1024; i++) {\n            array[i] += 1;\n        }\n    //}\n    //pthread_exit(NULL);\n    printf(\"exiting\\n\");\n    return NULL;\n}\n\n\nint main(int argc, char** argv)\n{\n\tpthread_t thread[MAX_NUM_THREADS];\n\tint thread_count = 4;\n\tint i;\n//    int sum;\n\n\tfor (i = 0; i< thread_count; i++)\t\n\t\tpthread_create(&thread[i], NULL, worker, NULL);\n\n\tfor(i = 0 ; i < thread_count ; i++)\n\t\tpthread_join(thread[i], NULL);\n}\n"
  }
]