Repository: HewlettPackard/quartz Branch: master Commit: c22e1aa156a0 Files: 92 Total size: 375.4 KB Directory structure: gitextract_aunglxr9/ ├── AUTHORS ├── CMakeLists.txt ├── Doxyfile ├── README-BENCHMARKS-TESTING.md ├── README.md ├── TODO.dox ├── bench/ │ ├── CMakeLists.txt │ ├── memlat/ │ │ ├── CMakeLists.txt │ │ └── memlat.c │ ├── multilat/ │ │ ├── CMakeLists.txt │ │ └── multilat.c │ └── new_memlat/ │ ├── CMakeLists.txt │ ├── memlat.c │ └── memlat.sh ├── benchmark-tests/ │ ├── bandwidth-model-building.sh │ ├── memlat-bench-test-10M-single-socket.sh │ ├── memlat-bench-test-10M.sh │ ├── memlat-orig-lat-test-single-socket.sh │ ├── memlat-orig-lat-test.sh │ ├── nvmemul-bandwidth.ini │ ├── nvmemul-debug.ini │ ├── nvmemul-orig.ini │ └── nvmemul.ini ├── license.txt ├── nvmemul-orig.ini ├── nvmemul.dox ├── nvmemul.ini ├── scripts/ │ ├── install.sh │ ├── runenv.sh │ ├── setupdev.sh │ └── turboboost.sh ├── src/ │ ├── CMakeLists.txt │ ├── dev/ │ │ ├── CMakeLists.txt │ │ ├── Makefile │ │ ├── ioctl_query.h │ │ └── pmc.c │ └── lib/ │ ├── CMakeLists.txt │ ├── config.c │ ├── config.h │ ├── cpu/ │ │ ├── CMakeLists.txt │ │ ├── cpu.c │ │ ├── cpu.h │ │ ├── haswell-papi.h │ │ ├── haswell.h │ │ ├── ivybridge-papi.h │ │ ├── ivybridge.h │ │ ├── known_cpus.h │ │ ├── pmc-papi.c │ │ ├── pmc-papi.h │ │ ├── pmc.c │ │ ├── pmc.h │ │ ├── sandybridge-papi.h │ │ ├── sandybridge.h │ │ └── xeon-ex.h │ ├── debug.c │ ├── debug.h │ ├── dev.c │ ├── dev.h │ ├── errno.h │ ├── error.h │ ├── init.c │ ├── interpose.c │ ├── interpose.h │ ├── measure.h │ ├── measure_bw.c │ ├── measure_lat.c │ ├── misc.c │ ├── misc.h │ ├── model.h │ ├── model_bw.c │ ├── model_lat.c │ ├── monotonic_timer.c │ ├── monotonic_timer.h │ ├── pflush.c │ ├── pflush.h │ ├── pmalloc.c │ ├── pmalloc.h │ ├── process_rank.c │ ├── stat.c │ ├── stat.h │ ├── thread.c │ ├── thread.h │ ├── topology.c │ └── topology.h └── test/ ├── CMakeLists.txt ├── test_dev.cc ├── test_interpose.cc ├── test_multithread.c ├── test_mutex.cc ├── test_nvm.c ├── test_nvm_remote_dram.c └── test_thread.cc ================================================ FILE CONTENTS ================================================ ================================================ FILE: AUTHORS ================================================ Haris Volos (haris.volos@hpe.com) Guilherme Magalhaes (guilherme.magalhaes@hpe.com) Lucy Cherkasova (lucy.cherkasova@gmail.com) ================================================ FILE: CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.8) #add_subdirectory(third_party) add_subdirectory(src) add_subdirectory(bench) enable_testing() #add_subdirectory(test) ================================================ FILE: Doxyfile ================================================ # Doxyfile 1.4.7 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project # # All text after a hash (#) is considered a comment and will be ignored # The format is: # TAG = value [value, ...] # For lists items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (" ") #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # The PROJECT_NAME tag is a single word (or a sequence of words surrounded # by quotes) that should identify the project. PROJECT_NAME = "Quartz" # The PROJECT_NUMBER tag can be used to enter a project or revision number. # This could be handy for archiving the generated documentation or # if some version control system is used. PROJECT_NUMBER = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. # If a relative path is entered, it will be relative to the location # where doxygen was started. If left blank the current directory will be used. OUTPUT_DIRECTORY = ./doc # If the CREATE_SUBDIRS tag is set to YES, then doxygen will create # 4096 sub-directories (in 2 levels) under the output directory of each output # format and will distribute the generated files over these directories. # Enabling this option can be useful when feeding doxygen a huge amount of # source files, where putting all generated files in the same directory would # otherwise cause performance problems for the file system. CREATE_SUBDIRS = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # The default language is English, other supported languages are: # Brazilian, Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, # Dutch, Finnish, French, German, Greek, Hungarian, Italian, Japanese, # Japanese-en (Japanese with English messages), Korean, Korean-en, Norwegian, # Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish, # Swedish, and Ukrainian. OUTPUT_LANGUAGE = English # This tag can be used to specify the encoding used in the generated output. # The encoding is not always determined by the language that is chosen, # but also whether or not the output is meant for Windows or non-Windows users. # In case there is a difference, setting the USE_WINDOWS_ENCODING tag to YES # forces the Windows encoding (this is the default for the Windows binary), # whereas setting the tag to NO uses a Unix-style encoding (the default for # all platforms other than Windows). USE_WINDOWS_ENCODING = NO # If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will # include brief member descriptions after the members that are listed in # the file and class documentation (similar to JavaDoc). # Set to NO to disable this. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend # the brief description of a member or function before the detailed description. # Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator # that is used to form the text in various listings. Each string # in this list, if found as the leading text of the brief description, will be # stripped from the text and the result after processing the whole list, is # used as the annotated text. Otherwise, the brief description is used as-is. # If left blank, the following values are used ("$name" is automatically # replaced with the name of the entity): "The $name class" "The $name widget" # "The $name file" "is" "provides" "specifies" "contains" # "represents" "a" "an" "the" ABBREVIATE_BRIEF = # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # Doxygen will generate a detailed section even if there is only a brief # description. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full # path before files name in the file list and in the header files. If set # to NO the shortest path that makes the file name unique will be used. FULL_PATH_NAMES = YES # If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag # can be used to strip a user-defined part of the path. Stripping is # only done if one of the specified strings matches the left-hand part of # the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the # path to strip. STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of # the path mentioned in the documentation of a class, which tells # the reader which header file to include in order to use a class. # If left blank only the name of the header file containing the class # definition is used. Otherwise one should specify the include paths that # are normally passed to the compiler using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter # (but less readable) file names. This can be useful is your file systems # doesn't support long names like on DOS, Mac, or CD-ROM. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen # will interpret the first line (until the first dot) of a JavaDoc-style # comment as the brief description. If set to NO, the JavaDoc # comments will behave just like the Qt-style comments (thus requiring an # explicit @brief command for a brief description. JAVADOC_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen # treat a multi-line C++ special comment block (i.e. a block of //! or /// # comments) as a brief description. This used to be the default behaviour. # The new default is to treat a multi-line C++ comment block as a detailed # description. Set this tag to YES if you prefer the old behaviour instead. MULTILINE_CPP_IS_BRIEF = NO # If the DETAILS_AT_TOP tag is set to YES then Doxygen # will output the detailed description near the top, like JavaDoc. # If set to NO, the detailed description appears after the member # documentation. DETAILS_AT_TOP = NO # If the INHERIT_DOCS tag is set to YES (the default) then an undocumented # member inherits the documentation from any documented member that it # re-implements. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce # a new page for each member. If set to NO, the documentation of a member will # be part of the file/class/namespace that contains it. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. # Doxygen uses this value to replace tabs by spaces in code fragments. TAB_SIZE = 8 # This tag can be used to specify a number of aliases that acts # as commands in the documentation. An alias has the form "name=value". # For example adding "sideeffect=\par Side Effects:\n" will allow you to # put the command \sideeffect (or @sideeffect) in the documentation, which # will result in a user-defined paragraph with heading "Side Effects:". # You can put \n's in the value part of an alias to insert newlines. ALIASES = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C # sources only. Doxygen will then generate output that is more tailored for C. # For instance, some of the names that are used will be different. The list # of all members will be omitted, etc. OPTIMIZE_OUTPUT_FOR_C = NO # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java # sources only. Doxygen will then generate output that is more tailored for Java. # For instance, namespaces will be presented as packages, qualified scopes # will look different, etc. OPTIMIZE_OUTPUT_JAVA = NO # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want to # include (a tag file for) the STL sources as input, then you should # set this tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); v.s. # func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. BUILTIN_STL_SUPPORT = NO # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES, then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. DISTRIBUTE_GROUP_DOC = NO # Set the SUBGROUPING tag to YES (the default) to allow class member groups of # the same type (for instance a group of public functions) to be put as a # subgroup of that type (e.g. under the Public Functions section). Set it to # NO to prevent subgrouping. Alternatively, this can be done per class using # the \nosubgrouping command. SUBGROUPING = YES #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in # documentation are documented, even if no documentation was available. # Private class members and static file members will be hidden unless # the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES EXTRACT_ALL = NO # If the EXTRACT_PRIVATE tag is set to YES all private members of a class # will be included in the documentation. EXTRACT_PRIVATE = NO # If the EXTRACT_STATIC tag is set to YES all static members of a file # will be included in the documentation. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) # defined locally in source files will be included in the documentation. # If set to NO only classes defined in header files are included. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. When set to YES local # methods, which are defined in the implementation section but not in # the interface are included in the documentation. # If set to NO (the default) only methods in the interface are included. EXTRACT_LOCAL_METHODS = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all # undocumented members of documented classes, files or namespaces. # If set to NO (the default) these members will be included in the # various overviews, but no documentation section is generated. # This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. # If set to NO (the default) these classes will be included in the various # overviews. This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all # friend (class|struct|union) declarations. # If set to NO (the default) these declarations will be included in the # documentation. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any # documentation blocks found inside the body of a function. # If set to NO (the default) these blocks will be appended to the # function's detailed documentation block. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation # that is typed after a \internal command is included. If the tag is set # to NO (the default) then the documentation will be excluded. # Set it to YES to include the internal documentation. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate # file names in lower-case letters. If set to YES upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. CASE_SENSE_NAMES = YES # If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen # will show members with their full class and namespace scopes in the # documentation. If set to YES the scope will be hidden. HIDE_SCOPE_NAMES = NO # If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen # will put a list of the files that are included by a file in the documentation # of that file. SHOW_INCLUDE_FILES = YES # If the INLINE_INFO tag is set to YES (the default) then a tag [inline] # is inserted in the documentation for inline members. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen # will sort the (detailed) documentation of file and class members # alphabetically by member name. If set to NO the members will appear in # declaration order. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the # brief documentation of file, namespace and class members alphabetically # by member name. If set to NO (the default) the members will appear in # declaration order. SORT_BRIEF_DOCS = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be # sorted by fully-qualified names, including namespaces. If set to # NO (the default), the class list will be sorted only by class name, # not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the # alphabetical list. SORT_BY_SCOPE_NAME = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or # disable (NO) the todo list. This list is created by putting \todo # commands in the documentation. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or # disable (NO) the test list. This list is created by putting \test # commands in the documentation. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or # disable (NO) the bug list. This list is created by putting \bug # commands in the documentation. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or # disable (NO) the deprecated list. This list is created by putting # \deprecated commands in the documentation. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional # documentation sections, marked by \if sectionname ... \endif. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines # the initial value of a variable or define consists of for it to appear in # the documentation. If the initializer consists of more lines than specified # here it will be hidden. Use a value of 0 to hide initializers completely. # The appearance of the initializer of individual variables and defines in the # documentation can be controlled using \showinitializer or \hideinitializer # command in the documentation regardless of this setting. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated # at the bottom of the documentation of classes and structs. If set to YES the # list will mention the files that were used to generate the documentation. SHOW_USED_FILES = YES # If the sources in your project are distributed over multiple directories # then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy # in the documentation. The default is NO. SHOW_DIRECTORIES = NO # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from the # version control system). Doxygen will invoke the program by executing (via # popen()) the command , where is the value of # the FILE_VERSION_FILTER tag, and is the name of an input file # provided by doxygen. Whatever the program writes to standard output # is used as the file version. See the manual for examples. FILE_VERSION_FILTER = #--------------------------------------------------------------------------- # configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated # by doxygen. Possible values are YES and NO. If left blank NO is used. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated by doxygen. Possible values are YES and NO. If left blank # NO is used. WARNINGS = YES # If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings # for undocumented members. If EXTRACT_ALL is set to YES then this flag will # automatically be disabled. WARN_IF_UNDOCUMENTED = YES # If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some # parameters in a documented function, or documenting parameters that # don't exist or using markup commands wrongly. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be abled to get warnings for # functions that are documented, but have no documentation for their parameters # or return value. If set to NO (the default) doxygen will only warn about # wrong or incomplete parameter documentation, but not about the absence of # documentation. WARN_NO_PARAMDOC = NO # The WARN_FORMAT tag determines the format of the warning messages that # doxygen can produce. The string should contain the $file, $line, and $text # tags, which will be replaced by the file and line number from which the # warning originated and the warning text. Optionally the format may contain # $version, which will be replaced by the version of the file (if it could # be obtained via FILE_VERSION_FILTER) WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning # and error messages should be written. If left blank the output is written # to stderr. WARN_LOGFILE = #--------------------------------------------------------------------------- # configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag can be used to specify the files and/or directories that contain # documented source files. You may enter file names like "myfile.cpp" or # directories like "/usr/src/myproject". Separate the files or directories # with spaces. INPUT = nvmemul.dox TODO.dox src/ # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank the following patterns are tested: # *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx # *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py FILE_PATTERNS = # The RECURSIVE tag can be used to turn specify whether or not subdirectories # should be searched for input files as well. Possible values are YES and NO. # If left blank NO is used. RECURSIVE = YES # The EXCLUDE tag can be used to specify files and/or directories that should # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used select whether or not files or # directories that are symbolic links (a Unix filesystem feature) are excluded # from the input. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. Note that the wildcards are matched # against the file with absolute path, so to exclude all test directories # for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXAMPLE_PATH tag can be used to specify one or more files or # directories that contain example code fragments that are included (see # the \include command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank all files are included. EXAMPLE_PATTERNS = # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude # commands irrespective of the value of the RECURSIVE tag. # Possible values are YES and NO. If left blank NO is used. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or # directories that contain image that are included in the documentation (see # the \image command). IMAGE_PATH = ./doc/figures # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command , where # is the value of the INPUT_FILTER tag, and is the name of an # input file. Doxygen will then use the output that the filter program writes # to standard output. If FILTER_PATTERNS is specified, this tag will be # ignored. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: # pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further # info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER # is applied to all files. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will be used to filter the input files when producing source # files to browse (i.e. when SOURCE_BROWSER is set to YES). FILTER_SOURCE_FILES = NO #--------------------------------------------------------------------------- # configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will # be generated. Documented entities will be cross-referenced with these sources. # Note: To get rid of all source code in the generated output, make sure also # VERBATIM_HEADERS is set to NO. SOURCE_BROWSER = YES # Setting the INLINE_SOURCES tag to YES will include the body # of functions and classes directly in the documentation. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct # doxygen to hide any special comment blocks from generated source code # fragments. Normal C and C++ comments will always remain visible. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES (the default) # then for each documented function all documented # functions referencing it will be listed. REFERENCED_BY_RELATION = YES # If the REFERENCES_RELATION tag is set to YES (the default) # then for each documented function all documented entities # called/used by that function will be listed. REFERENCES_RELATION = YES # If the REFERENCES_LINK_SOURCE tag is set to YES (the default) # and SOURCE_BROWSER tag is set to YES, then the hyperlinks from # functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will # link to the source code. Otherwise they will link to the documentstion. REFERENCES_LINK_SOURCE = YES # If the USE_HTAGS tag is set to YES then the references to source code # will point to the HTML generated by the htags(1) tool instead of doxygen # built-in source browser. The htags tool is part of GNU's global source # tagging system (see http://www.gnu.org/software/global/global.html). You # will need version 4.8.6 or higher. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen # will generate a verbatim copy of the header file for each class for # which an include is specified. Set to NO to disable this. VERBATIM_HEADERS = YES #--------------------------------------------------------------------------- # configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index # of all compounds will be generated. Enable this if the project # contains a lot of classes, structs, unions or interfaces. ALPHABETICAL_INDEX = YES # If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then # the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns # in which this list will be split (can be a number in the range [1..20]) COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all # classes will be put under the same header in the alphabetical index. # The IGNORE_PREFIX tag can be used to specify one or more prefixes that # should be ignored while generating the index headers. IGNORE_PREFIX = #--------------------------------------------------------------------------- # configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES (the default) Doxygen will # generate HTML output. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `html' will be used as the default path. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for # each generated HTML page (for example: .htm,.php,.asp). If it is left blank # doxygen will generate files with .html extension. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a personal HTML header for # each generated HTML page. If it is left blank doxygen will generate a # standard header. HTML_HEADER = # The HTML_FOOTER tag can be used to specify a personal HTML footer for # each generated HTML page. If it is left blank doxygen will generate a # standard footer. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading # style sheet that is used by each HTML page. It can be used to # fine-tune the look of the HTML output. If the tag is left blank doxygen # will generate a default style sheet. Note that doxygen will try to copy # the style sheet file to the HTML output directory, so don't put your own # stylesheet in the HTML output directory as well, or it will be erased! HTML_STYLESHEET = # If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, # files or namespaces will be aligned in HTML using tables. If set to # NO a bullet list will be used. HTML_ALIGN_MEMBERS = YES # If the GENERATE_HTMLHELP tag is set to YES, additional index files # will be generated that can be used as input for tools like the # Microsoft HTML help workshop to generate a compressed HTML help file (.chm) # of the generated HTML documentation. GENERATE_HTMLHELP = YES # If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can # be used to specify the file name of the resulting .chm file. You # can add a path in front of the file if the result should not be # written to the html output directory. CHM_FILE = # If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can # be used to specify the location (absolute path including file name) of # the HTML help compiler (hhc.exe). If non-empty doxygen will try to run # the HTML help compiler on the generated index.hhp. HHC_LOCATION = # If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag # controls if a separate .chi index file is generated (YES) or that # it should be included in the master .chm file (NO). GENERATE_CHI = NO # If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag # controls whether a binary table of contents is generated (YES) or a # normal table of contents (NO) in the .chm file. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members # to the contents of the HTML help documentation and to the tree view. TOC_EXPAND = YES # The DISABLE_INDEX tag can be used to turn on/off the condensed index at # top of each HTML page. The value NO (the default) enables the index and # the value YES disables it. DISABLE_INDEX = NO # This tag can be used to set the number of enum values (range [1..20]) # that doxygen will group on one line in the generated HTML documentation. ENUM_VALUES_PER_LINE = 4 # If the GENERATE_TREEVIEW tag is set to YES, a side panel will be # generated containing a tree-like index structure (just like the one that # is generated for HTML Help). For this to work a browser that supports # JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+, # Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are # probably better off using the HTML help feature. GENERATE_TREEVIEW = YES # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be # used to set the initial width (in pixels) of the frame in which the tree # is shown. TREEVIEW_WIDTH = 250 #--------------------------------------------------------------------------- # configuration options related to the LaTeX output #--------------------------------------------------------------------------- # If the GENERATE_LATEX tag is set to YES (the default) Doxygen will # generate Latex output. GENERATE_LATEX = NO # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `latex' will be used as the default path. LATEX_OUTPUT = latex # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be # invoked. If left blank `latex' will be used as the default command name. LATEX_CMD_NAME = latex # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to # generate index for LaTeX. If left blank `makeindex' will be used as the # default command name. MAKEINDEX_CMD_NAME = makeindex # If the COMPACT_LATEX tag is set to YES Doxygen generates more compact # LaTeX documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_LATEX = NO # The PAPER_TYPE tag can be used to set the paper type that is used # by the printer. Possible values are: a4, a4wide, letter, legal and # executive. If left blank a4wide will be used. PAPER_TYPE = a4wide # The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX # packages that should be included in the LaTeX output. EXTRA_PACKAGES = # The LATEX_HEADER tag can be used to specify a personal LaTeX header for # the generated latex document. The header should contain everything until # the first chapter. If it is left blank doxygen will generate a # standard header. Notice: only use this tag if you know what you are doing! LATEX_HEADER = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated # is prepared for conversion to pdf (using ps2pdf). The pdf file will # contain links (just like the HTML output) instead of page references # This makes the output suitable for online browsing using a pdf viewer. PDF_HYPERLINKS = NO # If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of # plain latex in the generated Makefile. Set this option to YES to get a # higher quality PDF documentation. USE_PDFLATEX = NO # If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. # command to the generated LaTeX files. This will instruct LaTeX to keep # running if errors occur, instead of asking the user for help. # This option is also used when generating formulas in HTML. LATEX_BATCHMODE = NO # If LATEX_HIDE_INDICES is set to YES then doxygen will not # include the index chapters (such as File Index, Compound Index, etc.) # in the output. LATEX_HIDE_INDICES = NO #--------------------------------------------------------------------------- # configuration options related to the RTF output #--------------------------------------------------------------------------- # If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output # The RTF output is optimized for Word 97 and may not look very pretty with # other RTF readers or editors. GENERATE_RTF = NO # The RTF_OUTPUT tag is used to specify where the RTF docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `rtf' will be used as the default path. RTF_OUTPUT = rtf # If the COMPACT_RTF tag is set to YES Doxygen generates more compact # RTF documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_RTF = NO # If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated # will contain hyperlink fields. The RTF file will # contain links (just like the HTML output) instead of page references. # This makes the output suitable for online browsing using WORD or other # programs which support those fields. # Note: wordpad (write) and others do not support links. RTF_HYPERLINKS = NO # Load stylesheet definitions from file. Syntax is similar to doxygen's # config file, i.e. a series of assignments. You only have to provide # replacements, missing definitions are set to their default value. RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an rtf document. # Syntax is similar to doxygen's config file. RTF_EXTENSIONS_FILE = #--------------------------------------------------------------------------- # configuration options related to the man page output #--------------------------------------------------------------------------- # If the GENERATE_MAN tag is set to YES (the default) Doxygen will # generate man pages GENERATE_MAN = NO # The MAN_OUTPUT tag is used to specify where the man pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `man' will be used as the default path. MAN_OUTPUT = man # The MAN_EXTENSION tag determines the extension that is added to # the generated man pages (default is the subroutine's section .3) MAN_EXTENSION = .3 # If the MAN_LINKS tag is set to YES and Doxygen generates man output, # then it will generate one additional man file for each entity # documented in the real man page(s). These additional files # only source the real man page, but without them the man command # would be unable to find the correct page. The default is NO. MAN_LINKS = NO #--------------------------------------------------------------------------- # configuration options related to the XML output #--------------------------------------------------------------------------- # If the GENERATE_XML tag is set to YES Doxygen will # generate an XML file that captures the structure of # the code including all documentation. GENERATE_XML = NO # The XML_OUTPUT tag is used to specify where the XML pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `xml' will be used as the default path. XML_OUTPUT = xml # The XML_SCHEMA tag can be used to specify an XML schema, # which can be used by a validating XML parser to check the # syntax of the XML files. XML_SCHEMA = # The XML_DTD tag can be used to specify an XML DTD, # which can be used by a validating XML parser to check the # syntax of the XML files. XML_DTD = # If the XML_PROGRAMLISTING tag is set to YES Doxygen will # dump the program listings (including syntax highlighting # and cross-referencing information) to the XML output. Note that # enabling this will significantly increase the size of the XML output. XML_PROGRAMLISTING = YES #--------------------------------------------------------------------------- # configuration options for the AutoGen Definitions output #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will # generate an AutoGen Definitions (see autogen.sf.net) file # that captures the structure of the code including all # documentation. Note that this feature is still experimental # and incomplete at the moment. GENERATE_AUTOGEN_DEF = NO #--------------------------------------------------------------------------- # configuration options related to the Perl module output #--------------------------------------------------------------------------- # If the GENERATE_PERLMOD tag is set to YES Doxygen will # generate a Perl module file that captures the structure of # the code including all documentation. Note that this # feature is still experimental and incomplete at the # moment. GENERATE_PERLMOD = NO # If the PERLMOD_LATEX tag is set to YES Doxygen will generate # the necessary Makefile rules, Perl scripts and LaTeX code to be able # to generate PDF and DVI output from the Perl module output. PERLMOD_LATEX = NO # If the PERLMOD_PRETTY tag is set to YES the Perl module output will be # nicely formatted so it can be parsed by a human reader. This is useful # if you want to understand what is going on. On the other hand, if this # tag is set to NO the size of the Perl module output will be much smaller # and Perl will parse it just the same. PERLMOD_PRETTY = YES # The names of the make variables in the generated doxyrules.make file # are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. # This is useful so different doxyrules.make files included by the same # Makefile don't overwrite each other's variables. PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor #--------------------------------------------------------------------------- # If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will # evaluate all C-preprocessor directives found in the sources and include # files. ENABLE_PREPROCESSING = YES # If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro # names in the source code. If set to NO (the default) only conditional # compilation will be performed. Macro expansion can be done in a controlled # way by setting EXPAND_ONLY_PREDEF to YES. MACRO_EXPANSION = NO # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES # then the macro expansion is limited to the macros specified with the # PREDEFINED and EXPAND_AS_DEFINED tags. EXPAND_ONLY_PREDEF = NO # If the SEARCH_INCLUDES tag is set to YES (the default) the includes files # in the INCLUDE_PATH (see below) will be search if a #include is found. SEARCH_INCLUDES = YES # The INCLUDE_PATH tag can be used to specify one or more directories that # contain include files that are not input files but should be processed by # the preprocessor. INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the # directories. If left blank, the patterns specified with FILE_PATTERNS will # be used. INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that # are defined before the preprocessor is started (similar to the -D option of # gcc). The argument of the tag is a list of macros of the form: name # or name=definition (no spaces). If the definition and the = are # omitted =1 is assumed. To prevent a macro definition from being # undefined via #undef or recursively expanded use the := operator # instead of the = operator. PREDEFINED = # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then # this tag can be used to specify a list of macro names that should be expanded. # The macro definition that is found in the sources will be used. # Use the PREDEFINED tag if you want to use a different macro definition. EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then # doxygen's preprocessor will remove all function-like macros that are alone # on a line, have an all uppercase name, and do not end with a semicolon. Such # function macros are typically used for boiler-plate code, and will confuse # the parser if not removed. SKIP_FUNCTION_MACROS = YES #--------------------------------------------------------------------------- # Configuration::additions related to external references #--------------------------------------------------------------------------- # The TAGFILES option can be used to specify one or more tagfiles. # Optionally an initial location of the external documentation # can be added for each tagfile. The format of a tag file without # this location is as follows: # TAGFILES = file1 file2 ... # Adding location for the tag files is done as follows: # TAGFILES = file1=loc1 "file2 = loc2" ... # where "loc1" and "loc2" can be relative or absolute paths or # URLs. If a location is present for each tag, the installdox tool # does not have to be run to correct the links. # Note that each tag file must have a unique name # (where the name does NOT include the path) # If a tag file is not located in the directory in which doxygen # is run, you must also specify the path to the tagfile here. TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create # a tag file that is based on the input files it reads. GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES all external classes will be listed # in the class index. If set to NO only the inherited external classes # will be listed. ALLEXTERNALS = NO # If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed # in the modules index. If set to NO, only the current project's groups will # be listed. EXTERNAL_GROUPS = YES # The PERL_PATH should be the absolute path and name of the perl script # interpreter (i.e. the result of `which perl'). PERL_PATH = /usr/bin/perl #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- # If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will # generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base # or super classes. Setting the tag to NO turns the diagrams off. Note that # this option is superseded by the HAVE_DOT option below. This is only a # fallback. It is recommended to install and use dot, since it yields more # powerful graphs. CLASS_DIAGRAMS = YES # If set to YES, the inheritance and collaboration graphs will hide # inheritance and usage relations if the target is undocumented # or is not a class. HIDE_UNDOC_RELATIONS = YES # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is # available from the path. This tool is part of Graphviz, a graph visualization # toolkit from AT&T and Lucent Bell Labs. The other options in this section # have no effect if this option is set to NO (the default) HAVE_DOT = NO # If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect inheritance relations. Setting this tag to YES will force the # the CLASS_DIAGRAMS tag to NO. CLASS_GRAPH = YES # If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect implementation dependencies (inheritance, containment, and # class references variables) of the class with other documented classes. COLLABORATION_GRAPH = YES # If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen # will generate a graph for groups, showing the direct groups dependencies GROUP_GRAPHS = YES # If the UML_LOOK tag is set to YES doxygen will generate inheritance and # collaboration diagrams in a style similar to the OMG's Unified Modeling # Language. UML_LOOK = NO # If set to YES, the inheritance and collaboration graphs will show the # relations between templates and their instances. TEMPLATE_RELATIONS = NO # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT # tags are set to YES then doxygen will generate a graph for each documented # file showing the direct and indirect include dependencies of the file with # other documented files. INCLUDE_GRAPH = YES # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and # HAVE_DOT tags are set to YES then doxygen will generate a graph for each # documented header file showing the documented files that directly or # indirectly include this file. INCLUDED_BY_GRAPH = YES # If the CALL_GRAPH and HAVE_DOT tags are set to YES then doxygen will # generate a call dependency graph for every global function or class method. # Note that enabling this option will significantly increase the time of a run. # So in most cases it will be better to enable call graphs for selected # functions only using the \callgraph command. CALL_GRAPH = NO # If the CALLER_GRAPH and HAVE_DOT tags are set to YES then doxygen will # generate a caller dependency graph for every global function or class method. # Note that enabling this option will significantly increase the time of a run. # So in most cases it will be better to enable caller graphs for selected # functions only using the \callergraph command. CALLER_GRAPH = NO # If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen # will graphical hierarchy of all classes instead of a textual one. GRAPHICAL_HIERARCHY = YES # If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES # then doxygen will show the dependencies a directory has on other directories # in a graphical way. The dependency relations are determined by the #include # relations between the files in the directories. DIRECTORY_GRAPH = YES # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images # generated by dot. Possible values are png, jpg, or gif # If left blank png will be used. DOT_IMAGE_FORMAT = png # The tag DOT_PATH can be used to specify the path where the dot tool can be # found. If left blank, it is assumed the dot tool can be found in the path. DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the # \dotfile command). DOTFILE_DIRS = # The MAX_DOT_GRAPH_WIDTH tag can be used to set the maximum allowed width # (in pixels) of the graphs generated by dot. If a graph becomes larger than # this value, doxygen will try to truncate the graph, so that it fits within # the specified constraint. Beware that most browsers cannot cope with very # large images. MAX_DOT_GRAPH_WIDTH = 1024 # The MAX_DOT_GRAPH_HEIGHT tag can be used to set the maximum allows height # (in pixels) of the graphs generated by dot. If a graph becomes larger than # this value, doxygen will try to truncate the graph, so that it fits within # the specified constraint. Beware that most browsers cannot cope with very # large images. MAX_DOT_GRAPH_HEIGHT = 1024 # The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the # graphs generated by dot. A depth value of 3 means that only nodes reachable # from the root by following a path via at most 3 edges will be shown. Nodes # that lay further from the root node will be omitted. Note that setting this # option to 1 or 2 may greatly reduce the computation time needed for large # code bases. Also note that a graph may be further truncated if the graph's # image dimensions are not sufficient to fit the graph (see MAX_DOT_GRAPH_WIDTH # and MAX_DOT_GRAPH_HEIGHT). If 0 is used for the depth value (the default), # the graph is not depth-constrained. MAX_DOT_GRAPH_DEPTH = 0 # Set the DOT_TRANSPARENT tag to YES to generate images with a transparent # background. This is disabled by default, which results in a white background. # Warning: Depending on the platform used, enabling this option may lead to # badly anti-aliased labels on the edges of a graph (i.e. they become hard to # read). DOT_TRANSPARENT = NO # Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) # support this, this feature is disabled by default. DOT_MULTI_TARGETS = NO # If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will # generate a legend page explaining the meaning of the various boxes and # arrows in the dot generated graphs. GENERATE_LEGEND = YES # If the DOT_CLEANUP tag is set to YES (the default) Doxygen will # remove the intermediate dot files that are used to generate # the various graphs. DOT_CLEANUP = YES #--------------------------------------------------------------------------- # Configuration::additions related to the search engine #--------------------------------------------------------------------------- # The SEARCHENGINE tag specifies whether or not a search engine should be # used. If set to NO the values of all tags below this one will be ignored. SEARCHENGINE = NO ================================================ FILE: README-BENCHMARKS-TESTING.md ================================================ **For testing whether your environment is configured correctly for running Quartz** (e.g., whether you set all the required environmental variables, etc.) **we have created a few scripts with benchmarks, which can be executed automatically** and which can provide you with a feedback on Quartz performance in your environment. **The directory with these scripts is called: *benchmark-tests*. There are three scripts which you can run:** - **bandwidth-model-building.sh** This script will execute for approximately **10 min** and will build a memory bandwidth model that can be used in the experiments with memory bandwidth throttling. The configuration file uses a "debug" mode on purpose -- that you can see the messages on the screen about the progress of the memory bandwidth model building, which can be found at */tmp/bandwidth_model* - **memlat-orig-lat-test.sh** This script will measure your server hardware *memory access latency* in nanoseconds: local and remote (for two sockets servers). It will execute the test 20 times, and write the results in directory *ORIG-lat-test*. You can find the summary of the results in the file *ORIG-lat-test/final-hw-latency.txt*. It will have measurements like: FORMAT: 1_min_local 2_aver_local 3_max_local 4_min_remote 5_aver_remote 6_max_remote 91 91.9 92 152 163.9 176 First three numbers show: minimal, average and maximum measured local memory access latency (in ns, over 20 measurements). The last three numbers show show similar measurements for access latency of the remote memory, i.e., in the second socket. - **memlat-bench-test-10M.sh** This script will execute memlat benchmark (pointer-chasing benchmark) with nine emulated memory access latencies: 200 ns, 300 ns,..., 1000 ns. It will run the benchmark with these emulated latencies in two settings: in the local socket (.i.e., emulating a higher memory access latency in the local socket) and similarly, in the remote socket. Each test is repeated 10 times: this is used for assessing the variability of your environment. In some cases, we had issues with TurboBoost mode, \ which did impact the quality of the emulation... This test might take **approx. 30 min to finish** (since it executes 180 tests), and will create two output directories: *FULL-RESULTS-test* and *SUMMARY-RESULTS-test* In the directory SUMMARY-RESULTS-test, you will find two files that summarize the outcome of the experiments in the local and remote sockets. The outcome should look like this: FORMAT: 1_emul_lat 2_min_meas_lat 3_aver_meas_lat 4_max_meas_lat 5_aver_error(%) 6_max_error(%) 200 177 197.9 204 1.05 11.5 300 259 289.5 300 3.5 13.6 400 354 382.6 395 4.3 11.5 500 468 485.8 490 2.8 6.4 600 554 575.3 585 4.1 7.6 700 640 666.6 681 4.7 8.5 800 749 766.4 776 4.2 6.3 900 851 866.2 871 3.7 5.4 1000 926 956.5 966 4.35 7.4 The format is the following: 1st column: emulated latency (in nanoseconds) 2nd column: minimum measured latency (across 10 tests, in ns) 3d column: average measured latency (across 10 tests, in ns) 4th column: maximum measured latency (across 10 tests, in ns) 5th column: average error (between emulated and measured latencies, in %) 6th column: max error (between emulated and measured latencies, in %) One of the goals of the designed performance emulator is to provide a framework for application sensitivity studies under different latencies and memory bw. Even if you have 15% deviation (error) from the targeted emulated latencies, but the benchmark measurements are consistent -- this is a good sign that you can perform a good sensitivity study. ================================================ FILE: README.md ================================================ Quartz: A DRAM-based performance emulator for NVM ---------------------- Quartz leverages features available in commodity hardware to emulate different latency and bandwidth characteristics of future byte-addressable NVM technologies. Quartz's design, implementation details, evaluation, and overhead can be found in the following research paper: - **H. Volos, G. Magalhaes, L. Cherkasova, J. Li: Quartz: A Lightweight Performance Emulator for Persistent Memory Software. In Proc. of the 16th ACM/IFIP/USENIX International Middleware Conference, (Middleware'2015), Vancouver, Canada, December 8-11, 2015. and can be downloaded from: http://www.jahrhundert.net/papers/middleware2015.pdf** While the emulator is designed to cover three processor families: *Sandy Bridge, Ivy Bridge*, and *Haswell* -- we have had the best results on the *Ivy Bridge* platform. Haswell processor has a TurboBoost feature that cause higher variance and deviations when emulating higher range latencies (above 600 ns). Contributors ---------------------- For a list of contributors see [AUTHORS](https://github.hpe.com/labs/quartz/blob/master/AUTHORS). Extended documentation ---------------------- Extended documentation available in Doxygen form. To build and view: doxygen xdg-open doc/html/index.html Dependencies ------------ This is the list of libraries and tools used by Quartz: On RPM based distributions: - cmake 2.8 - libconfig and libconfig-devel - numactl-devel - uthash-devel - kernel-devel On Debian based distributions: - cmake 2.8 - libconfig-dev - libnuma-dev - uthash-dev - linux-headers You can run 'sudo scripts/install.sh' in order to automatically install these dependencies. Supported environment --------------------- Currently the latency emulator can be used on Linux with *Sandy Bridge, Ivy Bridge*, and *Haswell* Intel processors. For bandwidth emulation support, Intel Thermal Memory Controller device is required. No specific Linux distribution or kernel version is required. Source code tree overview ------------------------- bench Benchmarks doc Documentation, including Doxygen generated documentation (doc/html) src/lib Emulator main library code src/dev Kernel-module for accessing performance counters and memory-controller PCI registers scripts Helper scripts to run a program using the emulator and install dependencies test Several tests and application code examples benchmark-tests Several automated tests with benchmark runs and output analysis for testing the correctness of configured emulation environment and the accuracy of expected results For more details, please see the extended documentation generated using Doxygen. Building -------- After installing the dependencies, go to the emulator's source code root folder and execute the following steps: mkdir build cd build cmake .. make clean all In order to disable statistics support, replace the third step above with: cmake .. -DSTATISTICS=OFF See more details about statistics on the respective section below. The emulator library, benchmark and test binaries resulted from the build process will be available in the respective subfolder inside the 'build' folder. Usage ----- First, load the emulator's kernel module. From the emulator's source code root folder, execute: sudo scripts/setupdev.sh load Set your processor to run at maximum frequency to ensure fixed cycle rate (as the cycle counter is used to project delay time). You can use the scaling governor: echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor Set the LD_PRELOAD and NVMEMUL_INI environment variables to point respectively to the emulators library and the configuration file to be used. The LD_PRELOAD is used for automatically loading the emulator's library when the user application is executed. Thus, there is no need to statically link the library to the user application. See below details about the configuration file in the respective section. Rather than configuring the scaling governor and the environment variables manually as indicated above, you can use the scripts/runenv.sh script. See below. An additional configuration step may be required depending on the Linux Kernel version. This emulator makes use of rdpmc x86 instruction to read CPU counters. Before kernel 4.0, when rdpmc support was enabled, any process (not just ones with an active perf event) could use the rdpmc instruction to access the counters. Starting with Linux 4.0 rdpmc support is only allowed if an event is currently enabled in a process's context. To restore the old behavior, write the value 2 to /sys/devices/cpu/rdpmc if kernel version is 4.0 or greater: echo 2 | sudo tee /sys/devices/cpu/rdpmc Run your application: scripts/runenv.sh The runenv.sh script runs an application in a new shell environment that properly sets LD_PRELOAD to the library available in the build folder. We do not modify the current shell environment to avoid getting other applications interposed by the emulator unexpectedly. Alternatively, you may directly link the library to your application but the nvmemul library must come first in the linking order to ensure we properly interpose on necessary functions. Additionally, this script sets the NVMEMUL_INI environment variable to point to the nvmemul.ini configuration file available in the emulator's source code root folder. Configuration file ------------------ Emulator runtime parameters can be defined in a configuration file. The default path is ./nvmemul.ini but you may change the path through the environment variable $NVMEMUL_INI (see scripts/runenv.sh). The main available parameters are: - Latency: enable True means the latency emulation is on, false, the latency emulation is disabled. inject_delay True means the delay injection is on, false, the emulator will skip the delay injection read The target read latency in nano seconds. It must be greater than the hardware latency. This value is automatically consisted by the emulator. write The target write latency in nano seconds. It must be greater than the hardware latency. This value is automatically consisted by the emulator. max_epoch_duration_us This is the epoch duration in micro seconds. Eventually an epoch may be greater than this value depending on signal delivery managed by Kernel. min_epoch_duration_us The minimum epoch duration. - Bandwidth: enable True means the bandwidth emulation is on, false, it is disabled. model File path used by the emulator to cache the detected hardware bandwidth characteristics. read Target read bandwidth in MB/s. write Target write bandwidth in MB/s; - Topology: mc_pci File path used by the emulator to cache the PCI bus topology. It is not required if bandwidth emulation is disabled. physical_nodes List all CPU sockets ids to be added to the known topology. An odd number of CPU sockets means it will not be possible to configure all CPUs in pairs and then a single CPU will be used as NVM only. See Emulation modes section below. - Statistics: enable True means the statistics collection and report is enable, false, it is disable. See the Statistics section below. file File path used by the emulator to write the statistics report. If not provided, emulator will use stdout. - Debug: level Shows debugging message with level up to this value, the greater this value is, the more verbose the debug log will be. 0: off; 1: critical; 2: error; 3: warning; 4: info; 5: debugging. verbose If greater than zero shows source code information along with the debugging message. Latency emulation modes ----------------------- The emulator may run application threads on a *NVM only* mode or *DRAM+NVM* mode. It depends if the system has more than one CPU socket and if the topology configuration enables multiple CPU socket. For *NVM only* mode, the emulator will use a CPU socket with no sibling node and make use of the DRAM available in that socket to emulate NVM. Any DRAM memory access on this socket will produce delays injection to emulate the target latency. For *DRAM+NVM* mode, the emulator will differentiate DRAM from virtual NVM latencies. It is supported only on IvyBridge, Haswell (and higher) Intel processor systems with 2 CPU sockets or more. A proper configuration as mentioned above and explicit calls to NVM memory allocation in the application’s source code is required. - The emulator will bind application threads to node 0 CPU and DRAM. The other CPU socket will not be used for application threads and the DRAM from this second socket will be used as virtual NVM; - The application must explicitly allocate virtual NVRAM memory using pmalloc(size) and pfree(pointer, size) API provided by the emulator. See the NVM programming section below. NVM programming --------------- The emulator provides an API for allocating and deallocating memory from NVM space. It is possible to use this API on both NVM only and DRAM+NVM modes. However, it is really required to use this API in the DRAM+NVM mode so the emulator can clearly differentiate DRAM from NVM memory access latencies. This is the API available for user applications: void *pmalloc(size_t size); void pfree(void *start, size_t size); The application can include the NVM_EMUL/src/lib/pmalloc.h header file to properly define these headers. See test/test_nvm.c and test/test_nvm_remote_dram.c for an example on how to allocate memory on respectively local DRAM or virtual NVM on a DRAM+NVM emulation mode. Statistics ---------- The emulator collects statistical data to help on emulation accuracy validation. If enabled, by default the emulator will show the statistics report when the user application terminates to the standard output. Some applications suppress output to stdout, you can still see the reports by defining a target file for the report in the configuration file. When using a file as output, the emulator appends the result to the file and then previous reports are not overwritten. The statistics source code can also be statically removed at compile time. See Building section. These are the reported statistics: - initialization duration Time in micro seconds took by the emulator to initialize. - running threads The number of threads still running. If the report was called automatically by the emulator, all user threads are already terminated. - terminated threads Number of terminated threads, including the main thread. For each application thread: - thread id Thread id. - cpu id CPU id where the user thread was bind to. - spawn timestamp Thread spawn timestamp as reported by the monotonic time. - termination timestamp Thread termination timestamp as reported by the monotonic time. - execution time - stall cycles Total number of CPU stalls caused by memory accesses made by this thread. - NVM accesses Number of effective NVM accesses performed by the application. - latency calculation overhead cycles Overhead cycles caused by the emulator and that could not be amortized. Zero is expected. Otherwise, consider increasing the epoch duration. - injected delay cycles Total number of cycles injected by the emulator to emulate the target latency. - injected delay in usec Same value as above, but shown in micro seconds. - longest epoch duration The effective longest epoch duration ever performed for this thread. - shortest epoch duration The effective shortest epoch duration ever performed for this thread. - average epoch duration The average epoch duration for this thread. - number of epochs Total number of epochs performed for this thread. - epochs which didn't reach min duration Number of epochs requested by either Thread Monitor or thread synchronizations, but were not open since the epoch durations didn't reach the minimum epoch duration. - static epochs requested Number of epochs requested by the Thread Monitor. Support to PAPI --------------- Performance API (PAPI) library may be used with the emulator and there are some hooks to switch the current CPU counters reading method to PAPI. Up to the time of this writing, there was no way to make PAPI CPU counter reading to perform at the performance level required by the emulation. In the future, if it is desired to switch to PAPI, follow these steps: - Device pmc_ioctl_setcounter() and emulator lib set_counter() in dev/pmc.c calls can be deleted. - Define PAPI_SUPPORT for src/lib/* source code. - Compile with lib/cpu/pmc-papi.c rather than lib/cpu/pmc.c. - Link code with PAPI and add PAPI include directory. - Some extra tweaks may be required, check TODOs in the code. Multiple emulated processes and MPI programs -------------------------------------------- The emulator needs to bind user threads to specific CPU cores in order to optimize emulation results. It is required to export the EMUL_LOCAL_PROCESSES environment variable with the number or emulated processes on the host. The emulator will manage each emulated processes to partition the available CPUs in a coordinated way. It is recommended to set EMUL_LOCAL_PROCESSES with up to half number of available CPU cores (note DRAM+NVM mode already reserves half of available CPU cores). If EMUL_LOCAL_PROCESSES is not set or set with a value lower than 2, the emulator will not partition CPU cores per process. If some process crashes the emulator might not have cleaned up the environment and the process rank ids will not be correctly managed. On this case, close all emulated processes and delete files /tmp/emul_lock_file and /tmp/emul_process_local_rank if they exist. Bandwidth emulation ------------------- Quartz supports an emulation mode with "throttled" memory bandwidth. The memory bandwidth emulation makes use of the copy kernel from the Stream benchmark, openMP version. When the bandwidth emulation is enabled for a first time, Quartz creates a memory bandwidth model by utilizing the available *Thermal Registers* in the Memory Controller and measuring the corresponding memory bandwidth. This initial step of building a model might take several minutes **(~10min)**. For the memory bandwitdh emulation, *turn off the latency modeling* in the configuration file and select all available NUMA nodes in the configuration file in order to prepare the model for any combination of NUMA nodes selection. Modeling data will be cached to these files: /tmp/bandwidth_model /tmp/mc_pci_bus As first step, the emulator will detect the Memory Controller Thermal Registers Control PCI addresses and cache it to /tmp/mc/pci_bus. After this step, the emulator will close the current execution to safely clear NUMA bindings. Rerun the process to resume the work. Quartz will create the file: **/tmp/bandwidth_model**. It reflects the relationship between Thermal Registers and achievable memory bandwidth (in a single socket). The line format in this file is: read This file should present ascending values of memory bandwidth ranging from hundreds of MiB/s to tens of GiB/S. These values (or their approximations) can be used for the experiments with memory bandwidth throttling. Note, that the model is built once: it is cached and then used for all later experiments. (You can also run a specially prepared automated script *bandwidth-model-building.sh* in directory *benchmark-tests*. For details see [README-BENCHMARKS-TESTING.md] (https://github.hpe.com/labs/quartz/blob/master/README-BENCHMARKS-TESTING.md). For example, to enable memory bandwidth throttling at 2 GB/s, you should change the emulator configuration file "nvmemul.ini" using the following settings: bandwidth: { enable = true; model = "/tmp/bandwidth_model"; read = 2000; write = 2000; }; Both read and write bandwidth values must be set to the same value since the emulator does not model read/write independently in the current version. See Limitations session. The pmalloc() family is not intended to be used with the bandwidth modeling. Use numactl for instance to bind CPU and memory of the used application to the intended NUMA node depending. The bandwidth emulator considers the virtual NVRAM node only (in the configuration with two sockets). So it is required the application to keep processes/threads and data on the same NUMA node for bandwidth experiments. Automated Benchmark Runs ------------------------- We have created several automated tests with benchmark runs and output analysis for testing the correctness of configured emulation environment and the accuracy of expected results. For details see [README-BENCHMARKS-TESTING.md] (https://github.hpe.com/labs/quartz/blob/master/README-BENCHMARKS-TESTING.md). Limitations ----------- The emulator functionality may be affected by certain conditions in user applications: - application sets threads CPU and memory affinity. - application opens much more concurrent threads than available cores per socket. Note that on DRAM+NVM emulation mode, half of the available CPU cores is not used for user threads. - application sets handler for SIGUSR1. Other: - Write memory latency is not yet implemented. - Write/Read memory bandwidth emulation cannot be set independently. - The signal handler may cause syscalls in the application to fail. It is recommended to implement retries at the application level as a good practice for syscalls. - Child process from fork() calls are not tracked by the emulator. As a workaround, the emulator could make the library initialization function available in the external API. Applications then should call this function in the beginning of the child process. - OpenMP applications may use synchronization primitives not based on pthreads which are currently not supported. - See Todo session for details. Todo list --------- Please see accompanied TODO.dox or extended documentation for an extensive list. #License This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #Copyright (c) Copyright 2016 Hewlett Packard Enterprise Development LP **NOTE**: This software depends on other packages that may be licensed under different open source licenses. ================================================ FILE: TODO.dox ================================================ /** \file \todo Improve performance counter API by making it more generic. For example, autogenerate pmc event_id using perf. \todo Currently we may interrupt a thread to form a new epoch while it is blocked. This might cause accumulation of overhead cycles. \todo Currently our bandwidth model cannot independently throttle read and write bandwidth as it relies on throttling DDR ACT transactions. We tried throttling DDR READ and DDR WRITE transactions but this didn't work. \todo Extend library to interpose on other synchronization events we care: semaphores, barriers, context switches, openMP sync primitives, etc. \todo Currently our library does not support context switching. Extent the device driver to properly handle context switching: keep track of per-thread cpu counters, introduce proper delay at context switch points. \todo Support uncacheable and write-through memory. \todo Signal SIGUSR1 should be dedicated to the emulator. If the application makes use of this signal, the emulator will not work. Figure out a way to fix this limitation. \todo Interpose pthread_cancel() e pthread_exit() to make sure the thread is always deregistered internally to the emulator? \todo CPU counters overflow is not currently handled. \todo Multiple processes emulation must be reviewed: log file per process, statistics report by process, process id and thread id indications in the log messages. \todo See Limitations section in the README file. */ ================================================ FILE: bench/CMakeLists.txt ================================================ add_subdirectory(memlat) add_subdirectory(new_memlat) add_subdirectory(multilat) ================================================ FILE: bench/memlat/CMakeLists.txt ================================================ include_directories(${CMAKE_SOURCE_DIR}/src/lib) add_executable(memlat memlat.c) target_link_libraries(memlat nvmemul pthread) ================================================ FILE: bench/memlat/memlat.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include #include #include #include #define MAX_NUM_THREADS 512 uint64_t g_seed, g_nchains, g_nelems, g_from_node_id, g_to_node_id, g_element_size, g_access_size; extern int measure_latency2(uint64_t seedin, int nchains, size_t nelems, int element_size, int access_size, int from_node_id, int to_node_id); static uint64_t safe_strtoull(const char *s) { char *ep; uint64_t r; assert(NULL != s && '\0' != *s); r = strtoull(s, &ep, 10); assert('\0' == *ep); return r; } void* worker(void* arg) { int latency_ns; latency_ns = measure_latency2(g_seed, g_nchains, g_nelems, g_element_size, g_access_size, g_from_node_id, g_to_node_id); printf("latency_ns: %d\n", latency_ns); return NULL; } int main(int argc, char *argv[]) { int i; uint64_t nthreads; pthread_t thread[MAX_NUM_THREADS]; if (9 != argc) { fprintf(stderr, "usage: %s PRNGseed Nthreads Nchains Nelems SZelem SZaccess from_node to_node\n", argv[0]); return 1; } g_seed = safe_strtoull(argv[1]); nthreads = safe_strtoull(argv[2]); g_nchains = safe_strtoull(argv[3]); g_nelems = safe_strtoull(argv[4]); g_element_size = safe_strtoull(argv[5]); g_access_size = safe_strtoull(argv[6]); g_from_node_id = safe_strtoull(argv[7]); g_to_node_id = safe_strtoull(argv[8]); for (i = 0; i< nthreads; i++) { pthread_create(&thread[i], NULL, worker, NULL); } for(i = 0 ; i < nthreads; i++) { pthread_join(thread[i], NULL); } return 0; } ================================================ FILE: bench/multilat/CMakeLists.txt ================================================ include_directories(${CMAKE_SOURCE_DIR}/src/lib) add_executable(multilat multilat.c) target_link_libraries(multilat nvmemul pthread) ================================================ FILE: bench/multilat/multilat.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #define _GNU_SOURCE #include #include #include #include #include //#include #include "thread.h" #include #include "pmalloc.h" #include "debug.h" //#include "stat.h" #define NDEBUG //#ifndef NDEBUG #include //#endif // packs the arguments received from user typedef struct { int mem_refs_dram; int mem_refs_nvm; int interleave_dram; int interleave_nvm; //int from_node; //int to_node; } arg_s; // for multi thread management #define MAX_NUM_THREADS 50 pthread_t thread_desc[MAX_NUM_THREADS]; //pthread_mutex_t mutex; // for CPU cache trashing and pointer chasing #include typedef struct { uint64_t val; char padding[0]; } element_t; typedef struct { uint64_t N; uint64_t element_size; element_t* head; } chain_t; uint64_t trash_cache(uint64_t N); chain_t* alloc_chain(uint64_t seedin, uint64_t N, uint64_t element_size, uint64_t node_i, uint64_t node_j); element_t* element(chain_t* chain, uint64_t index); void inline read_element(chain_t* chain, uint64_t index, char* buf, uint64_t buf_size); // factor is 10 (could be more), to make sure we have a buffer much bigger than CPU cache // the memory buffer is NOT shared among threads // for now the cache size is hardcoded as 20 MB #define NELEMS (10 * 20480000 / 64LLU) #define PAGESZ 4096 #define MAX_NUM_CHAINS 16 //#undef USE_HUGETLB #define SEED_IN 1 #define NCHAINS 1 /*extern inline hrtime_t hrtime_cycles(void); static inline void delay_cycles(hrtime_t cycles) { hrtime_t start, stop; start = hrtime_cycles(); do { stop = hrtime_cycles(); } while (stop - start < cycles); }*/ // for fixing thread affinity to a single CPU after allocating memory chains and binding it to the local or remote nodes static int max_number_of_cpus(void) { int n, cpus = 2048; size_t setsize = CPU_ALLOC_SIZE(cpus); cpu_set_t *set = CPU_ALLOC(cpus); if (!set) goto err; for (;;) { CPU_ZERO_S(setsize, set); /* the library version does not return size of cpumask_t */ n = syscall(SYS_sched_getaffinity, 0, setsize, set); if (n < 0 && cpus < 1024 * 1024) { CPU_FREE(set); cpus *= 2; set = CPU_ALLOC(cpus); if (!set) goto err; continue; } CPU_FREE(set); return n * 8; } err: printf("cannot determine NR_CPUS"); return 0; } static int bind_cpu(thread_t *thread) { size_t setsize; cpu_set_t *cur_cpuset; cpu_set_t *new_cpuset; int ncpus = max_number_of_cpus(); if (thread == NULL) { // if thread is NULL it means the emulator is disabled, return without setting CPU affinity //printf("thread self is null"); return 0; } if (ncpus == 0) { return 1; } setsize = CPU_ALLOC_SIZE(ncpus); cur_cpuset = CPU_ALLOC(ncpus); new_cpuset = CPU_ALLOC(ncpus); CPU_ZERO_S(setsize, cur_cpuset); CPU_ZERO_S(setsize, new_cpuset); CPU_SET_S(thread->cpu_id, setsize, new_cpuset); if (pthread_getaffinity_np(thread->pthread, setsize, cur_cpuset) != 0) { DBG_LOG(ERROR, "Cannot get thread tid [%d] affinity, pthread: 0x%lx on processor %d\n", thread->tid, thread->pthread, thread->cpu_id); return 1; } if (CPU_EQUAL(cur_cpuset, new_cpuset)) { //printf("No need to bind CPU\n"); return 0; } DBG_LOG(INFO, "Binding thread tid [%d] pthread: 0x%lx on processor %d\n", thread->tid, thread->pthread, thread->cpu_id); if (pthread_setaffinity_np(thread->pthread, setsize, new_cpuset) != 0) { DBG_LOG(ERROR, "Cannot bind thread tid [%d] pthread: 0x%lx on processor %d\n", thread->tid, thread->pthread, thread->cpu_id); return 1; } return 0; } uint64_t force_ldm_stalls(chain_t **C, int element_size, int access_size, int mem_refs, // number of pointers/elements to chase uint64_t max_nelems, // max number of available elements/pointers int it_n, // seed to calculate the first pointer to chase, used to avoid repeating // pointers during consecutive calls unsigned long *time_diff_ns) { uint64_t j, i; int nchains = SEED_IN; uint64_t sumv[MAX_NUM_CHAINS]; uint64_t nextp[MAX_NUM_CHAINS]; char *buf; uint64_t buf_size = 16384; int count = 0; uint64_t start; uint64_t it_limit; struct timespec time_start, time_end; assert(nchains < MAX_NUM_CHAINS); if (mem_refs <= 0) return 0; buf = (char*) malloc(buf_size); assert(buf != NULL); if (max_nelems > mem_refs) { it_limit = max_nelems / mem_refs; } else { it_limit = 1; } it_n = it_n % it_limit; start = it_n * mem_refs; if ((start + mem_refs) > max_nelems) { start = 0; } /* chase the pointers */ if (nchains == 1) { clock_gettime(CLOCK_MONOTONIC, &time_start); sumv[0] = 0; // chase pointers until the 'mem_refs' count, the pointer chasing will restart from beginning if 'mem_refs' // is greater than 'nelems' for (count = 0, i = start; count < mem_refs; i = element(C[0], i)->val, ++count) { __asm__(""); sumv[0] += element(C[0], i)->val; if (access_size > element_size) { read_element(C[0], i, buf, buf_size); } } clock_gettime(CLOCK_MONOTONIC, &time_end); } // else { // for (j=0; j < nchains; j++) { // sumv[j] = 0; // nextp[j] = 0; // } // for (; 0 != element(C[0], nextp[0])->val; ) { // for (j=0; j < nchains; j++) { // sumv[j] += element(C[j], nextp[j])->val; // if (access_size > element_size) { // read_element(C[j], nextp[j], buf, buf_size); // } // nextp[j] = element(C[j], nextp[j])->val; // } // } // } *time_diff_ns = ((time_end.tv_sec * 1000000000) + time_end.tv_nsec) - ((time_start.tv_sec * 1000000000) + time_start.tv_nsec); free(buf); return sumv[0]; } void thread_iter(int dram_refs, int nvm_refs, int interleave_dram, int interleave_nvm) { long it_n; unsigned long time_dram, time_nvm, total_time_dram_ns, total_time_nvm_ns; uint64_t seed; uint64_t j; chain_t *C_dram[MAX_NUM_CHAINS]; chain_t *C_nvm[MAX_NUM_CHAINS]; int missing_dram_refs, missing_nvm_refs; int dram_stalls, nvm_stalls; struct timespec task_time_start, task_time_end; unsigned long task_time_diff_ns; #ifndef NDEBUG pid_t tid = (pid_t) syscall(SYS_gettid); #endif assert(NELEMS < UINT64_MAX); for (j=0; j < NCHAINS; j++) { seed = SEED_IN + j*j; C_dram[j] = alloc_chain(seed, NELEMS, 64LLU, 0, 0); C_nvm[j] = alloc_chain(seed, NELEMS, 64LLU, 0, 1); __asm__(""); } bind_cpu(thread_self()); // cache must be trashed after bind_cpu() call trash_cache(NELEMS); total_time_dram_ns = 0; total_time_nvm_ns = 0; missing_dram_refs = dram_refs; missing_nvm_refs = nvm_refs; #ifndef NDEBUG printf("DRAM accesses to be made: %ld\n", dram_refs); printf("NVM accesses to be made: %ld\n", nvm_refs); #endif //delay_cycles(8000000000); //printf("STARTING MEASURES\n"); clock_gettime(CLOCK_MONOTONIC, &task_time_start); for (it_n = 0; (missing_dram_refs > 0) || (missing_nvm_refs > 0); ++it_n) { __asm__(""); // calculate the number o memory accesses to be made on each memory type if (missing_dram_refs > interleave_dram) { missing_dram_refs -= interleave_dram; dram_stalls = interleave_dram; } else { dram_stalls = missing_dram_refs; missing_dram_refs = 0; } if (missing_nvm_refs > interleave_nvm) { missing_nvm_refs -= interleave_nvm; nvm_stalls = interleave_nvm; } else { nvm_stalls = missing_nvm_refs; missing_nvm_refs = 0; } time_dram = 0; time_nvm = 0; // do memory accesses interleaved by dividing the number of accesses in smaller amount // as configured by user force_ldm_stalls((chain_t **)&C_dram, 64LLU, 8, dram_stalls, NELEMS, it_n, &time_dram); force_ldm_stalls((chain_t **)&C_nvm, 64LLU, 8, nvm_stalls, NELEMS, it_n, &time_nvm); total_time_dram_ns += time_dram; total_time_nvm_ns += time_nvm; #ifndef NDEBUG printf("%ld DRAM accesses took: %ld ns\n", dram_stalls, time_dram); printf("%ld NVM accesses took: %ld ns\n", nvm_stalls, time_nvm); #endif } clock_gettime(CLOCK_MONOTONIC, &task_time_end); task_time_diff_ns = ((task_time_end.tv_sec * 1000000000) + task_time_end.tv_nsec) - ((task_time_start.tv_sec * 1000000000) + task_time_start.tv_nsec); // the memory latency is the total time divided by the number of accesses for each memory type if (dram_refs > 0) total_time_dram_ns /= dram_refs; else total_time_dram_ns = 0; if (nvm_refs > 0) total_time_nvm_ns /= nvm_refs; else total_time_nvm_ns = 0; printf("DRAM latency: %ld ns\n", total_time_dram_ns); printf("NVM latency: %ld ns\n", total_time_nvm_ns); printf("Measure time: %.3lf ms\n", (double)task_time_diff_ns/1000000.0); printf("Expected time: %.3ld ms\n", ((total_time_dram_ns * dram_refs) + (total_time_nvm_ns * nvm_refs)) / 1000000); for (j=0; j < NCHAINS; j++) { free(C_dram[j]); free(C_nvm[j]); } } void *thread_fn(void *arg) { int interleave_dram = ((arg_s *) arg)->interleave_dram; int interleave_nvm = ((arg_s *) arg)->interleave_nvm; int dram_refs = ((arg_s *) arg)->mem_refs_dram; int nvm_refs = ((arg_s *) arg)->mem_refs_nvm; thread_iter(dram_refs, nvm_refs, interleave_dram, interleave_nvm); return 0; } void run_threads(int n_threads, int dram_refs, int nvm_refs, int interleaved_dram, int interleaved_nvm) { pthread_attr_t attr; int i; arg_s args; if ((n_threads > MAX_NUM_THREADS) || (n_threads <= 0)) { printf("INVALID RANGE:\n"); printf("\tMax number of threads is %d\n", MAX_NUM_THREADS); exit(-1); } if (dram_refs < 0 || nvm_refs < 0 || interleaved_dram < 0 || interleaved_nvm < 0) { printf("INVALID RANGE:\n"); printf("\tdram refs: %d, nvm refs: %d, interleaved dram refs: %d, interleaved nvm refs: %d\n", dram_refs, nvm_refs, interleaved_dram, interleaved_nvm); exit(-1); } if ((dram_refs > 0 && interleaved_dram == 0) || (nvm_refs > 0 && interleaved_nvm == 0)) { printf("INVALID ARGUMENTS:\n"); printf("\tnumber of accesses in sequence cannot be zero if the number of accesses for the same memory type is greater than zero.\n"); exit(-1); } if (dram_refs < interleaved_dram) { printf("INVALID ARGUMENTS:\n"); printf("\tnumber of DRAM accesses cannot be lower than the number of DRAM accesses in sequence\n"); exit(-1); } if (nvm_refs < interleaved_nvm) { printf("INVALID ARGUMENTS:\n"); printf("\tnumber of NVM accesses cannot be lower than the number of NVM accesses in sequence\n"); exit(-1); } if (pthread_attr_init(&attr) != 0) { printf("pthread_attr_init failed"); exit(-1); } //srand(time(NULL)); args.interleave_dram = interleaved_dram; args.interleave_nvm = interleaved_nvm; args.mem_refs_dram = dram_refs; args.mem_refs_nvm = nvm_refs; for (i = 0; i < n_threads; ++i) { pthread_create(&thread_desc[i], &attr, thread_fn, (void *)&args); } pthread_attr_destroy(&attr); for (i = 0; i < n_threads; ++i) { pthread_join(thread_desc[i], NULL); } } int main(int argn, char **argv) { int dram_refs; int nvm_refs; int interleaved_dram; int interleaved_nvm; int n_threads; if (argn != 6) { printf("INVALID ARGUMENTS:\n"); printf("\t%s [# threads] [# total dram accesses] [# total nvm accesses] [# dram accesses in sequence] [# nvm accesses in sequence]\n", argv[0]); return -1; } n_threads = atoi(argv[1]); dram_refs = atoi(argv[2]); nvm_refs = atoi(argv[3]); interleaved_dram = atoi(argv[4]); interleaved_nvm = atoi(argv[5]); run_threads(n_threads, dram_refs, nvm_refs, interleaved_dram, interleaved_nvm); return 0; } ================================================ FILE: bench/new_memlat/CMakeLists.txt ================================================ include_directories(${CMAKE_SOURCE_DIR}/src/lib) add_executable(new_memlat memlat.c) target_link_libraries(new_memlat nvmemul pthread) ================================================ FILE: bench/new_memlat/memlat.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include #include #include #include #include "model.h" #include "thread.h" #define MAX_NUM_THREADS 512 uint64_t g_seed, g_nchains, g_nelems, g_from_node_id, g_to_node_id, g_element_size, g_access_size; extern int measure_latency2(uint64_t seedin, int nchains, size_t nelems, int element_size, int access_size, int from_node_id, int to_node_id); static uint64_t safe_strtoull(const char *s) { char *ep; uint64_t r; assert(NULL != s && '\0' != *s); r = strtoull(s, &ep, 10); assert('\0' == *ep); return r; } extern latency_model_t latency_model; #ifdef MEMLAT_SUPPORT extern __thread int tls_hw_local_latency; extern __thread int tls_hw_remote_latency; extern __thread uint64_t tls_global_remote_dram; extern __thread uint64_t tls_global_local_dram; static inline uint64_t ns_to_cycles(int cpu_speed_mhz, int ns) { return (cpu_speed_mhz * ns) / 1000; } #endif void* worker(void* arg) { int latency_ns; #ifdef MEMLAT_SUPPORT uint64_t exp_stalls; uint64_t calc_nvm_accesses; uint64_t detected_hw_lat; uint64_t actual_lat = 0; uint64_t total_time; uint64_t fixed_latency_ns = 0; uint64_t nvm_accesses = 0; uint64_t nvm_hw_latency; #endif latency_ns = measure_latency2(g_seed, g_nchains, g_nelems, g_element_size, g_access_size, g_from_node_id, g_to_node_id); printf("latency_ns: %d ns\n", latency_ns); #ifdef MEMLAT_SUPPORT total_time = g_nelems * latency_ns; if (thread_self()->virtual_node->dram_node != thread_self()->virtual_node->nvram_node) { detected_hw_lat = ns_to_cycles(thread_self()->cpu_speed_mhz, tls_hw_remote_latency); if (tls_global_remote_dram > 0) { actual_lat = thread_self()->stall_cycles / tls_global_remote_dram; fixed_latency_ns = total_time / tls_global_remote_dram; nvm_accesses = tls_global_remote_dram; } nvm_hw_latency = tls_hw_remote_latency; } else { detected_hw_lat = ns_to_cycles(thread_self()->cpu_speed_mhz, tls_hw_local_latency); if (tls_global_local_dram > 0) { actual_lat = thread_self()->stall_cycles / tls_global_local_dram; fixed_latency_ns = total_time / tls_global_local_dram; nvm_accesses = tls_global_local_dram; } nvm_hw_latency = tls_hw_local_latency; } exp_stalls = g_nelems * detected_hw_lat; calc_nvm_accesses = thread_self()->stall_cycles / detected_hw_lat; printf("target latency: %d ns\n", latency_model.read_latency); printf("Error: %3.1f%%\n", (double)(abs(latency_model.read_latency - latency_ns)*100) / (double)latency_model.read_latency); printf("target NVM accesses: %ld\n", g_nelems); printf("detected HW latency: %ld ns\n", nvm_hw_latency); printf("detected HW latency: %ld cycles (detected_hw_lat making use of cpu_speed_mhz)\n", detected_hw_lat); printf("expected CPU stalls: %ld cycles (target_nvm_accesses * detected_hw_lat)\n", exp_stalls); printf("actual CPU stalls: %ld cycles\n", thread_self()->stall_cycles); printf("calculated NVM accesses: %ld (actual_cpu_stalls / detected_hw_lat)\n", calc_nvm_accesses); if (nvm_accesses != 0) { printf("actual NVM accesses: %ld\n", nvm_accesses); printf("actual latency: %ld cyles (actual_stalls / actual_nvm_accesses)\n", actual_lat); printf("fixed measured latency: %ld ns (total_chasing_time / actual_nvm_accesses)\n", fixed_latency_ns); printf("fixed latency error: %3.1f%%\n", (double)(abs(latency_model.read_latency - fixed_latency_ns)*100) / (double)latency_model.read_latency); } else { fixed_latency_ns = total_time / calc_nvm_accesses; printf("fixed measured latency: %ld ns (total_chasing_time / calculated_nvm_accesses)\n", fixed_latency_ns); printf("fixed latency error: %3.1f%%\n", (double)(abs(latency_model.read_latency - fixed_latency_ns)*100) / (double)latency_model.read_latency); } #endif return NULL; } int main(int argc, char *argv[]) { int i; uint64_t nthreads; pthread_t thread[MAX_NUM_THREADS]; if (9 != argc) { fprintf(stderr, "usage: %s PRNGseed Nthreads Nchains Nelems SZelem SZaccess from_node to_node\n", argv[0]); return 1; } g_seed = safe_strtoull(argv[1]); nthreads = safe_strtoull(argv[2]); g_nchains = safe_strtoull(argv[3]); g_nelems = safe_strtoull(argv[4]); g_element_size = safe_strtoull(argv[5]); g_access_size = safe_strtoull(argv[6]); g_from_node_id = safe_strtoull(argv[7]); g_to_node_id = safe_strtoull(argv[8]); for (i = 0; i< nthreads; i++) { pthread_create(&thread[i], NULL, worker, NULL); } for(i = 0 ; i < nthreads; i++) { pthread_join(thread[i], NULL); } return 0; } ================================================ FILE: bench/new_memlat/memlat.sh ================================================ ################################################################# #Copyright 2016 Hewlett Packard Enterprise Development LP. #This program is free software; you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation; either version 2 of the License, or (at #your option) any later version. This program is distributed in the #hope that it will be useful, but WITHOUT ANY WARRANTY; without even #the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR #PURPOSE. See the GNU General Public License for more details. You #should have received a copy of the GNU General Public License along #with this program; if not, write to the Free Software Foundation, #Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ################################################################# #!/bin/bash # percentage of error as threshold to discard outliers, anything above this percentage will be discarded MAX_ERROR_PERCENTAGE=10 # max number of tries to execute memlat MAX_TRIES=10 TEMP_FILE=/tmp/tmp_memlat.out NVM_EMUL_PATH="`dirname $0`/../.." NELEMS=$1 TARGET_DRAM=$2 function usage() { echo "$0 [number of elements] [0=local dram|1=remote dram]" exit 1 } function validate_decimal() { re='^[0-9]+$' if ! [[ $1 =~ $re ]] ; then return 1 fi return 0 } function check_parameters() { if [ $# -ne 2 ]; then echo "Incorrect arguments" usage fi validate_decimal ${NELEMS} if [ $? -ne 0 ]; then echo "Invalid number of arguments" usage fi if [ ${TARGET_DRAM} -ne 0 -a ${TARGET_DRAM} -ne 1 ]; then echo "Incorret dram target" usage fi } function verify_run { target=$(cat ${TEMP_FILE} | grep "target latency" | awk '{ print $3 }') measured=$(cat ${TEMP_FILE} | grep "measured latency" | awk '{ print $4 }') if [ ${measured} -gt ${target} ]; then delta=$(expr ${measured} - ${target}); else delta=$(expr ${target} - ${measured}); fi if [ ${target} -gt 0 ]; then error=$(expr ${delta} \* 100) error=$(expr ${error} \/ ${target}) else error=0 fi if [ ${error} -gt ${MAX_ERROR_PERCENTAGE} ]; then return 1 fi return 0 } ############ MAIN ###################### check_parameters $* # execute memlat in loop until the result is within the threshold or the max tries is reached for (( c=0; c<${MAX_TRIES}; c++ )); do ${NVM_EMUL_PATH}/scripts/runenv.sh ${NVM_EMUL_PATH}/build/bench/new_memlat/new_memlat 1 1 1 ${NELEMS} 64 8 0 ${TARGET_DRAM} &> ${TEMP_FILE} verify_run ret=$? if [ ${ret} -eq 0 ]; then cat ${TEMP_FILE} | grep "measured latency" break fi done if [ ${ret} -ne 0 ]; then echo "Could not produce a valid run" fi rm -f ${TEMP_FILE} exit ${ret} ================================================ FILE: benchmark-tests/bandwidth-model-building.sh ================================================ ################################################################# #Copyright 2016 Hewlett Packard Enterprise Development LP. #This program is free software; you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation; either version 2 of the License, or (at #your option) any later version. This program is distributed in the #hope that it will be useful, but WITHOUT ANY WARRANTY; without even #the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR #PURPOSE. See the GNU General Public License for more details. You #should have received a copy of the GNU General Public License along #with this program; if not, write to the Free Software Foundation, #Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ################################################################# #!/bin/bash echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor cp nvmemul-bandwidth.ini nvmemul.ini rm /tmp/bandwidth_model ../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 0 ../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 0 ================================================ FILE: benchmark-tests/memlat-bench-test-10M-single-socket.sh ================================================ ################################################################# #Copyright 2016 Hewlett Packard Enterprise Development LP. #This program is free software; you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation; either version 2 of the License, or (at #your option) any later version. This program is distributed in the #hope that it will be useful, but WITHOUT ANY WARRANTY; without even #the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR #PURPOSE. See the GNU General Public License for more details. You #should have received a copy of the GNU General Public License along #with this program; if not, write to the Free Software Foundation, #Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ################################################################# #!/bin/bash #awk '($1~/physical_nodes/) {print;}' nvmemul.ini echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor dir_name_res=FULL-RESULTS-test dir_name_sum=SUMMARY-RESULTS-test rm -rf $dir_name_sum mkdir $dir_name_sum rm -f foo* rm -rf $dir_name_res mkdir $dir_name_res cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor >> $dir_name_res/foo-runs-test cp nvmemul-orig.ini nvmemul.ini ../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 0 >foo for numchains in 1 do for epoch in 10000 do echo "#FORMAT #1_emul_lat(ns) #2_min_meas_lat(ns) #3_aver_meas_lat(ns) #4_max_meas_lat(ns) #5_aver_error(%) #6_max_error(%)" > $dir_name_sum/summary-nvm-lat-accuracy-epoch-$epoch-numchains-$numchains.txt for lat in 200 300 400 500 600 700 800 900 1000 do awk 'BEGIN {read_lat = substr(ARGV[2],3); epoch_lat = substr(ARGV[3],3);} (!(NR==7 || NR==9 || NR==10 || $1~/physical_nodes/)){ print;} (NR==7){ print $1,$2, read_lat,";";} (NR==9){ print $1,$2, epoch_lat,";";} (NR==10){ print $1,$2, epoch_lat,";";} ($1~/physical_nodes/) {print $1,$2,"\"0\""";";} ' nvmemul-orig.ini v=$lat v=$epoch > foo-nvmemul-$lat-$epoch.ini mv foo-nvmemul-$lat-$epoch.ini nvmemul.ini echo "lat epoch chains" $lat $epoch $numchains >> $dir_name_res/foo-runs for time in 1 2 3 4 5 6 7 8 9 10 do ../build/bench/memlat/memlat 1 1 $numchains 10000000 64 8 0 0 >> $dir_name_res/full_results-$lat-$epoch-$numchains.txt done grep latency_ns $dir_name_res/full_results-$lat-$epoch-$numchains.txt > $dir_name_res/results-$lat-$epoch-$numchains.txt awk 'BEGIN {max = 0; min = 1000000; sum = 0; aver=0.0; max_error=0.0; aver_error=0.0;read_lat = substr(ARGV[2],3);epoch_lat = substr(ARGV[3],3); MPL = substr(ARGV[4],3); } ($2 > max){max = $2;} ($2 < min){min = $2;} {sum=sum+$2; if ($2 < read_lat*1.0) {error=read_lat -$2} else {error=$2 - read_lat}; if (error > max_error) max_error=error;} END {aver=sum/NR; if (aver < read_lat*1.0) {aver_error = (read_lat - aver)*100.0/read_lat} else {aver_error = (aver - read_lat )*100.0/read_lat}; print read_lat, min,aver,max, aver_error,max_error*100.0/read_lat;} ' $dir_name_res/results-$lat-$epoch-$numchains.txt v=$lat v=$epoch v=$numchains >> $dir_name_sum/summary-nvm-lat-accuracy-epoch-$epoch-numchains-$numchains.txt done done done #FORMAT_summary-results: #1_nvm_lat(ns) #2_min_nvm_lat(ns) #3_aver_nvm_lat(ns) #4_max_nvm_lat(ns) #5_aver_error(%) #6_max_error(%) #parameter is nvm_lat ================================================ FILE: benchmark-tests/memlat-bench-test-10M.sh ================================================ ################################################################# #Copyright 2016 Hewlett Packard Enterprise Development LP. #This program is free software; you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation; either version 2 of the License, or (at #your option) any later version. This program is distributed in the #hope that it will be useful, but WITHOUT ANY WARRANTY; without even #the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR #PURPOSE. See the GNU General Public License for more details. You #should have received a copy of the GNU General Public License along #with this program; if not, write to the Free Software Foundation, #Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ################################################################# #!/bin/bash #awk '($1~/physical_nodes/) {print;}' nvmemul.ini num_sockets=$(cat /proc/cpuinfo | grep "physical id" | sort -u | wc -l) if [ $num_sockets -eq 1 ]; then echo "Single Socket" ./memlat-bench-test-10M-single-socket.sh exit 0 fi echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor dir_name_res=FULL-RESULTS-test dir_name_sum=SUMMARY-RESULTS-test rm -rf $dir_name_sum mkdir $dir_name_sum rm -f foo* rm -rf $dir_name_res mkdir $dir_name_res cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor >> $dir_name_res/foo-runs-test cp nvmemul-orig.ini nvmemul.ini ../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 1 >foo for conf in local remote do if [ $conf = local ]; then confpar=0 else confpar=1 fi for numchains in 1 do for epoch in 10000 do echo "#FORMAT #1_emul_lat(ns) #2_min_meas_lat(ns) #3_aver_meas_lat(ns) #4_max_meas_lat(ns) #5_aver_error(%) #6_max_error(%)" > $dir_name_sum/summary-nvm-lat-accuracy-$conf-epoch-$epoch-numchains-$numchains.txt for lat in 200 300 400 500 600 700 800 900 1000 do awk 'BEGIN {read_lat = substr(ARGV[2],3); epoch_lat = substr(ARGV[3],3); config = substr(ARGV[4],3);} (!(NR==7 || NR==9 || NR==10 || $1~/physical_nodes/)){ print;} (NR==7){ print $1,$2, read_lat,";";} (NR==9){ print $1,$2, epoch_lat,";";} (NR==10){ print $1,$2, epoch_lat,";";} ($1~/physical_nodes/ && config ~ /local/) {print $1,$2,"\"0\""";";} ($1~/physical_nodes/ && config ~ /remote/) {print $1,$2,"\"0,1\""";";} ' nvmemul-orig.ini v=$lat v=$epoch v=$conf > foo-nvmemul-$lat-$epoch.ini mv foo-nvmemul-$lat-$epoch.ini nvmemul.ini echo "lat epoch chains" $lat $epoch $numchains >> $dir_name_res/foo-runs for time in 1 2 3 4 5 6 7 8 9 10 do ../build/bench/memlat/memlat 1 1 $numchains 10000000 64 8 0 $confpar >> $dir_name_res/full_results-$conf-$lat-$epoch-$numchains.txt done grep latency_ns $dir_name_res/full_results-$conf-$lat-$epoch-$numchains.txt > $dir_name_res/results-$conf-$lat-$epoch-$numchains.txt awk 'BEGIN {max = 0; min = 1000000; sum = 0; aver=0.0; max_error=0.0; aver_error=0.0;read_lat = substr(ARGV[2],3);epoch_lat = substr(ARGV[3],3); MPL = substr(ARGV[4],3); } ($2 > max){max = $2;} ($2 < min){min = $2;} {sum=sum+$2; if ($2 < read_lat*1.0) {error=read_lat -$2} else {error=$2 - read_lat}; if (error > max_error) max_error=error;} END {aver=sum/NR; if (aver < read_lat*1.0) {aver_error = (read_lat - aver)*100.0/read_lat} else {aver_error = (aver - read_lat )*100.0/read_lat}; print read_lat, min,aver,max, aver_error,max_error*100.0/read_lat;} ' $dir_name_res/results-$conf-$lat-$epoch-$numchains.txt v=$lat v=$epoch v=$numchains >> $dir_name_sum/summary-nvm-lat-accuracy-$conf-epoch-$epoch-numchains-$numchains.txt done done done done #FORMAT_summary-results: #1_nvm_lat(ns) #2_min_nvm_lat(ns) #3_aver_nvm_lat(ns) #4_max_nvm_lat(ns) #5_aver_error(%) #6_max_error(%) #parameter is nvm_lat ================================================ FILE: benchmark-tests/memlat-orig-lat-test-single-socket.sh ================================================ ################################################################# #Copyright 2016 Hewlett Packard Enterprise Development LP. #This program is free software; you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation; either version 2 of the License, or (at #your option) any later version. This program is distributed in the #hope that it will be useful, but WITHOUT ANY WARRANTY; without even #the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR #PURPOSE. See the GNU General Public License for more details. You #should have received a copy of the GNU General Public License along #with this program; if not, write to the Free Software Foundation, #Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ################################################################# #!/bin/bash #awk '($1~/physical_nodes/) {print;}' nvmemul.ini echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor dir_name_res=ORIG-lat-test rm -f foo* rm -rf $dir_name_res mkdir $dir_name_res cp nvmemul-debug.ini nvmemul.ini ../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 0 for time in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 do ../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 0 > $dir_name_res/foo-hw-latency.txt grep "measuring latency: latency is" $dir_name_res/foo-hw-latency.txt > $dir_name_res/foo awk 'NR==1 {local=$7;} END {print local}' $dir_name_res/foo >> $dir_name_res/list-hw-latency.txt done echo "#FORMAT:#1_min #2_aver #3_max" > $dir_name_res/final-hw-latency.txt awk 'BEGIN {max1 = 0.0; min1 = 10000000.0; sum1 = 0.0;} ($1 > max1){max1 = $1;} ($1 < min1){min1 = $1;} {sum1=sum1+$1;sum2=sum2+$2;} END {print min1, sum1/NR, max1;}' $dir_name_res/list-hw-latency.txt >> $dir_name_res/final-hw-latency.txt rm $dir_name_res/foo* ================================================ FILE: benchmark-tests/memlat-orig-lat-test.sh ================================================ ################################################################# #Copyright 2016 Hewlett Packard Enterprise Development LP. #This program is free software; you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation; either version 2 of the License, or (at #your option) any later version. This program is distributed in the #hope that it will be useful, but WITHOUT ANY WARRANTY; without even #the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR #PURPOSE. See the GNU General Public License for more details. You #should have received a copy of the GNU General Public License along #with this program; if not, write to the Free Software Foundation, #Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ################################################################# #!/bin/bash #awk '($1~/physical_nodes/) {print;}' nvmemul.ini num_sockets=$(cat /proc/cpuinfo | grep "physical id" | sort -u | wc -l) if [ $num_sockets -eq 1 ]; then echo "Single Socket" ./memlat-orig-lat-test-single-socket.sh exit 0 fi echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor dir_name_res=ORIG-lat-test rm -f foo* rm -rf $dir_name_res mkdir $dir_name_res cp nvmemul-debug.ini nvmemul.ini ../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 1 #FORMAT: ns #FORMAT: min_local #2_aver_local max_local min_remote #5_aver_remote max_remote #FORMAT: for time in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 do ../build/bench/memlat/memlat 1 1 1 1000000 64 8 0 1 > $dir_name_res/foo-hw-latency.txt grep "measuring latency: latency is" $dir_name_res/foo-hw-latency.txt > $dir_name_res/foo awk 'NR==1 {local=$7;} NR==2 {remote=$7;} END {print local , remote}' $dir_name_res/foo >> $dir_name_res/list-hw-latency.txt done echo "#FORMAT:#1_min_local #2_aver_local #3_max_local #4_min_remote #5_aver_remote #6_max_remote" > $dir_name_res/final-hw-latency.txt awk 'BEGIN {max1 = 0.0; min1 = 10000000.0; max2 = 0.0; min2 = 10000000.0; sum1 = 0.0; sum2 = 0.0;} ($1 > max1){max1 = $1;} ($1 < min1){min1 = $1;} ($2 > max2){max2 = $2;} ($2 < min2){min2 = $2;} {sum1=sum1+$1;sum2=sum2+$2;} END {print min1, sum1/NR, max1, min2, sum2/NR, max2 ;}' $dir_name_res/list-hw-latency.txt >> $dir_name_res/final-hw-latency.txt rm $dir_name_res/foo* #FORMAT: ns #FORMAT:#1_min_local #2_aver_local #3_max_local #4_min_remote #5_aver_remote #6_max_remote ================================================ FILE: benchmark-tests/nvmemul-bandwidth.ini ================================================ # Configuration file latency: { enable = true; inject_delay = true; read = 1000; write = 1000; max_epoch_duration_us = 10000; min_epoch_duration_us = 10000; calibration = false; }; bandwidth: { enable = true; model = "/tmp/bandwidth_model"; read = 2000; write = 2000; }; topology: { mc_pci = "/tmp/mc_pci_bus"; physical_nodes = "0"; hyperthreading = true; # do not use multiple hardware threads per core }; statistics: { enable = true; #file = "/tmp/statistics"; }; debug: { # debugging level level = 5; verbose = 0; # modules set to True produce debugging output module: { all = False; }; }; ================================================ FILE: benchmark-tests/nvmemul-debug.ini ================================================ # Configuration file latency: { enable = true; inject_delay = true; read = 1000 ; write = 1000; max_epoch_duration_us = 10000 ; min_epoch_duration_us = 10000 ; calibration = false; }; bandwidth: { enable = false; model = "/tmp/bandwidth_model"; read = 2000; write = 2000; }; topology: { mc_pci = "/tmp/mc_pci_bus"; physical_nodes = "0,1"; hyperthreading = true; # do not use multiple hardware threads per core }; statistics: { enable = true; #file = "/tmp/statistics"; }; debug: { # debugging level level = 5; verbose = 0; # modules set to True produce debugging output module: { all = False; }; }; ================================================ FILE: benchmark-tests/nvmemul-orig.ini ================================================ # Configuration file latency: { enable = true; inject_delay = true; read = 1000 ; write = 1000; max_epoch_duration_us = 10000 ; min_epoch_duration_us = 10000 ; calibration = false; }; bandwidth: { enable = false; model = "/tmp/bandwidth_model"; read = 2000; write = 2000; }; topology: { mc_pci = "/tmp/mc_pci_bus"; physical_nodes = "0,1"; hyperthreading = true; # do not use multiple hardware threads per core }; statistics: { enable = true; #file = "/tmp/statistics"; }; debug: { # debugging level level = 3; verbose = 0; # modules set to True produce debugging output module: { all = False; }; }; ================================================ FILE: benchmark-tests/nvmemul.ini ================================================ # Configuration file latency: { enable = true; inject_delay = true; read = 300 ; write = 200; max_epoch_duration_us = 10000 ; min_epoch_duration_us = 10000 ; calibration = false; }; bandwidth: { enable = false; model = "/tmp/bandwidth_model"; read = 2000; write = 2000; }; topology: { mc_pci = "/tmp/mc_pci_bus"; physical_nodes = "0,1"; hyperthreading = true; # do not use multiple hardware threads per core }; statistics: { enable = true; #file = "/tmp/statistics"; }; debug: { # debugging level level = 5; verbose = 0; # modules set to True produce debugging output module: { all = False; }; }; ================================================ FILE: license.txt ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ ================================================ FILE: nvmemul-orig.ini ================================================ # Configuration file latency: { enable = true; inject_delay = true; read = 1000 ; write = 1000; max_epoch_duration_us = 10000 ; min_epoch_duration_us = 10000 ; calibration = false; }; bandwidth: { enable = false; model = "/tmp/bandwidth_model"; read = 2000; write = 2000; }; topology: { mc_pci = "/tmp/mc_pci_bus"; physical_nodes = "0,1"; hyperthreading = true; # do not use multiple hardware threads per core }; statistics: { enable = true; #file = "/tmp/statistics"; }; debug: { # debugging level level = 3; verbose = 0; # modules set to True produce debugging output module: { all = False; }; }; ================================================ FILE: nvmemul.dox ================================================ /** @mainpage Quartz: A Lightweight Performance Emulator for Persistent Memory Software. \section section-intro Introduction Quartz: A DRAM-based performance emulation platform that leverages features available in commodity hardware to emulate different latency and bandwidth characteristics of future byte-addressable NVM technologies. */ ================================================ FILE: nvmemul.ini ================================================ # Configuration file latency: { enable = true; inject_delay = true; read = 1000 ; write = 1000; max_epoch_duration_us = 10000 ; min_epoch_duration_us = 10000 ; calibration = false; }; bandwidth: { enable = false; model = "/tmp/bandwidth_model"; read = 500; write = 500; }; topology: { mc_pci = "/tmp/mc_pci_bus"; physical_nodes = "0,1"; hyperthreading = true; # do not use multiple hardware threads per core }; statistics: { enable = true; #file = "/tmp/statistics"; }; debug: { # debugging level level = 1; verbose = 0; # modules set to True produce debugging output module: { all = False; }; }; ================================================ FILE: scripts/install.sh ================================================ #!/bin/bash ################################################################# #Copyright 2016 Hewlett Packard Enterprise Development LP. #This program is free software; you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation; either version 2 of the License, or (at #your option) any later version. This program is distributed in the #hope that it will be useful, but WITHOUT ANY WARRANTY; without even #the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR #PURPOSE. See the GNU General Public License for more details. You #should have received a copy of the GNU General Public License along #with this program; if not, write to the Free Software Foundation, #Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ################################################################# PAPI_MAJOR=5 PAPI_MINOR=1 PAPI_RELEASE=1 CMAKE_MAJOR=2 CMAKE_MINOR=8 function install_deps_rpm() { yum install -q -y numactl-devel libconfig libconfig-devel cmake kernel-devel-`uname -r` msr-tools uthash-devel if [ $? -ne 0 ]; then echo "Dependencies installation failed" exit -1 fi } function install_deps_deb() { apt-get install -y libnuma-dev libconfig-dev cmake msr-tools uthash-dev if [ $? -ne 0 ]; then echo "Dependencies installation failed" exit -1 fi } function check_supported_papi() { major=`papi_version | cut -d ' ' -f3 | cut -d '-' -f1 | cut -d '.' -f1` minor=`papi_version | cut -d ' ' -f3 | cut -d '-' -f1 | cut -d '.' -f2` release=`papi_version | cut -d ' ' -f3 | cut -d '-' -f1 | cut -d '.' -f3` if [ ${major} -ne ${PAPI_MAJOR} ]; then echo "CMake version (${major}.${minor}.${release}) not supported (=${PAPI_MAJOR}.${PAPI_MINOR}.${PAPI_RELEASE})" exit -1 fi if [ ${minor} -ne ${PAPI_MINOR} ]; then echo "CMake version (${major}.${minor}.${release}) not supported (=${PAPI_MAJOR}.${PAPI_MINOR}.${PAPI_RELEASE})" exit -1 fi if [ ${release} -ne ${PAPI_RELEASE} ]; then echo "CMake version (${major}.${minor}.${release}) not supported (=${PAPI_MAJOR}.${PAPI_MINOR}.${PAPI_RELEASE})" exit -1 fi } function check_supported_cmake() { major=`cmake -version | head -1 | cut -d ' ' -f3 | cut -d '-' -f1 | cut -d '.' -f1` minor=`cmake -version | head -1 | cut -d ' ' -f3 | cut -d '-' -f1 | cut -d '.' -f2` if [ ${major} -lt ${CMAKE_MAJOR} ]; then echo "CMake version (${major}.${minor}) not supported (>=${CMAKE_MAJOR}.${CMAKE_MINOR})" exit -1 fi if [ ${major} -eq ${CMAKE_MAJOR} ]; then if [ ${minor} -lt ${CMAKE_MINOR} ]; then echo "CMake version (${major}.${minor}) not supported (>=${CMAKE_MAJOR}.${CMAKE_MINOR})" exit -1 fi fi } function check_supported_versions() { check_supported_cmake # check_supported_papi } #################### MAIN #################### if [ $(id -u) -ne 0 ]; then echo "You mut be root to execute this script" exit -1 fi if [ -f /etc/redhat-release ]; then install_deps_rpm elif [ -f /etc/centos-release ]; then install_deps_rpm elif [ -f /etc/debian_version -o -f /etc/debian-release ]; then install_deps_deb else echo "Linux distribution not supported" exit -1 fi check_supported_versions ================================================ FILE: scripts/runenv.sh ================================================ #!/bin/bash ################################################################# #Copyright 2016 Hewlett Packard Enterprise Development LP. #This program is free software; you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation; either version 2 of the License, or (at #your option) any later version. This program is distributed in the #hope that it will be useful, but WITHOUT ANY WARRANTY; without even #the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR #PURPOSE. See the GNU General Public License for more details. You #should have received a copy of the GNU General Public License along #with this program; if not, write to the Free Software Foundation, #Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ################################################################# NVM_EMUL_PATH="`dirname $0`/.." if [ -z "$1" ]; then echo "runenv.sh [cmd to run]" exit 1 fi rootdir="$NVM_EMUL_PATH" bindir=$rootdir"/build" if [ -f /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor ]; then current_scaling=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor); if [ "${current_scaling}" != "performance" ]; then file_list=$(ls /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor) for cpu_file in ${file_list}; do echo "performance" | sudo tee ${cpu_file} > /dev/null done fi fi $rootdir/scripts/turboboost.sh disable v=$(uname -r | cut -d '.' -f1) if [ $v -ge 4 ]; then echo "2" | sudo tee /sys/bus/event_source/devices/cpu/rdpmc fi export LD_PRELOAD=$bindir"/src/lib/libnvmemul.so" export NVMEMUL_INI=$rootdir"/nvmemul.ini" if [ ! -f ${LD_PRELOAD} ]; then echo "Library not found. Compile the emulator's library first." exit -1 fi echo $LD_PRELOAD echo $NVMEMUL_INI # execute the command passed as argument $@ ================================================ FILE: scripts/setupdev.sh ================================================ #!/bin/bash ################################################################# #Copyright 2016 Hewlett Packard Enterprise Development LP. #This program is free software; you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation; either version 2 of the License, or (at #your option) any later version. This program is distributed in the #hope that it will be useful, but WITHOUT ANY WARRANTY; without even #the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR #PURPOSE. See the GNU General Public License for more details. You #should have received a copy of the GNU General Public License along #with this program; if not, write to the Free Software Foundation, #Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ################################################################# NVM_EMUL_PATH="`dirname $0`/.." device_name="nvmemul" device_module_name=${device_name}".ko" device_path="/dev/${device_name}" device_module_path=`find ${NVM_EMUL_PATH}/build -name ${device_module_name}` function loaddev { if [ -z "${device_module_path}" ]; then echo "Module not found. Compile the emulator's source code first." exit -1 fi /sbin/insmod ${device_module_path} 2> /dev/null if [ $? -ne 0 ]; then lsmod | grep ${device_name} > /dev/null if [ $? -eq 0 ]; then echo "Kernel module already loaded, please reload it." exit 1 fi echo "Kernel module loading failed" exit 1 fi device_major=`grep ${device_name} /proc/devices | awk '{ print $1 }'` if [ $? -ne 0 -o -z "${device_major}" ]; then echo "Failed to detect module major" exit 1 fi rm -f ${device_path} if [ $? -ne 0 ]; then echo "Failed to delete kernel module device file" exit 1 fi mknod ${device_path} c ${device_major} 0 chmod a+wr ${device_path} lsmod | grep ${device_name} > /dev/null if [ $? -eq 0 ]; then echo "Kernel module loaded successfully" else echo "kernel module loading failed" exit 1 fi } function unloaddev { /sbin/rmmod ${device_name} 2> /dev/null rm -f ${device_path} if [ $? -eq 0 ]; then echo "Kernel module unloaded successfully" else echo "Failed to delete kernel module device file" exit 1 fi } function help() { echo "$0 " } ### MAIN ### if [ $(id -u) -ne 0 ]; then echo "You mut be root to execute this script" exit -1 fi if [ $# -eq 0 ]; then help exit 1 fi if [ "$1" = "load" ] || [ "$1" = "l" ]; then loaddev elif [ "$1" = "unload" ] || [ "$1" = "u" ]; then unloaddev elif [ "$1" = "reload" ] || [ "$1" = "r" ]; then unloaddev loaddev else help exit 1 fi exit 0 ================================================ FILE: scripts/turboboost.sh ================================================ #!/bin/bash ################################################################# #Copyright 2016 Hewlett Packard Enterprise Development LP. #This program is free software; you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation; either version 2 of the License, or (at #your option) any later version. This program is distributed in the #hope that it will be useful, but WITHOUT ANY WARRANTY; without even #the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR #PURPOSE. See the GNU General Public License for more details. You #should have received a copy of the GNU General Public License along #with this program; if not, write to the Free Software Foundation, #Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ################################################################# function usage() { echo "$0 [target CPU id]" echo -e "\tfunctions:" echo -e "\t\t check: verifies if a given CPU id has Turbo Boost enabled" echo -e "\t\t disable: disables a given CPU id or all CPUs if not specified" echo -e "\t\t enabled: enables a given CPU id or all CPUs if not specified" } function verify_cpu_id() { re='^[0-9]+$' if ! [[ $1 =~ $re ]]; then echo "CPU id is not a number" exit 1 fi } function check_msr_module() { lsmod | grep msr > /dev/null if [ $? -ne 0 ]; then # some systems need this, others don't sudo modprobe msr &> /dev/null #if [ $? -ne 0 ]; then # echo "Failed to load MSR module" # exit 1 #fi fi } function check() { cpu=$1 if [ -z "${cpu}" ]; then usage exit 1 fi cpus=$(lscpu | sed -n 4p | awk '{ print $2 }') if [ ${cpu} -ge ${cpus} ]; then echo "CPU id out of range" exit 1 fi disabled=$(sudo rdmsr -p${cpu} 0x1a0 -f 38:38) if [ "${disabled}" == "1" ]; then echo "Turbo Boost for processor ${cpu} is disabled" else echo "Turbo Boost for processor ${cpu} is enabled" fi } function enable() { cpu=$1 cpus=$(lscpu | sed -n 4p | awk '{ print $2 }') if [ -z "${cpu}" ]; then for (( i=0; i<${cpus}; i++ )); do sudo wrmsr -p$i 0x1a0 0x850089 done echo "Turbo Boost enabled for all CPUs" else if [ ${cpu} -ge ${cpus} ]; then echo "CPU id out of range" exit 1 fi sudo wrmsr -p${cpu} 0x1a0 0x850089 echo "Turbo Boost enabled for CPU ${cpu}" fi } function disable() { cpu=$1 cpus=$(lscpu | sed -n 4p | awk '{ print $2 }') if [ -z "${cpu}" ]; then for (( i=0; i<${cpus}; i++ )); do sudo wrmsr -p$i 0x1a0 0x4000850089; done echo "Turbo Boost disabled for all CPUs" else if [ ${cpu} -ge ${cpus} ]; then echo "CPU id out of range" exit 1 fi sudo wrmsr -p${cpu} 0x1a0 0x4000850089; echo "Turbo Boost disabled for CPU ${cpu}" fi } ### MAIN ### if [ $# -eq 0 ]; then usage exit 1 fi funct=$1 target_cpu=$2 check_msr_module if [ ! -z "${target_cpu}" ]; then verify_cpu_id ${target_cpu} fi case ${funct} in "enable") enable ${target_cpu} ;; "disable") disable ${target_cpu} ;; "check") check ${target_cpu} ;; *) usage exit 1 esac exit 0 ================================================ FILE: src/CMakeLists.txt ================================================ add_subdirectory(lib) add_subdirectory(dev) ================================================ FILE: src/dev/CMakeLists.txt ================================================ # Build NVM Emulation device driver (using Kbuild Makefile) set(DEV_DIR "${CMAKE_CURRENT_SOURCE_DIR}") set(DEV_BIN_DIR "${CMAKE_CURRENT_BINARY_DIR}") set(DEV_KERNEL_MODULE "${DEV_BIN_DIR}/nvmemul.ko") mark_as_advanced(DEV_DIR DEV_BIN_DIR) # We invoke make in build folder to keep the glog's source folder clean. file(MAKE_DIRECTORY ${DEV_BIN_DIR}) add_custom_command(OUTPUT ${DEV_KERNEL_MODULE} COMMAND ${CMAKE_COMMAND} -E copy_directory ${DEV_DIR} ${DEV_BIN_DIR} COMMAND ${CMAKE_MAKE_PROGRAM} -j COMMENT [Build-NVM Emulation Device] WORKING_DIRECTORY "${DEV_BIN_DIR}" DEPENDS ${DEV_DIR}/pmc.c # just to see if it has been overwritten ) # we use add_custom_command for the build itself because otherwise we have to build it # every time. the following add_custom_target gives a name for the output. add_custom_target(dev_build ALL DEPENDS ${DEV_KERNEL_MODULE}) ================================================ FILE: src/dev/Makefile ================================================ # build modules obj-m = nvmemul.o nvmemul-objs = pmc.o # use the kernel build system KERNEL_VERSION := `uname -r` KERNEL_SOURCE := /lib/modules/$(KERNEL_VERSION)/build SRCDIR=`pwd` OBJDIR=`pwd` all: make -C $(KERNEL_SOURCE) M=$(OBJDIR) modules clean: make -C $(KERNEL_SOURCE) M=$(OBJDIR) clean ================================================ FILE: src/dev/ioctl_query.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __IOCTL_QUERY_H #define __IOCTL_QUERY_H #include #define MYDEV_MAGIC (0xAA) typedef struct { unsigned int counter_id; unsigned int event_id; } ioctl_query_setcounter_t; typedef struct { unsigned int bus_id; unsigned int device_id; unsigned int function_id; unsigned int offset; unsigned int val; } ioctl_query_setgetpci_t; #define IOCTL_SETCOUNTER _IOR(MYDEV_MAGIC, 0, ioctl_query_setcounter_t *) #define IOCTL_SETPCI _IOR(MYDEV_MAGIC, 1, ioctl_query_setgetpci_t *) #define IOCTL_GETPCI _IOWR(MYDEV_MAGIC, 2, ioctl_query_setgetpci_t *) #endif /* __IOCTL_QUERY_H */ ================================================ FILE: src/dev/pmc.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ioctl_query.h" static long pmc_ioctl(struct file *f, unsigned int cmd, unsigned long arg); //unsigned long read_cr4(void); //void write_cr4(unsigned long); #ifndef read_cr4 #define read_cr4 native_read_cr4 #endif #ifndef write_cr4 #define write_cr4 native_write_cr4 #endif struct file_operations pmc_fops = { .unlocked_ioctl = pmc_ioctl, .compat_ioctl = pmc_ioctl, }; static const char* module_name = "nvmemul"; static int mod_major = 0; static const int NVMEMUL_MAJOR = 0; const const int PERFCTR0 = 0xc1; const const int PERFEVENTSEL0 = 0x186; void pmc_set_pce_bit(void* arg) { unsigned long cr4reg; cr4reg = read_cr4(); cr4reg |= 0x100; // setting the PCE bit write_cr4(cr4reg); } int pmc_init_module(void) { printk(KERN_INFO "%s: Loading. Initializing...\n", module_name); if ((mod_major = register_chrdev(NVMEMUL_MAJOR, module_name, &pmc_fops)) == -EBUSY) { printk(KERN_INFO "%s: Unable to get major for %s device\n", module_name, module_name); return -EIO; } if (mod_major <= 0) { printk(KERN_INFO "%s: Unable to get major for %s device\n", module_name, module_name); return -EIO; } printk(KERN_INFO "%s: major is %d\n", module_name, mod_major); /* * In order to use the rdpmc instruction in user mode, we need to set the * PCE bit of CR4. PCE is 8th bit of cr4, and 256 is 2 << 8 */ pmc_set_pce_bit(NULL); smp_call_function(pmc_set_pce_bit, NULL, 1); return 0; } void pmc_exit_module(void) { printk(KERN_INFO "%s: Unloading. Cleaning up...\n", module_name); /* Freeing the major number */ unregister_chrdev(mod_major, module_name); } struct counter_s { int counter_id; unsigned long val; }; /* * pmc_clear clears the PMC specified by counter * counter = 0 => perfctr0 * counter = 1 => perfctr1 * it uses WRMSR to write the values in the counters */ static void __pmc_clear(int counter_id) { int counterRegister = PERFCTR0 + counter_id; /* clear the old register */ __asm__ __volatile__("mov %0, %%ecx\n\t" "xor %%edx, %%edx\n\t" "xor %%eax, %%eax\n\t" "wrmsr\n\t" : /* no outputs */ : "m" (counterRegister) : "eax", "ecx", "edx" /* all clobbered */); } static void pmc_clear(void* arg) { struct counter_s* counter = (struct counter_s*) arg; __pmc_clear(counter->counter_id); } void pmc_clear_all_cpu(int counter_id) { struct counter_s counter = { counter_id, 0}; pmc_clear((void*) &counter); smp_call_function(pmc_clear, (void*) &counter, 1); } /* * This function writes the value specified by the arg to the counter * indicated by counter */ static void __set_counter(int counter_id, unsigned long val) { int selectionRegister = PERFEVENTSEL0 + counter_id; __pmc_clear(counter_id); /* set the value */ __asm__ __volatile__("mov %0, %%ecx\n\t" /* ecx contains the number of the MSR to set */ "xor %%edx, %%edx\n\t"/* edx contains the high bits to set the MSR to */ "mov %1, %%eax\n\t" /* eax contains the low bits to set the MSR to */ "wrmsr\n\t" : /* no outputs */ : "m" (selectionRegister), "m" (val) : "eax", "ecx", "edx" /* clobbered */); } void set_counter(void* arg) { struct counter_s* counter = (struct counter_s*) arg; __set_counter(counter->counter_id, counter->val); } void set_counter_all_cpu(int counter_id, unsigned long arg) { struct counter_s counter = { counter_id, arg}; set_counter((void*) &counter); smp_call_function(set_counter, (void*) &counter, 1); } static long pmc_ioctl_setcounter(struct file* f, unsigned int cmd, unsigned long arg) { ioctl_query_setcounter_t q; if (copy_from_user(&q, (ioctl_query_setcounter_t*) arg, sizeof(ioctl_query_setcounter_t))) { return -EFAULT; } if ((q.counter_id < 0) || (q.counter_id > 3)) { printk(KERN_INFO "%s: set_counter illegal value 0x%x for counter\n", module_name, q.counter_id); return -ENXIO; } /* disable counter */ set_counter_all_cpu(q.counter_id, 0); pmc_clear_all_cpu(q.counter_id); /* set counter */ set_counter_all_cpu(q.counter_id, q.event_id); printk(KERN_INFO "%s: setcounter counter_id: 0x%x event_id=0x%x\n", module_name, q.counter_id, q.event_id); return 0; } static long pmc_ioctl_setpci(struct file* f, unsigned int cmd, unsigned long arg) { ioctl_query_setgetpci_t q; struct pci_bus *bus = NULL; if (copy_from_user(&q, (ioctl_query_setgetpci_t*) arg, sizeof(ioctl_query_setgetpci_t))) { return -EFAULT; } while ((bus = pci_find_next_bus(bus))) { if (q.bus_id == bus->number) { pci_bus_write_config_word(bus, PCI_DEVFN(q.device_id, q.function_id), q.offset, (u16) q.val); printk(KERN_INFO "%s: setpci bus_id=0x%x device_id=0x%x, function_id=0x%x, val=0x%x\n", module_name, q.bus_id, q.device_id, q.function_id, q.val); return 0; } } return -ENXIO; } static long pmc_ioctl_getpci(struct file* f, unsigned int cmd, unsigned long arg) { ioctl_query_setgetpci_t q; struct pci_bus *bus = NULL; if (copy_from_user(&q, (ioctl_query_setgetpci_t*) arg, sizeof(ioctl_query_setgetpci_t))) { return -EFAULT; } while ((bus = pci_find_next_bus(bus))) { if (q.bus_id == bus->number) { unsigned int val = 0; pci_bus_read_config_word(bus, PCI_DEVFN(q.device_id, q.function_id), q.offset, (u16*) &val); printk(KERN_INFO "%s: getpci bus_id 0x%x device_id 0x%x, function_id 0x%x, offset 0x%x, val 0x%x\n", module_name, q.bus_id, q.device_id, q.function_id, q.offset, val); q.val = val; if (copy_to_user((ioctl_query_setgetpci_t*) arg, &q, sizeof(ioctl_query_setgetpci_t))) { return -EFAULT; } return 0; } } return -ENXIO; } static long pmc_ioctl(struct file *f, unsigned int cmd, unsigned long arg) { int ret = -1; printk(KERN_INFO "%s: ioctl command: 0x%x\n", module_name, cmd); switch (cmd) { case IOCTL_SETCOUNTER: ret = pmc_ioctl_setcounter(f, cmd, arg); break; case IOCTL_SETPCI: ret = pmc_ioctl_setpci(f, cmd, arg); break; case IOCTL_GETPCI: ret = pmc_ioctl_getpci(f, cmd, arg); break; default: printk(KERN_INFO "%s: ioctl illegal command: 0x%x\n", module_name, cmd); break; } return ret; } /* Declaration of the init and exit functions */ module_init(pmc_init_module); module_exit(pmc_exit_module); MODULE_LICENSE("GPL"); MODULE_AUTHOR("HPLabs"); ================================================ FILE: src/lib/CMakeLists.txt ================================================ project(nvmemul) option(STATISTICS "Enable statistics report" ON) if(STATISTICS) message(STATUS "WITH STATISTICS") add_definitions(-DUSE_STATISTICS) else() message(STATUS "WITHOUT STATISTICS") endif() set(nvmemul_src config.c debug.c dev.c init.c interpose.c measure_bw.c measure_lat.c misc.c monotonic_timer.c model_bw.c model_lat.c pflush.c pmalloc.c stat.c thread.c topology.c process_rank.c ) include_directories(${CMAKE_SOURCE_DIR}/third_party) include_directories(${CMAKE_SOURCE_DIR}/src) include_directories(${CMAKE_SOURCE_DIR}/src/lib) add_definitions(-g) add_definitions(-O2) add_definitions(-fPIC) add_definitions(-Wall) add_definitions(-march=native) add_definitions(-fopenmp) add_definitions(-std=gnu89) #add_definitions(-DNDEBUG) #add_definitions(-std=c99) add_definitions(-msse4) add_subdirectory(cpu) add_library(nvmemul SHARED ${nvmemul_src} $) target_link_libraries(nvmemul dl) target_link_libraries(nvmemul config) target_link_libraries(nvmemul numa) target_link_libraries(nvmemul rt) target_link_libraries(nvmemul m) target_link_libraries(nvmemul gomp) ================================================ FILE: src/lib/config.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include "config.h" #include #include #include #include #include #define ENVVAR_MAX_LEN 128 static char* __getenv(const char* prefix, const char* name) { char normalized_name[ENVVAR_MAX_LEN]; if ((strlen(name) + strlen(prefix) + 1) > ENVVAR_MAX_LEN) { return NULL; } strcpy(normalized_name, prefix); strcat(normalized_name, "_"); strcat(normalized_name, name); return getenv(normalized_name); } static inline int env_setting_lookup(const char *name, char **value_str) { char *val; char normalized_name[ENVVAR_MAX_LEN]; int i; if ((strlen(name)) > ENVVAR_MAX_LEN) { return CONFIG_FALSE; } for (i=0; name[i]; i++) { if (name[i] == '.') { normalized_name[i] = '_'; } else { normalized_name[i] = toupper(name[i]); } } normalized_name[i] = '\0'; val = __getenv(ENVVAR_PREFIX, normalized_name); if (val) { *value_str = val; return CONFIG_TRUE; } else { return CONFIG_FALSE; } } static inline int env_setting_lookup_int(const char *name, int *value) { char *value_str; if (env_setting_lookup(name, &value_str) == CONFIG_FALSE) { return CONFIG_FALSE; } if (value_str) { *value = atoi(value_str); return CONFIG_TRUE; } else { return CONFIG_FALSE; } } static inline int env_setting_lookup_bool(const char *name, int *value) { return env_setting_lookup_int(name, value); } static inline int env_setting_lookup_string(const char *name, char **value) { return env_setting_lookup(name, value); } int __cconfig_lookup_bool(config_t *cfg, const char *name, int *value) { int val; int found_val = 0; if (env_setting_lookup_bool(name, &val) == CONFIG_TRUE) { found_val = 1; } else { if (config_lookup_bool(cfg, name, &val) == CONFIG_TRUE) { found_val = 1; } } if (found_val) { *value = val; return CONFIG_TRUE; } return CONFIG_FALSE; } int __cconfig_lookup_valid_bool(config_t *cfg, const char *name, int *value, int validity_check, ...) { return __cconfig_lookup_bool(cfg, name, value); } int __cconfig_lookup_int(config_t *cfg, const char *name, int *value) { int val; int found_val = 0; if (env_setting_lookup_int(name, &val) == CONFIG_TRUE) { found_val = 1; } else { // third parameter changed from libconfig 1.3 to 1.4, it was 'long' and now it is 'int' if (config_lookup_int(cfg, name, &val) == CONFIG_TRUE) { found_val = 1; } } if (found_val) { *value = val; return CONFIG_TRUE; } return CONFIG_FALSE; } int __cconfig_lookup_valid_int(config_t *cfg, const char *name, int *value, int validity_check, ...) { int min; int max; int list_length; int i; int val; int listval; va_list ap; if (__cconfig_lookup_int(cfg, name, &val) == CONFIG_TRUE) { switch (validity_check) { case CONFIG_NO_CHECK: *value = val; return CONFIG_TRUE; case CONFIG_RANGE_CHECK: va_start(ap, validity_check); min = va_arg(ap, int); max = va_arg(ap, int); va_end(ap); if (*value >= min && *value <= max) { *value = val; return CONFIG_TRUE; } break; case CONFIG_LIST_CHECK: va_start(ap, validity_check); list_length = va_arg(ap, int); for (i=0; i #include #define ENVVAR_PREFIX "NVMEMUL" #ifdef __cplusplus extern "C" { #endif /* Make sure we don't redefine a macro already defined in libconfig.h */ #ifdef CONFIG_NO_CHECK # error "ERROR: Redefining previously defined CONFIG_NO_CHECK" #else # define CONFIG_NO_CHECK 0 #endif #ifdef CONFIG_RANGE_CHECK # error "ERROR: Redefining previously defined CONFIG_RANGE_CHECK" #else # define CONFIG_RANGE_CHECK 1 #endif #ifdef CONFIG_LIST_CHECK # error "ERROR: Redefining previously defined CONFIG_LIST_CHECK" #else # define CONFIG_LIST_CHECK 2 #endif /** * The lookup functions return the value of a configuration variable based on * the following order: * 1) value of environment variable * 2) value in configuration file variable * * If the variable is not found then a lookup function does not set the value. */ int __cconfig_lookup_bool(config_t *cfg, const char *name, int *value); int __cconfig_lookup_int(config_t *cfg, const char *name, int *value); int __cconfig_lookup_string(config_t *cfg, const char *name, char **value); int __cconfig_lookup_valid_bool(config_t *cfg, const char *name, int *value, int validity_check, ...); int __cconfig_lookup_valid_int(config_t *cfg, const char *name, int *value, int validity_check, ...); int __cconfig_lookup_valid_string(config_t *cfg, const char *name, char **value, int validity_check, ...); int __cconfig_init(config_t *cfg, const char *config_file); #ifdef __cplusplus } #endif #endif /* __CONFIG_H */ ================================================ FILE: src/lib/cpu/CMakeLists.txt ================================================ set(nvmemul_cpu_src cpu.c pmc.c ) add_library(cpu OBJECT ${nvmemul_cpu_src}) ================================================ FILE: src/lib/cpu/cpu.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include #include #include #include "cpu.h" #include "dev.h" #include "error.h" #include "misc.h" #include "known_cpus.h" #include "xeon-ex.h" #include // Mainline architectures and processors available here: // https://software.intel.com/en-us/articles/intel-architecture-and-processor-identification-with-cpuid-model-and-family-numbers // // It turns out that CPUID is not an accurate approach to identifying a // processor as different processors may have the same CPUID. // So instead we rely on the brand string returned by /proc/cpuinfo:model_name #define MASK(msb, lsb) (~((~0) << (msb + 1)) & ((~0) << lsb)) #define EXTRACT(val, msb, lsb) ((MASK(msb, lsb) & val) >> lsb) #define MODEL(eax) EXTRACT(eax, 7, 4) #define EXTENDED_MODEL(eax) EXTRACT(eax, 19, 16) #define MODEL_NUMBER(eax) ((EXTENDED_MODEL(eax) << 4) | MODEL(eax)) #define FAMILY(eax) EXTRACT(eax, 11, 8) #define Extended_Family(eax) EXTRACT(eax, 27, 20) #define Family_Number(eax) (FAMILY(eax) + Extended_Family(eax)) void cpuid(unsigned int info, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { __asm__( "cpuid;" : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) : "a"(info)); } void get_family_model(int *family, int *model) { unsigned int eax, ebx, ecx, edx; int success = __get_cpuid(1, &eax, &ebx, &ecx, &edx); if (family != NULL) { *family = success ? Family_Number(eax) : 0; } if (model != NULL) { *model = success ? MODEL_NUMBER(eax) : 0; } } // caller is responsible for freeing memory allocated by this function char *cpuinfo(char *valname) { FILE *fp; char *line = NULL; size_t len = 0; ssize_t read; fp = fopen("/proc/cpuinfo", "r"); if (fp == NULL) { return NULL; } while ((read = getline(&line, &len, fp)) != -1) { if (strstr(line, valname)) { char *colon = strchr(line, ':'); int len = colon - line; char *buf = malloc(strlen(line) - len); strcpy(buf, &line[len + 2]); free(line); fclose(fp); return buf; } } free(line); fclose(fp); return NULL; } // reads current cpu frequency through the /proc/cpuinfo file // avoid calling this function often int cpu_speed_mhz() { size_t val; char *str = cpuinfo("cpu MHz"); val = string_to_size(str); free(str); return val; } // reads cpu LLC cache size through the /proc/cpuinfo file // avoid calling this function often size_t cpu_llc_size_bytes() { size_t val; char *str = cpuinfo("cache size"); val = string_to_size(str); free(str); return val; } // caller is responsible for freeing memory allocated by this function char *cpu_model_name() { return cpuinfo("model name"); } int match(const char *to_match, const char *regex_text) { int ret; const char *p = to_match; regex_t regex; regmatch_t m[1]; if ((ret = regcomp(®ex, regex_text, REG_EXTENDED | REG_NEWLINE)) != 0) { return E_ERROR; } if ((ret = regexec(®ex, p, 1, m, 0))) { regfree(®ex); return E_ERROR; // no match } regfree(®ex); return E_SUCCESS; } int is_Xeon() { char *model_name; if ((model_name = cpu_model_name()) == NULL) { return 0; } if (match(model_name, "Xeon") == E_SUCCESS) { free(model_name); return 1; } else { free(model_name); return 0; } } int is_Intel() { char *model_name; if ((model_name = cpu_model_name()) == NULL) { return 0; } if (match(model_name, "Intel") == E_SUCCESS) { free(model_name); return 1; } else { free(model_name); return 0; } } cpu_model_t *cpu_model() { int i, family, model; cpu_model_t *cpu_model = NULL; if (!is_Intel()) return NULL; get_family_model(&family, &model); int isXeon = is_Xeon(); for (i = 0; known_cpus[i].microarch != Invalid; i++) { microarch_ID_t c = known_cpus[i]; if (c.family == family && c.model == model) { switch (c.microarch) { case SandyBridge: cpu_model = &cpu_model_intel_xeon_ex; break; case IvyBridge: cpu_model = &cpu_model_intel_xeon_ex_v2; break; case Haswell: cpu_model = &cpu_model_intel_xeon_ex_v3; break; default: return NULL; } if (!isXeon) cpu_model->microarch = (microarch_t)(cpu_model->microarch - 1); DBG_LOG(INFO, "Detected CPU model '%s'\n", microarch_strings[cpu_model->microarch]); break; } } if (!cpu_model) { return NULL; } // complete the model with some runtime information cpu_model->llc_size_bytes = cpu_llc_size_bytes(); // cpu_model->speed_mhz = cpu_speed_mhz(); return cpu_model; } ================================================ FILE: src/lib/cpu/cpu.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __CPU_H #define __CPU_H #include #include #include "dev.h" #define MAX_THROTTLE_VALUE 1023 int set_throttle_register(int node, uint64_t val); size_t cpu_llc_size_bytes(); struct pmc_set_s; typedef enum { THROTTLE_DDR_ACT = 0, THROTTLE_DDR_READ, THROTTLE_DDR_WRITE } throttle_type_t; // order matters. see cpu_model() typedef enum { Invalid, SandyBridge, SandyBridgeXeon, IvyBridge, IvyBridgeXeon, Haswell, HaswellXeon } microarch_t; typedef struct { int family; int model; microarch_t microarch; } microarch_ID_t; /** * CPU object that encapsulates processor-specific methods for accessing * performance counters and memory controller PCI registers */ typedef struct cpu_model_s { microarch_t microarch; // processor description size_t llc_size_bytes; // last level cache size // int speed_mhz; // cpu clock frequency struct pmc_events_s* pmc_events; // performance monitoring events supported by the processor int (*set_throttle_register)(pci_regs_t *regs, throttle_type_t throttle_type, uint16_t val); int (*get_throttle_register)(pci_regs_t *regs, throttle_type_t throttle_type, uint16_t* val); } cpu_model_t; cpu_model_t* cpu_model(); int cpu_speed_mhz(); #endif /* __CPU_H */ ================================================ FILE: src/lib/cpu/haswell-papi.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __CPU_HASWELL_H #define __CPU_HASWELL_H #include #include "debug.h" // Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with // applications to list all available performance events with their architecture specific // detailed description and translate them to their respective event code. 'showevtinfo' application can // be used to list all available performance event names with detailed description and 'check_events' application // can be used to translate the performance event to the corresponding event code. // These events will be initialized and started. // Every event reading will return an array with the values for all these events. // The array index is the same index used to define the event in the *_native_events array below const char *haswell_native_events[MAX_NUM_EVENTS] = { "CYCLE_ACTIVITY:STALLS_L2_PENDING", "MEM_LOAD_UOPS_L3_HIT_RETIRED:XSNP_NONE", "MEM_LOAD_UOPS_L3_MISS_RETIRED:REMOTE_DRAM", "MEM_LOAD_UOPS_L3_MISS_RETIRED:LOCAL_DRAM" }; uint64_t haswell_read_stall_events_local() { long long values[MAX_NUM_EVENTS]; uint64_t events = 0; if (pmc_events_read_local_thread(values) == PAPI_OK) { uint64_t l2_pending = values[0]; uint64_t llc_hit = values[1]; uint64_t remote_dram = values[2]; uint64_t local_dram = values[3]; DBG_LOG(DEBUG, "read stall L2 cycles %lu; llc_hit %lu; remote_dram %lu; local_dram %lu\n", l2_pending, llc_hit, remote_dram, local_dram); double num = remote_dram + local_dram; double den = num + llc_hit; if (den == 0) return 0; events = (uint64_t)((double)l2_pending * ((double)num / den)); } else { DBG_LOG(ERROR, "read stall cycles failed\n"); } return events; } uint64_t haswell_read_stall_events_remote() { long long values[MAX_NUM_EVENTS]; uint64_t events = 0; if (pmc_events_read_local_thread(values) == PAPI_OK) { uint64_t l2_pending = values[0]; uint64_t llc_hit = values[1]; uint64_t remote_dram = values[2]; uint64_t local_dram = values[3]; DBG_LOG(DEBUG, "read stall L2 cycles %lu; llc_hit %lu; remote_dram %lu; local_dram %lu\n", l2_pending, llc_hit, remote_dram, local_dram); // calculate stalls based on l2 stalls and LLC miss/hit double num = remote_dram + local_dram; double den = num + llc_hit; if (den == 0) return 0; double stalls = (double)l2_pending * ((double)num / den); // calculate remote dram stalls based on total stalls and local/remote dram accesses den = remote_dram + local_dram; if (den == 0) return 0; events = (uint64_t) (stalls * ((double)remote_dram / den)); } else { DBG_LOG(ERROR, "read stall cycles failed\n"); } return events; } #endif /* __CPU_HASWELL_H */ ================================================ FILE: src/lib/cpu/haswell.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __CPU_HASWELL_H #define __CPU_HASWELL_H #include #include "thread.h" #include "cpu/pmc.h" #include "debug.h" // Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with // applications to list all available performance events with their architecture specific // detailed description and translate them to their respective event code. 'showevtinfo' application can // be used to list all available performance event names with detailed description and 'check_events' application // can be used to translate the performance event to the corresponding event code. extern __thread int tls_hw_local_latency; extern __thread int tls_hw_remote_latency; #ifdef MEMLAT_SUPPORT extern __thread uint64_t tls_global_remote_dram; extern __thread uint64_t tls_global_local_dram; #endif #undef FOREACH_PMC_HW_EVENT #define FOREACH_PMC_HW_EVENT(ACTION) \ ACTION("CYCLE_ACTIVITY:STALLS_L2_PENDING", NULL, 0x55305a3) \ ACTION("MEM_LOAD_UOPS_L3_HIT_RETIRED:XSNP_NONE", NULL, 0x5308d2) \ ACTION("MEM_LOAD_UOPS_L3_MISS_RETIRED:REMOTE_DRAM", NULL, 0x530cd3) \ ACTION("MEM_LOAD_UOPS_L3_MISS_RETIRED:LOCAL_DRAM", NULL, 0x5303d3) #undef FOREACH_PMC_EVENT #define FOREACH_PMC_EVENT(ACTION, prefix) \ ACTION(ldm_stall_cycles, prefix) \ ACTION(remote_dram, prefix) #define L3_FACTOR 7.0 DECLARE_ENABLE_PMC(haswell, ldm_stall_cycles) { ASSIGN_PMC_HW_EVENT_TO_ME("CYCLE_ACTIVITY:STALLS_L2_PENDING", 0); ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_L3_HIT_RETIRED:XSNP_NONE", 1); ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_L3_MISS_RETIRED:REMOTE_DRAM", 2); ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_L3_MISS_RETIRED:LOCAL_DRAM", 3); return E_SUCCESS; } DECLARE_CLEAR_PMC(haswell, ldm_stall_cycles) { } DECLARE_READ_PMC(haswell, ldm_stall_cycles) { uint64_t l2_pending_diff = READ_MY_HW_EVENT_DIFF(0); uint64_t llc_hit_diff = READ_MY_HW_EVENT_DIFF(1); uint64_t remote_dram_diff = READ_MY_HW_EVENT_DIFF(2); uint64_t local_dram_diff = READ_MY_HW_EVENT_DIFF(3); DBG_LOG(DEBUG, "read stall L2 cycles diff %lu; llc_hit %lu; cycles diff remote_dram %lu; local_dram %lu\n", l2_pending_diff, llc_hit_diff, remote_dram_diff, local_dram_diff); if ((remote_dram_diff == 0) && (local_dram_diff == 0)) return 0; #ifdef MEMLAT_SUPPORT tls_global_local_dram += local_dram_diff; #endif // calculate stalls based on L2 stalls and LLC miss/hit double num = L3_FACTOR * (remote_dram_diff + local_dram_diff); double den = num + llc_hit_diff; if (den == 0) return 0; return (uint64_t) ((double)l2_pending_diff * (num / den)); } DECLARE_ENABLE_PMC(haswell, remote_dram) { ASSIGN_PMC_HW_EVENT_TO_ME("CYCLE_ACTIVITY:STALLS_L2_PENDING", 0); ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_L3_HIT_RETIRED:XSNP_NONE", 1); ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_L3_MISS_RETIRED:REMOTE_DRAM", 2); ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_L3_MISS_RETIRED:LOCAL_DRAM", 3); return E_SUCCESS; } DECLARE_CLEAR_PMC(haswell, remote_dram) { } DECLARE_READ_PMC(haswell, remote_dram) { uint64_t l2_pending_diff = READ_MY_HW_EVENT_DIFF(0); uint64_t llc_hit_diff = READ_MY_HW_EVENT_DIFF(1); uint64_t remote_dram_diff = READ_MY_HW_EVENT_DIFF(2); uint64_t local_dram_diff = READ_MY_HW_EVENT_DIFF(3); DBG_LOG(DEBUG, "read stall L2 cycles diff %lu; llc_hit %lu; cycles diff remote_dram %lu; local_dram %lu\n", l2_pending_diff, llc_hit_diff, remote_dram_diff, local_dram_diff); if ((remote_dram_diff == 0) && (local_dram_diff == 0)) return 0; #ifdef MEMLAT_SUPPORT tls_global_remote_dram += remote_dram_diff; #endif // calculate stalls based on L2 stalls and LLC miss/hit double num = L3_FACTOR * (remote_dram_diff + local_dram_diff); double den = num + llc_hit_diff; if (den == 0) return 0; double stalls = (double)l2_pending_diff * (num / den); // calculate remote dram stalls based on total stalls and local/remote dram accesses // also consider the weight of remote memory access against local memory access den = (remote_dram_diff * tls_hw_remote_latency) + (local_dram_diff * tls_hw_local_latency); if (den == 0) return 0; return (uint64_t) (stalls * ((double)(remote_dram_diff * tls_hw_remote_latency) / den)); } PMC_EVENTS(haswell, 4) #endif /* __CPU_HASWELL_H */ ================================================ FILE: src/lib/cpu/ivybridge-papi.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __CPU_IVYBRIDGE_H #define __CPU_IVYBRIDGE_H #include #include "debug.h" // Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with // applications to list all available performance events with their architecture specific // detailed description and translate them to their respective event code. 'showevtinfo' application can // be used to list all available performance event names with detailed description and 'check_events' application // can be used to translate the performance event to the corresponding event code. // These events will be initialized and started. // Every event reading will return an array with the values for all these events. // The array index is the same index used to define the event in the *_native_events array below const char *ivybridge_native_events[MAX_NUM_EVENTS] = { "CYCLE_ACTIVITY:STALLS_L2_PENDING", "MEM_LOAD_UOPS_LLC_HIT_RETIRED:XSNP_NONE", "MEM_LOAD_UOPS_LLC_MISS_RETIRED:REMOTE_DRAM", "MEM_LOAD_UOPS_LLC_MISS_RETIRED:LOCAL_DRAM" }; uint64_t ivybridge_read_stall_events_local() { long long values[MAX_NUM_EVENTS]; uint64_t events = 0; if (pmc_events_read_local_thread(values) == PAPI_OK) { uint64_t l2_pending = values[0]; uint64_t llc_hit = values[1]; uint64_t remote_dram = values[2]; uint64_t local_dram = values[3]; DBG_LOG(DEBUG, "read stall L2 cycles %lu; llc_hit %lu; remote_dram %lu; local_dram %lu\n", l2_pending, llc_hit, remote_dram, local_dram); double num = remote_dram + local_dram; double den = num + llc_hit; if (den == 0) return 0; events = (uint64_t)((double)l2_pending * ((double)num / den)); } else { DBG_LOG(ERROR, "read stall cycles failed\n"); } return events; } uint64_t ivybridge_read_stall_events_remote() { long long values[MAX_NUM_EVENTS]; uint64_t events = 0; if (pmc_events_read_local_thread(values) == PAPI_OK) { uint64_t l2_pending = values[0]; uint64_t llc_hit = values[1]; uint64_t remote_dram = values[2]; uint64_t local_dram = values[3]; DBG_LOG(DEBUG, "read stall L2 cycles %lu; llc_hit %lu; remote_dram %lu; local_dram %lu\n", l2_pending, llc_hit, remote_dram, local_dram); // calculate stalls based on l2 stalls and LLC miss/hit double num = remote_dram + local_dram; double den = num + llc_hit; if (den == 0) return 0; double stalls = (double)l2_pending * ((double)num / den); // calculate remote dram stalls based on total stalls and local/remote dram accesses den = remote_dram + local_dram; if (den == 0) return 0; events = (uint64_t) (stalls * ((double)remote_dram / den)); } else { DBG_LOG(ERROR, "read stall cycles failed\n"); } return events; } #endif /* __CPU_IVYBRIDGE_H */ ================================================ FILE: src/lib/cpu/ivybridge.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __CPU_IVYBRIDGE_H #define __CPU_IVYBRIDGE_H #include #include "thread.h" #include "cpu/pmc.h" #include "debug.h" // Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with // applications to list all available performance events with their architecture specific // detailed description and translate them to their respective event code. 'showevtinfo' application can // be used to list all available performance event names with detailed description and 'check_events' application // can be used to translate the performance event to the corresponding event code. extern __thread int tls_hw_local_latency; extern __thread int tls_hw_remote_latency; #ifdef MEMLAT_SUPPORT extern __thread uint64_t tls_global_remote_dram; extern __thread uint64_t tls_global_local_dram; #endif #undef FOREACH_PMC_HW_EVENT #define FOREACH_PMC_HW_EVENT(ACTION) \ ACTION("CYCLE_ACTIVITY:STALLS_L2_PENDING", NULL, 0x55305a3) \ ACTION("MEM_LOAD_UOPS_LLC_HIT_RETIRED:XSNP_NONE", NULL, 0x5308d2) \ ACTION("MEM_LOAD_UOPS_LLC_MISS_RETIRED:REMOTE_DRAM", NULL, 0x530cd3) \ ACTION("MEM_LOAD_UOPS_LLC_MISS_RETIRED:LOCAL_DRAM", NULL, 0x5303d3) #undef FOREACH_PMC_EVENT #define FOREACH_PMC_EVENT(ACTION, prefix) \ ACTION(ldm_stall_cycles, prefix) \ ACTION(remote_dram, prefix) #define L3_FACTOR 7.0 DECLARE_ENABLE_PMC(ivybridge, ldm_stall_cycles) { ASSIGN_PMC_HW_EVENT_TO_ME("CYCLE_ACTIVITY:STALLS_L2_PENDING", 0); ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_LLC_HIT_RETIRED:XSNP_NONE", 1); ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_LLC_MISS_RETIRED:REMOTE_DRAM", 2); ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_LLC_MISS_RETIRED:LOCAL_DRAM", 3); return E_SUCCESS; } DECLARE_CLEAR_PMC(ivybridge, ldm_stall_cycles) { } DECLARE_READ_PMC(ivybridge, ldm_stall_cycles) { uint64_t l2_pending_diff = READ_MY_HW_EVENT_DIFF(0); uint64_t llc_hit_diff = READ_MY_HW_EVENT_DIFF(1); uint64_t remote_dram_diff = READ_MY_HW_EVENT_DIFF(2); uint64_t local_dram_diff = READ_MY_HW_EVENT_DIFF(3); DBG_LOG(DEBUG, "read stall L2 cycles diff %lu; llc_hit %lu; cycles diff remote_dram %lu; local_dram %lu\n", l2_pending_diff, llc_hit_diff, remote_dram_diff, local_dram_diff); if ((remote_dram_diff == 0) && (local_dram_diff == 0)) return 0; #ifdef MEMLAT_SUPPORT tls_global_local_dram += local_dram_diff; #endif // calculate stalls based on L2 stalls and LLC miss/hit double num = L3_FACTOR * (remote_dram_diff + local_dram_diff); double den = num + llc_hit_diff; if (den == 0) return 0; return (uint64_t) ((double)l2_pending_diff * (num / den)); } DECLARE_ENABLE_PMC(ivybridge, remote_dram) { ASSIGN_PMC_HW_EVENT_TO_ME("CYCLE_ACTIVITY:STALLS_L2_PENDING", 0); ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_LLC_HIT_RETIRED:XSNP_NONE", 1); ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_LLC_MISS_RETIRED:REMOTE_DRAM", 2); ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_LLC_MISS_RETIRED:LOCAL_DRAM", 3); return E_SUCCESS; } DECLARE_CLEAR_PMC(ivybridge, remote_dram) { } DECLARE_READ_PMC(ivybridge, remote_dram) { uint64_t l2_pending_diff = READ_MY_HW_EVENT_DIFF(0); uint64_t llc_hit_diff = READ_MY_HW_EVENT_DIFF(1); uint64_t remote_dram_diff = READ_MY_HW_EVENT_DIFF(2); uint64_t local_dram_diff = READ_MY_HW_EVENT_DIFF(3); DBG_LOG(DEBUG, "read stall L2 cycles diff %lu; llc_hit %lu; cycles diff remote_dram %lu; local_dram %lu\n", l2_pending_diff, llc_hit_diff, remote_dram_diff, local_dram_diff); if ((remote_dram_diff == 0) && (local_dram_diff == 0)) return 0; #ifdef MEMLAT_SUPPORT tls_global_remote_dram += remote_dram_diff; #endif // calculate stalls based on L2 stalls and LLC miss/hit double num = L3_FACTOR * (remote_dram_diff + local_dram_diff); double den = num + llc_hit_diff; if (den == 0) return 0; double stalls = (double)l2_pending_diff * (num / den); // calculate remote dram stalls based on total stalls and local/remote dram accesses // also consider the weight of remote memory access against local memory access den = (remote_dram_diff * tls_hw_remote_latency) + (local_dram_diff * tls_hw_local_latency); if (den == 0) return 0; return (uint64_t) (stalls * ((double)(remote_dram_diff * tls_hw_remote_latency) / den)); } PMC_EVENTS(ivybridge, 4) #endif /* __CPU_IVYBRIDGE_H */ ================================================ FILE: src/lib/cpu/known_cpus.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __KNOWN_CPUS_H #define __KNOWN_CPUS_H #include "cpu.h" // later, cpu_model_name() is used to distinguish between // Xeon and non-Xeon processors. It's much easier here // to consider all processors non-Xeon. // references: // 1- http://a4lg.com/tech/x86/database/x86-families-and-models.en.html // 2- Intel® Xeon® Processor E7-8800/4800 v3 Product Family Specification // 3- https://software.intel.com/en-us/articles/intel-architecture-and-processor-identification-with-cpuid-model-and-family-numbers microarch_ID_t known_cpus[] = { // order does not matter {.family = 0x06, .model = 0x2A, .microarch = SandyBridge}, {.family = 0x06, .model = 0x2D, .microarch = SandyBridge}, {.family = 0x06, .model = 0x3A, .microarch = IvyBridge}, {.family = 0x06, .model = 0x3E, .microarch = IvyBridge}, {.family = 0x06, .model = 0x3C, .microarch = Haswell}, {.family = 0x06, .model = 0x3F, .microarch = Haswell}, {.family = 0x06, .model = 0x45, .microarch = Haswell}, {.family = 0x06, .model = 0x46, .microarch = Haswell}, // must be the last element {.family = 0x0, .model = 0x0, .microarch = Invalid}}; // order must correspond to microarch_t char *microarch_strings[] = { "Invalid", "Sandy Bridge", "Sandy Bridge Xeon", "Ivy Bridge", "Ivy Bridge Xeon", "Haswell", "Haswell Xeon"}; #endif /* __KNOWN_CPUS_H */ ================================================ FILE: src/lib/cpu/pmc-papi.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include #include #include "cpu/pmc-papi.h" #include "debug.h" __thread int tls_event_set = PAPI_NULL; #define STR_MAX_SIZE 256 static void log_papi_critical(int ret_val, const char *msg) { //char papi_str[STR_MAX_SIZE]; //PAPI_perror(ret_val, (char *)papi_str, sizeof(papi_str)); DBG_LOG(CRITICAL, "%s (%s)\n", msg, PAPI_strerror(ret_val)); } int pmc_init() { int ret_val; if ((ret_val = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT) { log_papi_critical(ret_val, "PMC library init error"); return -1; } if ((ret_val = PAPI_thread_init(pthread_self)) != PAPI_OK) { log_papi_critical(ret_val, "PMC thread support init error"); return -1; } // if ((ret_val = PAPI_set_domain(PAPI_DOM_ALL)) != PAPI_OK) { // log_papi_critical(ret_val, "PMC set domain error"); // return -1; // } return 0; } void pmc_shutdown() { PAPI_shutdown(); } int pmc_create_event_set_local_thread() { int ret_val; if ((ret_val = PAPI_create_eventset(&tls_event_set)) != PAPI_OK) { log_papi_critical(ret_val, "PMC event set init error"); return -1; } // if ((ret_val = PAPI_set_granularity(PAPI_GRN_SYS)) != PAPI_OK) { // log_papi_critical(ret_val, "PMC set granularity error"); // return -1; // } return 0; } void pmc_destroy_event_set_local_thread() { PAPI_cleanup_eventset(tls_event_set); PAPI_destroy_eventset(&tls_event_set); } int pmc_register_thread() { return PAPI_register_thread(); } int pmc_unregister_thread() { return PAPI_unregister_thread(); } int pmc_register_event_local_thread(const char *event_name) { int ret_val; char msg[STR_MAX_SIZE]; // The pthread scope for each thread should be set to PTHREAD_SCOPE_SYSTEM. // On linux, pthread supports only PTHREAD_SCOPE_SYSTEM. assert(tls_event_set != PAPI_NULL); assert(event_name); if ((ret_val = PAPI_add_named_event(tls_event_set, (char *)event_name)) != PAPI_OK) { snprintf(msg, sizeof(msg), "PMC event (%s) register error", event_name); log_papi_critical(ret_val, msg); return -1; } return 0; } int pmc_events_start_local_thread() { int ret_val; assert(tls_event_set != PAPI_NULL); if ((ret_val = PAPI_start(tls_event_set)) != PAPI_OK) { log_papi_critical(ret_val, "PMC events start error"); return -1; } return 0; } void pmc_events_stop_local_thread() { long long values[MAX_NUM_EVENTS]; assert(tls_event_set != PAPI_NULL); PAPI_stop(tls_event_set, values); } int pmc_events_read_local_thread(long long *values) { int ret_val; // int status = 0; assert(values); // PAPI_state(event_set, &status); // if (status != PAPI_RUNNING) { // DBG_LOG(CRITICAL, "PMC event set not in running state"); // return -1; // } if ((ret_val = PAPI_read(tls_event_set, values)) != PAPI_OK) { log_papi_critical(ret_val, "PMC events read error"); return -1; } if ((ret_val = PAPI_reset(tls_event_set)) != PAPI_OK) { log_papi_critical(ret_val, "PMC events reset error"); return -1; } return 0; } ================================================ FILE: src/lib/cpu/pmc-papi.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __CPU_PMC_H #define __CPU_PMC_H #include // Usually the architectures support up to 4 counters enabled at the same // time per core when HT is enabled #define MAX_NUM_EVENTS 4 typedef uint64_t (*read_stalls_t)(void); typedef struct { const char **native_events; read_stalls_t read_stalls_events_local; read_stalls_t read_stalls_events_remote; } pmc_event_t; int pmc_init(); void pmc_shutdown(); int pmc_create_event_set_local_thread(); void pmc_destroy_event_set_local_thread(); int pmc_register_event_local_thread(const char *event_name); int pmc_events_start_local_thread(); void pmc_events_stop_local_thread(); int pmc_events_read_local_thread(long long *values); int pmc_register_thread(); int pmc_unregister_thread(); #endif /* __CPU_PMC_H */ ================================================ FILE: src/lib/cpu/pmc.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include "cpu/pmc.h" #include "dev.h" #include "error.h" #include "thread.h" #include "topology.h" #pragma GCC push_options #pragma GCC optimize ("O0") // The width of general purpose counters are 40bits. // https://www.felixcloutier.com/x86/RDPMC.html #define RDPMC_MAX_VALUE 0xFFFFFFFFFF long long rdpmc(int counter) { unsigned eax; unsigned edx; unsigned long long r; __asm__ __volatile__ ("mov %2, %%ecx\n\t" "rdpmc\n\t" "mov %%eax, %0\n\t" "and $255, %%edx\n\t" "mov %%edx, %1\n\t" : "=m" (eax), "=m" (edx), "=m" (counter) : /* no inputs */ : "eax", "ecx", "edx"); /* eax, ecx, edx clobbered */ r = ((unsigned long long) edx << 32) | eax; return r; } int rdpmc32(int counter) { unsigned eax; __asm__ __volatile__ ("mov %1, %%ecx\n\t" "rdpmc\n\t" "mov %%eax, %0\n\t" : "=m" (eax), "=m" (counter) : /* no inputs */ : "eax", "ecx", "edx"); /* eax, ecx, edx clobbered */ return eax; } #pragma GCC pop_options /*int num_used_hw_cntrs(pmc_events_t* events) { int i; int used; pmc_hw_event_t* event = 0; // check if this a known registered hardware event for (i=0, used=0; events->known_hw_events[i].name; i++) { event = &events->known_hw_events[i]; used += event->active ? 0 : 1; } return used; }*/ int get_avail_hw_cntr_id(pmc_events_t* events) { int i; int used; pmc_hw_event_t* event = 0; int status = -1; int* hw_cntr_id_status = calloc(events->num_avail_hw_cntrs, sizeof(int)); for (i=0, used=0; events->known_hw_events[i].name; i++) { event = &events->known_hw_events[i]; if (event->active) { used++; hw_cntr_id_status[event->hw_cntr_id] = 1; } } if (used == events->num_avail_hw_cntrs) { goto done; } for (i=0; events->num_avail_hw_cntrs; i++) { if (hw_cntr_id_status[i] == 0) { status = i; goto done; } } done: free(hw_cntr_id_status); return status; } pmc_hw_event_t* enable_pmc_hw_event(pmc_events_t* events, const char* name) { int i; pmc_hw_event_t* event = 0; int found = 0; // check if this a known registered hardware event for (i=0; events->known_hw_events[i].name; i++) { event = &events->known_hw_events[i]; if (strcasecmp(event->name, name) == 0) { found = 1; if (event->active) { return event; } break; } } if (!found) { DBG_LOG(WARNING, "Unknown hardware performance monitoring event\n"); return NULL; } // enable it // need to find an available performance counter to monitor this event if ((event->hw_cntr_id = get_avail_hw_cntr_id(events)) < 0) { DBG_LOG(ERROR, "No available hardware performance counters\n"); return NULL; } // assign an array to keep per processor last read values (useful to calculate the diff since the last read) int num_cpus = system_num_cpus(); if (!event->last_val) { event->last_val = calloc(num_cpus, sizeof(*event->last_val)); } for (i=0; ilast_val[i] = 0; } // call into the kernel driver to enable the counter on all processors if (set_counter(event->hw_cntr_id, event->encoding) != E_SUCCESS) { DBG_LOG(ERROR, "Can't enable counter on all processors\n"); return NULL; } event->active = 1; return event; } void disable_pmc_hw_event(pmc_events_t* events, const char* name) { int i; pmc_hw_event_t* event = 0; int found = 0; // check if this a known registered hardware event for (i=0; events->known_hw_events[i].name; i++) { event = &events->known_hw_events[i]; if (strcasecmp(event->name, name) == 0) { found = 1; if (!event->active) { return; } break; } } if (!found) { DBG_LOG(WARNING, "Unknown hardware performance monitoring event\n"); return; } event->active = 0; } void clear_pmc_hw_event(pmc_hw_event_t* event) { DBG_LOG(CRITICAL, "Unimplemented functionality\n"); } uint64_t read_pmc_hw_event_cur(pmc_hw_event_t* event) { return rdpmc(event->hw_cntr_id); } uint64_t read_pmc_hw_event_diff(pmc_hw_event_t* event) { int cpu_id = thread_self()->cpu_id; uint64_t cur_val = read_pmc_hw_event_cur(event); uint64_t last_val = event->last_val[cpu_id]; //if (cur_val < last_val && (event->hw_cntr_id == 0)) { if (cur_val < last_val) { event->last_val[cpu_id] = cur_val; return (cur_val + (RDPMC_MAX_VALUE - last_val)); } event->last_val[cpu_id] = cur_val; return cur_val - last_val; } pmc_event_t* enable_pmc_event(cpu_model_t* cpu, const char* name) { int i; pmc_event_t* event = 0; int found = 0; // check if this a known registered event for (i=0; cpu->pmc_events->known_events[i].name; i++) { event = &cpu->pmc_events->known_events[i]; if (strcasecmp(event->name, name) == 0) { found = 1; if (event->active) { return event; } break; } } if (!found) { return NULL; } // enable it event->hw_events = NULL; event->num_hw_events = 0; if (event->enable(cpu->pmc_events, event) != E_SUCCESS) { assert(0 && "DIE"); return NULL; } event->active = 1; return event; } int assign_pmc_hw_event_to_event(pmc_events_t* events, const char* name, pmc_event_t* event, int local_id) { pmc_hw_event_t* hw_event; if (!(hw_event = enable_pmc_hw_event(events, name))) { return E_ERROR; } if (local_id != event->num_hw_events) { DBG_LOG(CRITICAL, "local_id does not match assign id\n") // TODO: application should abort here, look for all DBG_LOG(CRITICAL) } event->hw_events = realloc(event->hw_events, (event->num_hw_events+1) * sizeof(*event->hw_events)); event->hw_events[event->num_hw_events] = hw_event; event->num_hw_events++; return E_SUCCESS; } void release_all_pmc_hw_events_of_event(pmc_event_t* event) { int i; if (event->num_hw_events > 0) { for (i=0; inum_hw_events; i++) { event->hw_events[i]->active = 0; } free(event->hw_events); event->hw_events = NULL; event->num_hw_events = 0; } } void disable_pmc_event(cpu_model_t* cpu, const char* name) { int i; pmc_event_t* event; for (i=0; cpu->pmc_events->known_events[i].name; i++) { event = &cpu->pmc_events->known_events[i]; if (strcasecmp(event->name, name) == 0 && event->active) { event->active = 0; } } } ================================================ FILE: src/lib/cpu/pmc.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __CPU_PMC_H #define __CPU_PMC_H #include "cpu/cpu.h" #define DECLARE_ENABLE_PMC(prefix, name) int prefix##_create_pmc_##name(struct pmc_events_s* events, struct pmc_event_s* event) #define DECLARE_CLEAR_PMC(prefix, name) void prefix##_clear_pmc_##name(struct pmc_event_s* event) #define DECLARE_READ_PMC(prefix, name) uint64_t prefix##_read_pmc_##name(struct pmc_event_s* event) #define ENABLE_PMC_FNAME(prefix, name) prefix##_create_pmc_##name #define CLEAR_PMC_FNAME(prefix, name) prefix##_clear_pmc_##name #define READ_PMC_FNAME(prefix, name) prefix##_read_pmc_##name #define PMC_HW_EVENT(name, os_name, encoding) { name, os_name, encoding, 0, 0}, #define PMC_EVENT(name, prefix) { #name, NULL, 0, 0, ENABLE_PMC_FNAME(prefix, name), CLEAR_PMC_FNAME(prefix, name), READ_PMC_FNAME(prefix, name)}, #define PMC_EVENTS_PTR(prefix) &prefix##_pmc_events #define PMC_EVENTS(prefix, num_hw_cntrs) \ pmc_hw_event_t prefix##_known_hw_event[] = { \ FOREACH_PMC_HW_EVENT(PMC_HW_EVENT) \ {NULL, NULL, 0, 0, 0} \ }; \ pmc_event_t prefix##_known_event[] = { \ FOREACH_PMC_EVENT(PMC_EVENT, prefix) \ {NULL, NULL, 0, 0, NULL, NULL, NULL} \ }; \ pmc_events_t prefix##_pmc_events = { \ num_hw_cntrs, \ prefix##_known_hw_event, \ prefix##_known_event \ }; #define ASSIGN_PMC_HW_EVENT_TO_ME(name, local_id) \ if (assign_pmc_hw_event_to_event(events, name, event, local_id) != E_SUCCESS) { \ release_all_pmc_hw_events_of_event(event); \ } #define READ_MY_HW_EVENT_DIFF(local_id) read_pmc_hw_event_diff(event->hw_events[local_id]) #define READ_MY_HW_EVENT_CUR(local_id) read_pmc_hw_event_cur(event->hw_events[local_id]) typedef struct { char* name; char* os_name; // perf name if known uint64_t encoding; int active; int hw_cntr_id; uint64_t* last_val; // array holding the last read values per processor (useful to calculate the diff since the last read) } pmc_hw_event_t; typedef struct pmc_event_s { const char* name; pmc_hw_event_t** hw_events; int num_hw_events; int active; int (*enable)(struct pmc_events_s* events, struct pmc_event_s* event); void (*clear)(struct pmc_event_s* event); uint64_t (*read)(struct pmc_event_s* event); } pmc_event_t; typedef struct pmc_events_s { int num_avail_hw_cntrs; pmc_hw_event_t* known_hw_events; pmc_event_t* known_events; } pmc_events_t; pmc_hw_event_t* enable_pmc_hw_event(pmc_events_t* events, const char* name); void disable_pmc_hw_event(pmc_events_t* events, const char* name); void clear_pmc_hw_event(pmc_hw_event_t* event); uint64_t read_pmc_hw_event_cur(pmc_hw_event_t* event); uint64_t read_pmc_hw_event_diff(pmc_hw_event_t* event); int assign_pmc_hw_event_to_event(pmc_events_t* events, const char* name, pmc_event_t* event, int local_id); void release_all_pmc_hw_events_of_event(pmc_event_t* event); pmc_event_t* enable_pmc_event(cpu_model_t* cpu, const char* name); void disable_pmc_event(cpu_model_t* cpu, const char* name); static inline void clear_pmc_event(pmc_event_t* event) { event->clear(event); } //#include "debug.h" static inline uint64_t read_pmc_event(pmc_event_t* event) { uint64_t ret; ret = event->read(event); return ret; } #endif /* __CPU_PMC_H */ ================================================ FILE: src/lib/cpu/sandybridge-papi.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __CPU_SANDYBRIDGE_H #define __CPU_SANDYBRIDGE_H #include #include #include "debug.h" // Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with // applications to list all available performance events with their architecutre specific // detailed description and translate them to their respective event code. showevtinfo application can // be used to list all available performance event names with detailed desciption and check_events application // can be used to translate the performance event to the corresponding event code. // These events will be initialized and started. // Every event reading will return an array with the values for all these events. // The array index is the same index used to define the event in the *_native_events array below const char *sandybridge_native_events[MAX_NUM_EVENTS] = { "CYCLE_ACTIVITY:STALLS_L2_PENDING", "MEM_LOAD_UOPS_MISC_RETIRED:LLC_MISS", "MEM_LOAD_UOPS_RETIRED:L3_HIT", NULL }; void sandybridge_latency_calibration_local(int *hw_latency, int target_latency) { if ((*hw_latency + 10) < target_latency) *hw_latency += 10; } void sandybridge_latency_calibration_remote(int *hw_latency, int target_latency) { if ((*hw_latency + 30) < target_latency) *hw_latency += 30; } uint64_t sandybridge_read_stall_events_local() { long long values[MAX_NUM_EVENTS]; uint64_t events = 0; if (pmc_events_read_local_thread(values) == PAPI_OK) { uint64_t cycle_activity_stalls_l2_pending_diff = values[0]; uint64_t mem_load_uops_misc_retired_llc_miss_diff = values[1]; uint64_t mem_load_uops_retired_l3_hit_diff = values[2]; DBG_LOG(DEBUG, "read stall L2 cycles %lu, LLC miss %lu, L3 hit %lu\n", cycle_activity_stalls_l2_pending_diff, mem_load_uops_misc_retired_llc_miss_diff, mem_load_uops_retired_l3_hit_diff); uint64_t uden = 7.0 * mem_load_uops_misc_retired_llc_miss_diff + mem_load_uops_retired_l3_hit_diff; if (uden == 0) { return 0; } double den = uden; double num = 7.0 * mem_load_uops_misc_retired_llc_miss_diff; events = (uint64_t) floorl(cycle_activity_stalls_l2_pending_diff*num/den); } else { DBG_LOG(DEBUG, "read stall cycles failed\n"); } return events; } #endif /* __CPU_SANDYBRIDGE_H */ ================================================ FILE: src/lib/cpu/sandybridge.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __CPU_SANDYBRIDGE_H #define __CPU_SANDYBRIDGE_H #include #include "thread.h" #include "cpu/pmc.h" #include "debug.h" // Perfmon2 is a library that provides a generic interface to access the PMU. It also comes with // applications to list all available performance events with their architecutre specific // detailed description and translate them to their respective event code. showevtinfo application can // be used to list all available performance event names with detailed desciption and check_events application // can be used to translate the performance event to the corresponding event code. #undef FOREACH_PMC_HW_EVENT #define FOREACH_PMC_HW_EVENT(ACTION) \ ACTION("CYCLE_ACTIVITY:STALLS_L2_PENDING", NULL, 0x55305a3) \ ACTION("MEM_LOAD_UOPS_MISC_RETIRED:LLC_MISS", NULL, 0x5302d4) \ ACTION("MEM_LOAD_UOPS_RETIRED:L3_HIT", NULL, 0x5304d1) \ ACTION("INSTRUCTION_RETIRED", NULL, 0x5300c0) #undef FOREACH_PMC_EVENT #define FOREACH_PMC_EVENT(ACTION, prefix) \ ACTION(ldm_stall_cycles, prefix) DECLARE_ENABLE_PMC(sandybridge, ldm_stall_cycles) { ASSIGN_PMC_HW_EVENT_TO_ME("CYCLE_ACTIVITY:STALLS_L2_PENDING", 0); ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_MISC_RETIRED:LLC_MISS", 1); //ASSIGN_PMC_HW_EVENT_TO_ME("INSTRUCTION_RETIRED", 2); ASSIGN_PMC_HW_EVENT_TO_ME("MEM_LOAD_UOPS_RETIRED:L3_HIT", 2); return E_SUCCESS; } DECLARE_CLEAR_PMC(sandybridge, ldm_stall_cycles) { } DECLARE_READ_PMC(sandybridge, ldm_stall_cycles) { //return 0; uint64_t cycle_activity_stalls_l2_pending_diff = READ_MY_HW_EVENT_DIFF(0); uint64_t mem_load_uops_misc_retired_llc_miss_diff = READ_MY_HW_EVENT_DIFF(1); uint64_t mem_load_uops_retired_l3_hit_diff = READ_MY_HW_EVENT_DIFF(2); //return floor(cycle_activity_stalls_l2_pending_diff * (((double) (7*mem_load_uops_misc_retired_llc_miss_diff))/((double)(7*mem_load_uops_misc_retired_llc_miss_diff + mem_load_uops_retired_l3_hit_diff)))); uint64_t uden = 7.0 * mem_load_uops_misc_retired_llc_miss_diff + mem_load_uops_retired_l3_hit_diff; if (uden == 0) { return 0; } double den = uden; double num = 7.0 * mem_load_uops_misc_retired_llc_miss_diff; return (uint64_t) floorl(cycle_activity_stalls_l2_pending_diff*num/den); } PMC_EVENTS(sandybridge, 4) #endif /* __CPU_SANDYBRIDGE_H */ ================================================ FILE: src/lib/cpu/xeon-ex.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include "dev.h" #ifdef PAPI_SUPPORT #include "sandybridge-papi.h" #include "ivybridge-papi.h" #include "haswell-papi.h" #else #include "sandybridge.h" #include "ivybridge.h" #include "haswell.h" #endif int intel_xeon_ex_set_throttle_register(pci_regs_t *regs, throttle_type_t throttle_type, uint16_t val) { int offset; int i; switch(throttle_type) { case THROTTLE_DDR_ACT: offset = 0x190; break; case THROTTLE_DDR_READ: offset = 0x192; break; case THROTTLE_DDR_WRITE: offset = 0x194; break; default: offset = 0x190; } // write to all 4 channels // first Activate throttling /*set_pci(bus_id, 0x10, 0x0, 0x190, (uint16_t) val); set_pci(bus_id, 0x10, 0x1, 0x190, (uint16_t) val); set_pci(bus_id, 0x10, 0x4, 0x190, (uint16_t) val); set_pci(bus_id, 0x10, 0x5, 0x190, (uint16_t) val);*/ // then the Read or Write throttling for (i=0; i < regs->channels; ++i) { set_pci(regs->addr[i].bus_id, regs->addr[i].dev_id, regs->addr[i].funct, offset, (uint16_t) val); } return 0; } int intel_xeon_ex_get_throttle_register(pci_regs_t *regs, throttle_type_t throttle_type, uint16_t* val) { int offset; switch(throttle_type) { case THROTTLE_DDR_ACT: offset = 0x190; break; case THROTTLE_DDR_READ: offset = 0x192; break; case THROTTLE_DDR_WRITE: offset = 0x194; break; default: offset = 0x190; } // read just channel 1 get_pci(regs->addr[0].bus_id, regs->addr[0].dev_id, regs->addr[0].funct, offset, val); return 0; } // desc is fixed in cpu_model() if not Xeon cpu_model_t cpu_model_intel_xeon_ex = { .microarch = SandyBridgeXeon, #ifdef PAPI_SUPPORT .pmc_events = {sandybridge_native_events, sandybridge_read_stall_events_local, NULL}, #else .pmc_events = PMC_EVENTS_PTR(sandybridge), #endif .set_throttle_register = intel_xeon_ex_set_throttle_register, .get_throttle_register = intel_xeon_ex_get_throttle_register }; cpu_model_t cpu_model_intel_xeon_ex_v2 = { .microarch = IvyBridgeXeon, #ifdef PAPI_SUPPORT .pmc_events = {ivybridge_native_events, ivybridge_read_stall_events_local, ivybridge_read_stall_events_remote}, #else .pmc_events = PMC_EVENTS_PTR(ivybridge), #endif .set_throttle_register = intel_xeon_ex_set_throttle_register, .get_throttle_register = intel_xeon_ex_get_throttle_register }; cpu_model_t cpu_model_intel_xeon_ex_v3 = { .microarch = HaswellXeon, #ifdef PAPI_SUPPORT .pmc_events = {haswell_native_events, haswell_read_stall_events_local, haswell_read_stall_events_remote}, #else .pmc_events = PMC_EVENTS_PTR(haswell), #endif .set_throttle_register = intel_xeon_ex_set_throttle_register, .get_throttle_register = intel_xeon_ex_get_throttle_register }; ================================================ FILE: src/lib/debug.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include "debug.h" #include #include #include #include #include "config.h" int dbg_modules[dbg_module_count]; int dbg_level = 0; int dbg_verbose = 0; const char* dbg_identifier = ""; static char dbg_identifier_buf[128]; static int strrep(char *target, char *source, char oldc, char newc) { int i; for (i=0; source[i]; i++) { if (source[i] == oldc) { target[i] = newc; } else { target[i] = source[i]; } } target[i] = '\0'; return 0; } void dbg_set_level(int level) { dbg_level = level; } int dbg_init(config_t* dbg_cfg, int level, const char* identifier) { // if user hasn't provided a debugging level then get it from the // configuration env/file if (level < 0) { __cconfig_lookup_int(dbg_cfg, "debug.level", &dbg_level); } else { dbg_level = level; } __cconfig_lookup_int(dbg_cfg, "debug.verbose", &dbg_verbose); // if user hasn't provide an identifier then check whether the environment // provides one, othewise create one based on process' pid if (!identifier) { dbg_identifier = getenv("DEBUG_IDENTIFIER"); if (!dbg_identifier) { sprintf(dbg_identifier_buf, "%d", getpid()); dbg_identifier = dbg_identifier_buf; } } else { dbg_identifier = identifier; } // read per module debugging flags #define STR(name) #name #define ACTION(name) \ do { \ char dotstr[128]; \ strrep(dotstr, STR(debug_module_##name), '_', '.'); \ __cconfig_lookup_bool(dbg_cfg, dotstr, \ &dbg_modules[dbg_module_##name]); \ } while (0); FOREACH_DEBUG_MODULE(ACTION) #undef ACTION DBG_LOG(DEBUG, ""); // prevent compiler warning return 0; } void dbg_backtrace (void) { void *array[10]; size_t size; char **strings; size_t i; size = backtrace (array, 10); strings = backtrace_symbols (array, size); printf ("Obtained %zd stack frames.\n", size); for (i = 0; i < size; i++) printf ("%s\n", strings[i]); free (strings); } ================================================ FILE: src/lib/debug.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __DEBUG_H #define __DEBUG_H #include #include #include #include #include "config.h" #define FOREACH_DEBUG_MODULE(ACTION) \ ACTION(all) /* special name that covers all modules */ #define ACTION(name) \ dbg_module_##name, enum { FOREACH_DEBUG_MODULE(ACTION) dbg_module_count }; #undef ACTION #ifndef NDEBUG #define DBG_CODE(code) DBG_##code enum dbg_code { DBG_OFF = 0, DBG_CODE(CRITICAL) = 1, // Critical DBG_CODE(ERROR) = 2, // Error DBG_CODE(WARNING) = 3, // Warning DBG_CODE(INFO) = 4, // Info DBG_CODE(DEBUG) = 5, // Debugging }; static const char* dbg_code2str[] = { (char*) "OFF", (char*) "CRITICAL", (char*) "ERROR", (char*) "WARNING", (char*) "INFO", (char*) "DEBUG", }; static const int dbg_terminate_level = DBG_ERROR; static const int dbg_stderr_level = DBG_WARNING; extern int dbg_modules[]; extern int dbg_level; extern int dbg_verbose; extern const char* dbg_identifier; #define DBG_MODULE(name) dbg_module_##name #define DBG_LOG(level, format, ...) \ do { \ FILE* ferr = stdout; \ time_t ctime; \ if (DBG_CODE(level) && (DBG_CODE(level) <= dbg_level || \ DBG_CODE(level) <= dbg_terminate_level)) \ { \ if (DBG_CODE(level) <= dbg_stderr_level) { \ ferr=stderr; \ } \ if (dbg_verbose) { \ ctime = time(NULL); \ fprintf(ferr, "[%s] [%lu] %s in %s <%s,%d>: " format, \ dbg_identifier, \ ctime, \ dbg_code2str[DBG_CODE(level)], \ __FUNCTION__, __FILE__, __LINE__, ##__VA_ARGS__); \ } else { \ fprintf(ferr, "[%s] %s: " format, \ dbg_identifier, \ dbg_code2str[DBG_CODE(level)], \ ##__VA_ARGS__); \ } \ if (DBG_CODE(level) <= dbg_terminate_level) { \ exit(-1); \ } \ } \ } while(0); #define DBG_LOG2(level, module, format, ...) \ do { \ FILE* ferr = stdout; \ if (DBG_CODE(level) && \ (dbg_modules[module] || dbg_modules[dbg_module_all] || \ DBG_CODE(level) <= dbg_terminate_level) && \ (DBG_CODE(level) <= dbg_level || \ DBG_CODE(level) <= dbg_terminate_level)) \ { \ if (DBG_CODE(level) <= dbg_stderr_level) { \ ferr=stderr; \ } \ fprintf(ferr, "[%s] %s in %s <%s,%d>: " format, \ dbg_identifier, \ dbg_code2str[DBG_CODE(level)], \ __FUNCTION__, __FILE__, __LINE__, ##__VA_ARGS__); \ if (DBG_CODE(level) <= dbg_terminate_level) { \ exit(-1); \ } \ } \ } while(0); #else /* NDEBUG */ #define DBG_LOG(level, format, ...) #define DBG_LOG2(level, module, format, ...) #endif /* NDEBUG */ #define VERIFY(condition) \ do { \ if (!(condition)) { \ fprintf(stderr, "Assumption \"%s\"\nFailed in file %s: at line:%i\n", \ #condition,__FILE__,__LINE__); \ DBG_LOG (DBG_CRITICAL, #condition);} \ fflush(stderr); \ } while (0); int dbg_init(config_t* dbg_cfg, int level, const char* identifier); void dbg_backtrace (void); void dbg_set_level(int level); #endif // __DEBUG_H ================================================ FILE: src/lib/dev.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include #include #include #include #include #include #include "dev/ioctl_query.h" #include "error.h" #include "dev.h" // TODO: get this value from the config file #define DEV_PATH "/dev/nvmemul" int set_counter(unsigned int counter_id, unsigned int event_id) { int fd; int ret; ioctl_query_setcounter_t q; fd = open(DEV_PATH, O_RDONLY); if (fd < 0) { DBG_LOG(ERROR, "Can't open %s - Is the NVM emulator device driver installed?\n", DEV_PATH); return E_ERROR; } q.counter_id = counter_id; q.event_id = event_id; if ((ret = ioctl(fd, IOCTL_SETCOUNTER, &q)) < 0) { close(fd); return E_ERROR; } close(fd); return E_SUCCESS; } int set_pci(unsigned int bus_id, unsigned int device_id, unsigned int function_id, unsigned int offset, uint16_t val) { int fd; int ret; ioctl_query_setgetpci_t q; fd = open(DEV_PATH, O_RDONLY); if (fd < 0) { DBG_LOG(ERROR, "Can't open %s - Is the NVM emulator device driver installed?\n", DEV_PATH); return E_ERROR; } q.bus_id = bus_id; q.device_id = device_id; q.function_id = function_id; q.offset = offset; q.val = val; if ((ret = ioctl(fd, IOCTL_SETPCI, &q)) < 0) { close(fd); return E_ERROR; } close(fd); return E_SUCCESS; } int get_pci(unsigned int bus_id, unsigned int device_id, unsigned int function_id, unsigned int offset, uint16_t* val) { int fd; int ret; ioctl_query_setgetpci_t q; fd = open(DEV_PATH, O_RDWR); if (fd < 0) { DBG_LOG(ERROR, "Can't open %s - Is the NVM emulator device driver installed?\n", DEV_PATH); return E_ERROR; } q.bus_id = bus_id; q.device_id = device_id; q.function_id = function_id; q.offset = offset; q.val = 0; if ((ret = ioctl(fd, IOCTL_GETPCI, &q)) < 0) { close(fd); return E_ERROR; } *val = q.val; close(fd); return E_SUCCESS; } ================================================ FILE: src/lib/dev.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __DEVICE_DRIVER_API_H #define __DEVICE_DRIVER_API_H #include #define MAX_NUM_MC_PCI_BUS 16 #define MAX_NUM_MC_CHANNELS 16 typedef struct { unsigned int bus_id; unsigned int dev_id; unsigned int funct; } pci_addr; typedef struct { pci_addr addr[MAX_NUM_MC_CHANNELS]; unsigned int channels; } pci_regs_t; int set_counter(unsigned int counter_id, unsigned int event_id); int set_pci(unsigned bus_id, unsigned int device_id, unsigned int function_id, unsigned int offset, uint16_t val); int get_pci(unsigned bus_id, unsigned int device_id, unsigned int function_id, unsigned int offset, uint16_t* val); #endif /* __DEVICE_DRIVER_API_H */ ================================================ FILE: src/lib/errno.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __ERRNO_H #define __ERRNO_H #ifdef __DEFINE_ERRNO # error "__DEFINE_ERRNO previously defined" #endif /* * Define error codes and error messages here */ #define __DEFINE_ERRNO(ACTION) \ ACTION(E_SUCCESS, "Success") \ ACTION(E_ERROR, "Generic error") \ ACTION(E_NOMEM, "No memory") \ ACTION(E_EXIST, "Name already exists") \ ACTION(E_NOENT, "Name does not exist") \ ACTION(E_INVAL, "Invalid argument") \ ACTION(E_BUSY, "Resource busy") \ ACTION(E_NOTEMPTY, "Not empty") \ ACTION(E_ERRNO, "Standard C library error; check errno for details") #ifdef __ENUM_MEMBER # error "__ENUM_MEMBER previously defined" #endif #define __ENUM_MEMBER(name, str) name, enum { __DEFINE_ERRNO(__ENUM_MEMBER) E_MAXERRNO }; #undef __ENUM_MEMBER /* don't polute the macro namespace */ #ifdef __ERRNO_STRING # error "__ERRNO_STRING previously defined" #endif #define __ERRNO_STRING(name, str) str, /* TODO: not used for now static const char* ErrorToString(int err) { static const char* errstr[] = { __DEFINE_ERRNO(__ERRNO_STRING) "Unknown error code" }; if (err >= 0 && err < E_MAXERRNO) { return errstr[err]; } return errstr[E_MAXERRNO]; } */ #undef __ERRNO_STRING /* don't polute the macro namespace */ #undef __DEFINE_ERRNO /* don't polute the macro namespace */ #endif /* __ERRNO_H */ ================================================ FILE: src/lib/error.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __ERROR_H #define __ERROR_H #include "errno.h" #include "debug.h" #endif /* __ERROR_H */ ================================================ FILE: src/lib/init.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include "cpu/cpu.h" #include "config.h" #include "error.h" #include "model.h" #include "measure.h" #include "thread.h" #include "topology.h" #include "interpose.h" #include "monotonic_timer.h" #include "pflush.h" #include "stat.h" static void init() __attribute__((constructor)); static void finalize() __attribute__((destructor)); int set_process_local_rank(); int unset_process_local_rank(); int partition_cpus(virtual_topology_t* virtual_topology); static virtual_topology_t* virtual_topology = NULL; void finalize() { int i; if (latency_model.enabled) { unregister_self(); } if (read_bw_model.enabled) { for (i=0; i < virtual_topology->num_virtual_nodes; i++) { // FIXME: currently we keep a single bandwidth model and not per-node BW model physical_node_t* phys_node = virtual_topology->virtual_nodes[i].nvram_node; pci_regs_t *regs = phys_node->mc_pci_regs; // reset throttling phys_node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, 0x8FFF); } } #ifdef USE_STATISTICS stats_report(); #endif // finalize libraries and release resources #ifdef PAPI_SUPPORT pmc_shutdown(); #endif unset_process_local_rank(); //__cconfig_destroy(&cfg); } void init() { config_t cfg; cpu_model_t* cpu; char* ld_preload_path; double start_time, end_time; #ifdef CALIBRATION_SUPPORT int i; #endif // FIXME: do we need to register the main thread with our system? // YES: for sure for single-threaded apps start_time = monotonic_time_us(); // we reset LD_PRELOAD to ensure we don't get into recursive preloads when // calling popen during initialization. before exiting we reactivate LD_PRELOAD // to allow LD_PRELOADS on children ld_preload_path = getenv("LD_PRELOAD"); unsetenv("LD_PRELOAD"); if (__cconfig_init(&cfg, "nvmemul.ini") == CONFIG_FALSE) { goto error; } __cconfig_lookup_bool(&cfg, "latency.enable", &latency_model.enabled); __cconfig_lookup_bool(&cfg, "bandwidth.enable", &read_bw_model.enabled); if (dbg_init(&cfg, -1, NULL) != E_SUCCESS) { goto error; } if (init_interposition() != E_SUCCESS) { goto error; } if ((cpu = cpu_model()) == NULL) { DBG_LOG(ERROR, "No supported processor found\n"); goto error; } init_virtual_topology(&cfg, cpu, &virtual_topology); if (init_bandwidth_model(&cfg, virtual_topology) != E_SUCCESS) { goto error; } if (latency_model.enabled) { if (init_latency_model(&cfg, cpu, virtual_topology) != E_SUCCESS) { goto error; } init_thread_manager(&cfg, virtual_topology); #ifdef USE_STATISTICS // statistics makes use of the thread manager and is used by the register_self() stats_enable(&cfg); #endif set_process_local_rank(); // thread manager must be initialized and local rank set // CPU partitioning must be made before the first thread is registered if (partition_cpus(virtual_topology) != E_SUCCESS) { goto error; } if (register_self() != E_SUCCESS) { goto error; } #ifdef CALIBRATION_SUPPORT // main thread is now tracked by the latency emulator // first, calibrate the latency emulation if (latency_model.calibration) { for (i = 0; i < virtual_topology->num_virtual_nodes; ++i) { latency_calibration(&virtual_topology->virtual_nodes[i]); } } #endif int write_latency; __cconfig_lookup_int(&cfg, "latency.write", &write_latency); init_pflush(cpu_speed_mhz(), write_latency); } end_time = monotonic_time_us(); #ifdef USE_STATISTICS if (latency_model.enabled) { stats_set_init_time(end_time - start_time); } #endif if (ld_preload_path) setenv("LD_PRELOAD", ld_preload_path, 1); return; error: /* Cannot initialize library -- catastrophic error */ if (ld_preload_path) setenv("LD_PRELOAD", ld_preload_path, 1); fprintf(stderr, "ERROR: nvmemul: Initialization failed. Running without non-volatile memory emulation.\n"); } ================================================ FILE: src/lib/interpose.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #define _GNU_SOURCE #include #include #include #include #include #include "error.h" #include "model.h" #include "thread.h" #include "cpu/cpu.h" #ifdef PAPI_SUPPORT #include "cpu/pmc-papi.h" #else #include "cpu/pmc.h" #endif // WARNING: Our library MUST directly use the functions we interpose on by // calling __lib_X to avoid interposition on ourselves. int (*__lib_pthread_create)(pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine) (void *), void *arg); int (*__lib_pthread_mutex_lock)(pthread_mutex_t *mutex); int (*__lib_pthread_mutex_trylock)(pthread_mutex_t *mutex); int (*__lib_pthread_mutex_unlock)(pthread_mutex_t *mutex); int (*__lib_pthread_detach)(pthread_t thread); extern inline hrtime_t hrtime_cycles(void); extern inline int cycles_to_us(cpu_model_t* cpu, hrtime_t cycles); int init_interposition() { char *error; // if no symbol is returned then no interposition needed __lib_pthread_create = dlsym(RTLD_NEXT, "pthread_create"); __lib_pthread_mutex_lock = dlsym(RTLD_NEXT, "pthread_mutex_lock"); __lib_pthread_mutex_trylock = dlsym(RTLD_NEXT, "pthread_mutex_trylock"); __lib_pthread_mutex_unlock = dlsym(RTLD_NEXT, "pthread_mutex_unlock"); __lib_pthread_detach = dlsym(RTLD_NEXT, "pthread_detach"); if (__lib_pthread_mutex_lock == NULL || __lib_pthread_mutex_unlock == NULL || __lib_pthread_create == NULL || __lib_pthread_mutex_trylock == NULL || __lib_pthread_detach == NULL) { error = dlerror(); DBG_LOG(ERROR, "Interposition failed: %s\n", error != NULL ? error : "unknown reason"); return E_ERROR; } return E_SUCCESS; } // Interposing on pthread_create requires interposing on the thread created as we // require the TID of that thread which we can only get by executing the gettid() // system call from that thread. So we interpose on the start_routine which is // called by the new thread typedef struct { void *(*start_routine) (void *); void *arg; } pthread_create_functor_t; void* __interposed_start_routine(void* args) { void* ret; pthread_create_functor_t* f = (pthread_create_functor_t*) args; if (register_self() != E_SUCCESS) { free(args); return NULL; } ret = f->start_routine(f->arg); // FIXME: directly calling unregister may miss cases where the // thread terminates prematurely (such as pthread_exit or cancel) // consider using a key destructor function instead //fprintf(stderr, "stall cycles: %lu\n", thread_self()->stall_cycles); //fprintf(stderr, "signals_sent: %lu signals_recv: %lu\n", thread_self()->signals_sent, thread_self()->signals_recv); unregister_self(); free(args); return ret; } int pthread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine) (void *), void *arg) { int ret; //DBG_LOG(DEBUG, "interposing pthread_create\n"); //assert(__lib_pthread_create); if (__lib_pthread_create == NULL) init_interposition(); if (latency_model.enabled) { pthread_create_functor_t *functor = malloc(sizeof(pthread_create_functor_t)); functor->arg = arg; functor->start_routine = start_routine; if ((ret = __lib_pthread_create(thread, attr, __interposed_start_routine, (void*) functor)) != 0) { DBG_LOG(ERROR, "call to __lib_pthread_create failed\n"); return ret; } } else { ret = __lib_pthread_create(thread, attr, start_routine, arg); } return ret; } int pthread_mutex_lock(pthread_mutex_t *mutex) { int err; if (latency_model.enabled) { if(reached_min_epoch_duration(thread_self())) { // create new epoch here in order to propagate only the critical session delay to other threads // the thread monitor will keep trying to create new epoch, unless the min duration has not been reached create_latency_epoch(); } } //DBG_LOG(DEBUG, "interposing pthread_mutex_lock\n"); //assert(__lib_pthread_mutex_lock); if (__lib_pthread_mutex_lock == NULL) init_interposition(); err = __lib_pthread_mutex_lock(mutex); return err; } int pthread_mutex_trylock(pthread_mutex_t *mutex) { int err; if (latency_model.enabled) { if(reached_min_epoch_duration(thread_self())) { create_latency_epoch(); } } //DBG_LOG(DEBUG, "interposing pthread_mutex_trylock\n"); //assert(__lib_pthread_mutex_trylock); if (__lib_pthread_mutex_trylock == NULL) init_interposition(); err = __lib_pthread_mutex_trylock(mutex); return err; } int pthread_mutex_unlock(pthread_mutex_t *mutex) { int err; if (latency_model.enabled) { if (reached_min_epoch_duration(thread_self())) { create_latency_epoch(); } } //DBG_LOG(DEBUG, "interposing pthread_mutex_unlock\n"); //assert(__lib_pthread_mutex_unlock); if (__lib_pthread_mutex_unlock == NULL) init_interposition(); err = __lib_pthread_mutex_unlock(mutex); return err; } ================================================ FILE: src/lib/interpose.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __INTERPOSE_H #define __INTERPOSE_H /** * * \page library_interposition Library interposition * * The emulator intercepts several events of interest. It achieves this * by interposing on corresponding functions. * Currently this includes thread creation and POSIX synchronization mechanisms. */ extern int (*__lib_pthread_create)(pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine) (void *), void *arg); extern int (*__lib_pthread_mutex_lock)(pthread_mutex_t *mutex); extern int (*__lib_pthread_mutex_trylock)(pthread_mutex_t *mutex); extern int (*__lib_pthread_mutex_unlock)(pthread_mutex_t *mutex); extern int (*__lib_pthread_detach)(pthread_t thread); int init_interposition(); #endif /* __INTERPOSE_H */ ================================================ FILE: src/lib/measure.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __MEASURE_H #define __MEASURE_H /** * \file * * Memory latency and bandwidth measurements */ /** * \brief Measure memory read bandwidth * * Measures memory read bandwidth from a local socket (cpu_node) * to the memory of a remote socket (mem_node). It does this * by firing a bunch of threads issuing streaming instructions * to saturate memory bandwidth. */ double measure_read_bw(int cpu_node, int mem_node); /** * \brief Measure memory write bandwidth * * Measures memory write bandwidth from a local socket (cpu_node) * to the memory of a remote socket (mem_node). * See measure_read_bw for how this is done. */ double measure_write_bw(int cpu_node, int mem_node); /** * \brief Measure memory latency * * Measures memory read latency from one local socket to the memory of a * remote socket. It does this using a pointer chasing microbenchmark. * The microbenchmark setups an array where each element determines the * element to be read next. */ int measure_latency(cpu_model_t* cpu, int from_node_id, int to_node_id); /** * \brief Calibrate memory latency * * Automatically tweaks the memory latency based on the detected hardware latency * on the target systems. */ void latency_calibration(); #endif ================================================ FILE: src/lib/measure_bw.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ // 2 BW measuring algorithms: one based on SSE4 instructions and the second based on // stream benchmark Copy kernel. //#define SSE4_VERSION #ifdef SSE4_VERSION #include #include #include #include #include #include #include "monotonic_timer.h" #include "interpose.h" #ifdef __SSE4_1__ #include #endif #define BYTES_PER_GB (1024*1024*1024LL) #define BYTES_PER_MB (1024*1024LL) // flag for terminating current test int g_done; // global current number of threads int g_nthreads = 0; // synchronization barrier for current thread counter pthread_barrier_t g_barrier; // thread shared parameters for test function void* g_array; size_t g_thrsize; int g_times; void (*g_func)(void*, size_t); // Compute bandwidth in MB/s. static inline double to_bw(size_t bytes, double secs) { double size_bytes = (double) bytes; double size_mb = size_bytes / ((double) BYTES_PER_MB); return size_mb / secs; } void* thread_worker(void* arg) { int j; unsigned int thread_num = (uintptr_t) arg; while (1) { // *** Barrier **** pthread_barrier_wait(&g_barrier); if (g_done) break; for (j = 0; j < g_times; j++) { g_func(&((char*) g_array)[g_thrsize * thread_num], g_thrsize); } // *** Barrier **** pthread_barrier_wait(&g_barrier); } return NULL; } int timeitp(void (*function)(void*, size_t), int nthreads, void* array, size_t size, int samples, int times) { double min = INFINITY; double runtime; size_t i, j, p; int thread_num; // globally set test function and thread number g_func = function; g_nthreads = nthreads; g_array = array; g_thrsize = size / nthreads; g_times = times; // create barrier and run threads pthread_barrier_init(&g_barrier, NULL, nthreads); pthread_t thr[nthreads]; //__lib_pthread_create(&thr[0], NULL, thread_master, new int(0)); for (p = 1; p < nthreads; ++p) { assert(__lib_pthread_create); __lib_pthread_create(&thr[p], NULL, thread_worker, (void *) p); } // use current thread as master thread; g_done = 0; thread_num = 0; for (i = 0; i < samples; i++) { pthread_barrier_wait(&g_barrier); assert(!g_done); double ts1 = monotonic_time(); for (j = 0; j < times; j++) { g_func(&((char*)g_array)[g_thrsize * thread_num], g_thrsize); } pthread_barrier_wait(&g_barrier); double ts2 = monotonic_time(); runtime = ts2 - ts1; if (runtime < min) { min = runtime; } } g_done = 1; pthread_barrier_wait(&g_barrier); for (p = 1; p < nthreads; ++p) { pthread_join(thr[p], NULL); } pthread_barrier_destroy(&g_barrier); return to_bw(size * times, min); } int timeit(void (*function)(void*, size_t), void* array, size_t size, int samples, int times) { double min = INFINITY; size_t i; // force allocation of physical pages memset(array, 0xff, size); for (i = 0; i < samples; i++) { double before, after, total; before = monotonic_time(); int j; for (j = 0; j < times; j++) { function(array, size); } after = monotonic_time(); total = after - before; if (total < min) { min = total; } } return to_bw(size * times, min); } #ifdef __SSE4_1__ void write_memory_nontemporal_sse(void* array, size_t size) { __m128i* varray = (__m128i*) array; __m128i vals = _mm_set1_epi32(1); size_t i; for (i = 0; i < size / sizeof(__m128i); i++) { _mm_stream_si128(&varray[i], vals); vals = _mm_add_epi16(vals, vals); } } void write_memory_sse(void* array, size_t size) { __m128i* varray = (__m128i*) array; __m128i vals = _mm_set1_epi32(1); size_t i; for (i = 0; i < size / sizeof(__m128i); i++) { _mm_store_si128(&varray[i], vals); vals = _mm_add_epi16(vals, vals); } } void read_memory_sse(void* array, size_t size) { __m128i* varray = (__m128i*) array; __m128i accum = _mm_set1_epi32(0xDEADBEEF); size_t i; for (i = 0; i < size / sizeof(__m128i); i++) { accum = _mm_add_epi16(varray[i], accum); } // This is unlikely, and we want to make sure the reads are not optimized // away. assert(!_mm_testz_si128(accum, accum)); } #else # error "No compiler support for SSE instructions" #endif //static char array[1024*1024*1024]; double measure_read_bw(int cpu_node, int mem_node) { char* array; size_t size = 1024*1024*1024; double bw; int nthreads = 16; array = numa_alloc_onnode(size, mem_node); assert(array); numa_run_on_node(cpu_node); // force allocation of physical pages memset(array, 0xff, size); bw = timeitp(read_memory_sse, nthreads, array, size, 5, 1); numa_free(array, size); return bw; } double measure_write_bw(int cpu_node, int mem_node) { char* array; size_t size = 1024*1024*1024; double bw; int nthreads = 16; array = numa_alloc_onnode(size, mem_node); assert(array); numa_run_on_node(cpu_node); // force allocation of physical pages memset(array, 0xff, size); bw = timeitp(write_memory_nontemporal_sse, nthreads, array, size, 5, 1); numa_free(array, size); return bw; } #else // SSE4_VERSION #include #include #include #include #include #include #include #include #include "monotonic_timer.h" #include "debug.h" # define N 20000000 # define NTIMES 10 # define OFFSET 0 # define HLINE "-------------------------------------------------------------\n" # ifndef MIN # define MIN(x,y) ((x)<(y)?(x):(y)) # endif # ifndef MAX # define MAX(x,y) ((x)>(y)?(x):(y)) # endif static double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; static double bytes[4] = { 2 * sizeof(double) * N, 2 * sizeof(double) * N, 3 * sizeof(double) * N, 3 * sizeof(double) * N }; //extern double mysecond(); double measure_read_bw(int cpu_node, int mem_node) { register int j, k; double t, times[4][NTIMES]; double *a, *c; //struct bitmask* membind; /* --- SETUP --- determine precision and check timing --- */ //membind = numa_allocate_nodemask(); //numa_bitmask_setbit(membind, mem_node); //numa_bind(membind); //numa_free_nodemask(membind); numa_run_on_node(cpu_node); omp_set_num_threads(10); // allocate memory dynamically to make sure the data is stored on the expected NUMA node a = (double *)numa_alloc_onnode( (N+OFFSET) * sizeof(double), mem_node); c = (double *)numa_alloc_onnode( (N+OFFSET) * sizeof(double), mem_node); DBG_LOG(DEBUG, "Measuring read BW on cpu node %d and mem node %d\n", cpu_node, mem_node); /* Get initial value for system clock. */ #pragma omp parallel for for (j=0; j #include #include #include #include #include #include #include #include #include #include #include #include "cpu/cpu.h" #include "error.h" #include "model.h" #define P (void)printf #define FP (void)fprintf #define PAGESZ 4096 #define MAX_NUM_CHAINS 16 #undef USE_HUGETLB #ifdef MEMLAT_SUPPORT extern __thread uint64_t tls_global_remote_dram; extern __thread uint64_t tls_global_local_dram; #endif typedef struct { uint64_t val; char padding[0]; } element_t; typedef struct { uint64_t N; uint64_t element_size; element_t* head; } chain_t; inline uint64_t min(uint64_t a, uint64_t b) { return a < b ? a : b; } /* G. Marsaglia, 2003. "Xorshift RNGs", Journal of Statistical Software v. 8 n. 14, pp. 1-6, discussed in _Numerical Recipes_ 3rd ed. */ static uint64_t prng(uint64_t* seed) { uint64_t x = *seed; x ^= x >> 21; x ^= x << 35; x ^= x >> 4; *seed = x; return x; } static uint64_t T(void) { struct timeval tv; #ifndef NDEBUG int r = #endif gettimeofday(&tv, NULL); assert(0 == r); return (uint64_t)(tv.tv_sec) * 1000000 + tv.tv_usec; } element_t* element(chain_t* chain, uint64_t index) { char* p = (char*) chain->head + index * chain->element_size; return (element_t *) p; } void inline read_element(chain_t* chain, uint64_t index, char* buf, uint64_t buf_size) { uint64_t i; element_t *elem = element(chain, index); buf_size = min(chain->element_size, buf_size); memcpy(buf, &elem->padding[0], buf_size - sizeof(elem->val)); for (i = buf_size; i <= chain->element_size - buf_size; i += buf_size) { memcpy(buf, &elem->padding[i], buf_size); } } chain_t* alloc_chain(uint64_t seedin, uint64_t N, uint64_t element_size, uint64_t node_i, uint64_t node_j) { uint64_t sum, p, i; element_t *B; char *A, *Aaligned, *M; uint64_t seed = seedin; chain_t* chain; #ifndef NDEBUG long mbind_result; #endif /* fill B[] with random permutation of 1..N */ chain = (chain_t*) malloc(sizeof(chain_t)); chain->N = N; chain->element_size = element_size; Aaligned = A = (char *) malloc(2 * PAGESZ + N * sizeof(element_t)); assert(NULL != A); while ( 0 != (Aaligned - (char *)0) % PAGESZ ) Aaligned++; B = (element_t *) Aaligned; for (i = 0; i < N; i++) B[i].val = 1+i; for (i = 0; i < N; i++) { uint64_t r, t; r = prng(&seed); r = r % N; /* should be okay for N << 2^64 */ t = B[i].val; B[i].val = B[r].val; B[r].val = t; } sum = 0; for (i = 0; i < N; i++) sum += B[i].val; assert((N+1)*N/2 == sum); /* Euler's formula */ /* set up C[] such that "chasing pointers" through it visits every element exactly once */ #ifdef USE_HUGETLB M = (char*) mmap(NULL, 2 * PAGESZ + (1+N) * element_size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB, -1, 0); #else M = (char*) mmap(NULL, 2 * PAGESZ + (1+N) * element_size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); #endif assert(NULL != M); while ( 0 != (M - (char *)0) % PAGESZ ) M++; numa_run_on_node(node_i); uint64_t nodemask = 1 << node_j; #ifndef NDEBUG mbind_result = #endif mbind(M, N*element_size, MPOL_BIND, &nodemask, 64, MPOL_MF_MOVE); assert(mbind_result == 0); bzero(M, N*element_size); // force physical memory allocation chain->head = (element_t *) M; for (i = 0; i < N; i++) { element(chain, i)->val = UINT64_MAX; } p = 0; for (i = 0; i < N; i++) { p = element(chain, p)->val = B[i].val; } element(chain, p)->val = 0; for (i = 0; i <= N; i++) { assert(N >= element(chain, i)->val); } free(A); return chain; } uint64_t trash_cache(uint64_t N) { uint64_t T1, i, sum; char* A; char* ptr; element_t* B; ptr = A = (char *) malloc(2 * PAGESZ + N * sizeof(element_t)); assert(NULL != A); while ( 0 != (A - (char *)0) % PAGESZ ) { A++; __asm__(""); /* prevent optimizer from removing loop */ } B = (element_t *)A; /* trash the CPU cache */ T1 = T() % 1000; for (i = 0; i < N; i++) { B[i].val = T1 * i + i % (T1+1); __asm__(""); /* prevent optimizer from removing loop */ } sum = 0; for (i = 0; i < N; i++) { sum += B[i].val; __asm__(""); /* prevent optimizer from removing loop */ } free(ptr); return sum; } int __measure_latency(uint64_t seedin, int nchains, size_t nelems, int element_size, int access_size, int from_node_id, int to_node_id) { uint64_t seed, j, i, T1, T2; uint64_t sumv[MAX_NUM_CHAINS]; uint64_t nextp[MAX_NUM_CHAINS]; chain_t *C[MAX_NUM_CHAINS]; char *buf; uint64_t buf_size = 16384; assert(nelems < UINT64_MAX); assert(nchains < MAX_NUM_CHAINS); DBG_LOG(INFO, "measuring latency: nchains %d, nelems %zu, elem_sz %d, access_sz %d, from_node_id %d, to_node_id %d\n", nchains, nelems, element_size, access_size, from_node_id, to_node_id); for (j=0; j < nchains; j++) { seed = seedin + j*j; C[j] = alloc_chain(seed, nelems, element_size, from_node_id, to_node_id); } trash_cache(nelems); buf = (char*) malloc(buf_size); assert(buf != NULL); #ifdef MEMLAT_SUPPORT tls_global_remote_dram = 0; tls_global_local_dram = 0; #endif /* chase the pointers */ if (nchains == 1) { T1 = T(); sumv[0] = 0; for (i = 0; 0 != element(C[0], i)->val; i = element(C[0], i)->val) { sumv[0] += element(C[0], i)->val; if (access_size > element_size) { read_element(C[0], i, buf, buf_size); } } T2 = T(); } else { T1 = T(); for (j=0; j < nchains; j++) { sumv[j] = 0; nextp[j] = 0; } for (; 0 != element(C[0], nextp[0])->val; ) { for (j=0; j < nchains; j++) { sumv[j] += element(C[j], nextp[j])->val; if (access_size > element_size) { read_element(C[j], nextp[j], buf, buf_size); } nextp[j] = element(C[j], nextp[j])->val; } } T2 = T(); } assert((nelems+1)*nelems/2 == sumv[0]); /* Euler's formula */ uint64_t time_per_op_ns = ((T2-T1)*1000)/nelems; DBG_LOG(INFO, "measuring latency: latency is %lu ns\n", time_per_op_ns); for (j=0; j < nchains; j++) { free(C[j]); } free(buf); return time_per_op_ns; } int measure_latency(cpu_model_t* cpu, int from_node_id, int to_node_id) { size_t factor = 10; // this needs to be large enough to ensure we always miss in the LLC cache size_t element_size = 64LLU; size_t access_size = 8; size_t nelems = factor * cpu->llc_size_bytes / element_size; return __measure_latency(1, 1, nelems, element_size, access_size, from_node_id, to_node_id); } int measure_latency2(uint64_t seedin, int nchains, size_t nelems, int element_size, int access_size, int from_node_id, int to_node_id) { if (nelems*element_size < cpu_llc_size_bytes()) { DBG_LOG(WARNING, "warning: #elements == %" PRIu64 " seems small!\n", nelems); } return __measure_latency(seedin, nchains, nelems, element_size, access_size, from_node_id, to_node_id); } #ifdef CALIBRATION_SUPPORT #define TOLERATED_DEVIATION_PERCENTAGE 5 // maximum deviation acceptable for the target latency #define CALIBRATION_STEP_SIZE 0.05 // max ns step size to calibrate the CPU stalls #define CALIBRATION_FINEST_STEP 0.01 // min (finest) ns step size to calibrate the CPU stalls #define MAX_TOLERATED_BAD_STEPS 2 // max number of bad steps in the calibration, before the calibration inverts the value to increment #define NELEMS 10000000 #define SEED_IN 1 #define NCHAINS 1 #define ELEM_SIZE 64LLU #define ACCESS_SIZE 8 #define FILE_CALIB_LOCAL "/tmp/local_latency_calibration" #define FILE_CALIB_REMOTE "/tmp/remote_latency_calibration" static int calibrate_load_from_file(virtual_node_t *virtual_node) { FILE *fp = NULL; char *file_name = NULL; char *line = NULL; size_t len; double correction_factor; int status = E_ERROR; if (virtual_node->dram_node == virtual_node->nvram_node) { file_name = FILE_CALIB_LOCAL; } else { file_name = FILE_CALIB_REMOTE; } if (access(file_name, R_OK | W_OK) == 0) { // calibration file is available, check if the current target latency is mapped if ((fp = fopen(file_name, "r"))) { if (getline(&line, &len, fp) != -1) { if (sscanf(line, "%lf", &correction_factor) == 1) { // set CPU stalls factor to the read value latency_model.stalls_calibration_factor = correction_factor; DBG_LOG(INFO, "CALIBRATION: factor loaded from file (%s) (%f)\n", file_name, correction_factor); status = E_SUCCESS; } } if (line) free(line); fclose(fp); } } return status; } static void calibrate_save_to_file(virtual_node_t *virtual_node, double correction_factor) { char *file_name; FILE *fp; if (virtual_node->dram_node == virtual_node->nvram_node) { file_name = FILE_CALIB_LOCAL; } else { file_name = FILE_CALIB_REMOTE; } // calibration file is available, check if the current target latency is mapped if ((fp = fopen(file_name, "a"))) { // it is assumed this line is not yet present in the file fprintf(fp, "%f\n", correction_factor); DBG_LOG(INFO, "CALIBRATION: factor saved to file (%s) (%f)\n", file_name, correction_factor); fclose(fp); } } static int diff_target_latencies(int measured_latency, int target_latency) { int diff = target_latency - measured_latency; return abs(diff); } static double calibrate(virtual_node_t *virtual_node, double step_value, int from_node, int to_node) { int measured; int best_diff_latency; double best_factor = 0; int diff; int bad_step_count = 0; int close_value; int calib_done; // force a change in correction factor and measure latency // each step will increment the or decrement the factor // at the end we have a calibrated correction factor for the CPU stalls DBG_LOG(INFO, "CALIBRATION: for nodes (dram %d, nvram %d)\n", from_node, to_node); best_diff_latency = INT32_MAX; close_value = 0; calib_done = 0; while(!calib_done) { measured = measure_latency2(SEED_IN, NCHAINS, NELEMS, ELEM_SIZE, ACCESS_SIZE, from_node, to_node); DBG_LOG(INFO, "CALIBRATION: measured latency (%d)\n", measured); diff = diff_target_latencies(measured, latency_model.read_latency); if (diff < best_diff_latency) { // best measured latency so far bad_step_count = 0; best_diff_latency = diff; best_factor = latency_model.stalls_calibration_factor; // check if the diff is less or equal than the configured percentage of the target latency if (diff <= (latency_model.read_latency * TOLERATED_DEVIATION_PERCENTAGE / 100)) { DBG_LOG(INFO, "CALIBRATION: got a close latency value (factor %lf)\n", best_factor); close_value = 1; } } else if (diff >= best_diff_latency) { // measure latency is getting worse if (close_value && bad_step_count == 0) { // if we have a close_value, return it calib_done = 1; } else { // otherwise let's give retries ++bad_step_count; if (bad_step_count >= MAX_TOLERATED_BAD_STEPS) { // this calibration method seem to be moving to the wrong direction // return invalid value and hopefully fall back to the second method return 0; } } } latency_model.stalls_calibration_factor += step_value; } // while return best_factor; } static double calibrate_with_size(virtual_node_t *virtual_node, double calib_size, int from_node, int to_node) { double best_factor; // first method decrements the factor with the provided step size if (((best_factor = calibrate(virtual_node, (-calib_size), from_node, to_node)) == 0) || calib_size == CALIBRATION_FINEST_STEP) { if (best_factor > 0.0) { // recover last best factor latency_model.stalls_calibration_factor = best_factor; } // second method increments the factor with the provided step size // this method will be always performed if the provided step size is the finest best_factor = calibrate(virtual_node, calib_size, from_node, to_node); } return best_factor; } void latency_calibration(virtual_node_t *virtual_node) { double best_factor; int from_node = virtual_node->dram_node->node_id; int to_node = virtual_node->nvram_node->node_id; // if calibration file exist, load the correction factor and exit if (calibrate_load_from_file(virtual_node) == E_SUCCESS) { return; } if ((best_factor = calibrate_with_size(virtual_node, CALIBRATION_STEP_SIZE, from_node, to_node)) != 0) { latency_model.stalls_calibration_factor = best_factor + CALIBRATION_FINEST_STEP; best_factor = calibrate_with_size(virtual_node, CALIBRATION_FINEST_STEP, from_node, to_node); } if (best_factor == 0.0) { best_factor = 1.0; } // set the hardware latency to the best fit value latency_model.stalls_calibration_factor = best_factor; DBG_LOG(INFO, "CALIBRATION: CPU stalls correction factor is %f (dram %d, nvram %d)\n", best_factor, from_node, to_node); // save file for local or remote 'correction factor' calibrate_save_to_file(virtual_node, best_factor); } #endif // CALIBRATION SUPPORT ================================================ FILE: src/lib/misc.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include #include #include #include size_t string_to_size(char* str) { size_t factor = 1; size_t size; long val; char* endptr = 0; val = strtoull(str, &endptr, 10); while(endptr && (endptr - str) < strlen(str) && !isalpha(*endptr)) {endptr++;} switch (endptr[0]) { case 'K': case 'k': factor = 1024LLU; break; case 'M': case 'm': factor = 1024LLU*1024LLU; break; case 'G': case 'g': factor = 1024LLU*1024LLU*1024LLU; break; default: factor = 1; } size = factor * val; return size; } ================================================ FILE: src/lib/misc.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __MISC_H #define __MISC_H size_t string_to_size(char* str); #endif ================================================ FILE: src/lib/model.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __MODEL_H #define __MODEL_H #include "config.h" #include "cpu/cpu.h" #include "thread.h" #ifdef PAPI_SUPPORT #include "cpu/pmc-papi.h" #else #include "cpu/pmc.h" #endif #define MAX_EPOCH_DURATION_US 1000000 #define MIN_EPOCH_DURATION_US 1 typedef struct { int enabled; int read_latency; int write_latency; int inject_delay; #ifdef CALIBRATION_SUPPORT int calibration; #endif #ifdef PAPI_SUPPORT read_stalls_t pmc_stall_local; read_stalls_t pmc_stall_remote; #else pmc_event_t* pmc_stall_cycles; pmc_event_t* pmc_remote_dram; int process_local_rank; int max_local_processe_ranks; #endif double stalls_calibration_factor; } latency_model_t; extern latency_model_t latency_model; typedef struct { unsigned int throttle_reg_val[MAX_THROTTLE_VALUE]; double bandwidth[MAX_THROTTLE_VALUE]; int npoints; int enabled; } bw_model_t; extern bw_model_t read_bw_model; extern bw_model_t write_bw_model; int init_bandwidth_model(config_t* cfg, struct virtual_topology_s* topology); int init_latency_model(config_t* cfg, cpu_model_t* cpu, struct virtual_topology_s* virtual_topology); void init_thread_latency_model(thread_t *thread); void create_latency_epoch(); #endif /* __MODEL_H */ ================================================ FILE: src/lib/model_bw.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include #include #include #include "cpu/cpu.h" #include "config.h" #include "error.h" #include "measure.h" #include "stat.h" #include "topology.h" #include "monotonic_timer.h" #include "model.h" /** * \file * * \page latency_emulation Memory bandwidth emulation * * To emulate bandwidth, we rely on memory power throttling (supported by recent memory * controllers) to limit the effective bandwidth to the DRAM attached to a socket. * Memory power throttling is configured through the PCI configuration space. * We use a kernel-module to set the proper PCI registers. * * Initially, we perform a series of bandwidth measurements to find out the bandwidth * that corresponds to each register value. We incrementally try out each register value * starting from 0x800f until we saturate memory bandwidth. * */ bw_model_t read_bw_model; bw_model_t write_bw_model; #define THROTTLE_INCREMENT 15 #define THROTTLE_INITIAL_VALUE 0x800f static int train_model(physical_node_t* phys_node, char model_type, bw_model_t* bw_model) { double x[MAX_THROTTLE_VALUE]; double best_rate; double m; int i; uint16_t throttle_reg_val; int min_number_throttle_points = 10; double stop_slope = 0.1; int phys_node_id = phys_node->node_id; pci_regs_t *regs = phys_node->mc_pci_regs; // reset throttling phys_node->cpu_model->get_throttle_register(regs, THROTTLE_DDR_ACT, &throttle_reg_val); if (throttle_reg_val < 0x8fff) phys_node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, 0x8FFF); DBG_LOG(INFO, "throttle bus id %d, on physical node: %d\n", regs->addr[0].bus_id, phys_node_id); // we run until our bandwidth curve flattens out which we find out using // gradient (slope) analysis for (i=0; i < MAX_THROTTLE_VALUE; i++) { phys_node->cpu_model->get_throttle_register(regs, THROTTLE_DDR_ACT, &throttle_reg_val); if (throttle_reg_val >= 0x8fff) throttle_reg_val = THROTTLE_INITIAL_VALUE; else throttle_reg_val += THROTTLE_INCREMENT; if (model_type == 'r') { phys_node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, throttle_reg_val); best_rate = measure_read_bw(phys_node_id, phys_node_id); // restore throttling register //phys_node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, 0x8fff); } /*else if (model_type == 'w') { phys_node->cpu_model->set_throttle_register(bus_id, THROTTLE_DDR_ACT, throttle_reg_val); best_rate = measure_write_bw(phys_node_id, phys_node_id); // restore throttling register phys_node->cpu_model->set_throttle_register(bus_id, THROTTLE_DDR_ACT, 0x8fff); }*/ DBG_LOG(INFO, "throttle reg: 0x%x, %c bandwidth: %f\n", throttle_reg_val, model_type, best_rate); bw_model->throttle_reg_val[i] = throttle_reg_val; bw_model->bandwidth[i] = best_rate; x[i] = (double) throttle_reg_val; // slope calculation requires values of type double if (i > min_number_throttle_points) { m = slope(&x[i-min_number_throttle_points], &bw_model->bandwidth[i-min_number_throttle_points], min_number_throttle_points); if (abs(m) < stop_slope) { break; } } } bw_model->npoints = i; return E_SUCCESS; } static int load_model(const char* path, const char* prefix, bw_model_t* bw_model) { FILE *fp; char *line = NULL; char str[64]; size_t len = 0; ssize_t read; int x; double y; int found_points; fp = fopen(path, "r"); if (fp == NULL) { return E_ERROR; } DBG_LOG(INFO, "Loading %s bandwidth model from %s\n", prefix, path); for (found_points = 0; (read = getline(&line, &len, fp)) != -1; ) { if (strstr(line, prefix)) { sscanf(line, "%s\t%d\t%lf", str, &x, &y); DBG_LOG(INFO, "throttle reg: 0x%x, bandwidth: %f\n", x, y); bw_model->throttle_reg_val[found_points] = x; bw_model->bandwidth[found_points] = y; found_points++; } } free(line); if (found_points) { bw_model->npoints = found_points; } else { DBG_LOG(INFO, "No %s bandwidth model found in %s\n", prefix, path); return E_ERROR; } fclose(fp); return E_SUCCESS; } static int save_model(const char* path, const char* prefix, bw_model_t* bw_model) { int i; FILE *fp; fp = fopen(path, "a"); if (fp == NULL) { return E_ERROR; } DBG_LOG(INFO, "Saving %s bandwidth model into %s\n", prefix, path); for (i=0; inpoints; i++) { int x = bw_model->throttle_reg_val[i]; double y = bw_model->bandwidth[i]; //DBG_LOG(INFO, "throttle reg: 0x%x, bandwidth: %f\n", x, y); fprintf(fp, "%s\t%d\t%f\n", prefix, x, y); } fclose(fp); return E_SUCCESS; } static int find_data_point(bw_model_t* model, double target_bw, unsigned int* point) { int i; double error; // go through all points as we are not sorted and pick the one closest *point = 0; error = target_bw; for (i=1; inpoints; i++) { if (fabs(model->bandwidth[i] - target_bw) < error) { *point = i; error = fabs(model->bandwidth[i] - target_bw); } } return E_SUCCESS; } int __set_write_bw(physical_node_t* node, uint64_t target_bw) { pci_regs_t *regs = node->mc_pci_regs; int ret; unsigned int point; if (regs == NULL) { return E_SUCCESS; } if (target_bw == (uint64_t) (-1)) { node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, 0x8fff); return E_SUCCESS; } if ((ret = find_data_point(&write_bw_model, (double) target_bw, &point)) != E_SUCCESS) { return ret; } DBG_LOG(INFO, "Setting throttle reg: %d (0x%x), target write bandwidth: %" PRIu64 ", actual write bandwidth: %" PRIu64 "\n", write_bw_model.throttle_reg_val[point], write_bw_model.throttle_reg_val[point], target_bw, (uint64_t) write_bw_model.bandwidth[point]); node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, write_bw_model.throttle_reg_val[point]); return E_SUCCESS; } int set_write_bw(config_t* cfg, physical_node_t* node) { int target_bw; __cconfig_lookup_int(cfg, "bandwidth.write", &target_bw); return __set_write_bw(node, target_bw); } int __set_read_bw(physical_node_t* node, uint64_t target_bw) { pci_regs_t *regs = node->mc_pci_regs; int ret; unsigned int point; if (regs == NULL) { return E_SUCCESS; } if (target_bw == (uint64_t) (-1)) { node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, 0x8fff); return E_SUCCESS; } if ((ret = find_data_point(&read_bw_model, (double) target_bw, &point)) != E_SUCCESS) { return ret; } DBG_LOG(INFO, "Setting throttle reg: %d (0x%x), target read bandwidth: %" PRIu64 ", actual read bandwidth: %" PRIu64 "\n", read_bw_model.throttle_reg_val[point], read_bw_model.throttle_reg_val[point], target_bw, (uint64_t) read_bw_model.bandwidth[point]); node->cpu_model->set_throttle_register(regs, THROTTLE_DDR_ACT, read_bw_model.throttle_reg_val[point]); return E_SUCCESS; } int set_read_bw(config_t* cfg, physical_node_t* node) { int target_bw; __cconfig_lookup_int(cfg, "bandwidth.read", &target_bw); return __set_read_bw(node, target_bw); } int init_bandwidth_model(config_t* cfg, virtual_topology_t* topology) { int i; char* model_file; srandom((int)monotonic_time()); if (read_bw_model.enabled) { DBG_LOG(INFO, "Initializing bandwidth model\n"); // initialize bandwidth model for (i=0; inum_virtual_nodes; i++) { // FIXME: currently we keep a single bandwidth model and not per-node bandwidth model physical_node_t* phys_node = topology->virtual_nodes[i].nvram_node; if (__cconfig_lookup_string(cfg, "bandwidth.model", &model_file) == CONFIG_TRUE) { if (load_model(model_file, "read", &read_bw_model) != E_SUCCESS) { train_model(phys_node, 'r', &read_bw_model); save_model(model_file, "read", &read_bw_model); } /*if (load_model(model_file, "write", &write_bw_model) != E_SUCCESS) { train_model(phys_node, 'w', &write_bw_model); save_model(model_file, "write", &write_bw_model); }*/ } } // set read and write memory bandwidth for (i=0; inum_virtual_nodes; i++) { physical_node_t* phys_node = topology->virtual_nodes[i].nvram_node; set_read_bw(cfg, phys_node); //set_write_bw(cfg, phys_node); } } else { // reset throttle registers for (i=0; inum_virtual_nodes; i++) { // FIXME: currently we keep a single bandwidth model and not per-node bandwidth model physical_node_t* phys_node = topology->virtual_nodes[i].dram_node; __set_read_bw(phys_node, (uint64_t) (-1)); __set_write_bw(phys_node, (uint64_t) (-1)); } } return E_SUCCESS; } ================================================ FILE: src/lib/model_lat.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include "cpu/cpu.h" #include "config.h" #include "error.h" #include "thread.h" #include "topology.h" #include "model.h" #include "monotonic_timer.h" /** * \file * * \page latency_emulation Memory latency emulation * * To emulate latency, we construct epochs and inject software created delays * at the end of each epoch. * Epochs are created either at fixed intervals by periodically interrupting * threads or on demand when a synchronization method (lock, unlock) is called. * * Delays are calculated using a simple analytic model that takes input from * performance counters. */ latency_model_t latency_model; #pragma GCC push_options #pragma GCC optimize ("O0") inline hrtime_t hrtime_cycles(void) { unsigned hi, lo; __asm__ __volatile__ ("rdtscp" : "=a"(lo), "=d"(hi)); return ( (hrtime_t)lo)|( ((hrtime_t)hi)<<32 ); } #pragma GCC pop_options /* static inline hrtime_t ns_to_cycles(int cpu_speed_mhz, int ns) { return (cpu_speed_mhz * ns) / 1000; } */ inline hrtime_t cycles_to_us(int cpu_speed_mhz, hrtime_t cycles) { return (cycles/cpu_speed_mhz); } #pragma GCC push_options #pragma GCC optimize ("O0") static inline void create_delay_cycles(hrtime_t cycles) { hrtime_t start, stop; start = hrtime_cycles(); do { stop = hrtime_cycles(); } while (stop - start < cycles); } #pragma GCC pop_options /* static inline void create_delay_ns(cpu_model_t* cpu, int ns) { hrtime_t cycles; cycles = ns_to_cycles(cpu, ns); create_delay_cycles(cycles); } */ static int check_target_latency_against_hw_latency(virtual_topology_t* virtual_topology) { int status = 0; int i; int hw_latency_dram; int hw_latency_nvram; for (i = 0; i < virtual_topology->num_virtual_nodes; ++i) { hw_latency_dram = virtual_topology->virtual_nodes[i].dram_node->latency; hw_latency_nvram = virtual_topology->virtual_nodes[i].nvram_node->latency; if (hw_latency_dram >= latency_model.read_latency || hw_latency_dram >= latency_model.write_latency || hw_latency_nvram >= latency_model.read_latency || hw_latency_nvram >= latency_model.write_latency) { DBG_LOG(ERROR, "Target read (%d) and write (%d) latency to be emulated must be greater than the " "hardware latency dram (%d) and virtual nvram (%d) (virtual node %d)\n", latency_model.read_latency, latency_model.write_latency, hw_latency_dram, hw_latency_nvram, i); status = -1; break; } } return status; } int init_latency_model(config_t* cfg, cpu_model_t* cpu, virtual_topology_t* virtual_topology) { int i; DBG_LOG(INFO, "Initializing latency model\n"); memset(&latency_model, 0, sizeof(latency_model_t)); latency_model.enabled = 1; __cconfig_lookup_int(cfg, "latency.read", &latency_model.read_latency); __cconfig_lookup_int(cfg, "latency.write", &latency_model.write_latency); if (check_target_latency_against_hw_latency(virtual_topology) < 0) { return E_INVAL; } __cconfig_lookup_bool(cfg, "latency.inject_delay", &latency_model.inject_delay); if (!latency_model.inject_delay) { DBG_LOG(WARNING, "Latency model is enabled, but delay injection is disabled\n"); } #ifdef PAPI_SUPPORT if (pmc_init() != 0) { return E_ERROR; } latency_model.pmc_stall_local = cpu->pmc_events.read_stalls_events_local; latency_model.pmc_stall_remote = cpu->pmc_events.read_stalls_events_remote; #else for (i=0; cpu->pmc_events->known_events[i].name; ++i) { // LDM_STALL_CYCLES implementation for each processor is mandatory if (strcasecmp(cpu->pmc_events->known_events[i].name, "LDM_STALL_CYCLES") == 0) { if (!(latency_model.pmc_stall_cycles = enable_pmc_event(cpu, "LDM_STALL_CYCLES"))) { return E_NOENT; } } if (strcasecmp(cpu->pmc_events->known_events[i].name, "REMOTE_DRAM") == 0) { if (!(latency_model.pmc_remote_dram = enable_pmc_event(cpu, "REMOTE_DRAM"))) { return E_NOENT; } } } assert(latency_model.pmc_stall_cycles); #endif #ifdef CALIBRATION_SUPPORT __cconfig_lookup_bool(cfg, "latency.calibration", &latency_model.calibration); if (latency_model.calibration) { latency_model.stalls_calibration_factor = 1.0; } #endif return E_SUCCESS; } __thread uint64_t tls_overhead = 0; __thread int tls_hw_local_latency = 0; __thread int tls_hw_remote_latency = 0; #ifdef MEMLAT_SUPPORT __thread uint64_t tls_global_remote_dram = 0; __thread uint64_t tls_global_local_dram = 0; #endif void init_thread_latency_model(thread_t *thread) { tls_hw_local_latency = thread->virtual_node->dram_node->latency; tls_hw_remote_latency = thread->virtual_node->nvram_node->latency; } void create_latency_epoch() { uint64_t stall_cycles = 0; uint64_t delay_cycles = 0; int hw_latency; int target_latency; hrtime_t start, stop; double epoch_end; start = hrtime_cycles(); // An epoch may be created by a critical section and the static epoch // may interfere with the current epoch creation. Block the signal here // and unblock it at the end of this function. block_new_epoch(); // must always be thread_self since we call core specific data through hrtime_cycles thread_t* thread = thread_self(); if (!reached_min_epoch_duration(thread)) { if (!thread) thread = thread_self(); if (thread) thread->signaled = 0; unblock_new_epoch(); return; } //DBG_LOG(INFO, "new epoch for thread id [%i]\n", thread->tid); #ifdef USE_STATISTICS if (thread->thread_manager->stats.enabled) { thread->stats.epochs++; } #endif // this is the generic hardware latency for this thread (it takes into account the current virtual node latencies) hw_latency = thread->virtual_node->nvram_node->latency; target_latency = latency_model.read_latency; // check if the thread_self is remote (virtual topology where dram != nvram) or local (dram == nvram) // on this case, stall cycles will be a proportion of remote memory accesses // TODO: the read pmc method used below must be changed to support PAPI if (thread->virtual_node->dram_node != thread->virtual_node->nvram_node && latency_model.pmc_remote_dram) { stall_cycles = read_pmc_event(latency_model.pmc_remote_dram); } else { stall_cycles = read_pmc_event(latency_model.pmc_stall_cycles); } #ifdef CALIBRATION_SUPPORT if (latency_model.calibration) { stall_cycles = (uint64_t)((double)stall_cycles * latency_model.stalls_calibration_factor); } #endif delay_cycles = stall_cycles * ((double)(target_latency - hw_latency) / ((double) hw_latency)); stop = hrtime_cycles(); tls_overhead += stop - start; DBG_LOG(DEBUG, "overhead cycles: %lu; immediate overhead %lu; stall cycles: %lu; delay cycles: %lu\n", tls_overhead, stop - start, stall_cycles, delay_cycles); if (delay_cycles > tls_overhead) { delay_cycles -= tls_overhead; tls_overhead = 0; } else { tls_overhead -= delay_cycles; delay_cycles = 0; } #ifdef MEMLAT_SUPPORT thread->stall_cycles += stall_cycles; #endif #ifdef USE_STATISTICS if (thread->thread_manager->stats.enabled) { thread->stats.stall_cycles += stall_cycles; thread->stats.delay_cycles += delay_cycles; thread->stats.overhead_cycles = tls_overhead; } #endif epoch_end = monotonic_time_us(); DBG_LOG(DEBUG, "injecting delay of %lu cycles (%lu usec) - discounted overhead\n", delay_cycles, cycles_to_us(thread->cpu_speed_mhz, delay_cycles)); if (delay_cycles && latency_model.inject_delay) { create_delay_cycles(delay_cycles); } #ifdef USE_STATISTICS if (thread->thread_manager->stats.enabled) { uint64_t older_epoch_timestamp = thread->stats.last_epoch_timestamp; uint64_t diff_epoch_timestamp = epoch_end - older_epoch_timestamp; if (diff_epoch_timestamp < thread->stats.shortest_epoch_duration_us) { thread->stats.shortest_epoch_duration_us = diff_epoch_timestamp; } if (diff_epoch_timestamp > thread->stats.longest_epoch_duration_us) { thread->stats.longest_epoch_duration_us = diff_epoch_timestamp; } thread->stats.overall_epoch_duration_us += diff_epoch_timestamp; thread->stats.last_epoch_timestamp = monotonic_time_us(); } else { // last epoch timestamp must always be updated thread->stats.last_epoch_timestamp = monotonic_time_us(); } #else thread->last_epoch_timestamp = monotonic_time_us(); #endif // this must be the last step, since this function is called also from the signal handler // and the monitor thread sets this flag, we must make sure race conditions are prevented thread->signaled = 0; unblock_new_epoch(); } ================================================ FILE: src/lib/monotonic_timer.c ================================================ // Copyright 2013 Alex Reece. // // A cross platform monotonic timer. #include #include "monotonic_timer.h" #if _POSIX_TIMERS > 0 && defined(_POSIX_MONOTONIC_CLOCK) // If we have it, use clock_gettime and CLOCK_MONOTONIC. #include double monotonic_time() { struct timespec time; // Note: Make sure to link with -lrt to define clock_gettime. clock_gettime(CLOCK_MONOTONIC, &time); return ((double) time.tv_sec) + ((double) time.tv_nsec / (NANOS_PER_SECF)); } double monotonic_time_us() { struct timespec time; // Note: Make sure to link with -lrt to define clock_gettime. clock_gettime(CLOCK_MONOTONIC, &time); return ((double) (time.tv_sec * USECS_PER_SEC)) + ((double) time.tv_nsec / NANOS_PER_USECF); } #else // Fall back to rdtsc. The reason we don't use clock() is this scary message // from the man page: // "On several other implementations, the value returned by clock() also // includes the times of any children whose status has been collected via // wait(2) (or another wait-type call)." // // Also, clock() only has microsecond accuracy. // // This whitepaper offered excellent advice on how to use rdtscp for // profiling: http://download.intel.com/embedded/software/IA/324264.pdf // // Unfortunately, we can't follow its advice exactly with our semantics, // so we're just going to use rdtscp with cpuid. // // Note that rdtscp will only be available on new processors. #include static inline uint64_t rdtsc() { uint32_t hi, lo; asm volatile("rdtscp\n" "movl %%edx, %0\n" "movl %%eax, %1\n" "cpuid" : "=r" (hi), "=r" (lo) : : "%rax", "%rbx", "%rcx", "%rdx"); return (((uint64_t)hi) << 32) | (uint64_t)lo; } static uint64_t rdtsc_per_sec = 0; static uint64_t rdtsc_per_usec = 0; static void __attribute__((constructor)) init_rdtsc_per_sec() { uint64_t before, after; before = rdtsc(); usleep(USECS_PER_SEC); after = rdtsc(); rdtsc_per_sec = after - before; before = rdtsc(); usleep(1); after = rdtsc(); rdtsc_per_usec = after - before; } double monotonic_time() { return (double) rdtsc() / (double) rdtsc_per_sec; } // TODO: not tested, it is core specific and callers must be aware double monotonic_time_us() { return ((double) rdtsc() / (double) rdtsc_per_usec); } #endif ================================================ FILE: src/lib/monotonic_timer.h ================================================ // Copyright 2013 Alex Reece. // // A cross platform monotonic timer. #ifndef MONOTONIC_TIMER_H_ #define MONOTONIC_TIMER_H_ #define NANOS_PER_SECF 1000000000.0 #define NANOS_PER_USECF 1000.0 #define NANOS_PER_USEC 1000 #define USECS_PER_SEC 1000000 // Returns seconds since some unspecified start time (guaranteed to be // monotonically increasing). double monotonic_time(); double monotonic_time_us(); #endif // MONOTONIC_TIMER_H_ ================================================ FILE: src/lib/pflush.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include "pflush.h" #include typedef uint64_t hrtime_t; #if defined(__i386__) static inline unsigned long long asm_rdtsc(void) { unsigned long long int x; __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x)); return x; } static inline unsigned long long asm_rdtscp(void) { unsigned hi, lo; __asm__ __volatile__ ("rdtscp" : "=a"(lo), "=d"(hi)::"ecx"); return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); } #elif defined(__x86_64__) static inline unsigned long long asm_rdtsc(void) { unsigned hi, lo; __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); } static inline unsigned long long asm_rdtscp(void) { unsigned hi, lo; __asm__ __volatile__ ("rdtscp" : "=a"(lo), "=d"(hi)::"rcx"); return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); } #else #error "What architecture is this???" #endif /* Flush cacheline */ #define asm_clflush(addr) \ ({ \ __asm__ __volatile__ ("clflush %0" : : "m"(*addr)); \ }) /* Memory fence */ #define asm_mfence() \ ({ \ PM_FENCE(); \ __asm__ __volatile__ ("mfence"); \ }) static int global_cpu_speed_mhz = 0; static int global_write_latency_ns = 0; void init_pflush(int cpu_speed_mhz, int write_latency_ns) { global_cpu_speed_mhz = cpu_speed_mhz; global_write_latency_ns = write_latency_ns; } inline hrtime_t cycles_to_ns(int cpu_speed_mhz, hrtime_t cycles) { return (cycles*1000/cpu_speed_mhz); } inline hrtime_t ns_to_cycles(int cpu_speed_mhz, hrtime_t ns) { return (ns*cpu_speed_mhz/1000); } static inline void emulate_latency_ns(int ns) { hrtime_t cycles; hrtime_t start; hrtime_t stop; start = asm_rdtsc(); cycles = ns_to_cycles(global_cpu_speed_mhz, ns); do { /* RDTSC doesn't necessarily wait for previous instructions to complete * so a serializing instruction is usually used to ensure previous * instructions have completed. However, in our case this is a desirable * property since we want to overlap the latency we emulate with the * actual latency of the emulated instruction. */ stop = asm_rdtsc(); } while (stop - start < cycles); } void pflush(uint64_t *addr) { if (global_write_latency_ns == 0) { return; } /* Measure the latency of a clflush and add an additional delay to * meet the latency to write to NVM */ hrtime_t start; hrtime_t stop; start = asm_rdtscp(); asm_clflush(addr); stop = asm_rdtscp(); int to_insert_ns = global_write_latency_ns - cycles_to_ns(global_cpu_speed_mhz, stop-start); if (to_insert_ns <= 0) { return; } emulate_latency_ns(to_insert_ns); } ================================================ FILE: src/lib/pflush.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __PFLUSH_H #define __PFLUSH_H /** * \file * * \page pflush_api Persistent Memory API * * Method to be used by client to inject a write latency. */ #include #ifdef __cplusplus extern "C" { #endif void init_pflush(int cpu_speed_mhz, int write_latency_ns); /** * \brief Flush the cacheline containing address addr. */ void pflush(uint64_t *addr); #ifdef __cplusplus } #endif #endif /* __PFLUSH_H */ ================================================ FILE: src/lib/pmalloc.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include "topology.h" #include "pmalloc.h" #include "thread.h" #include "debug.h" // pmalloc should be implemented as a separate library // FIXME: pmalloc currently uses numa_alloc_onnode() which is slower than regular malloc. // Consider layering another malloc on top of a emulated nvram void* pmalloc(size_t size) { thread_t* thread = thread_self(); if (thread == NULL) { // FIXME: JVM for instance create threads using a mechanism not traced by this emulator // for now we make sure the current thread is registered right when it makes the // first explicit NVM allocation. A better solution is to trace the thread creation // done by JVM. register_self(); thread = thread_self(); } if (thread) { return numa_alloc_onnode(size, thread->virtual_node->nvram_node->node_id); } else { DBG_LOG(ERROR, "pmalloc called with NULL thread\n"); } return NULL; } void *prealloc(void *old_addr, size_t old_size, size_t new_size) { return numa_realloc(old_addr, old_size, new_size); } void pfree(void* start, size_t size) { numa_free(start, size); } ================================================ FILE: src/lib/pmalloc.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __PMALLOC_H #define __PMALLOC_H /** * \file * * \page pmalloc_api Persistent Memory API * * Methods to be used by clients to allocate and free emulated NVRAM. */ #include #ifdef __cplusplus extern "C" { #endif void *pmalloc(size_t size); void *prealloc(void *old_addr, size_t old_size, size_t new_size); void pfree(void *start, size_t size); #ifdef __cplusplus } #endif #endif /* __PMALLOC_H */ ================================================ FILE: src/lib/process_rank.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ /* * process_rank.c * * Created on: Jun 16, 2015 * Author: root */ #include #include "model.h" #include "error.h" #define EMUL_LOCAL_PROCESSES_VAR "EMUL_LOCAL_PROCESSES" #define EMUL_LOCK_FILE "/tmp/emul_lock_file" #define EMUL_PROCESS_LOCAL_RANK_FILE "/tmp/emul_process_local_rank" #define LOCKED_WAIT_US 1000 #define MAX_LOCKED_RETRIES 50 extern latency_model_t latency_model; int set_process_local_rank() { FILE *flock = NULL; FILE *fcounter = NULL; int expired = 0; int process_id = 0; char *processes; int ret = E_SUCCESS; #ifndef NDEBUG char hname[64]; #endif processes = getenv(EMUL_LOCAL_PROCESSES_VAR); if (!processes) { DBG_LOG(WARNING, "No %s variable set, skipping rank setting\n", EMUL_LOCAL_PROCESSES_VAR); return E_SUCCESS; } else { if (sscanf(processes, "%d", &latency_model.max_local_processe_ranks) != 1) { DBG_LOG(WARNING, "Ignoring EMUL_PROCESSES_PER_SYSTEM variable with invalid value '%s'\n", processes); return E_SUCCESS; } } if (latency_model.max_local_processe_ranks < 2) { DBG_LOG(WARNING, "EMUL_PROCESSES_PER_SYSTEM value is %d, skipping rank setting\n", latency_model.max_local_processe_ranks); return E_SUCCESS; } DBG_LOG(DEBUG, "setting process local rank for %d local processes\n", latency_model.max_local_processe_ranks); while (expired < MAX_LOCKED_RETRIES) { // open lock file on exclusive mode flock = fopen(EMUL_LOCK_FILE, "wx"); if (flock == NULL) { // DBG_LOG(DEBUG, "failed to create lock file\n"); usleep(LOCKED_WAIT_US); expired++; } if (flock) break; } if (expired >= MAX_LOCKED_RETRIES) { DBG_LOG(ERROR, "failed to set process local rank\n"); return E_ERROR; } // lock acquired, read process counter file if (access(EMUL_PROCESS_LOCAL_RANK_FILE, R_OK | W_OK) < 0) { // rank file does not exist, create it and write "1" for next process // this process rank id is 1 process_id = 1; fcounter = fopen(EMUL_PROCESS_LOCAL_RANK_FILE, "w"); fwrite(&process_id, sizeof(int), 1, fcounter); fclose(fcounter); } else { // rank file exists, read the current rank max value and use it as this process local // rank id and increment the value in the rank file for the next process fcounter = fopen(EMUL_PROCESS_LOCAL_RANK_FILE, "r+"); if (fread(&process_id, sizeof(int), 1, fcounter) == 0) { abort(); } DBG_LOG(DEBUG, "read from file current max rank %d\n", process_id); latency_model.process_local_rank = process_id; process_id++; if (process_id >= latency_model.max_local_processe_ranks) { DBG_LOG(ERROR, "process rank %d exceeded limit of %d max emulated processes\n", process_id, latency_model.max_local_processe_ranks); fclose(fcounter); ret = E_ERROR; } else { DBG_LOG(DEBUG, "write to file new max rank %d\n", process_id); rewind(fcounter); fwrite(&process_id, sizeof(int), 1, fcounter); fclose(fcounter); } } // close and delete lock file fclose(flock); remove(EMUL_LOCK_FILE); #ifndef NDEBUG gethostname(hname, sizeof(hname)); DBG_LOG(DEBUG, "process local rank is %d on system %s\n", latency_model.process_local_rank, hname); #endif return ret; } int unset_process_local_rank() { FILE *flock = NULL; FILE *fcounter = NULL; int expired = 0; int process_id; if (latency_model.max_local_processe_ranks < 2) { return E_SUCCESS; } DBG_LOG(DEBUG, "Unsetting process local rank\n"); while (expired < MAX_LOCKED_RETRIES) { // open lock file on Exclusive mode flock = fopen(EMUL_LOCK_FILE, "wx"); if (flock == NULL) { // DBG_LOG(DEBUG, "failed to create lock file\n"); usleep(LOCKED_WAIT_US); expired++; } if (flock) break; } if (expired >= MAX_LOCKED_RETRIES) { DBG_LOG(ERROR, "failed to unset process local rank\n"); return E_ERROR; } // lock acquired, read process counter file if (access(EMUL_PROCESS_LOCAL_RANK_FILE, R_OK | W_OK) == 0) { // if rank file does not exist, nothing to be done // file exists, read the current value and decrement it fcounter = fopen(EMUL_PROCESS_LOCAL_RANK_FILE, "r+"); if (fread(&process_id, sizeof(int), 1, fcounter) == 0) { abort(); } DBG_LOG(DEBUG, "Exiting process and reading current rank max %d\n", process_id); if (process_id > 0) process_id--; { char hname[64]; gethostname(hname, sizeof(hname)); DBG_LOG(DEBUG, "Exiting process and writing new rank max %d on %s\n", process_id, hname); } rewind(fcounter); fwrite(&process_id, sizeof(int), 1, fcounter); fclose(fcounter); } // close and delete lock file fclose(flock); remove(EMUL_LOCK_FILE); return E_SUCCESS; } ================================================ FILE: src/lib/stat.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include #include #include #include "utlist.h" #include "stat.h" #include "thread.h" #include "interpose.h" #include "model.h" thread_manager_t* get_thread_manager(); hrtime_t cycles_to_us(int cpu_speed_mhz, hrtime_t cycles); #ifdef USE_STATISTICS void stats_set_init_time(double init_time_us) { thread_manager_t* thread_manager = get_thread_manager(); __lib_pthread_mutex_lock(&thread_manager->mutex); thread_manager->stats.init_time_us = init_time_us; __lib_pthread_mutex_unlock(&thread_manager->mutex); } void stats_enable(config_t *cfg) { thread_manager_t* thread_manager = get_thread_manager(); __cconfig_lookup_bool(cfg, "statistics.enable", &thread_manager->stats.enabled); if (__cconfig_lookup_string(cfg, "statistics.file", &thread_manager->stats.output_file) == CONFIG_FALSE) { __lib_pthread_mutex_lock(&thread_manager->mutex); thread_manager->stats.output_file = NULL; __lib_pthread_mutex_unlock(&thread_manager->mutex); } } static char *get_current_time() { time_t curtime; char *str_time; time(&curtime); str_time = ctime(&curtime); str_time[strlen(str_time) - 1] = 0; return str_time; } static inline hrtime_t ns_to_cycles(int cpu_speed_mhz, int ns) { return (cpu_speed_mhz * ns) / 1000; } extern __thread int tls_hw_local_latency; extern __thread int tls_hw_remote_latency; static void show_thread_stats(thread_t *thread, FILE *out_file) { uint64_t fixed_value; uint64_t cycles; fprintf(out_file, "\tThread id [%d]\n", thread->tid); fprintf(out_file, "\t\t: cpu id: %d\n", thread->cpu_id); fprintf(out_file, "\t\t: spawn timestamp: %lu\n", thread->stats.register_timestamp); fprintf(out_file, "\t\t: termination timestamp: %lu\n", thread->stats.unregister_timestamp); fixed_value = thread->stats.unregister_timestamp > 0 ? (thread->stats.unregister_timestamp - thread->stats.register_timestamp) : 0; fprintf(out_file, "\t\t: execution time: %lu usecs\n", fixed_value); fprintf(out_file, "\t\t: stall cycles: %lu\n", thread->stats.stall_cycles); if (thread->virtual_node->dram_node != thread->virtual_node->nvram_node && latency_model.pmc_remote_dram) { cycles = ns_to_cycles(thread->cpu_speed_mhz, tls_hw_remote_latency); fixed_value = cycles ? thread->stats.stall_cycles / cycles : 0; } else { cycles = ns_to_cycles(thread->cpu_speed_mhz, tls_hw_local_latency); fixed_value = cycles ? thread->stats.stall_cycles / cycles : 0; } fprintf(out_file, "\t\t: NVM accesses: %lu\n", fixed_value); fprintf(out_file, "\t\t: latency calculation overhead cycles: %lu\n", thread->stats.overhead_cycles); fprintf(out_file, "\t\t: injected delay cycles: %lu\n", thread->stats.delay_cycles); if (thread->cpu_speed_mhz) { fprintf(out_file, "\t\t: injected delay in usec: %lu\n", cycles_to_us(thread->cpu_speed_mhz, thread->stats.delay_cycles)); } fprintf(out_file, "\t\t: longest epoch duration: %lu usec\n", thread->stats.longest_epoch_duration_us); fixed_value = (thread->stats.shortest_epoch_duration_us == UINT64_MAX) ? 0 : thread->stats.shortest_epoch_duration_us; fprintf(out_file, "\t\t: shortest epoch duration: %lu usec\n", fixed_value); fixed_value = thread->stats.epochs ? (thread->stats.overall_epoch_duration_us / thread->stats.epochs) : thread->stats.overall_epoch_duration_us; fprintf(out_file, "\t\t: average epoch duration: %lu usec\n", fixed_value); fprintf(out_file, "\t\t: number of epochs: %lu\n", thread->stats.epochs); fprintf(out_file, "\t\t: epochs which didn't reach min duration: %lu\n", thread->stats.min_epoch_not_reached); fprintf(out_file, "\t\t: static epochs requested: %lu\n", thread->stats.signals_sent); } void stats_report() { thread_t *thread; FILE *out_file; uint64_t running_threads = 0; thread_manager_t* thread_manager = get_thread_manager(); uint64_t terminated_threads; if (!thread_manager) return; if (!thread_manager->stats.enabled) return; if (thread_manager->stats.output_file) { out_file = fopen(thread_manager->stats.output_file, "a"); if (!out_file) { fprintf(stderr, "Failed to open statistics file for writing: %s\n", thread_manager->stats.output_file); return; } } else { out_file = stdout; } __lib_pthread_mutex_lock(&thread_manager->mutex); LL_FOREACH(thread_manager->thread_list, thread) { running_threads++; } __lib_pthread_mutex_unlock(&thread_manager->mutex); fprintf(out_file, "\n\n===== STATISTICS (%s) =====\n\n", get_current_time()); if (!latency_model.inject_delay) { fprintf(out_file, "WARNING: delay injection is disabled\n"); } fprintf(out_file, "PID: %d\n", getpid()); fprintf(out_file, "Initialization duration: %lu usec\n", thread_manager->stats.init_time_us); fprintf(out_file, "Running threads: %lu\n", running_threads); terminated_threads = thread_manager->stats.n_threads > 0 ? (thread_manager->stats.n_threads - running_threads) : 0; fprintf(out_file, "Terminated threads: %lu\n", terminated_threads); fprintf(out_file, "\n"); fprintf(out_file, "== Running threads == \n"); __lib_pthread_mutex_lock(&thread_manager->mutex); LL_FOREACH(thread_manager->thread_list, thread) { show_thread_stats(thread, out_file); } __lib_pthread_mutex_unlock(&thread_manager->mutex); fprintf(out_file, "\n== Terminated threads == \n"); __lib_pthread_mutex_lock(&thread_manager->mutex); LL_FOREACH(thread_manager->stats.thread_list, thread) { show_thread_stats(thread, out_file); } __lib_pthread_mutex_unlock(&thread_manager->mutex); if (out_file != stdout) { fclose(out_file); } } #endif double sum(double array[], int n) { int i; double s = 0; for (i=0; i #include #include "config.h" #ifdef USE_STATISTICS struct thread_s; typedef struct { int enabled; struct thread_s* thread_list; uint64_t n_threads; uint64_t init_time_us; char *output_file; } stats_t; typedef struct { uint64_t stall_cycles; uint64_t overhead_cycles; uint64_t delay_cycles; uint64_t signals_sent; uint64_t epochs; double last_epoch_timestamp; uint64_t shortest_epoch_duration_us; uint64_t longest_epoch_duration_us; uint64_t overall_epoch_duration_us; uint64_t min_epoch_not_reached; uint64_t register_timestamp; uint64_t unregister_timestamp; } thread_stats_t; void stats_enable(config_t *cfg); void stats_set_init_time(double init_time_us); void stats_report(); #endif double sum(double array[], int n); double sumxy(double x[], double y[], int n); double avg(double array[], int n); double slope(double x[], double y[], int n); #endif /* __STATISTICS_H */ ================================================ FILE: src/lib/thread.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include #include #include #include #include "cpu/cpu.h" #include "utlist.h" #include "error.h" #include "interpose.h" #include "model.h" #include "thread.h" #include "topology.h" #include "monotonic_timer.h" static thread_manager_t* thread_manager = NULL; __thread thread_t* tls_thread = NULL; extern inline hrtime_t hrtime_cycles(void); // assign a virtual/physical node using a round-robin policy static void rr_next_cpu_id(thread_manager_t* thread_manager, int* next_virtual_node_idp, int* next_cpu_idp) { int next_virtual_node_id; virtual_node_t* virtual_node; physical_node_t* physical_node; virtual_topology_t* virtual_topology = thread_manager->virtual_topology; *next_virtual_node_idp = thread_manager->next_virtual_node_id; *next_cpu_idp = thread_manager->next_cpu_id; // advance to the next virtual node and cpu id next_virtual_node_id = thread_manager->next_virtual_node_id; virtual_node = &virtual_topology->virtual_nodes[next_virtual_node_id]; physical_node = virtual_node->dram_node; // we run threads on the dram node if ((thread_manager->next_cpu_id = next_cpu(physical_node->cpu_bitmask, thread_manager->next_cpu_id + 1)) < 0) { next_virtual_node_id = (next_virtual_node_id + 1) % virtual_topology->num_virtual_nodes; virtual_node = &virtual_topology->virtual_nodes[next_virtual_node_id]; physical_node = virtual_node->dram_node; thread_manager->next_cpu_id = first_cpu(physical_node->cpu_bitmask); thread_manager->next_virtual_node_id = next_virtual_node_id; } } void rr_set_next_cpu_based_on_rank(int rank, int max_rank) { int cpu_id; int virtual_node_id; int i; // set the next CPU id based on this process rank id thread_manager->next_virtual_node_id = 0; thread_manager->next_cpu_id = 0; for (i = 0; i <= rank; ++i) { rr_next_cpu_id(thread_manager, &virtual_node_id, &cpu_id); } DBG_LOG(DEBUG, "no partitioning of CPUs, set next CPU " "to vnode %d and cpu %d\n", virtual_node_id, cpu_id); } void partition_cpus_based_on_rank(int rank, int max_rank, int num_cpus, virtual_topology_t* virtual_topology) { // assumed the number of cpus/2 is greater or equal to max_rank // this partition is num_cpus/max_rank int part_size = num_cpus/max_rank; int start = rank * part_size; int end = start + part_size -1; int i; int cpu_id = 0; int virtual_node_id = 0; virtual_node_t* virtual_node; physical_node_t* physical_node; DBG_LOG(DEBUG, "partitioning CPUS, this process has CPUs from %d and %d\n", start, end); thread_manager->next_virtual_node_id = 0; thread_manager->next_cpu_id = 0; for (i = 0; i < num_cpus; ++i) { rr_next_cpu_id(thread_manager, &virtual_node_id, &cpu_id); if (i < start || i > end) { // this CPU is outside the partition of this process // disable this CPU virtual_node = &virtual_topology->virtual_nodes[virtual_node_id]; physical_node = virtual_node->dram_node; DBG_LOG(DEBUG, "disabling CPU %d\n", cpu_id); if (numa_bitmask_isbitset(physical_node->cpu_bitmask, cpu_id)) { numa_bitmask_clearbit(physical_node->cpu_bitmask, cpu_id); } } } } int bind_thread_on_cpu(thread_manager_t* thread_manager, thread_t* thread, int virtual_node_id, int cpu_id) { thread->virtual_node = &thread_manager->virtual_topology->virtual_nodes[virtual_node_id]; DBG_LOG(INFO, "Binding thread tid [%d] pthread: 0x%lx on processor %d\n", thread->tid, thread->pthread, cpu_id); struct bitmask* cpubind = numa_allocate_cpumask(); numa_bitmask_setbit(cpubind, cpu_id); if (numa_sched_setaffinity(thread->tid, cpubind) != 0) { DBG_LOG(ERROR, "Cannot bind thread tid [%d] pthread: 0x%lx on processor %d\n", thread->tid, thread->pthread, cpu_id); numa_bitmask_free(cpubind); return E_ERROR; } numa_bitmask_free(cpubind); return E_SUCCESS; } int bind_thread_on_mem(thread_manager_t* thread_manager, thread_t* thread, int virtual_node_id, int cpu_id) { int physical_node_id; struct bitmask* membind = numa_allocate_nodemask(); physical_node_id = thread_manager->virtual_topology->virtual_nodes[virtual_node_id].dram_node->node_id; numa_bitmask_setbit(membind, physical_node_id); numa_set_membind(membind); numa_free_nodemask(membind); return E_SUCCESS; } thread_t* thread_self() { return tls_thread; } void thread_interrupt_handler(int signum) { DBG_LOG(DEBUG, "Handling interrupt thread [%d] pthread: 0x%lx\n", thread_self()->tid, thread_self()->pthread); create_latency_epoch(); } #ifdef PAPI_SUPPORT static int setup_events_thread_self(thread_t *thread, const char **native_events) { int i; // create event set for this thread if (pmc_create_event_set_local_thread() != 0) { return -1; } // register events for this thread for (i = 0; i < MAX_NUM_EVENTS; ++i) { if (native_events[i]) { DBG_LOG(INFO, "registering event %s, thread id [%d]\n", native_events[i], thread->tid); if (pmc_register_event_local_thread(native_events[i]) != 0) { return E_ERROR; } } } // start event counting for this thread if (pmc_events_start_local_thread() != 0) { return E_ERROR; } pmc_register_thread(); return 0; } #endif int register_thread(thread_manager_t* thread_manager, pthread_t pthread, pid_t tid) { int ret = 0; int cpu_id; int virtual_node_id; thread_t* thread = malloc(sizeof(thread_t)); if (thread_manager == NULL) { // this is possible if both BW and latency modeling are enabled and the BW model is not yet created. // the BW modeling will spawn threads which will attempt to register with the thread manager if the // latency modeling is enabled. However the thread manager is instantiated later. //goto error; return E_SUCCESS; } memset(thread, 0, sizeof(thread_t)); thread->pthread = pthread; thread->tid = tid; thread->thread_manager = thread_manager; #ifdef USE_STATISTICS if (thread_manager->stats.enabled) { thread->stats.last_epoch_timestamp = monotonic_time_us(); thread->stats.shortest_epoch_duration_us = UINT64_MAX; } #endif /* install thread interrupt handler as the signal handler for SIGUSR1. */ struct sigaction sa; memset (&sa, 0, sizeof(sa)); sa.sa_handler = &thread_interrupt_handler; sa.sa_flags = SA_RESTART; sigaction (SIGUSR1, &sa, NULL); // bind the thread on a cpu and memory node and // link the thread to the list of threads assert(__lib_pthread_mutex_lock); __lib_pthread_mutex_lock(&thread_manager->mutex); rr_next_cpu_id(thread_manager, &virtual_node_id, &cpu_id); if ((ret = bind_thread_on_cpu(thread_manager, thread, virtual_node_id, cpu_id)) != E_SUCCESS) { __lib_pthread_mutex_unlock(&thread_manager->mutex); DBG_LOG(ERROR, "thread id [%d] failed to bind to CPU\n", thread->tid); goto error; } if ((ret = bind_thread_on_mem(thread_manager, thread, virtual_node_id, cpu_id)) != E_SUCCESS) { __lib_pthread_mutex_unlock(&thread_manager->mutex); DBG_LOG(ERROR, "thread id [%d] failed to bind to Memory\n", thread->tid); goto error; } thread->cpu_id = cpu_id; thread->cpu_speed_mhz = cpu_speed_mhz(); #ifdef PAPI_SUPPORT cpu_model_t *cpu = thread_manager->virtual_topology->virtual_nodes[virtual_node_id].dram_node->cpu_model; if (setup_events_thread_self(thread, cpu->pmc_events.native_events) != 0) { ret = E_ERROR; __lib_pthread_mutex_unlock(&thread_manager->mutex); goto error; } #endif LL_APPEND(thread_manager->thread_list, thread); #ifdef USE_STATISTICS if (thread_manager->stats.enabled) { thread_manager->stats.n_threads++; thread->stats.register_timestamp = monotonic_time_us(); } #endif __lib_pthread_mutex_unlock(&thread_manager->mutex); init_thread_latency_model(thread); tls_thread = thread; return E_SUCCESS; error: free(thread); DBG_LOG(ERROR, "thread id [%d] failed to register with Monitor Thread\n", thread->tid); return ret; } int unregister_thread(thread_manager_t* thread_manager, thread_t * thread) { __lib_pthread_mutex_lock(&thread_manager->mutex); if (thread_manager == NULL) { return E_SUCCESS; } LL_DELETE(thread_manager->thread_list, thread); #ifdef USE_STATISTICS if (thread_manager->stats.enabled) { thread->stats.unregister_timestamp = monotonic_time_us(); LL_APPEND(thread_manager->stats.thread_list, thread); } #endif __lib_pthread_mutex_unlock(&thread_manager->mutex); #ifdef PAPI_SUPPORT pmc_events_stop_local_thread(); pmc_destroy_event_set_local_thread(); pmc_unregister_thread(); #endif return E_SUCCESS; } int register_self() { int ret = E_SUCCESS; if (thread_self() == NULL) { pid_t tid = (pid_t) syscall(SYS_gettid); DBG_LOG(INFO, "Registering thread tid [%d]\n", tid); ret = register_thread(thread_manager, pthread_self(), tid); } return ret; } int unregister_self() { if (tls_thread) { unregister_thread(thread_manager, tls_thread); #ifdef USE_STATISTICS if (!thread_manager->stats.enabled) { // statistics makes use of the thread descriptor free(tls_thread); } #else free(tls_thread); #endif tls_thread = NULL; } return E_SUCCESS; } static int reached_max_epoch_duration(thread_t* thread); void interrupt_threads(thread_manager_t* manager) { thread_t* thread; assert(__lib_pthread_mutex_lock); __lib_pthread_mutex_lock(&manager->mutex); LL_FOREACH(manager->thread_list, thread) { assert(thread); if (thread->signaled == 0 && reached_max_epoch_duration(thread)) { DBG_LOG(DEBUG, "interrupting thread [%d]\n", thread->tid); #ifdef USE_STATISTICS if (manager->stats.enabled) { thread->stats.signals_sent++; } #endif // this flag must be set before the signal is sent to make sure // there will be no race condition thread->signaled = 1; pthread_kill(thread->pthread, SIGUSR1); } } assert(__lib_pthread_mutex_unlock); __lib_pthread_mutex_unlock(&manager->mutex); } void* monitor_thread(void* arg) { thread_manager_t* manager = (thread_manager_t*) arg; struct timespec epoch_duration; // time_t secs = thread_manager->max_epoch_duration_us / USECS_PER_SEC; // long nanosecs = (thread_manager->max_epoch_duration_us % USECS_PER_SEC) * NANOS_PER_USEC; epoch_duration.tv_sec = 0; epoch_duration.tv_nsec = MIN_EPOCH_DURATION_US * 1000; while(1) { nanosleep(&epoch_duration, NULL); interrupt_threads(manager); } return NULL; } static void set_epoch_duration(config_t* cfg, const char *config_str, int *epoch_us, int default_epoch_us) { if (__cconfig_lookup_int(cfg, config_str, epoch_us) != CONFIG_TRUE) { *epoch_us = default_epoch_us; } else { if (*epoch_us > MAX_EPOCH_DURATION_US || *epoch_us < MIN_EPOCH_DURATION_US) { DBG_LOG(WARNING, "%s is out of supported bounds [%i, %i], setting it to %i\n", config_str, MIN_EPOCH_DURATION_US, MAX_EPOCH_DURATION_US, default_epoch_us); *epoch_us = default_epoch_us; } } } int init_thread_manager(config_t* cfg, virtual_topology_t* virtual_topology) { int ret; pthread_t monitor_tid; thread_manager_t* mgr; virtual_node_t* virtual_node; physical_node_t* physical_node; if (!(mgr = malloc(sizeof(thread_manager_t)))) { ret = E_ERROR; goto done; } memset(mgr, 0, sizeof(thread_manager_t)); mgr->thread_list = NULL; mgr->virtual_topology = virtual_topology; mgr->next_virtual_node_id = 0; set_epoch_duration(cfg, "latency.max_epoch_duration_us", &mgr->max_epoch_duration_us, MAX_EPOCH_DURATION_US); set_epoch_duration(cfg, "latency.min_epoch_duration_us", &mgr->min_epoch_duration_us, MIN_EPOCH_DURATION_US); if (mgr->min_epoch_duration_us > mgr->max_epoch_duration_us) { DBG_LOG(WARNING, "latency.min_epoch_duration_us is greater than latency.max_epoch_duration_us, setting it to %i\n", MIN_EPOCH_DURATION_US); mgr->min_epoch_duration_us = MIN_EPOCH_DURATION_US; } virtual_node = &virtual_topology->virtual_nodes[mgr->next_virtual_node_id]; physical_node = virtual_node->dram_node; mgr->next_cpu_id = first_cpu(physical_node->cpu_bitmask); pthread_mutex_init(&mgr->mutex, NULL); // fire a monitoring thread that periodically interrupts threads assert(__lib_pthread_create); assert(__lib_pthread_detach); __lib_pthread_create(&monitor_tid, NULL, monitor_thread, (void*) mgr); __lib_pthread_detach(monitor_tid); thread_manager = mgr; return E_SUCCESS; done: return ret; } int reached_min_epoch_duration(thread_t* thread) { double current_time; uint64_t diff_us; int result = 0; if (thread == NULL) { // FIXME: JVM for instance create threads using a mechanism not traced by this emulator // for now we make sure the current thread is registered right when it makes the // first explicit NVM allocation or when interposed functions are called. A // better solution is to trace the thread creation done by JVM. if (register_self() != E_SUCCESS) // if the thread could not be registered, exit this function return 0; thread = thread_self(); } current_time = monotonic_time_us(); #ifdef USE_STATISTICS diff_us = (uint64_t) (current_time - thread->stats.last_epoch_timestamp); #else diff_us = (uint64_t) (current_time - thread->last_epoch_timestamp); #endif DBG_LOG(DEBUG, "thread id [%d] last epoch was %lu usec ago\n", thread->tid, diff_us); if(diff_us >= thread_manager->min_epoch_duration_us) { DBG_LOG(DEBUG, "thread id [%d] reached min epoch duration (%i usec)\n", thread->tid, thread_manager->min_epoch_duration_us); result = 1; } #ifdef USE_STATISTICS if (thread_manager->stats.enabled && ! result) { thread->stats.min_epoch_not_reached++; } #endif return result; } static int reached_max_epoch_duration(thread_t* thread) { double current_time; uint64_t diff_us; int result = 0; // it compares this time with the last_epoch_timestamp, which is set by another thread // so, this time must be based on a system time and not on CPU cycles/time registers current_time = monotonic_time_us(); #ifdef USE_STATISTICS diff_us = (uint64_t) (current_time - thread->stats.last_epoch_timestamp); #else diff_us = (uint64_t) (current_time - thread->last_epoch_timestamp); #endif DBG_LOG(DEBUG, "thread id [%d] last epoch was %lu usec ago\n", thread->tid, diff_us); if(diff_us >= thread_manager->max_epoch_duration_us) { DBG_LOG(DEBUG, "thread id [%d] reached max epoch duration (%i usec)\n", thread->tid, thread_manager->max_epoch_duration_us); result = 1; } return result; } void block_new_epoch() { sigset_t set; sigemptyset(&set); sigaddset(&set, SIGUSR1); pthread_sigmask(SIG_BLOCK, &set, NULL); } void unblock_new_epoch() { sigset_t set; sigemptyset(&set); sigaddset(&set, SIGUSR1); pthread_sigmask(SIG_UNBLOCK, &set, NULL); } thread_manager_t* get_thread_manager() { return thread_manager; } ================================================ FILE: src/lib/thread.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __THREAD_H #define __THREAD_H #include #include #include #include #include #include "topology.h" #include "cpu/cpu.h" #include "stat.h" struct thread_manager_s; // opaque typedef uint64_t hrtime_t; // TODO: Used by memlat benchmark, should be disabled on a release version #define MEMLAT_SUPPORT typedef struct thread_s { struct virtual_node_s* virtual_node; pthread_t pthread; pid_t tid; int cpu_id; // the processor the thread is bound on int cpu_speed_mhz; struct thread_manager_s* thread_manager; struct thread_s* next; int signaled; #ifdef MEMLAT_SUPPORT uint64_t stall_cycles; #endif #ifdef USE_STATISTICS thread_stats_t stats; #else double last_epoch_timestamp; #endif } thread_t; typedef struct thread_manager_s { pthread_mutex_t mutex; thread_t* thread_list; int max_epoch_duration_us; // maximum epoch duration in microseconds int min_epoch_duration_us; // minimum epoch duration in microseconds int next_virtual_node_id; // used by the round-robin policy -- next virtual node to run on int next_cpu_id; // used by the round-robin policy -- next cpu to run on struct virtual_topology_s* virtual_topology; #ifdef USE_STATISTICS stats_t stats; #endif } thread_manager_t; int init_thread_manager(config_t* cfg, struct virtual_topology_s* virtual_topology); int register_self(); int unregister_self(); thread_t* thread_self(); int reached_min_epoch_duration(thread_t* thread); void block_new_epoch(); void unblock_new_epoch(); #endif /* __THREAD_H */ ================================================ FILE: src/lib/topology.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ /** * \file * * Constructs a virtual topology */ #include #include #include #include #include #include #include #include "cpu/cpu.h" #include "error.h" #include "measure.h" #include "topology.h" #include "model.h" #define MAX_NUM_MC_PCI_BUS 16 extern latency_model_t latency_model; void rr_set_next_cpu_based_on_rank(int rank, int max_rank); void partition_cpus_based_on_rank(int rank, int max_rank, int num_cpus, virtual_topology_t* virtual_topology); int select_cpus_based_on_local_rank(virtual_topology_t* virtual_topology) { int num_cpus = 0; int vnode; virtual_node_t* virtual_node; physical_node_t* physical_node; int n_procs = latency_model.max_local_processe_ranks; int rank = latency_model.process_local_rank; if (rank >= n_procs) { DBG_LOG(ERROR, "process rank %d exceeded limit of %d max emulated processes\n", rank, n_procs); return E_ERROR; } for (vnode = 0; vnode < virtual_topology->num_virtual_nodes; ++vnode) { virtual_node = &virtual_topology->virtual_nodes[vnode]; physical_node = virtual_node->dram_node; num_cpus += physical_node->num_cpus; } DBG_LOG(DEBUG, "number of cpus is %d\n", num_cpus); if (n_procs > (num_cpus/2)) { // do not partition CPUs, but bind this process to the CPU // indicated by our rank, after that, a new thread will be // bound to next available CPU on a round robin policy from // the max rank rr_set_next_cpu_based_on_rank(rank, n_procs); } else { // partition the CPUs to each rank // some CPUs may end up idle/without bound processes, if n_procs is not // multiple of 2 // TODO: warn or avoid idle CPUs partition_cpus_based_on_rank(rank, n_procs, num_cpus, virtual_topology); } return E_SUCCESS; } /** * \brief Returns a list of memory-controller pci buses */ int get_mc_pci_bus_list(pci_regs_t *bus_id_list[], int max_list_size, int* dev_countp) { FILE* fp; char buf[2048]; int bus_id, dev_id, funct; int last_bus_id = -1; int channel = 0; char dontcare[512]; int dev_count = 0; fp = popen("lspci", "r"); if (fp == NULL) { return E_ERROR; } for (dev_count=0; fgets(buf, sizeof(buf)-1, fp) != NULL; ) { if (strstr(buf, "Thermal Control")) { if (sscanf(buf, "%x:%x.%x %s", &bus_id, &dev_id, &funct, dontcare) == 4) { if (bus_id != last_bus_id) { ++dev_count; last_bus_id = bus_id; if (dev_count > max_list_size) { pclose(fp); return E_ERROR; } channel = 0; bus_id_list[dev_count-1] = (pci_regs_t*)malloc(sizeof(pci_regs_t)); } bus_id_list[dev_count-1]->addr[channel].bus_id = bus_id; bus_id_list[dev_count-1]->addr[channel].dev_id = dev_id; bus_id_list[dev_count-1]->addr[channel].funct = funct; ++channel; bus_id_list[dev_count-1]->channels = channel; } } } *dev_countp = dev_count; pclose(fp); return E_SUCCESS; } /** * \brief Discovers the physical memory-controller pci bus topology of the * machine, which includes the socket each memory controller is attached to * * To discover where a memory controller is connected to, we throttle the rest of * the memory controllers and measure local bandwidth of each node. The unthrottled * memory controller is attached to the node with the highest local bandwidth */ int discover_mc_pci_topology(cpu_model_t* cpu_model, physical_node_t* physical_nodes[], int num_physical_nodes) { pci_regs_t *regs_addr[16]; int dev_count; physical_node_t* local_node = NULL; int b, i; double max_local_rbw; double rbw; int count = 0; uint16_t throttle_reg_val; get_mc_pci_bus_list(regs_addr, MAX_NUM_MC_PCI_BUS, &dev_count); if (dev_count < num_physical_nodes) { // TODO: application is terminated on error only if in DEBUG mode DBG_LOG(WARNING, "The number of physical nodes is greater than the number of memory-controller pci buses.\n"); } for (b=0; bget_throttle_register(regs_addr[i], THROTTLE_DDR_ACT, &throttle_reg_val); if (throttle_reg_val < 0x8fff) cpu_model->set_throttle_register(regs_addr[i], THROTTLE_DDR_ACT, 0x8fff); } else { cpu_model->set_throttle_register(regs_addr[i], THROTTLE_DDR_ACT, 0x800f); } } // measure local bandwidth of each node max_local_rbw = 0; for (i=0; inode_id, node_i->node_id); if (rbw > max_local_rbw) { max_local_rbw = rbw; local_node = node_i; } } if (local_node) { DBG_LOG(DEBUG, "setting node_id %d to bus %X\n", local_node->node_id, regs_addr[b]->addr[0].bus_id); local_node->mc_pci_regs = regs_addr[b]; if (++count == num_physical_nodes) break; } } for (i=0; iget_throttle_register(regs_addr[i], THROTTLE_DDR_ACT, &throttle_reg_val); if (throttle_reg_val < 0x8fff) cpu_model->set_throttle_register(regs_addr[i], THROTTLE_DDR_ACT, 0x8fff); } return E_SUCCESS; } /** * \brief Loads the memory controller pci topology from a file */ static int load_mc_pci_topology(const char* path, physical_node_t* physical_nodes[], int num_physical_nodes) { FILE *fp; char *line = NULL; size_t len = 0; ssize_t read; int j; int bus_id, dev_id, funct; int node_id; int dev_count; pci_regs_t *regs = NULL; int channel = 0; int last_bus_id = -1; fp = fopen(path, "r"); if (fp == NULL) { return E_ERROR; } DBG_LOG(INFO, "Loading memory-controller pci topology from %s\n", path); for (dev_count = 0; (read = getline(&line, &len, fp)) != -1; ) { sscanf(line, "%d\t%x:%x.%x", &node_id, &bus_id, &dev_id, &funct); DBG_LOG(INFO, "node: %d, pci addr: %x:%x.%x\n", node_id, bus_id, dev_id, funct); if (bus_id != last_bus_id) { last_bus_id = bus_id; regs = (pci_regs_t*) malloc(sizeof(pci_regs_t)); channel = 0; dev_count++; for (j=0; jnode_id) { physical_nodes[j]->mc_pci_regs = regs; DBG_LOG(INFO, "node: %d, pci bus: 0x%x\n", physical_nodes[j]->node_id, bus_id); } } } regs->addr[channel].bus_id = bus_id; regs->addr[channel].dev_id = dev_id; regs->addr[channel].funct = funct; ++channel; regs->channels = channel; } free(line); if (dev_count < num_physical_nodes) { DBG_LOG(WARNING, "No complete memory-controller pci topology found in %s\n", path); } fclose(fp); return E_SUCCESS; } /** * \brief Saves the memory controller pci topology in a file for later reuse */ static int save_mc_pci_topology(const char* path, physical_node_t* physical_nodes[], int num_physical_nodes) { int i, j; FILE *fp; fp = fopen(path, "w"); if (fp == NULL) { return E_ERROR; } DBG_LOG(INFO, "Saving memory-controller pci topology into %s\n", path); for (i=0; imc_pci_regs; int node_id = physical_nodes[i]->node_id; for (j=0; regs != NULL && j < regs->channels; ++j) { DBG_LOG(INFO, "node: %d, pci addr: %x:%x.%x\n", node_id, regs->addr[j].bus_id, regs->addr[j].dev_id, regs->addr[j].funct); fprintf(fp, "%d\t%x:%x.%x\n", node_id, regs->addr[j].bus_id, regs->addr[j].dev_id, regs->addr[j].funct); } } fclose(fp); return E_SUCCESS; } int num_cpus(struct bitmask* bitmask) { int i,n; // if we had knowledge of the bitmask structure then we could // count the bits faster but bitmask seems to be an opaque structure for (i=0, n=0; i 1) { ret = select_cpus_based_on_local_rank(virtual_topology); } return ret; } /** * \brief Construct a virtual topology * * Constructs a NUMA virtual topology where two physical sockets are fused into a * single virtual node */ int init_virtual_topology(config_t* cfg, cpu_model_t* cpu_model, virtual_topology_t** virtual_topologyp) { char* mc_pci_file; char* str; char* saveptr = NULL; char* token = "NULL"; int* physical_node_ids; physical_node_t** physical_nodes = NULL; int num_physical_nodes; int n, v, i, j, sibling_idx; int node_id; physical_node_t* node_i, *node_j, *sibling_node; int ret; int min_distance; int hyperthreading; struct bitmask* mem_nodes; virtual_topology_t* virtual_topology; if (__cconfig_lookup_string(cfg, "topology.physical_nodes", &str) == CONFIG_FALSE) { return E_ERROR; } DBG_LOG(DEBUG, "Possible NUMA nodes are %d\n", numa_num_possible_nodes()); DBG_LOG(DEBUG, "NUMA nodes allowed are %lu\n", numa_get_mems_allowed()->size); DBG_LOG(DEBUG, "NUMA configured CPUs are %d\n", numa_num_configured_cpus()); // parse the physical nodes string physical_node_ids = calloc(numa_num_possible_nodes(), sizeof(*physical_node_ids)); num_physical_nodes = 0; while ((token = strtok_r(str, ",", &saveptr))) { physical_node_ids[num_physical_nodes] = atoi(token); str = NULL; if (++num_physical_nodes > numa_num_possible_nodes()) { // we re being asked to run on more nodes than available free(physical_node_ids); ret = E_ERROR; goto done; } } if (!(physical_nodes = calloc(num_physical_nodes, sizeof(*physical_nodes)))) { DBG_LOG(ERROR, "Failed physical nodes allocation\n"); abort(); } // select those nodes we can run on (e.g. not constrained by any numactl) mem_nodes = numa_get_mems_allowed(); for (i=0, n=0; inode_id = node_id; physical_nodes[n]->cpu_bitmask = numa_allocate_cpumask(); physical_nodes[n]->cpu_model = cpu_model; numa_node_to_cpus(node_id, physical_nodes[n]->cpu_bitmask); __cconfig_lookup_bool(cfg, "topology.hyperthreading", &hyperthreading); if (hyperthreading) { physical_nodes[n]->num_cpus = num_cpus(physical_nodes[n]->cpu_bitmask); } else { DBG_LOG(INFO, "Not using hyperthreading.\n"); // disable the upper half of the processors in the bitmask physical_nodes[n]->num_cpus = num_cpus(physical_nodes[n]->cpu_bitmask) / 2; int fc = first_cpu(physical_nodes[n]->cpu_bitmask); for (j=fc+system_num_cpus()/2; jnum_cpus; j++) { if (numa_bitmask_isbitset(physical_nodes[n]->cpu_bitmask, j)) { numa_bitmask_clearbit(physical_nodes[n]->cpu_bitmask, j); } } } DBG_LOG(INFO, "%d CPUs on physical node %d\n", physical_nodes[n]->num_cpus, n); n++; } } free(physical_node_ids); num_physical_nodes = n; // If pci bus topology of each physical node is not provided then discover it. // The bus topology must be always known even if BW model is disabled. if (__cconfig_lookup_string(cfg, "topology.mc_pci", &mc_pci_file) == CONFIG_FALSE || (__cconfig_lookup_string(cfg, "topology.mc_pci", &mc_pci_file) == CONFIG_TRUE && load_mc_pci_topology(mc_pci_file, physical_nodes, num_physical_nodes) != E_SUCCESS)) { discover_mc_pci_topology(cpu_model, physical_nodes, num_physical_nodes); save_mc_pci_topology(mc_pci_file, physical_nodes, num_physical_nodes); DBG_LOG(INFO, "Topology MC PCI file saved, restart the process\n"); exit(0); } // form virtual nodes by grouping physical nodes that are close to each other virtual_topology = malloc(sizeof(*virtual_topology)); virtual_topology->num_virtual_nodes = num_physical_nodes / 2 + num_physical_nodes % 2; virtual_topology->virtual_nodes = calloc(virtual_topology->num_virtual_nodes, sizeof(*(virtual_topology->virtual_nodes))); DBG_LOG(INFO, "Number of physical nodes %d\n", num_physical_nodes); DBG_LOG(INFO, "Number of virtual nodes %d\n", virtual_topology->num_virtual_nodes); for (i=0, v=0; inode_id,node_j->node_id) < min_distance) { sibling_node = node_j; sibling_idx = j; } } if (sibling_node) { physical_nodes[i] = physical_nodes[sibling_idx] = NULL; virtual_node_t* virtual_node = &virtual_topology->virtual_nodes[v]; virtual_node->dram_node = node_i; virtual_node->nvram_node = sibling_node; virtual_node->dram_node->latency = measure_latency(cpu_model, virtual_node->dram_node->node_id, virtual_node->dram_node->node_id); virtual_node->nvram_node->latency = measure_latency(cpu_model, virtual_node->dram_node->node_id, virtual_node->nvram_node->node_id); virtual_node->node_id = v; DBG_LOG(INFO, "Fusing physical nodes %d %d into virtual node %d\n", node_i->node_id, sibling_node->node_id, virtual_node->node_id); v++; } } // any physical node that is not paired with another physical node is // formed into a virtual node on its own if (2*v < num_physical_nodes) { for (i=0; ivirtual_nodes[v]; virtual_node->dram_node = virtual_node->nvram_node = node_i; virtual_node->node_id = v; virtual_node->dram_node->latency = measure_latency(cpu_model, virtual_node->dram_node->node_id, virtual_node->dram_node->node_id); DBG_LOG(WARNING, "Forming physical node %d into virtual node %d without a sibling node.\n", node_i->node_id, virtual_node->node_id); } } *virtual_topologyp = virtual_topology; ret = E_SUCCESS; done: free(physical_nodes); return ret; } ================================================ FILE: src/lib/topology.h ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #ifndef __TOPOLOGY_H #define __TOPOLOGY_H #include #include "config.h" #include "cpu/cpu.h" #include "dev.h" /* DOXYGEN Documentation : */ /** \page virtual_topology Virtual topology The emulator constructs a topology of virtual nodes out of physical nodes (i.e., NUMA sockets) that represents the arrangement of processors, DRAM, and NVRAM of the virtual machine that the emulator emulates. Currently, the emulator supports a NUMA virtual topology where essentially two physical sockets are fused into a single virtual node. Each virtual node comprises the processors from one socket only (active socket), and DRAM from both two physical sockets. The DRAM attached to the active socket is used as the virtual node's locally attached DRAM and the DRAM of the other socket (passive) is used as the virtual node's locally attached NVRAM. This topology allows us to emulate a machine that has both DRAM and NVRAM but reduces the computation capacity of the machine to half. In the future we would like to support a topology that matches the shared NVRAM storage of The Machine. */ typedef struct { int node_id; cpu_model_t* cpu_model; pci_regs_t *mc_pci_regs; int num_cpus; // number of node's cpus struct bitmask* cpu_bitmask; // a bitmask of the node's CPUs // this is actual physical latency. the latency number though depends on // whether the node corresponds to a dram node or a nvram node. // if dram then latency is the measured local latency to dram. // if nvram then latency is the measured remote latency to the sibling nvram node int latency; } physical_node_t; typedef struct virtual_node_s { int node_id; physical_node_t* dram_node; physical_node_t* nvram_node; //cpu_model_t* cpu_model; } virtual_node_t; typedef struct virtual_topology_s { virtual_node_t* virtual_nodes; // pointer to an array of virtual nodes int num_virtual_nodes; } virtual_topology_t; int init_virtual_topology(config_t* cfg, cpu_model_t* cpu_model, virtual_topology_t** virtual_topologyp); int system_num_cpus(); int first_cpu(struct bitmask* bitmask); int next_cpu(struct bitmask* bitmask, int cpu_id); #endif /* __TOPOLOGY_H */ ================================================ FILE: test/CMakeLists.txt ================================================ include_directories(${CMAKE_SOURCE_DIR}/third_party/gtest-1.7.0/include) include_directories(${CMAKE_SOURCE_DIR}/src/lib) add_definitions(-g) add_definitions(-Wall) #add_definitions(-DNDEBUG) add_executable(test_interpose ${CMAKE_CURRENT_SOURCE_DIR}/test_interpose.cc) target_link_libraries(test_interpose pthread gtest) add_executable(test_dev ${CMAKE_CURRENT_SOURCE_DIR}/test_dev.cc) target_link_libraries(test_dev pthread nvmemul) add_executable(test_thread ${CMAKE_CURRENT_SOURCE_DIR}/test_thread.cc) target_link_libraries(test_thread nvmemul pthread) add_executable(test_mutex ${CMAKE_CURRENT_SOURCE_DIR}/test_mutex.cc) target_link_libraries(test_mutex nvmemul pthread) add_executable(test_nvm_remote_dram ${CMAKE_CURRENT_SOURCE_DIR}/test_nvm_remote_dram.c) target_link_libraries(test_nvm_remote_dram nvmemul) add_executable(test_nvm ${CMAKE_CURRENT_SOURCE_DIR}/test_nvm.c) target_link_libraries(test_nvm nvmemul) add_executable(test_multithread ${CMAKE_CURRENT_SOURCE_DIR}/test_multithread.c) #target_link_libraries(test_multithread rt) target_link_libraries(test_multithread nvmemul pthread) add_test(NAME interpose COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_interpose) set(ENV_COMMON "LD_PRELOAD=${CMAKE_BINARY_DIR}/src/emul/libnvmemul.so") SET_PROPERTY(TEST interpose PROPERTY ENVIRONMENT ${ENV_COMMON} "ENUM_INI=emul.ini") ================================================ FILE: test/test_dev.cc ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include #include #include "gtest/gtest.h" #include "pmalloc.h" int main(int argc, char** argv) { // ::testing::InitGoogleTest(&argc, argv); // return RUN_ALL_TESTS(); printf("PID: %d\n", getpid()); printf("malloc: %p\n", malloc(8)); printf("malloc: %p\n", malloc(8)); printf("pmalloc: %p\n", pmalloc(8)); } ================================================ FILE: test/test_interpose.cc ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include #include #include "gtest/gtest.h" static int interpose_pthread_create_success = 0; // Ugly hack: we want to test whether interposition works. To do this we // hook on the functions that the interposition code calls by redefining these // functions. As those functions are written in C, we need to make sure we force // the C++ compiler use C linkage. #ifdef __cplusplus extern "C" { #endif // this function is called when interposition of pthread_create is successful int register_thread(pthread_t thread) { interpose_pthread_create_success = 1; return 0; } #ifdef __cplusplus } #endif void* interpose_pthread_create_start_routine(void* args) { return NULL; } void interpose_pthread_create() { pthread_t thread; pthread_create (&thread, NULL, &interpose_pthread_create_start_routine, NULL); pthread_join(thread, NULL); } void interpose_pthread_mutex_lock(pthread_mutex_t* lock) { pthread_mutex_lock(lock); } void interpose_pthread_mutex_unlock(pthread_mutex_t* lock) { pthread_mutex_unlock(lock); } TEST(Interpose, pthread_create) { EXPECT_EQ(0, interpose_pthread_create_success); interpose_pthread_create(); EXPECT_EQ(1, interpose_pthread_create_success); } TEST(Interpose, pthread_mutex_lock) { //EXPECT_EQ(1, 0); } int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); pthread_mutex_t lock; pthread_mutex_init(&lock, NULL); interpose_pthread_mutex_lock(&lock); interpose_pthread_mutex_unlock(&lock); } ================================================ FILE: test/test_multithread.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #define _GNU_SOURCE #include #include #include #include #include #include #include "thread.h" #include #include "pmalloc.h" #include "debug.h" //#include "stat.h" #ifndef NDEBUG #include #endif typedef struct { int cs_n; int cs_duration; int out_cs_duration; int from_node; int to_node; } arg_s; #define MAX_NUM_THREADS 50 pthread_t thread_desc[MAX_NUM_THREADS]; #include typedef struct { uint64_t val; char padding[0]; } element_t; typedef struct { uint64_t N; uint64_t element_size; element_t* head; } chain_t; uint64_t trash_cache(uint64_t N); chain_t* alloc_chain(uint64_t seedin, uint64_t N, uint64_t element_size, uint64_t node_i, uint64_t node_j); element_t* element(chain_t* chain, uint64_t index); void inline read_element(chain_t* chain, uint64_t index, char* buf, uint64_t buf_size); // factor is 10 (could be more), to make sure we have a buffer much bigger than CPU cache // the memory buffer is NOT shared among threads // for now the cache size is hardcoded as 20 MB #define NELEMS (10 * 20480000 / 64LLU) #define PAGESZ 4096 #define MAX_NUM_CHAINS 16 //#undef USE_HUGETLB #define SEED_IN 1 #define NCHAINS 1 pthread_mutex_t mutex; static int max_number_of_cpus(void) { int n, cpus = 2048; size_t setsize = CPU_ALLOC_SIZE(cpus); cpu_set_t *set = CPU_ALLOC(cpus); if (!set) goto err; for (;;) { CPU_ZERO_S(setsize, set); /* the library version does not return size of cpumask_t */ n = syscall(SYS_sched_getaffinity, 0, setsize, set); if (n < 0 && cpus < 1024 * 1024) { CPU_FREE(set); cpus *= 2; set = CPU_ALLOC(cpus); if (!set) goto err; continue; } CPU_FREE(set); return n * 8; } err: printf("cannot determine NR_CPUS"); return 0; } static int bind_cpu(thread_t *thread) { size_t setsize; cpu_set_t *cur_cpuset; cpu_set_t *new_cpuset; int ncpus = max_number_of_cpus(); if (thread == NULL) { // if thread is NULL it means the emulator is disabled, return without setting CPU affinity //printf("thread self is null"); return 0; } if (ncpus == 0) { return 1; } setsize = CPU_ALLOC_SIZE(ncpus); cur_cpuset = CPU_ALLOC(ncpus); new_cpuset = CPU_ALLOC(ncpus); CPU_ZERO_S(setsize, cur_cpuset); CPU_ZERO_S(setsize, new_cpuset); CPU_SET_S(thread->cpu_id, setsize, new_cpuset); if (pthread_getaffinity_np(thread->pthread, setsize, cur_cpuset) != 0) { DBG_LOG(ERROR, "Cannot get thread tid [%d] affinity, pthread: 0x%lx on processor %d\n", thread->tid, thread->pthread, thread->cpu_id); return 1; } if (CPU_EQUAL(cur_cpuset, new_cpuset)) { //printf("No need to bind CPU\n"); return 0; } DBG_LOG(INFO, "Binding thread tid [%d] pthread: 0x%lx on processor %d\n", thread->tid, thread->pthread, thread->cpu_id); if (pthread_setaffinity_np(thread->pthread, setsize, new_cpuset) != 0) { DBG_LOG(ERROR, "Cannot bind thread tid [%d] pthread: 0x%lx on processor %d\n", thread->tid, thread->pthread, thread->cpu_id); return 1; } return 0; } uint64_t force_ldm_stalls(chain_t **C, int element_size, int access_size, int duration, // number of pointers/elements to chase uint64_t nelems, // max number of available elements/pointers int it_n) { // seed to calculate the first pointer to chase, used to avoid repeating // pointers during consecutive calls uint64_t j, i; int nchains = SEED_IN; uint64_t sumv[MAX_NUM_CHAINS]; uint64_t nextp[MAX_NUM_CHAINS]; char *buf; uint64_t buf_size = 16384; int count = 0; uint64_t start; uint64_t it_limit; assert(nchains < MAX_NUM_CHAINS); if (duration <= 0) return 0; // TODO: ignore the use of buf? // TODO: ignore more than one chain? buf = (char*) malloc(buf_size); assert(buf != NULL); if (nelems > duration) { it_limit = nelems / duration; } else { it_limit = 1; } it_n = it_n % it_limit; start = it_n * duration; if ((start + duration) > nelems) { start = 0; } /* chase the pointers */ if (nchains == 1) { sumv[0] = 0; // chase pointers until the 'duration' count, the pointer chasing will restart from beginning if duration // is greater than 'nelems' for (count = 0, i = start; count < duration; i = element(C[0], i)->val, ++count) { __asm__(""); sumv[0] += element(C[0], i)->val; if (access_size > element_size) { read_element(C[0], i, buf, buf_size); } } } else { for (j=0; j < nchains; j++) { sumv[j] = 0; nextp[j] = 0; } for (; 0 != element(C[0], nextp[0])->val; ) { for (j=0; j < nchains; j++) { sumv[j] += element(C[j], nextp[j])->val; if (access_size > element_size) { read_element(C[j], nextp[j], buf, buf_size); } nextp[j] = element(C[j], nextp[j])->val; } } } free(buf); return sumv[0]; } void iter(int cs_n, int cs_duration, int out_cs_duration, int from_node, int to_node) { long it_n; struct timespec time_start, time_end; unsigned long diff_us; uint64_t seed; uint64_t j; chain_t *C[MAX_NUM_CHAINS]; #ifndef NDEBUG pid_t tid = (pid_t) syscall(SYS_gettid); #endif DBG_LOG(INFO, "\t: from node: %d to node: %d\n", from_node, to_node); assert(NELEMS < UINT64_MAX); for (j=0; j < NCHAINS; j++) { seed = SEED_IN + j*j; C[j] = alloc_chain(seed, NELEMS, 64LLU, from_node, to_node); __asm__(""); } bind_cpu(thread_self()); trash_cache(NELEMS); for (it_n = 0; it_n < cs_n; ++it_n) { __asm__(""); pthread_mutex_lock(&mutex); #ifndef NDEBUG clock_gettime(CLOCK_MONOTONIC, &time_start); #endif // critical section // make cs_duration random memory accesses and leave force_ldm_stalls((chain_t **)&C, 64LLU, 8, cs_duration, NELEMS, it_n); #ifndef NDEBUG clock_gettime(CLOCK_MONOTONIC, &time_end); #endif pthread_mutex_unlock(&mutex); // outside critical section force_ldm_stalls((chain_t **)&C, 64LLU, 8, out_cs_duration, NELEMS, (it_n+1)*2); #ifndef NDEBUG diff_us = ((time_end.tv_sec * 1000000) + (time_end.tv_nsec / 1000)) - ((time_start.tv_sec * 1000000) + (time_start.tv_nsec / 1000)); DBG_LOG(INFO, "\tthread [%d] critical section took %lu usec\n", tid, diff_us); #endif // if ((it_n + 1) % out_cs_duration == 0) { //// usleep(1); //// pthread_yield(); // sched_yield(); // } } for (j=0; j < NCHAINS; j++) { free(C[j]); } } void *thread_fn(void *arg) { int cs_n = ((arg_s *) arg)->cs_n; int cs_duration = ((arg_s *) arg)->cs_duration; int out_cs_duration = ((arg_s *) arg)->out_cs_duration; int from_node = ((arg_s *) arg)->from_node; int to_node = ((arg_s *) arg)->to_node; iter(cs_n, cs_duration, out_cs_duration, from_node, to_node); return 0; } void manage_threads(int n_threads, int cs_n, int cs_duration, int out_cs_duration, int from_node, int to_node) { pthread_attr_t attr; int i; arg_s args; if ((n_threads > MAX_NUM_THREADS) || (n_threads <= 0)) { printf("INVALID RANGE:\n"); printf("\tMax number of threads is %d\n", MAX_NUM_THREADS); exit(-1); } if (cs_n <= 0 || cs_duration <= 0 || out_cs_duration < 0) { printf("INVALID RANGE:\n"); printf("\tcritical sections: %d, cs level: %d, out cs level: %d\n", cs_n, cs_duration, out_cs_duration); exit(-1); } pthread_mutex_init(&mutex, NULL); if (pthread_attr_init(&attr) != 0) { printf("pthread_attr_init failed"); exit(-1); } srand(time(NULL)); args.cs_duration = cs_duration; args.cs_n = cs_n; args.out_cs_duration = out_cs_duration; args.from_node = from_node; args.to_node = to_node; for (i = 0; i < n_threads; ++i) { pthread_create(&thread_desc[i], &attr, thread_fn, (void *)&args); } pthread_attr_destroy(&attr); for (i = 0; i < n_threads; ++i) { pthread_join(thread_desc[i], NULL); } pthread_mutex_destroy(&mutex); } int main(int argn, char **argv) { int n_threads; int cs_n; int cs_duration; //int cs_n_before_yield; int out_cs_duration; int from_node; int to_node; if (argn != 7) { printf("INVALID ARGUMENTS:\n"); printf("\t%s [# threads] [# critical sections per thread] [size of each critical section] " "[size of computation outside critical section] [from_node] [to_node]\n", argv[0]); return -1; } n_threads = atoi(argv[1]); cs_n = atoi(argv[2]); cs_duration = atoi(argv[3]); //cs_n_before_yield = atoi(argv[4]); out_cs_duration = atoi(argv[4]); from_node = atoi(argv[5]); to_node = atoi(argv[6]); manage_threads(n_threads, cs_n, cs_duration, out_cs_duration, from_node, to_node); // stats_report(); return 0; } ================================================ FILE: test/test_mutex.cc ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include #include #include #include #include "gtest/gtest.h" #define MAX_NUM_THREADS 128 pthread_mutex_t mutex; void* worker(void* args) { // int i; // char* array = (char*) malloc(1024*1024); pthread_mutex_lock(&mutex); pthread_mutex_unlock(&mutex); return NULL; } int main(int argc, char** argv) { pthread_t thread[MAX_NUM_THREADS]; int thread_count = 4; int i; // int sum; pthread_mutex_init(&mutex, NULL); pthread_mutex_lock(&mutex); pthread_mutex_unlock(&mutex); for (i = 0; i< thread_count; i++) pthread_create(&thread[i], NULL, worker, NULL); for(i = 0 ; i < thread_count ; i++) pthread_join(thread[i], NULL); } ================================================ FILE: test/test_nvm.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include #define BUF_SIZE (2048) unsigned long mem[BUF_SIZE][BUF_SIZE]; void iter() { int i; int j; unsigned long k; for (i=0; i < BUF_SIZE; ++i) { for (j=0; j < BUF_SIZE; ++j) { mem[i][j] = i * j; } } k = 0; while(1) { for (i=0; i < BUF_SIZE; ++i) { __asm__ __volatile__(""); for (j=0; j < BUF_SIZE; ++j) { k += mem[j][i] + i*j; mem[j][i] = k; } } // fprintf(stdout, "k is %lu\n", (unsigned long)k); usleep(1000); } } int main() { iter(); return 0; } ================================================ FILE: test/test_nvm_remote_dram.c ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include #include "pmalloc.h" #define BUF_SIZE (4 * 1024) unsigned long **mem; void iter() { int i; int j; unsigned long k; mem = (unsigned long **) pmalloc(BUF_SIZE * sizeof(unsigned long *)); for (i=0; i < BUF_SIZE; ++i) { mem[i] = (unsigned long *) pmalloc(BUF_SIZE * sizeof(unsigned long)); for (j=0; j < BUF_SIZE; ++j) { mem[i][j] = i * j; } } k = 0; while(1) { for (i=0; i < BUF_SIZE; ++i) { __asm__ __volatile__(""); for (j=0; j < BUF_SIZE; ++j) { k += mem[j][i] + i*j; mem[j][i] = k; } } // usleep(1000); } for (i=0; i < BUF_SIZE; ++i) { pfree(mem[i], BUF_SIZE * sizeof(unsigned long)); } pfree(mem, BUF_SIZE * sizeof(unsigned long *)); } int main() { iter(); return 0; } ================================================ FILE: test/test_thread.cc ================================================ /*************************************************************************** Copyright 2016 Hewlett Packard Enterprise Development LP. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ***************************************************************************/ #include #include #include #include #include #include "gtest/gtest.h" #define MAX_NUM_THREADS 128 void* worker(void* args) { int i; char* array = (char*) malloc(1024*1024); //while(1) { for (i=0; i<1024*1024; i++) { array[i] += 1; } //} //pthread_exit(NULL); printf("exiting\n"); return NULL; } int main(int argc, char** argv) { pthread_t thread[MAX_NUM_THREADS]; int thread_count = 4; int i; // int sum; for (i = 0; i< thread_count; i++) pthread_create(&thread[i], NULL, worker, NULL); for(i = 0 ; i < thread_count ; i++) pthread_join(thread[i], NULL); }