Repository: Lancern/asm2vec Branch: master Commit: d38a3bc3bc9c Files: 21 Total size: 56.6 KB Directory structure: gitextract_5xp0becm/ ├── .gitignore ├── README.md ├── asm2vec/ │ ├── __init__.py │ ├── asm.py │ ├── internal/ │ │ ├── __init__.py │ │ ├── atomic.py │ │ ├── parse.py │ │ ├── repr.py │ │ ├── sampling.py │ │ ├── training.py │ │ └── util.py │ ├── logging.py │ ├── model.py │ ├── parse.py │ └── repo.py ├── examples/ │ ├── estimating.s │ ├── training-estimating.py │ └── training.s └── tests/ ├── asm_test.py ├── parse_test.py └── utilities_test.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Created by https://www.toptal.com/developers/gitignore/api/macos,intellij,virtualenv,python # Edit at https://www.toptal.com/developers/gitignore?templates=macos,intellij,virtualenv,python ### Intellij ### # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 # User-specific stuff .idea/**/workspace.xml .idea/**/tasks.xml .idea/**/usage.statistics.xml .idea/**/dictionaries .idea/**/shelf # Generated files .idea/**/contentModel.xml # Sensitive or high-churn files .idea/**/dataSources/ .idea/**/dataSources.ids .idea/**/dataSources.local.xml .idea/**/sqlDataSources.xml .idea/**/dynamic.xml .idea/**/uiDesigner.xml .idea/**/dbnavigator.xml # Gradle .idea/**/gradle.xml .idea/**/libraries # Gradle and Maven with auto-import # When using Gradle or Maven with auto-import, you should exclude module files, # since they will be recreated, and may cause churn. Uncomment if using # auto-import. # .idea/artifacts # .idea/compiler.xml # .idea/jarRepositories.xml # .idea/modules.xml # .idea/*.iml # .idea/modules # *.iml # *.ipr # CMake cmake-build-*/ # Mongo Explorer plugin .idea/**/mongoSettings.xml # File-based project format *.iws # IntelliJ out/ # mpeltonen/sbt-idea plugin .idea_modules/ # JIRA plugin atlassian-ide-plugin.xml # Cursive Clojure plugin .idea/replstate.xml # Crashlytics plugin (for Android Studio and IntelliJ) com_crashlytics_export_strings.xml crashlytics.properties crashlytics-build.properties fabric.properties # Editor-based Rest Client .idea/httpRequests # Android studio 3.1+ serialized cache file .idea/caches/build_file_checksums.ser ### Intellij Patch ### # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 # *.iml # modules.xml # .idea/misc.xml # *.ipr # Sonarlint plugin # https://plugins.jetbrains.com/plugin/7973-sonarlint .idea/**/sonarlint/ # SonarQube Plugin # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin .idea/**/sonarIssues.xml # Markdown Navigator plugin # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced .idea/**/markdown-navigator.xml .idea/**/markdown-navigator-enh.xml .idea/**/markdown-navigator/ # Cache file creation bug # See https://youtrack.jetbrains.com/issue/JBR-2257 .idea/$CACHE_FILE$ # CodeStream plugin # https://plugins.jetbrains.com/plugin/12206-codestream .idea/codestream.xml ### macOS ### # General .DS_Store .AppleDouble .LSOverride # Icon must end with two \r Icon # Thumbnails ._* # Files that might appear in the root of a volume .DocumentRevisions-V100 .fseventsd .Spotlight-V100 .TemporaryItems .Trashes .VolumeIcon.icns .com.apple.timemachine.donotpresent # Directories potentially created on remote AFP share .AppleDB .AppleDesktop Network Trash Folder Temporary Items .apdisk ### Python ### # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ pytestdebug.log # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ doc/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ pythonenv* # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # profiling data .prof ### VirtualEnv ### # Virtualenv # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ [Bb]in [Ii]nclude [Ll]ib [Ll]ib64 [Ll]ocal [Ss]cripts pyvenv.cfg pip-selfcheck.json # End of https://www.toptal.com/developers/gitignore/api/macos,intellij,virtualenv,python ================================================ FILE: README.md ================================================ # asm2vec This is an unofficial implementation of the `asm2vec` model as a standalone python package. The details of the model can be found in the original paper: [(sp'19) Asm2Vec: Boosting Static Representation Robustness for Binary Clone Search against Code Obfuscation and Compiler Optimization](https://www.computer.org/csdl/proceedings-article/sp/2019/666000a038/19skfc3ZfKo) ## Requirements This implementation is written in python 3.7 and it's recommended to use python 3.7+ as well. The only dependency of this package is `numpy` which can be installed as follows: ```shell python3 -m pip install numpy ``` ## How to use ### Import To install the package, execute the following commands: ```shell git clone https://github.com/lancern/asm2vec.git ``` Add the following line to the `.bashrc` file to add `asm2vec` to your python interpreter's search path for external packages: ```shell export PYTHONPATH="path/to/asm2vec:$PYTHONPATH" ``` Replace `path/to/asm2vec` with the directory you clone `asm2vec` into. Then execute the following commands to update `PYTHONPATH`: ```shell source ~/.bashrc ``` You can also add the following code snippets to your python source code referring `asm2vec` to guide python interpreter finding the package successfully: ```python import sys sys.path.append('path/to/asm2vec') ``` In your python code, use the following `import` statement to import this package: ```python import asm2vec. ``` ### Define CFGs And Training You have 2 approaches to define the binary program that will be sent to the `asm2vec` model. The first approach is to build the CFG manually, as shown below: ```python from asm2vec.asm import BasicBlock from asm2vec.asm import Function from asm2vec.asm import parse_instruction block1 = BasicBlock() block1.add_instruction(parse_instruction('mov eax, ebx')) block1.add_instruction(parse_instruction('jmp _loc')) block2 = BasicBlock() block2.add_instruction(parse_instruction('xor eax, eax')) block2.add_instruction(parse_instruction('ret')) block1.add_successor(block2) block3 = BasicBlock() block3.add_instruction(parse_instruction('sub eax, [ebp]')) f1 = Function(block1, 'some_func') f2 = Function(block3, 'another_func') # block4 is ignore here for clarity f3 = Function(block4, 'estimate_func') ``` And then you can train a model with the following code: ```python from asm2vec.model import Asm2Vec model = Asm2Vec(d=200) train_repo = model.make_function_repo([f1, f2, f3]) model.train(train_repo) ``` The second approach is using the `parse` module provided by `asm2vec` to build CFGs automatically from an assembly code source file: ```python from asm2vec.parse import parse_fp with open('source.asm', 'r') as fp: funcs = parse_fp(fp) ``` And then you can train a model with the following code: ```python from asm2vec.model import Asm2Vec model = Asm2Vec(d=200) train_repo = model.make_function_repo(funcs) model.train(train_repo) ``` ### Estimation You can use the `asm2vec.model.Asm2Vec.to_vec` method to convert a function into its vector representation. ### Serialization The implementation support serialization on many of its internal data structures so that you can serialize the internal state of a trained model into disk for future use. You can serialize two data structures to primitive data: the function repository and the model memento. > To be finished. ## Hyper Parameters The constructor of `asm2vec.model.Asm2Vec` class accepts some keyword arguments as hyper parameters of the model. The following table lists all the hyper parameters available: | Parameter Name | Type | Meaning | Default Value | | ----------------------- | ------- | ------------------------------------------------------------------------------------------------------ | ------------- | | `d` | `int` | The dimention of the vectors for tokens. | `200` | | `initial_alpha` | `float` | The initial learning rate. | `0.05` | | `alpha_update_interval` | `int` | How many tokens can be processed before changing the learning rate? | `10000` | | `rnd_walks` | `int` | How many random walks to perform to sequentialize a function? | `3` | | `neg_samples` | `int` | How many samples to take during negative sampling? | `25` | | `iteration` | `int` | How many iterations to perform? (This parameter is reserved for future use and is not implemented now) | `1` | | `jobs` | `int` | How many tasks to execute concurrently during training? | `4` | ## Notes For simplicity, the Selective Callee Expansion is not implemented in this early implementation. You have to do it manually before sending CFG into `asm2vec` . ================================================ FILE: asm2vec/__init__.py ================================================ __all__ = ['asm', 'model', 'parse'] ================================================ FILE: asm2vec/asm.py ================================================ from typing import * class Instruction: def __init__(self, op: str, *args: str): self._op = op self._args = list(args) def op(self) -> str: return self._op def number_of_args(self) -> int: return len(self._args) def args(self) -> List[str]: return self._args def parse_instruction(code: str) -> Instruction: sep_index = code.find(' ') if sep_index == -1: return Instruction(code) op = code[:sep_index] # Operator args_list = list(map(str.strip, code[sep_index:].split(','))) # Operands return Instruction(op, *args_list) class BasicBlock: _next_unused_id: int = 1 def __init__(self): # Allocate a new unique ID for the basic block. self._id = self.__class__._next_unused_id self.__class__._next_unused_id += 1 self._instructions = [] self._predecessors = [] self._successors = [] def __iter__(self): return self._instructions.__iter__() def __len__(self): return len(self._instructions) def __hash__(self): return self._id.__hash__() def __eq__(self, other): if not isinstance(other, BasicBlock): return False return self._id == other.id() def __ne__(self, other): return not self.__eq__(other) def id(self) -> int: return self._id def add_instruction(self, instr: Instruction) -> None: self._instructions.append(instr) def body_instructions(self) -> List[Instruction]: return self._instructions[:-1] def instructions(self) -> List[Instruction]: return self._instructions def add_predecessor(self, predecessor: 'BasicBlock') -> None: self._predecessors.append(predecessor) predecessor._successors.append(self) def add_successor(self, successor: 'BasicBlock') -> None: self._successors.append(successor) successor._predecessors.append(self) def first_instruction(self) -> Instruction: return self._instructions[0] def last_instruction(self) -> Instruction: return self._instructions[-1] def predecessors(self) -> List['BasicBlock']: return self._predecessors def in_degree(self) -> int: return len(self._predecessors) def successors(self) -> List['BasicBlock']: return self._successors def out_degree(self) -> int: return len(self._successors) class CFGWalkerCallback: def __call__(self, *args, **kwargs): self.on_enter(*args) def on_enter(self, block: BasicBlock) -> None: pass def on_exit(self, block: BasicBlock) -> None: pass CFGWalkerCallbackType = Union[CFGWalkerCallback, Callable[[BasicBlock], Any]] def _walk_cfg(entry: BasicBlock, action: CFGWalkerCallbackType, visited: Set) -> None: if entry.id() in visited: return visited.add(entry.id()) action(entry) for successor in entry.successors(): _walk_cfg(successor, action, visited) if isinstance(action, CFGWalkerCallback): action.on_exit(entry) def walk_cfg(entry: BasicBlock, action: CFGWalkerCallbackType) -> None: _walk_cfg(entry, action, set()) class Function: _next_unused_id = 1 def __init__(self, entry: BasicBlock, name: str = None): # Allocate a unique ID for the current Function object. self._id = self.__class__._next_unused_id self.__class__._next_unused_id += 1 self._entry = entry self._name = name self._callees = [] # Functions that are called by this function self._callers = [] # Functions that call this function def __len__(self) -> int: instr_count = 0 def count_instr(block: BasicBlock) -> None: nonlocal instr_count instr_count += len(block) walk_cfg(self._entry, count_instr) return instr_count def __hash__(self): return self._id def __eq__(self, other): if not isinstance(other, Function): return False return self._id == other.id() def __ne__(self, other): return not self.__eq__(other) def id(self) -> int: return self._id def entry(self) -> BasicBlock: return self._entry def name(self) -> str: return self._name def add_callee(self, f: 'Function') -> None: self._callees.append(f) f._callers.append(self) def callees(self) -> List['Function']: return self._callees def out_degree(self) -> int: return len(self._callees) def add_caller(self, f: 'Function') -> None: self._callers.append(f) f._callees.append(self) def callers(self) -> List['Function']: return self._callers def in_degree(self) -> int: return len(self._callers) ================================================ FILE: asm2vec/internal/__init__.py ================================================ ================================================ FILE: asm2vec/internal/atomic.py ================================================ from typing import * import threading class LockContextManager: def __init__(self, lock: threading.Lock): self._lock = lock self._exited = False def __enter__(self): self._lock.acquire() def __exit__(self, exc_type, exc_val, exc_tb): self._exited = True self._lock.release() def exited(self) -> bool: return self._exited class Atomic: class AtomicContextManager(LockContextManager): def __init__(self, atomic: 'Atomic'): super().__init__(atomic._lock) self._atomic = atomic self._exited = False def __enter__(self): super().__enter__() return self def __exit__(self, exc_type, exc_val, exc_tb): super().__exit__(exc_type, exc_val, exc_tb) def value(self) -> Any: if self.exited(): raise RuntimeError('Trying to access AtomicContextManager after its exit.') return self._atomic._val def set(self, value: Any) -> None: if self.exited(): raise RuntimeError('Trying to access AtomicContextManager after its exit.') self._atomic._val = value def __init__(self, value: Any): self._val = value self._lock = threading.Lock() def lock(self) -> AtomicContextManager: return self.__class__.AtomicContextManager(self) def value(self) -> Any: with self.lock() as val: return val.value() ================================================ FILE: asm2vec/internal/parse.py ================================================ from typing import * import logging import asm2vec.asm class AssemblySyntaxError(Exception): def __init__(self, message: str = None): self._msg = message def message(self) -> str: return self._msg def raise_asm_syntax_error(expect: str, found: str) -> None: raise AssemblySyntaxError('Expect "{}", but "{}" was found.'.format(expect, found)) jmp_op = { 'jmp', 'ja', 'jae', 'jb', 'jbe', 'jc', 'jcxz', 'jecxz', 'jrcxz', 'je', 'jg', 'jge', 'jl', 'jle', 'jna', 'jnae', 'jnb', 'jnbe', 'jnc', 'jne', 'jng', 'jnge', 'jnl', 'jnle', 'jno', 'jnp', 'jns', 'jnz', 'jo', 'jp', 'jpe', 'jpo', 'js', 'jz' } call_op = { 'call' } ret_op = { 'ret' } x86_64_regs = { 'al', 'ah', 'bl', 'bh', 'cl', 'ch', 'dl', 'dh', 'spl', 'bpl', 'sil', 'dil', 'ax', 'bx', 'cx', 'dx', 'sp', 'bp', 'si', 'di', 'eax', 'ebx', 'ecx', 'edx', 'esp', 'ebp', 'esi', 'edi', 'rax', 'rdx', 'rcx', 'rdx', 'rsp', 'rbp', 'rsi', 'rdi', 'r8b', 'r9b', 'r10b', 'r11b', 'r12b', 'r13b', 'r14b', 'r15b', 'r8w', 'r9w', 'r10w', 'r11w', 'r12w', 'r13w', 'r14w', 'r15w', 'r8d', 'r9d', 'r10d', 'r11d', 'r12d', 'r13d', 'r14d', 'r15d', 'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15', 'cs', 'ss', 'ds', 'es', 'fs', 'gs', 'ecs', 'ess', 'eds', 'ees', 'efs', 'egs', 'rcs', 'rss', 'rds', 'res', 'rfs', 'rgs' } def is_jmp(op: str) -> bool: return op.lower() in jmp_op def is_conditional_jmp(op: str) -> bool: return is_jmp(op) and op.lower() != 'jmp' def is_call(op: str) -> bool: return op.lower() in call_op def is_ret(op: str) -> bool: return op.lower() in ret_op def is_reg(arg: str) -> bool: return arg.lower() in x86_64_regs class CFGBuilder: def __init__(self, context: 'ParseContext'): self._context = context self._blocks: List[asm2vec.asm.BasicBlock] = [] self._active_block = -1 self._block_labels: Dict[str, int] = dict() def _logger(self) -> logging.Logger: return self._context.logger().getChild(self.__class__.__name__) def _allocate_block(self) -> int: self._blocks.append(asm2vec.asm.BasicBlock()) return len(self._blocks) - 1 def _allocate_named_block(self, name: str) -> int: if name in self._block_labels: return self._block_labels[name] else: idx = self._allocate_block() self._block_labels[name] = idx return idx def _get_active_block(self) -> asm2vec.asm.BasicBlock: return self._blocks[self._active_block] def _set_active_block(self, block_id: int) -> None: self._active_block = block_id def _has_active_block(self) -> bool: return self._active_block != -1 def _close_active_block(self) -> None: self._active_block = -1 def _add_jmp(self, op: str, args: List[str]) -> None: if len(args) != 1: raise_asm_syntax_error('Jump with single operand', '{} operands'.format(len(args))) cur_block = self._get_active_block() self._close_active_block() if is_conditional_jmp(op): # Allocate another basic block for more instructions since the current code point is reachable. # This may produce some empty basic blocks in the final output. self._set_active_block(self._allocate_block()) self._get_active_block().add_predecessor(cur_block) def add_instr(self, op: str, args: List[str]) -> None: if not self._has_active_block(): # Allocate a new basic block. self._set_active_block(self._allocate_block()) self._get_active_block().add_instruction(asm2vec.asm.Instruction(op, *args)) if is_jmp(op): self._add_jmp(op, args) elif is_ret(op): # `ret` instruction encountered. Close current active block. self._close_active_block() def set_label(self, label: str) -> None: block_id = self._block_labels.get(label, -1) if block_id == -1: # Test if the current active block is empty in which case we can reuse it. if self._has_active_block() and len(self._get_active_block()) == 0: self._block_labels[label] = self._active_block else: # Open a new block for the label. block_id = self._allocate_block() self._block_labels[label] = block_id # Link the new block with the previously-active block. if self._has_active_block(): self._get_active_block().add_successor(self._blocks[block_id]) self._set_active_block(block_id) else: self._set_active_block(block_id) def build(self) -> List[asm2vec.asm.Function]: func_entries: Dict[str, int] = dict() # Walk through all instructions and fix block relations formed by jump and call instructions. for blk in self._blocks: for inst in blk: if is_jmp(inst.op()): target = inst.args()[0] if target in self._block_labels: blk.add_successor(self._blocks[self._block_labels[target]]) elif is_call(inst.op()): target = inst.args()[0] if target in self._block_labels and target not in func_entries: func_entries[target] = self._block_labels[target] for func_name in self._context.options().func_names(): if func_name not in self._block_labels: self._logger().warning('Cannot find function "{}"', func_name) continue if func_name not in func_entries: func_entries[func_name] = self._block_labels[func_name] funcs: Dict[str, asm2vec.asm.Function] = \ dict(map(lambda x: (x[0], asm2vec.asm.Function(self._blocks[x[1]], x[0])), func_entries.items())) # Fix function call relation. for (name, f) in funcs.items(): def block_action(block: asm2vec.asm.BasicBlock) -> None: for instr in block: if is_call(instr.op()): callee_name = instr.args()[0] if callee_name in funcs: f.add_callee(funcs[callee_name]) asm2vec.asm.walk_cfg(f.entry(), block_action) # TODO: Implement Selective Callee Expansion here. return list(funcs.values()) class ParseOptions: def __init__(self, **kwargs): self._func_names = kwargs.get('func_names', []) def func_names(self) -> List[str]: return self._func_names class ParseContext: def __init__(self, **kwargs): self._builder = CFGBuilder(self) self._options = ParseOptions(**kwargs) self._logger = logging.getLogger('asm2vec.ParseContext') def logger(self) -> logging.Logger: return self._logger def options(self) -> ParseOptions: return self._options def builder(self) -> CFGBuilder: return self._builder ''' Parser rules for input assembly file: program : asm_line* ; asm_line : asm_label '\n' | BLANKS asm_instr '\n' ; asm_label : ASM_LABEL_ID ':' ; asm_instr : ASM_INSTR_OP ' ' asm_instr_arg_list ; asm_instr_arg_list : ASM_INSTR_ARG (',' asm_instr_arg_list)? | /* epsilon */ ; BLANKS : [ \n\t]+; ''' def is_fullmatch(pattern, s: str) -> bool: return pattern.fullmatch(s) is not None def parse_asm_label(ln: str, context: ParseContext) -> None: stripped = ln.strip() if stripped[-1] != ':': raise_asm_syntax_error('asm_label', ln) context.builder().set_label(stripped[:-1]) def parse_asm_instr(ln: str, context: ParseContext) -> None: delim_index = ln.find(' ') args = [] if delim_index == -1: op = ln else: op = ln[:delim_index] args = list(map(lambda arg: arg.strip(), ln[delim_index + 1:].split(','))) context.builder().add_instr(op, args) def parse_asm_line(ln: str, context: ParseContext) -> None: if len(ln.strip()) == 0: return if ln[0].isspace(): # Expect production asm_line -> BLANKS asm_instr '\n' parse_asm_instr(ln.strip(), context) else: # Expect production asm_line -> asm_label parse_asm_label(ln, context) def parse_asm_lines(lines: Iterable[str], **kwargs) -> List[asm2vec.asm.Function]: context = ParseContext(**kwargs) for ln in lines: parse_asm_line(ln, context) return context.builder().build() ================================================ FILE: asm2vec/internal/repr.py ================================================ import random from typing import * import concurrent.futures from asm2vec.asm import Instruction from asm2vec.asm import BasicBlock from asm2vec.asm import Function from asm2vec.asm import walk_cfg from asm2vec.repo import SequentialFunction from asm2vec.repo import VectorizedFunction from asm2vec.repo import VectorizedToken from asm2vec.repo import Token from asm2vec.repo import FunctionRepository from asm2vec.logging import asm2vec_logger from asm2vec.internal.atomic import Atomic def _random_walk(f: Function) -> List[Instruction]: visited: Set[int] = set() current = f.entry() seq: List[Instruction] = [] while current.id() not in visited: visited.add(current.id()) for instr in current: seq.append(instr) if len(current.successors()) == 0: break current = random.choice(current.successors()) return seq def _edge_sampling(f: Function) -> List[List[Instruction]]: edges: List[Tuple[BasicBlock, BasicBlock]] = [] def collect_edges(block: BasicBlock) -> None: nonlocal edges for successor in block.successors(): edges.append((block, successor)) walk_cfg(f.entry(), collect_edges) visited_edges: Set[Tuple[int, int]] = set() sequences = [] while len(visited_edges) < len(edges): e = random.choice(edges) visited_edges.add((e[0].id(), e[1].id())) sequences.append(list(e[0]) + list(e[1])) return sequences def make_sequential_function(f: Function, num_of_random_walks: int = 10) -> SequentialFunction: seq: List[List[Instruction]] = [] for _ in range(num_of_random_walks): seq.append(_random_walk(f)) # seq += _edge_sampling(f) return SequentialFunction(f.id(), f.name(), seq) def _get_function_tokens(f: Function, dim: int = 200) -> List[VectorizedToken]: tokens: List[VectorizedToken] = [] def collect_tokens(block: BasicBlock) -> None: nonlocal tokens for ins in block: tk: List[str] = [ins.op()] + ins.args() for t in tk: tokens.append(VectorizedToken(t, None, None, dim)) walk_cfg(f.entry(), collect_tokens) return tokens def _make_function_repo_helper(vocab: Dict[str, Token], funcs: List[Function], dim: int, num_of_rnd_walks: int, jobs: int) -> FunctionRepository: progress = Atomic(1) vec_funcs_atomic = Atomic([]) vocab_atomic = Atomic(vocab) def func_handler(f: Function): with vec_funcs_atomic.lock() as vfa: vfa.value().append(VectorizedFunction(make_sequential_function(f, num_of_rnd_walks), dim=dim*2)) tokens = _get_function_tokens(f, dim) for tk in tokens: with vocab_atomic.lock() as va: if tk.name() in va.value(): va.value()[tk.name()].count += 1 else: va.value()[tk.name()] = Token(tk) asm2vec_logger().debug('Sequence generated for function "%s", progress: %f%%', f.name(), progress.value() / len(funcs) * 100) with progress.lock() as prog: prog.set(prog.value() + 1) executor = concurrent.futures.ThreadPoolExecutor(max_workers=jobs) fs = [] for fn in funcs: fs.append(executor.submit(func_handler, fn)) done, not_done = concurrent.futures.wait(fs, return_when=concurrent.futures.FIRST_EXCEPTION) if len(not_done) > 0 or any(map(lambda fut: fut.cancelled() or not fut.done(), done)): raise RuntimeError('Not all tasks finished successfully.') vec_funcs = vec_funcs_atomic.value() repo = FunctionRepository(vec_funcs, vocab) # Re-calculate the frequency of each token. for t in repo.vocab().values(): t.frequency = t.count / repo.num_of_tokens() return repo def make_function_repo(funcs: List[Function], dim: int, num_of_rnd_walks: int, jobs: int) -> FunctionRepository: return _make_function_repo_helper(dict(), funcs, dim, num_of_rnd_walks, jobs) def make_estimate_repo(vocabulary: Dict[str, Token], f: Function, dim: int, num_of_rnd_walks: int) -> FunctionRepository: # Make a copy of the function list and vocabulary to avoid the change to affect the original trained repo. vocab: Dict[str, Token] = dict(**vocabulary) return _make_function_repo_helper(vocab, [f], dim, num_of_rnd_walks, 1) ================================================ FILE: asm2vec/internal/sampling.py ================================================ from typing import * import random T = TypeVar('T') class NegativeSampler: def __init__(self, distribution: List[Tuple[T, float]], alpha: float = 3 / 4): self._values = list(map(lambda x: x[0], distribution)) self._weights = list(map(lambda x: x[1] ** alpha, distribution)) def sample(self, k: int) -> List[T]: return random.choices(self._values, self._weights, k=k) ================================================ FILE: asm2vec/internal/training.py ================================================ from typing import * import math import threading import concurrent.futures import numpy as np from asm2vec.asm import Instruction from asm2vec.internal.repr import FunctionRepository from asm2vec.internal.repr import VectorizedFunction from asm2vec.internal.repr import Token from asm2vec.internal.repr import VectorizedToken from asm2vec.internal.sampling import NegativeSampler from asm2vec.internal.atomic import LockContextManager from asm2vec.internal.atomic import Atomic from asm2vec.logging import asm2vec_logger class Asm2VecParams: def __init__(self, **kwargs): self.d: int = kwargs.get('d', 200) self.initial_alpha: float = kwargs.get('alpha', 0.0025) self.alpha_update_interval: int = kwargs.get('alpha_update_interval', 10000) self.num_of_rnd_walks: int = kwargs.get('rnd_walks', 3) self.neg_samples: int = kwargs.get('neg_samples', 25) self.iteration: int = kwargs.get('iteration', 1) self.jobs: int = kwargs.get('jobs', 4) def to_dict(self) -> Dict[str, Any]: return { 'd': self.d, 'alpha': self.initial_alpha, 'alpha_update_interval': self.alpha_update_interval, 'num_of_rnd_walks': self.num_of_rnd_walks, 'neg_samples': self.neg_samples, 'iteration': self.iteration, 'jobs': self.jobs } def populate(self, rep: Dict[bytes, Any]) -> None: self.d: int = rep.get(b'd', 200) self.initial_alpha: float = rep.get(b'alpha', 0.0025) self.alpha_update_interval: int = rep.get(b'alpha_update_interval', 10000) self.num_of_rnd_walks: int = rep.get(b'rnd_walks', 3) self.neg_samples: int = rep.get(b'neg_samples', 25) self.iteration: int = rep.get(b'iteration', 1) self.jobs: int = rep.get(b'jobs', 4) class SequenceWindow: def __init__(self, sequence: List[Instruction], vocabulary: Dict[str, Token]): self._seq = sequence self._vocab = vocabulary self._i = 1 self._prev_ins = None self._curr_ins = None self._next_ins = None self._prev_ins_op = None self._prev_ins_args = None self._curr_ins_op = None self._curr_ins_args = None self._next_ins_op = None self._next_ins_args = None def move_next(self) -> bool: if self._i >= len(self._seq) - 1: return False def token_lookup(name) -> VectorizedToken: return self._vocab[name].vectorized() self._prev_ins = self._seq[self._i - 1] self._curr_ins = self._seq[self._i] self._next_ins = self._seq[self._i + 1] self._prev_ins_op = token_lookup(self._prev_ins.op()) self._prev_ins_args = list(map(token_lookup, self._prev_ins.args())) self._curr_ins_op = token_lookup(self._curr_ins.op()) self._curr_ins_args = list(map(token_lookup, self._curr_ins.args())) self._next_ins_op = token_lookup(self._next_ins.op()) self._next_ins_args = list(map(token_lookup, self._next_ins.args())) self._i += 1 return True def prev_ins(self) -> Instruction: return self._prev_ins def prev_ins_op(self) -> VectorizedToken: return self._prev_ins_op def prev_ins_args(self) -> List[VectorizedToken]: return self._prev_ins_args def curr_ins(self) -> Instruction: return self._curr_ins def curr_ins_op(self) -> VectorizedToken: return self._curr_ins_op def curr_ins_args(self) -> List[VectorizedToken]: return self._curr_ins_args def next_ins(self) -> Instruction: return self._next_ins def next_ins_op(self) -> VectorizedToken: return self._next_ins_op def next_ins_args(self) -> List[VectorizedToken]: return self._next_ins_args class TrainingContext: class Counter: def __init__(self, context: 'TrainingContext', name: str, initial: int = 0): self._context = context self._name = name self._val = initial def val(self) -> int: with self._context.lock(): return self._val def inc(self) -> int: with self._context.lock(): self._val += 1 return self._val def reset(self) -> int: with self._context.lock(): v = self._val self._val = 0 return v TOKENS_HANDLED_COUNTER: str = "tokens_handled" def __init__(self, repo: FunctionRepository, params: Asm2VecParams, is_estimating: bool = False): self._repo = repo self._params = params self._alpha = params.initial_alpha self._sampler = NegativeSampler(list(map(lambda t: (t, t.frequency), repo.vocab().values()))) self._is_estimating = is_estimating self._counters = dict() self._lock = threading.Lock() def repo(self) -> FunctionRepository: return self._repo def params(self) -> Asm2VecParams: return self._params def lock(self) -> LockContextManager: return LockContextManager(self._lock) def alpha(self) -> float: with self.lock(): return self._alpha def set_alpha(self, alpha: float) -> None: with self.lock(): self._alpha = alpha def sampler(self) -> NegativeSampler: return self._sampler def is_estimating(self) -> bool: return self._is_estimating def create_sequence_window(self, seq: List[Instruction]) -> SequenceWindow: return SequenceWindow(seq, self._repo.vocab()) def get_counter(self, name: str) -> Counter: with self.lock(): return self._counters.get(name) def add_counter(self, name: str, initial: int = 0) -> Counter: with self.lock(): c = self.__class__.Counter(self, name, initial) self._counters[name] = c return c def _sigmoid(x: float) -> float: return 1 / (1 + np.exp(-x)) def _identity(cond: bool) -> int: return 1 if cond else 0 def _dot_sigmoid(lhs: np.ndarray, rhs: np.ndarray) -> float: # noinspection PyTypeChecker return _sigmoid(np.dot(lhs, rhs)) def _get_inst_repr(op: VectorizedToken, args: List[VectorizedToken]) -> np.ndarray: if len(args) == 0: arg_vec = np.zeros(len(op.v)) else: arg_vec = np.average(list(map(lambda tk: tk.v, args)), axis=0) return np.hstack((op.v, arg_vec)) def _train_vectorized(wnd: SequenceWindow, f: VectorizedFunction, context: TrainingContext) -> None: ct_prev = _get_inst_repr(wnd.prev_ins_op(), wnd.prev_ins_args()) ct_next = _get_inst_repr(wnd.next_ins_op(), wnd.next_ins_args()) delta = np.average([ct_prev, f.v, ct_next], axis=0) tokens = [wnd.curr_ins_op()] + wnd.curr_ins_args() f_grad = np.zeros(f.v.shape) for tk in tokens: # Negative sampling. sampled_tokens: Dict[str, VectorizedToken] = \ dict(map(lambda x: (x.name(), x.vectorized()), context.sampler().sample(context.params().neg_samples))) if tk.name() not in sampled_tokens: sampled_tokens[tk.name()] = tk # The following code block tries to update the learning rate when necessary. Not required for now. # tokens_handled_counter = context.get_counter(TrainingContext.TOKENS_HANDLED_COUNTER) # if tokens_handled_counter is not None: # if tokens_handled_counter.val() % context.params().alpha_update_interval == 0: # # Update the learning rate. # alpha = 1 - tokens_handled_counter.val() / ( # context.params().iteration * context.repo().num_of_tokens() + 1) # context.set_alpha(max(alpha, context.params().initial_alpha * 0.0001)) for sp_tk in sampled_tokens.values(): # Accumulate gradient for function vector. g = (_dot_sigmoid(delta, tk.v_pred) - _identity(tk is sp_tk)) * context.alpha() f_grad += g / 3 * tk.v_pred if not context.is_estimating(): with context.lock(): # Update v'_t tk.v_pred -= g * delta # Apply function gradient. with context.lock(): f.v -= f_grad if not context.is_estimating(): # Apply gradient to instructions. d = len(f_grad) // 2 with context.lock(): wnd.prev_ins_op().v -= f_grad[:d] if len(wnd.prev_ins_args()) > 0: prev_args_grad = f_grad[d:] / len(wnd.prev_ins_args()) for t in wnd.prev_ins_args(): t.v -= prev_args_grad wnd.next_ins_op().v -= f_grad[:d] if len(wnd.next_ins_args()) > 0: next_args_grad = f_grad[d:] / len(wnd.next_ins_args()) for t in wnd.next_ins_args(): t.v -= next_args_grad def _train_sequence(f: VectorizedFunction, seq: List[Instruction], context: TrainingContext) -> None: wnd = context.create_sequence_window(seq) while wnd.move_next(): _train_vectorized(wnd, f, context) def train(repository: FunctionRepository, params: Asm2VecParams) -> None: context = TrainingContext(repository, params) context.add_counter(TrainingContext.TOKENS_HANDLED_COUNTER) asm2vec_logger().debug('Total number of functions: %d', len(context.repo().funcs())) progress = Atomic(1) def train_function(fn: VectorizedFunction): for seq in fn.sequential().sequences(): _train_sequence(fn, seq, context) asm2vec_logger().debug('Function "%s" trained, progress: %f%%', fn.sequential().name(), progress.value() / len(context.repo().funcs()) * 100) with progress.lock() as prog_proxy: prog_proxy.set(prog_proxy.value() + 1) executor = concurrent.futures.ThreadPoolExecutor(max_workers=context.params().jobs) futures = [] for f in context.repo().funcs(): futures.append(executor.submit(train_function, f)) done, not_done = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_EXCEPTION) if len(not_done) > 0: raise RuntimeError('Train failed due to one or more failed task.') def estimate(f: VectorizedFunction, estimate_repo: FunctionRepository, params: Asm2VecParams) -> np.ndarray: context = TrainingContext(estimate_repo, params, True) for seq in f.sequential().sequences(): _train_sequence(f, seq, context) return f.v ================================================ FILE: asm2vec/internal/util.py ================================================ import numpy as np def make_small_ndarray(dim: int) -> np.ndarray: rng = np.random.default_rng() return (rng.random(dim) - 0.5) / dim ================================================ FILE: asm2vec/logging.py ================================================ import logging def asm2vec_logger() -> logging.Logger: return logging.getLogger('asm2vec') def config_asm2vec_logging(**kwargs): level = kwargs.get('level', logging.WARNING) handlers = kwargs.get('handlers', []) filters = kwargs.get('filters', []) asm2vec_logger().setLevel(level) for hd in handlers: asm2vec_logger().addHandler(hd) for ft in filters: asm2vec_logger().addFilter(ft) ================================================ FILE: asm2vec/model.py ================================================ from typing import * import numpy as np import asm2vec.asm import asm2vec.repo import asm2vec.internal.training import asm2vec.internal.repr import asm2vec.internal.util class Asm2VecMemento: def __init__(self): self.params: Optional[asm2vec.internal.training.Asm2VecParams] = None self.vocab: Optional[Dict[str, asm2vec.repo.Token]] = None def serialize(self) -> Dict[str, Any]: return { 'params': self.params.to_dict(), 'vocab': asm2vec.repo.serialize_vocabulary(self.vocab) } def populate(self, rep: Dict[bytes, Any]) -> None: self.params = asm2vec.internal.training.Asm2VecParams() self.params.populate(rep[b'params']) self.vocab = asm2vec.repo.deserialize_vocabulary(rep[b'vocab']) class Asm2Vec: def __init__(self, **kwargs): self._params = asm2vec.internal.training.Asm2VecParams(**kwargs) self._vocab = None def memento(self) -> Asm2VecMemento: memento = Asm2VecMemento() memento.params = self._params memento.vocab = self._vocab return memento def set_memento(self, memento: Asm2VecMemento) -> None: self._params = memento.params self._vocab = memento.vocab def make_function_repo(self, funcs: List[asm2vec.asm.Function]) -> asm2vec.repo.FunctionRepository: return asm2vec.internal.repr.make_function_repo( funcs, self._params.d, self._params.num_of_rnd_walks, self._params.jobs) def train(self, repo: asm2vec.repo.FunctionRepository) -> None: asm2vec.internal.training.train(repo, self._params) self._vocab = repo.vocab() def to_vec(self, f: asm2vec.asm.Function) -> np.ndarray: estimate_repo = asm2vec.internal.repr.make_estimate_repo( self._vocab, f, self._params.d, self._params.num_of_rnd_walks) vf = estimate_repo.funcs()[0] asm2vec.internal.training.estimate(vf, estimate_repo, self._params) return vf.v ================================================ FILE: asm2vec/parse.py ================================================ from typing import * import asm2vec.asm import asm2vec.internal.parse from asm2vec.internal.parse import AssemblySyntaxError def parse_text(asm: str, **kwargs) -> List[asm2vec.asm.Function]: return asm2vec.internal.parse.parse_asm_lines(asm.split('\n'), **kwargs) def parse_fp(fp, **kwargs) -> List[asm2vec.asm.Function]: return asm2vec.internal.parse.parse_asm_lines(fp, **kwargs) def parse(asm_file_name: str, **kwargs) -> List[asm2vec.asm.Function]: with open(asm_file_name, mode='r') as fp: return parse_fp(fp, **kwargs) ================================================ FILE: asm2vec/repo.py ================================================ from typing import * import numpy as np import asm2vec.asm import asm2vec.internal.util class SequentialFunction: def __init__(self, fid: int, name: str, sequences: List[List[asm2vec.asm.Instruction]]): self._id = fid self._name = name self._seq = sequences def id(self) -> int: return self._id def name(self) -> str: return self._name def sequences(self) -> List[List[asm2vec.asm.Instruction]]: return self._seq class VectorizedFunction: def __init__(self, f: SequentialFunction, v: np.ndarray = None, dim: int = 400): self._f = f self.v = v if v is not None else asm2vec.internal.util.make_small_ndarray(dim) def sequential(self) -> SequentialFunction: return self._f class VectorizedToken: def __init__(self, name: str, v: np.ndarray = None, v_pred: np.ndarray = None, dim: int = 200): self._name = name self.v = v if v is not None else np.zeros(dim) self.v_pred = v_pred if v_pred is not None else asm2vec.internal.util.make_small_ndarray(dim * 2) def __eq__(self, other): if not isinstance(other, VectorizedToken): return False return self._name == other._name def __ne__(self, other): return not self.__eq__(other) def name(self) -> str: return self._name class Token: def __init__(self, vt: VectorizedToken, count: int = 1): self._vt = vt self.count: int = count self.frequency: float = 0 def vectorized(self) -> VectorizedToken: return self._vt def name(self) -> str: return self._vt.name() class FunctionRepository: def __init__(self, funcs: List[VectorizedFunction], vocab: Dict[str, Token]): self._funcs = funcs self._vocab = vocab self._num_of_tokens = sum(map(lambda x: x.count, vocab.values())) def funcs(self) -> List[VectorizedFunction]: return self._funcs def vocab(self) -> Dict[str, Token]: return self._vocab def num_of_tokens(self) -> int: return self._num_of_tokens def _serialize_token(token: Token) -> Dict[str, Any]: return { 'name': token.name(), 'v': list(token.vectorized().v), 'v_pred': list(token.vectorized().v_pred), 'count': token.count, 'frequency': token.frequency } def _deserialize_token(rep: Dict[bytes, Any]) -> Token: name = rep[b'name'].decode('utf-8') v = np.array(rep[b'v']) v_pred = np.array(rep[b'v_pred']) count = rep[b'count'] frequency = rep[b'frequency'] token = Token(VectorizedToken(name, v, v_pred)) token.count = count token.frequency = frequency return token def serialize_vocabulary(vocab: Dict[str, Token]) -> Dict[str, Any]: return dict(zip(vocab.keys(), map(_serialize_token, vocab.values()))) def deserialize_vocabulary(rep: Dict[bytes, Any]) -> Dict[str, Token]: return dict(zip(map(lambda b: b.decode('utf-8'), rep.keys()), map(_deserialize_token, rep.values()))) def _serialize_sequence(seq: List[asm2vec.asm.Instruction]) -> List[Any]: return list(map(lambda instr: [instr.op(), instr.args()], seq)) def _deserialize_sequence(rep: List[Any]) -> List[asm2vec.asm.Instruction]: return list(map( lambda instr_rep: asm2vec.asm.Instruction(instr_rep[0].decode('utf-8'), instr_rep[1].decode('utf-8')), rep)) def _serialize_vectorized_function(func: VectorizedFunction, include_sequences: bool) -> Dict[str, Any]: data = { 'id': func.sequential().id(), 'name': func.sequential().name(), 'v': list(func.v) } if include_sequences: data['sequences'] = list(map(_serialize_sequence, func.sequential().sequences())) return data def _deserialize_vectorized_function(rep: Dict[bytes, Any]) -> VectorizedFunction: name = rep[b'name'].decode('utf-8') fid = rep[b'id'] v = np.array(rep[b'v']) sequences = list(map(_deserialize_sequence, rep.get(b'sequences', []))) return VectorizedFunction(SequentialFunction(fid, name, sequences), v) SERIALIZE_VOCABULARY: int = 1 SERIALIZE_FUNCTION: int = 2 SERIALIZE_FUNCTION_SEQUENCES: int = 4 SERIALIZE_ALL: int = SERIALIZE_VOCABULARY | SERIALIZE_FUNCTION | SERIALIZE_FUNCTION_SEQUENCES def serialize_function_repo(repo: FunctionRepository, flags: int) -> Dict[str, Any]: data = dict() if (flags & SERIALIZE_VOCABULARY) != 0: data['vocab'] = serialize_vocabulary(repo.vocab()) if (flags & SERIALIZE_FUNCTION) != 0: include_sequences = ((flags & SERIALIZE_FUNCTION_SEQUENCES) != 0) data['funcs'] = list(map( lambda f: _serialize_vectorized_function(f, include_sequences), repo.funcs())) return data def deserialize_function_repo(rep: Dict[bytes, Any]) -> FunctionRepository: funcs = list(map(_deserialize_vectorized_function, rep.get(b'funcs', []))) vocab = deserialize_vocabulary(rep.get(b'vocab', dict())) return FunctionRepository(funcs, vocab) ================================================ FILE: examples/estimating.s ================================================ my_strlen_est: cmp BYTE PTR [rdi], 0 je .L4 mov rax, rdi .L3: add rax, 1 cmp BYTE PTR [rax], 0 jne .L3 .L2: sub rax, rdi ret .L4: mov rax, rdi jmp .L2 my_strcmp_est: movzx eax, BYTE PTR [rdi] test al, al je .L12 .L7: movzx edx, BYTE PTR [rsi] test dl, dl je .L15 cmp dl, al jne .L16 add rdi, 1 add rsi, 1 movzx eax, BYTE PTR [rdi] test al, al jne .L7 .L12: cmp BYTE PTR [rsi], 0 setne dl movzx edx, dl neg edx .L6: mov eax, edx ret .L16: movsx eax, al movsx edx, dl sub eax, edx mov edx, eax jmp .L6 .L15: mov edx, 1 test al, al jne .L6 jmp .L12 .LC0: .string "%s" .LC1: .string "%d\n" main: sub rsp, 264 lea rsi, [rsp+128] mov edi, OFFSET FLAT:.LC0 mov eax, 0 call scanf mov rsi, rsp mov edi, OFFSET FLAT:.LC0 mov eax, 0 call scanf lea rdi, [rsp+128] call my_strlen_est mov esi, eax mov edi, OFFSET FLAT:.LC1 mov eax, 0 call printf mov rsi, rsp lea rdi, [rsp+128] call my_strcmp_est mov esi, eax mov edi, OFFSET FLAT:.LC1 mov eax, 0 call printf mov eax, 0 add rsp, 264 ret ================================================ FILE: examples/training-estimating.py ================================================ import numpy as np import asm2vec.asm import asm2vec.parse import asm2vec.model def cosine_similarity(v1, v2): return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) def main(): training_funcs = asm2vec.parse.parse('training.s', func_names=['main', 'my_strlen_train', 'my_strcmp_train']) estimating_funcs = asm2vec.parse.parse('estimating.s', func_names=['main', 'my_strlen_est', 'my_strcmp_est']) print('# of training functions:', len(training_funcs)) print('# of estimating functions:', len(estimating_funcs)) model = asm2vec.model.Asm2Vec(d=200) training_repo = model.make_function_repo(training_funcs) model.train(training_repo) print('Training complete.') for tf in training_repo.funcs(): print('Norm of trained function "{}" = {}'.format(tf.sequential().name(), np.linalg.norm(tf.v))) estimating_funcs_vec = list(map(lambda f: model.to_vec(f), estimating_funcs)) print('Estimating complete.') for (ef, efv) in zip(estimating_funcs, estimating_funcs_vec): print('Norm of trained function "{}" = {}'.format(ef.name(), np.linalg.norm(efv))) for tf in training_repo.funcs(): for (ef, efv) in zip(estimating_funcs, estimating_funcs_vec): sim = cosine_similarity(tf.v, efv) print('sim("{}", "{}") = {}'.format(tf.sequential().name(), ef.name(), sim)) if __name__ == '__main__': main() ================================================ FILE: examples/training.s ================================================ my_strlen_train: push rbp mov rbp, rsp mov QWORD PTR [rbp-24], rdi mov rax, QWORD PTR [rbp-24] mov QWORD PTR [rbp-8], rax jmp .L2 .L3: add QWORD PTR [rbp-8], 1 .L2: mov rax, QWORD PTR [rbp-8] movzx eax, BYTE PTR [rax] test al, al jne .L3 mov rax, QWORD PTR [rbp-8] sub rax, QWORD PTR [rbp-24] pop rbp ret my_strcmp_train: push rbp mov rbp, rsp mov QWORD PTR [rbp-8], rdi mov QWORD PTR [rbp-16], rsi jmp .L6 .L10: mov rax, QWORD PTR [rbp-8] movzx edx, BYTE PTR [rax] mov rax, QWORD PTR [rbp-16] movzx eax, BYTE PTR [rax] cmp dl, al je .L7 mov rax, QWORD PTR [rbp-8] movzx eax, BYTE PTR [rax] movsx edx, al mov rax, QWORD PTR [rbp-16] movzx eax, BYTE PTR [rax] movsx eax, al sub edx, eax mov eax, edx jmp .L8 .L7: add QWORD PTR [rbp-8], 1 add QWORD PTR [rbp-16], 1 .L6: mov rax, QWORD PTR [rbp-8] movzx eax, BYTE PTR [rax] test al, al je .L9 mov rax, QWORD PTR [rbp-16] movzx eax, BYTE PTR [rax] test al, al jne .L10 .L9: mov rax, QWORD PTR [rbp-8] movzx eax, BYTE PTR [rax] test al, al je .L11 mov eax, 1 jmp .L8 .L11: mov rax, QWORD PTR [rbp-16] movzx eax, BYTE PTR [rax] test al, al je .L12 mov eax, -1 jmp .L8 .L12: mov eax, 0 .L8: pop rbp ret .LC0: .string "%s" .LC1: .string "%d\n" main: push rbp mov rbp, rsp sub rsp, 256 lea rax, [rbp-128] mov rsi, rax mov edi, OFFSET FLAT:.LC0 mov eax, 0 call scanf lea rax, [rbp-256] mov rsi, rax mov edi, OFFSET FLAT:.LC0 mov eax, 0 call scanf lea rax, [rbp-128] mov rdi, rax call my_strlen_train mov esi, eax mov edi, OFFSET FLAT:.LC1 mov eax, 0 call printf lea rdx, [rbp-256] lea rax, [rbp-128] mov rsi, rdx mov rdi, rax call my_strcmp_train mov esi, eax mov edi, OFFSET FLAT:.LC1 mov eax, 0 call printf mov eax, 0 leave ret ================================================ FILE: tests/asm_test.py ================================================ import unittest as ut import asm2vec.asm as asm class InstructionTest(ut.TestCase): def test_parse_instruction(self): ins = asm.parse_instruction('mov eax, ebx') self.assertEqual('mov', ins.op(), 'Operators not equal') self.assertListEqual(['eax', 'ebx'], ins.args(), 'Operands not equal') def test_parse_instruction_one_operand(self): ins = asm.parse_instruction('inc eax') self.assertEqual('inc', ins.op(), 'Operators not equal') self.assertListEqual(['eax'], ins.args(), 'Operands not equal') def test_parse_instruction_no_operands(self): ins = asm.parse_instruction('ret') self.assertEqual('ret', ins.op(), 'Operators not equal') self.assertListEqual([], ins.args(), 'Operands not equal') class BasicBlockTest(ut.TestCase): pass class FunctionTest(ut.TestCase): pass ================================================ FILE: tests/parse_test.py ================================================ import unittest as ut import asm2vec.parse test_asm = """ my_strlen: push rbp mov rbp, rsp mov QWORD PTR [rbp-24], rdi mov rax, QWORD PTR [rbp-24] mov QWORD PTR [rbp-8], rax jmp .L2 .L3: add QWORD PTR [rbp-8], 1 .L2: mov rax, QWORD PTR [rbp-8] movzx eax, BYTE PTR [rax] test al, al jne .L3 mov rax, QWORD PTR [rbp-8] sub rax, QWORD PTR [rbp-24] pop rbp ret .LC0: .string "%s" .LC1: .string "%d\\n" main: push rbp mov rbp, rsp add rsp, -128 lea rax, [rbp-128] mov rsi, rax mov edi, OFFSET FLAT:.LC0 mov eax, 0 call scanf lea rax, [rbp-128] mov rdi, rax call my_strlen mov esi, eax mov edi, OFFSET FLAT:.LC1 mov eax, 0 call printf mov eax, 0 leave ret """ class ParseTest(ut.TestCase): def test_parse_text(self): funcs = asm2vec.parse.parse_text(test_asm, func_names=['main', 'my_strlen']) self.assertEqual(2, len(funcs)) self.assertEqual({'main', 'my_strlen'}, set(map(lambda f: f.name(), funcs))) funcs = dict(map(lambda f: (f.name(), f), funcs)) main_func: asm2vec.asm.Function = funcs['main'] my_strlen_func: asm2vec.asm.Function = funcs['my_strlen'] self.assertListEqual(['my_strlen'], list(map(lambda f: f.name(), main_func.callees()))) self.assertListEqual(['main'], list(map(lambda f: f.name(), my_strlen_func.callers()))) ================================================ FILE: tests/utilities_test.py ================================================ import unittest as ut import asm2vec.internal.util as utilities class PermutationTest(ut.TestCase): def test_permute(self): v = [10, 20, 30, 40, 50] p = [2, 4, 1, 0, 3] pv = utilities.permute(v, p) self.assertListEqual([30, 50, 20, 10, 40], pv, 'Permutated vectors not equal.') def test_inv_permute(self): v = [30, 50, 20, 10, 40] p = [2, 4, 1, 0, 3] pv = utilities.inverse_permute(v, p) self.assertListEqual([10, 20, 30, 40, 50], pv, 'Inverse permutated vectors not equal.')