Repository: Lancern/asm2vec
Branch: master
Commit: d38a3bc3bc9c
Files: 21
Total size: 56.6 KB
Directory structure:
gitextract_5xp0becm/
├── .gitignore
├── README.md
├── asm2vec/
│ ├── __init__.py
│ ├── asm.py
│ ├── internal/
│ │ ├── __init__.py
│ │ ├── atomic.py
│ │ ├── parse.py
│ │ ├── repr.py
│ │ ├── sampling.py
│ │ ├── training.py
│ │ └── util.py
│ ├── logging.py
│ ├── model.py
│ ├── parse.py
│ └── repo.py
├── examples/
│ ├── estimating.s
│ ├── training-estimating.py
│ └── training.s
└── tests/
├── asm_test.py
├── parse_test.py
└── utilities_test.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Created by https://www.toptal.com/developers/gitignore/api/macos,intellij,virtualenv,python
# Edit at https://www.toptal.com/developers/gitignore?templates=macos,intellij,virtualenv,python
### Intellij ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
# Generated files
.idea/**/contentModel.xml
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# Gradle
.idea/**/gradle.xml
.idea/**/libraries
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
### Intellij Patch ###
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
# *.iml
# modules.xml
# .idea/misc.xml
# *.ipr
# Sonarlint plugin
# https://plugins.jetbrains.com/plugin/7973-sonarlint
.idea/**/sonarlint/
# SonarQube Plugin
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
.idea/**/sonarIssues.xml
# Markdown Navigator plugin
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
.idea/**/markdown-navigator.xml
.idea/**/markdown-navigator-enh.xml
.idea/**/markdown-navigator/
# Cache file creation bug
# See https://youtrack.jetbrains.com/issue/JBR-2257
.idea/$CACHE_FILE$
# CodeStream plugin
# https://plugins.jetbrains.com/plugin/12206-codestream
.idea/codestream.xml
### macOS ###
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
pytestdebug.log
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
doc/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
pythonenv*
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# profiling data
.prof
### VirtualEnv ###
# Virtualenv
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
[Bb]in
[Ii]nclude
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
pyvenv.cfg
pip-selfcheck.json
# End of https://www.toptal.com/developers/gitignore/api/macos,intellij,virtualenv,python
================================================
FILE: README.md
================================================
# asm2vec
This is an unofficial implementation of the `asm2vec` model as a standalone python package. The details of the model can be found in the original paper: [(sp'19) Asm2Vec: Boosting Static Representation Robustness for Binary Clone Search against Code Obfuscation and Compiler Optimization](https://www.computer.org/csdl/proceedings-article/sp/2019/666000a038/19skfc3ZfKo)
## Requirements
This implementation is written in python 3.7 and it's recommended to use python 3.7+ as well. The only dependency of this package is `numpy` which can be installed as follows:
```shell
python3 -m pip install numpy
```
## How to use
### Import
To install the package, execute the following commands:
```shell
git clone https://github.com/lancern/asm2vec.git
```
Add the following line to the `.bashrc` file to add `asm2vec` to your python interpreter's search path for external packages:
```shell
export PYTHONPATH="path/to/asm2vec:$PYTHONPATH"
```
Replace `path/to/asm2vec` with the directory you clone `asm2vec` into. Then execute the following commands to update `PYTHONPATH`:
```shell
source ~/.bashrc
```
You can also add the following code snippets to your python source code referring `asm2vec` to guide python interpreter finding the package successfully:
```python
import sys
sys.path.append('path/to/asm2vec')
```
In your python code, use the following `import` statement to import this package:
```python
import asm2vec.<module-name>
```
### Define CFGs And Training
You have 2 approaches to define the binary program that will be sent to the `asm2vec` model. The first approach is to build the CFG manually, as shown below:
```python
from asm2vec.asm import BasicBlock
from asm2vec.asm import Function
from asm2vec.asm import parse_instruction
block1 = BasicBlock()
block1.add_instruction(parse_instruction('mov eax, ebx'))
block1.add_instruction(parse_instruction('jmp _loc'))
block2 = BasicBlock()
block2.add_instruction(parse_instruction('xor eax, eax'))
block2.add_instruction(parse_instruction('ret'))
block1.add_successor(block2)
block3 = BasicBlock()
block3.add_instruction(parse_instruction('sub eax, [ebp]'))
f1 = Function(block1, 'some_func')
f2 = Function(block3, 'another_func')
# block4 is ignore here for clarity
f3 = Function(block4, 'estimate_func')
```
And then you can train a model with the following code:
```python
from asm2vec.model import Asm2Vec
model = Asm2Vec(d=200)
train_repo = model.make_function_repo([f1, f2, f3])
model.train(train_repo)
```
The second approach is using the `parse` module provided by `asm2vec` to build CFGs automatically from an assembly code source file:
```python
from asm2vec.parse import parse_fp
with open('source.asm', 'r') as fp:
funcs = parse_fp(fp)
```
And then you can train a model with the following code:
```python
from asm2vec.model import Asm2Vec
model = Asm2Vec(d=200)
train_repo = model.make_function_repo(funcs)
model.train(train_repo)
```
### Estimation
You can use the `asm2vec.model.Asm2Vec.to_vec` method to convert a function into its vector representation.
### Serialization
The implementation support serialization on many of its internal data structures so that you can serialize the internal state of a trained model into disk for future use.
You can serialize two data structures to primitive data: the function repository and the model memento.
> To be finished.
## Hyper Parameters
The constructor of `asm2vec.model.Asm2Vec` class accepts some keyword arguments as hyper parameters of the model. The following table lists all the hyper parameters available:
| Parameter Name | Type | Meaning | Default Value |
| ----------------------- | ------- | ------------------------------------------------------------------------------------------------------ | ------------- |
| `d` | `int` | The dimention of the vectors for tokens. | `200` |
| `initial_alpha` | `float` | The initial learning rate. | `0.05` |
| `alpha_update_interval` | `int` | How many tokens can be processed before changing the learning rate? | `10000` |
| `rnd_walks` | `int` | How many random walks to perform to sequentialize a function? | `3` |
| `neg_samples` | `int` | How many samples to take during negative sampling? | `25` |
| `iteration` | `int` | How many iterations to perform? (This parameter is reserved for future use and is not implemented now) | `1` |
| `jobs` | `int` | How many tasks to execute concurrently during training? | `4` |
## Notes
For simplicity, the Selective Callee Expansion is not implemented in this early implementation. You have to do it manually before sending CFG into `asm2vec` .
================================================
FILE: asm2vec/__init__.py
================================================
__all__ = ['asm', 'model', 'parse']
================================================
FILE: asm2vec/asm.py
================================================
from typing import *
class Instruction:
def __init__(self, op: str, *args: str):
self._op = op
self._args = list(args)
def op(self) -> str:
return self._op
def number_of_args(self) -> int:
return len(self._args)
def args(self) -> List[str]:
return self._args
def parse_instruction(code: str) -> Instruction:
sep_index = code.find(' ')
if sep_index == -1:
return Instruction(code)
op = code[:sep_index] # Operator
args_list = list(map(str.strip, code[sep_index:].split(','))) # Operands
return Instruction(op, *args_list)
class BasicBlock:
_next_unused_id: int = 1
def __init__(self):
# Allocate a new unique ID for the basic block.
self._id = self.__class__._next_unused_id
self.__class__._next_unused_id += 1
self._instructions = []
self._predecessors = []
self._successors = []
def __iter__(self):
return self._instructions.__iter__()
def __len__(self):
return len(self._instructions)
def __hash__(self):
return self._id.__hash__()
def __eq__(self, other):
if not isinstance(other, BasicBlock):
return False
return self._id == other.id()
def __ne__(self, other):
return not self.__eq__(other)
def id(self) -> int:
return self._id
def add_instruction(self, instr: Instruction) -> None:
self._instructions.append(instr)
def body_instructions(self) -> List[Instruction]:
return self._instructions[:-1]
def instructions(self) -> List[Instruction]:
return self._instructions
def add_predecessor(self, predecessor: 'BasicBlock') -> None:
self._predecessors.append(predecessor)
predecessor._successors.append(self)
def add_successor(self, successor: 'BasicBlock') -> None:
self._successors.append(successor)
successor._predecessors.append(self)
def first_instruction(self) -> Instruction:
return self._instructions[0]
def last_instruction(self) -> Instruction:
return self._instructions[-1]
def predecessors(self) -> List['BasicBlock']:
return self._predecessors
def in_degree(self) -> int:
return len(self._predecessors)
def successors(self) -> List['BasicBlock']:
return self._successors
def out_degree(self) -> int:
return len(self._successors)
class CFGWalkerCallback:
def __call__(self, *args, **kwargs):
self.on_enter(*args)
def on_enter(self, block: BasicBlock) -> None:
pass
def on_exit(self, block: BasicBlock) -> None:
pass
CFGWalkerCallbackType = Union[CFGWalkerCallback, Callable[[BasicBlock], Any]]
def _walk_cfg(entry: BasicBlock, action: CFGWalkerCallbackType, visited: Set) -> None:
if entry.id() in visited:
return
visited.add(entry.id())
action(entry)
for successor in entry.successors():
_walk_cfg(successor, action, visited)
if isinstance(action, CFGWalkerCallback):
action.on_exit(entry)
def walk_cfg(entry: BasicBlock, action: CFGWalkerCallbackType) -> None:
_walk_cfg(entry, action, set())
class Function:
_next_unused_id = 1
def __init__(self, entry: BasicBlock, name: str = None):
# Allocate a unique ID for the current Function object.
self._id = self.__class__._next_unused_id
self.__class__._next_unused_id += 1
self._entry = entry
self._name = name
self._callees = [] # Functions that are called by this function
self._callers = [] # Functions that call this function
def __len__(self) -> int:
instr_count = 0
def count_instr(block: BasicBlock) -> None:
nonlocal instr_count
instr_count += len(block)
walk_cfg(self._entry, count_instr)
return instr_count
def __hash__(self):
return self._id
def __eq__(self, other):
if not isinstance(other, Function):
return False
return self._id == other.id()
def __ne__(self, other):
return not self.__eq__(other)
def id(self) -> int:
return self._id
def entry(self) -> BasicBlock:
return self._entry
def name(self) -> str:
return self._name
def add_callee(self, f: 'Function') -> None:
self._callees.append(f)
f._callers.append(self)
def callees(self) -> List['Function']:
return self._callees
def out_degree(self) -> int:
return len(self._callees)
def add_caller(self, f: 'Function') -> None:
self._callers.append(f)
f._callees.append(self)
def callers(self) -> List['Function']:
return self._callers
def in_degree(self) -> int:
return len(self._callers)
================================================
FILE: asm2vec/internal/__init__.py
================================================
================================================
FILE: asm2vec/internal/atomic.py
================================================
from typing import *
import threading
class LockContextManager:
def __init__(self, lock: threading.Lock):
self._lock = lock
self._exited = False
def __enter__(self):
self._lock.acquire()
def __exit__(self, exc_type, exc_val, exc_tb):
self._exited = True
self._lock.release()
def exited(self) -> bool:
return self._exited
class Atomic:
class AtomicContextManager(LockContextManager):
def __init__(self, atomic: 'Atomic'):
super().__init__(atomic._lock)
self._atomic = atomic
self._exited = False
def __enter__(self):
super().__enter__()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
super().__exit__(exc_type, exc_val, exc_tb)
def value(self) -> Any:
if self.exited():
raise RuntimeError('Trying to access AtomicContextManager after its exit.')
return self._atomic._val
def set(self, value: Any) -> None:
if self.exited():
raise RuntimeError('Trying to access AtomicContextManager after its exit.')
self._atomic._val = value
def __init__(self, value: Any):
self._val = value
self._lock = threading.Lock()
def lock(self) -> AtomicContextManager:
return self.__class__.AtomicContextManager(self)
def value(self) -> Any:
with self.lock() as val:
return val.value()
================================================
FILE: asm2vec/internal/parse.py
================================================
from typing import *
import logging
import asm2vec.asm
class AssemblySyntaxError(Exception):
def __init__(self, message: str = None):
self._msg = message
def message(self) -> str:
return self._msg
def raise_asm_syntax_error(expect: str, found: str) -> None:
raise AssemblySyntaxError('Expect "{}", but "{}" was found.'.format(expect, found))
jmp_op = {
'jmp', 'ja', 'jae', 'jb', 'jbe', 'jc', 'jcxz', 'jecxz', 'jrcxz', 'je', 'jg', 'jge', 'jl', 'jle', 'jna',
'jnae', 'jnb', 'jnbe', 'jnc', 'jne', 'jng', 'jnge', 'jnl', 'jnle', 'jno', 'jnp', 'jns', 'jnz', 'jo', 'jp',
'jpe', 'jpo', 'js', 'jz'
}
call_op = {
'call'
}
ret_op = {
'ret'
}
x86_64_regs = {
'al', 'ah', 'bl', 'bh', 'cl', 'ch', 'dl', 'dh', 'spl', 'bpl', 'sil', 'dil',
'ax', 'bx', 'cx', 'dx', 'sp', 'bp', 'si', 'di',
'eax', 'ebx', 'ecx', 'edx', 'esp', 'ebp', 'esi', 'edi',
'rax', 'rdx', 'rcx', 'rdx', 'rsp', 'rbp', 'rsi', 'rdi',
'r8b', 'r9b', 'r10b', 'r11b', 'r12b', 'r13b', 'r14b', 'r15b',
'r8w', 'r9w', 'r10w', 'r11w', 'r12w', 'r13w', 'r14w', 'r15w',
'r8d', 'r9d', 'r10d', 'r11d', 'r12d', 'r13d', 'r14d', 'r15d',
'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15',
'cs', 'ss', 'ds', 'es', 'fs', 'gs',
'ecs', 'ess', 'eds', 'ees', 'efs', 'egs',
'rcs', 'rss', 'rds', 'res', 'rfs', 'rgs'
}
def is_jmp(op: str) -> bool:
return op.lower() in jmp_op
def is_conditional_jmp(op: str) -> bool:
return is_jmp(op) and op.lower() != 'jmp'
def is_call(op: str) -> bool:
return op.lower() in call_op
def is_ret(op: str) -> bool:
return op.lower() in ret_op
def is_reg(arg: str) -> bool:
return arg.lower() in x86_64_regs
class CFGBuilder:
def __init__(self, context: 'ParseContext'):
self._context = context
self._blocks: List[asm2vec.asm.BasicBlock] = []
self._active_block = -1
self._block_labels: Dict[str, int] = dict()
def _logger(self) -> logging.Logger:
return self._context.logger().getChild(self.__class__.__name__)
def _allocate_block(self) -> int:
self._blocks.append(asm2vec.asm.BasicBlock())
return len(self._blocks) - 1
def _allocate_named_block(self, name: str) -> int:
if name in self._block_labels:
return self._block_labels[name]
else:
idx = self._allocate_block()
self._block_labels[name] = idx
return idx
def _get_active_block(self) -> asm2vec.asm.BasicBlock:
return self._blocks[self._active_block]
def _set_active_block(self, block_id: int) -> None:
self._active_block = block_id
def _has_active_block(self) -> bool:
return self._active_block != -1
def _close_active_block(self) -> None:
self._active_block = -1
def _add_jmp(self, op: str, args: List[str]) -> None:
if len(args) != 1:
raise_asm_syntax_error('Jump with single operand', '{} operands'.format(len(args)))
cur_block = self._get_active_block()
self._close_active_block()
if is_conditional_jmp(op):
# Allocate another basic block for more instructions since the current code point is reachable.
# This may produce some empty basic blocks in the final output.
self._set_active_block(self._allocate_block())
self._get_active_block().add_predecessor(cur_block)
def add_instr(self, op: str, args: List[str]) -> None:
if not self._has_active_block():
# Allocate a new basic block.
self._set_active_block(self._allocate_block())
self._get_active_block().add_instruction(asm2vec.asm.Instruction(op, *args))
if is_jmp(op):
self._add_jmp(op, args)
elif is_ret(op):
# `ret` instruction encountered. Close current active block.
self._close_active_block()
def set_label(self, label: str) -> None:
block_id = self._block_labels.get(label, -1)
if block_id == -1:
# Test if the current active block is empty in which case we can reuse it.
if self._has_active_block() and len(self._get_active_block()) == 0:
self._block_labels[label] = self._active_block
else:
# Open a new block for the label.
block_id = self._allocate_block()
self._block_labels[label] = block_id
# Link the new block with the previously-active block.
if self._has_active_block():
self._get_active_block().add_successor(self._blocks[block_id])
self._set_active_block(block_id)
else:
self._set_active_block(block_id)
def build(self) -> List[asm2vec.asm.Function]:
func_entries: Dict[str, int] = dict()
# Walk through all instructions and fix block relations formed by jump and call instructions.
for blk in self._blocks:
for inst in blk:
if is_jmp(inst.op()):
target = inst.args()[0]
if target in self._block_labels:
blk.add_successor(self._blocks[self._block_labels[target]])
elif is_call(inst.op()):
target = inst.args()[0]
if target in self._block_labels and target not in func_entries:
func_entries[target] = self._block_labels[target]
for func_name in self._context.options().func_names():
if func_name not in self._block_labels:
self._logger().warning('Cannot find function "{}"', func_name)
continue
if func_name not in func_entries:
func_entries[func_name] = self._block_labels[func_name]
funcs: Dict[str, asm2vec.asm.Function] = \
dict(map(lambda x: (x[0], asm2vec.asm.Function(self._blocks[x[1]], x[0])), func_entries.items()))
# Fix function call relation.
for (name, f) in funcs.items():
def block_action(block: asm2vec.asm.BasicBlock) -> None:
for instr in block:
if is_call(instr.op()):
callee_name = instr.args()[0]
if callee_name in funcs:
f.add_callee(funcs[callee_name])
asm2vec.asm.walk_cfg(f.entry(), block_action)
# TODO: Implement Selective Callee Expansion here.
return list(funcs.values())
class ParseOptions:
def __init__(self, **kwargs):
self._func_names = kwargs.get('func_names', [])
def func_names(self) -> List[str]:
return self._func_names
class ParseContext:
def __init__(self, **kwargs):
self._builder = CFGBuilder(self)
self._options = ParseOptions(**kwargs)
self._logger = logging.getLogger('asm2vec.ParseContext')
def logger(self) -> logging.Logger:
return self._logger
def options(self) -> ParseOptions:
return self._options
def builder(self) -> CFGBuilder:
return self._builder
'''
Parser rules for input assembly file:
program
: asm_line*
;
asm_line
: asm_label '\n'
| BLANKS asm_instr '\n'
;
asm_label
: ASM_LABEL_ID ':'
;
asm_instr
: ASM_INSTR_OP ' ' asm_instr_arg_list
;
asm_instr_arg_list
: ASM_INSTR_ARG (',' asm_instr_arg_list)?
| /* epsilon */
;
BLANKS : [ \n\t]+;
'''
def is_fullmatch(pattern, s: str) -> bool:
return pattern.fullmatch(s) is not None
def parse_asm_label(ln: str, context: ParseContext) -> None:
stripped = ln.strip()
if stripped[-1] != ':':
raise_asm_syntax_error('asm_label', ln)
context.builder().set_label(stripped[:-1])
def parse_asm_instr(ln: str, context: ParseContext) -> None:
delim_index = ln.find(' ')
args = []
if delim_index == -1:
op = ln
else:
op = ln[:delim_index]
args = list(map(lambda arg: arg.strip(), ln[delim_index + 1:].split(',')))
context.builder().add_instr(op, args)
def parse_asm_line(ln: str, context: ParseContext) -> None:
if len(ln.strip()) == 0:
return
if ln[0].isspace():
# Expect production asm_line -> BLANKS asm_instr '\n'
parse_asm_instr(ln.strip(), context)
else:
# Expect production asm_line -> asm_label
parse_asm_label(ln, context)
def parse_asm_lines(lines: Iterable[str], **kwargs) -> List[asm2vec.asm.Function]:
context = ParseContext(**kwargs)
for ln in lines:
parse_asm_line(ln, context)
return context.builder().build()
================================================
FILE: asm2vec/internal/repr.py
================================================
import random
from typing import *
import concurrent.futures
from asm2vec.asm import Instruction
from asm2vec.asm import BasicBlock
from asm2vec.asm import Function
from asm2vec.asm import walk_cfg
from asm2vec.repo import SequentialFunction
from asm2vec.repo import VectorizedFunction
from asm2vec.repo import VectorizedToken
from asm2vec.repo import Token
from asm2vec.repo import FunctionRepository
from asm2vec.logging import asm2vec_logger
from asm2vec.internal.atomic import Atomic
def _random_walk(f: Function) -> List[Instruction]:
visited: Set[int] = set()
current = f.entry()
seq: List[Instruction] = []
while current.id() not in visited:
visited.add(current.id())
for instr in current:
seq.append(instr)
if len(current.successors()) == 0:
break
current = random.choice(current.successors())
return seq
def _edge_sampling(f: Function) -> List[List[Instruction]]:
edges: List[Tuple[BasicBlock, BasicBlock]] = []
def collect_edges(block: BasicBlock) -> None:
nonlocal edges
for successor in block.successors():
edges.append((block, successor))
walk_cfg(f.entry(), collect_edges)
visited_edges: Set[Tuple[int, int]] = set()
sequences = []
while len(visited_edges) < len(edges):
e = random.choice(edges)
visited_edges.add((e[0].id(), e[1].id()))
sequences.append(list(e[0]) + list(e[1]))
return sequences
def make_sequential_function(f: Function, num_of_random_walks: int = 10) -> SequentialFunction:
seq: List[List[Instruction]] = []
for _ in range(num_of_random_walks):
seq.append(_random_walk(f))
# seq += _edge_sampling(f)
return SequentialFunction(f.id(), f.name(), seq)
def _get_function_tokens(f: Function, dim: int = 200) -> List[VectorizedToken]:
tokens: List[VectorizedToken] = []
def collect_tokens(block: BasicBlock) -> None:
nonlocal tokens
for ins in block:
tk: List[str] = [ins.op()] + ins.args()
for t in tk:
tokens.append(VectorizedToken(t, None, None, dim))
walk_cfg(f.entry(), collect_tokens)
return tokens
def _make_function_repo_helper(vocab: Dict[str, Token], funcs: List[Function],
dim: int, num_of_rnd_walks: int, jobs: int) -> FunctionRepository:
progress = Atomic(1)
vec_funcs_atomic = Atomic([])
vocab_atomic = Atomic(vocab)
def func_handler(f: Function):
with vec_funcs_atomic.lock() as vfa:
vfa.value().append(VectorizedFunction(make_sequential_function(f, num_of_rnd_walks), dim=dim*2))
tokens = _get_function_tokens(f, dim)
for tk in tokens:
with vocab_atomic.lock() as va:
if tk.name() in va.value():
va.value()[tk.name()].count += 1
else:
va.value()[tk.name()] = Token(tk)
asm2vec_logger().debug('Sequence generated for function "%s", progress: %f%%',
f.name(), progress.value() / len(funcs) * 100)
with progress.lock() as prog:
prog.set(prog.value() + 1)
executor = concurrent.futures.ThreadPoolExecutor(max_workers=jobs)
fs = []
for fn in funcs:
fs.append(executor.submit(func_handler, fn))
done, not_done = concurrent.futures.wait(fs, return_when=concurrent.futures.FIRST_EXCEPTION)
if len(not_done) > 0 or any(map(lambda fut: fut.cancelled() or not fut.done(), done)):
raise RuntimeError('Not all tasks finished successfully.')
vec_funcs = vec_funcs_atomic.value()
repo = FunctionRepository(vec_funcs, vocab)
# Re-calculate the frequency of each token.
for t in repo.vocab().values():
t.frequency = t.count / repo.num_of_tokens()
return repo
def make_function_repo(funcs: List[Function], dim: int, num_of_rnd_walks: int, jobs: int) -> FunctionRepository:
return _make_function_repo_helper(dict(), funcs, dim, num_of_rnd_walks, jobs)
def make_estimate_repo(vocabulary: Dict[str, Token], f: Function,
dim: int, num_of_rnd_walks: int) -> FunctionRepository:
# Make a copy of the function list and vocabulary to avoid the change to affect the original trained repo.
vocab: Dict[str, Token] = dict(**vocabulary)
return _make_function_repo_helper(vocab, [f], dim, num_of_rnd_walks, 1)
================================================
FILE: asm2vec/internal/sampling.py
================================================
from typing import *
import random
T = TypeVar('T')
class NegativeSampler:
def __init__(self, distribution: List[Tuple[T, float]], alpha: float = 3 / 4):
self._values = list(map(lambda x: x[0], distribution))
self._weights = list(map(lambda x: x[1] ** alpha, distribution))
def sample(self, k: int) -> List[T]:
return random.choices(self._values, self._weights, k=k)
================================================
FILE: asm2vec/internal/training.py
================================================
from typing import *
import math
import threading
import concurrent.futures
import numpy as np
from asm2vec.asm import Instruction
from asm2vec.internal.repr import FunctionRepository
from asm2vec.internal.repr import VectorizedFunction
from asm2vec.internal.repr import Token
from asm2vec.internal.repr import VectorizedToken
from asm2vec.internal.sampling import NegativeSampler
from asm2vec.internal.atomic import LockContextManager
from asm2vec.internal.atomic import Atomic
from asm2vec.logging import asm2vec_logger
class Asm2VecParams:
def __init__(self, **kwargs):
self.d: int = kwargs.get('d', 200)
self.initial_alpha: float = kwargs.get('alpha', 0.0025)
self.alpha_update_interval: int = kwargs.get('alpha_update_interval', 10000)
self.num_of_rnd_walks: int = kwargs.get('rnd_walks', 3)
self.neg_samples: int = kwargs.get('neg_samples', 25)
self.iteration: int = kwargs.get('iteration', 1)
self.jobs: int = kwargs.get('jobs', 4)
def to_dict(self) -> Dict[str, Any]:
return {
'd': self.d,
'alpha': self.initial_alpha,
'alpha_update_interval': self.alpha_update_interval,
'num_of_rnd_walks': self.num_of_rnd_walks,
'neg_samples': self.neg_samples,
'iteration': self.iteration,
'jobs': self.jobs
}
def populate(self, rep: Dict[bytes, Any]) -> None:
self.d: int = rep.get(b'd', 200)
self.initial_alpha: float = rep.get(b'alpha', 0.0025)
self.alpha_update_interval: int = rep.get(b'alpha_update_interval', 10000)
self.num_of_rnd_walks: int = rep.get(b'rnd_walks', 3)
self.neg_samples: int = rep.get(b'neg_samples', 25)
self.iteration: int = rep.get(b'iteration', 1)
self.jobs: int = rep.get(b'jobs', 4)
class SequenceWindow:
def __init__(self, sequence: List[Instruction], vocabulary: Dict[str, Token]):
self._seq = sequence
self._vocab = vocabulary
self._i = 1
self._prev_ins = None
self._curr_ins = None
self._next_ins = None
self._prev_ins_op = None
self._prev_ins_args = None
self._curr_ins_op = None
self._curr_ins_args = None
self._next_ins_op = None
self._next_ins_args = None
def move_next(self) -> bool:
if self._i >= len(self._seq) - 1:
return False
def token_lookup(name) -> VectorizedToken:
return self._vocab[name].vectorized()
self._prev_ins = self._seq[self._i - 1]
self._curr_ins = self._seq[self._i]
self._next_ins = self._seq[self._i + 1]
self._prev_ins_op = token_lookup(self._prev_ins.op())
self._prev_ins_args = list(map(token_lookup, self._prev_ins.args()))
self._curr_ins_op = token_lookup(self._curr_ins.op())
self._curr_ins_args = list(map(token_lookup, self._curr_ins.args()))
self._next_ins_op = token_lookup(self._next_ins.op())
self._next_ins_args = list(map(token_lookup, self._next_ins.args()))
self._i += 1
return True
def prev_ins(self) -> Instruction:
return self._prev_ins
def prev_ins_op(self) -> VectorizedToken:
return self._prev_ins_op
def prev_ins_args(self) -> List[VectorizedToken]:
return self._prev_ins_args
def curr_ins(self) -> Instruction:
return self._curr_ins
def curr_ins_op(self) -> VectorizedToken:
return self._curr_ins_op
def curr_ins_args(self) -> List[VectorizedToken]:
return self._curr_ins_args
def next_ins(self) -> Instruction:
return self._next_ins
def next_ins_op(self) -> VectorizedToken:
return self._next_ins_op
def next_ins_args(self) -> List[VectorizedToken]:
return self._next_ins_args
class TrainingContext:
class Counter:
def __init__(self, context: 'TrainingContext', name: str, initial: int = 0):
self._context = context
self._name = name
self._val = initial
def val(self) -> int:
with self._context.lock():
return self._val
def inc(self) -> int:
with self._context.lock():
self._val += 1
return self._val
def reset(self) -> int:
with self._context.lock():
v = self._val
self._val = 0
return v
TOKENS_HANDLED_COUNTER: str = "tokens_handled"
def __init__(self, repo: FunctionRepository, params: Asm2VecParams, is_estimating: bool = False):
self._repo = repo
self._params = params
self._alpha = params.initial_alpha
self._sampler = NegativeSampler(list(map(lambda t: (t, t.frequency), repo.vocab().values())))
self._is_estimating = is_estimating
self._counters = dict()
self._lock = threading.Lock()
def repo(self) -> FunctionRepository:
return self._repo
def params(self) -> Asm2VecParams:
return self._params
def lock(self) -> LockContextManager:
return LockContextManager(self._lock)
def alpha(self) -> float:
with self.lock():
return self._alpha
def set_alpha(self, alpha: float) -> None:
with self.lock():
self._alpha = alpha
def sampler(self) -> NegativeSampler:
return self._sampler
def is_estimating(self) -> bool:
return self._is_estimating
def create_sequence_window(self, seq: List[Instruction]) -> SequenceWindow:
return SequenceWindow(seq, self._repo.vocab())
def get_counter(self, name: str) -> Counter:
with self.lock():
return self._counters.get(name)
def add_counter(self, name: str, initial: int = 0) -> Counter:
with self.lock():
c = self.__class__.Counter(self, name, initial)
self._counters[name] = c
return c
def _sigmoid(x: float) -> float:
return 1 / (1 + np.exp(-x))
def _identity(cond: bool) -> int:
return 1 if cond else 0
def _dot_sigmoid(lhs: np.ndarray, rhs: np.ndarray) -> float:
# noinspection PyTypeChecker
return _sigmoid(np.dot(lhs, rhs))
def _get_inst_repr(op: VectorizedToken, args: List[VectorizedToken]) -> np.ndarray:
if len(args) == 0:
arg_vec = np.zeros(len(op.v))
else:
arg_vec = np.average(list(map(lambda tk: tk.v, args)), axis=0)
return np.hstack((op.v, arg_vec))
def _train_vectorized(wnd: SequenceWindow, f: VectorizedFunction, context: TrainingContext) -> None:
ct_prev = _get_inst_repr(wnd.prev_ins_op(), wnd.prev_ins_args())
ct_next = _get_inst_repr(wnd.next_ins_op(), wnd.next_ins_args())
delta = np.average([ct_prev, f.v, ct_next], axis=0)
tokens = [wnd.curr_ins_op()] + wnd.curr_ins_args()
f_grad = np.zeros(f.v.shape)
for tk in tokens:
# Negative sampling.
sampled_tokens: Dict[str, VectorizedToken] = \
dict(map(lambda x: (x.name(), x.vectorized()), context.sampler().sample(context.params().neg_samples)))
if tk.name() not in sampled_tokens:
sampled_tokens[tk.name()] = tk
# The following code block tries to update the learning rate when necessary. Not required for now.
# tokens_handled_counter = context.get_counter(TrainingContext.TOKENS_HANDLED_COUNTER)
# if tokens_handled_counter is not None:
# if tokens_handled_counter.val() % context.params().alpha_update_interval == 0:
# # Update the learning rate.
# alpha = 1 - tokens_handled_counter.val() / (
# context.params().iteration * context.repo().num_of_tokens() + 1)
# context.set_alpha(max(alpha, context.params().initial_alpha * 0.0001))
for sp_tk in sampled_tokens.values():
# Accumulate gradient for function vector.
g = (_dot_sigmoid(delta, tk.v_pred) - _identity(tk is sp_tk)) * context.alpha()
f_grad += g / 3 * tk.v_pred
if not context.is_estimating():
with context.lock():
# Update v'_t
tk.v_pred -= g * delta
# Apply function gradient.
with context.lock():
f.v -= f_grad
if not context.is_estimating():
# Apply gradient to instructions.
d = len(f_grad) // 2
with context.lock():
wnd.prev_ins_op().v -= f_grad[:d]
if len(wnd.prev_ins_args()) > 0:
prev_args_grad = f_grad[d:] / len(wnd.prev_ins_args())
for t in wnd.prev_ins_args():
t.v -= prev_args_grad
wnd.next_ins_op().v -= f_grad[:d]
if len(wnd.next_ins_args()) > 0:
next_args_grad = f_grad[d:] / len(wnd.next_ins_args())
for t in wnd.next_ins_args():
t.v -= next_args_grad
def _train_sequence(f: VectorizedFunction, seq: List[Instruction], context: TrainingContext) -> None:
wnd = context.create_sequence_window(seq)
while wnd.move_next():
_train_vectorized(wnd, f, context)
def train(repository: FunctionRepository, params: Asm2VecParams) -> None:
context = TrainingContext(repository, params)
context.add_counter(TrainingContext.TOKENS_HANDLED_COUNTER)
asm2vec_logger().debug('Total number of functions: %d', len(context.repo().funcs()))
progress = Atomic(1)
def train_function(fn: VectorizedFunction):
for seq in fn.sequential().sequences():
_train_sequence(fn, seq, context)
asm2vec_logger().debug('Function "%s" trained, progress: %f%%',
fn.sequential().name(), progress.value() / len(context.repo().funcs()) * 100)
with progress.lock() as prog_proxy:
prog_proxy.set(prog_proxy.value() + 1)
executor = concurrent.futures.ThreadPoolExecutor(max_workers=context.params().jobs)
futures = []
for f in context.repo().funcs():
futures.append(executor.submit(train_function, f))
done, not_done = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_EXCEPTION)
if len(not_done) > 0:
raise RuntimeError('Train failed due to one or more failed task.')
def estimate(f: VectorizedFunction, estimate_repo: FunctionRepository, params: Asm2VecParams) -> np.ndarray:
context = TrainingContext(estimate_repo, params, True)
for seq in f.sequential().sequences():
_train_sequence(f, seq, context)
return f.v
================================================
FILE: asm2vec/internal/util.py
================================================
import numpy as np
def make_small_ndarray(dim: int) -> np.ndarray:
rng = np.random.default_rng()
return (rng.random(dim) - 0.5) / dim
================================================
FILE: asm2vec/logging.py
================================================
import logging
def asm2vec_logger() -> logging.Logger:
return logging.getLogger('asm2vec')
def config_asm2vec_logging(**kwargs):
level = kwargs.get('level', logging.WARNING)
handlers = kwargs.get('handlers', [])
filters = kwargs.get('filters', [])
asm2vec_logger().setLevel(level)
for hd in handlers:
asm2vec_logger().addHandler(hd)
for ft in filters:
asm2vec_logger().addFilter(ft)
================================================
FILE: asm2vec/model.py
================================================
from typing import *
import numpy as np
import asm2vec.asm
import asm2vec.repo
import asm2vec.internal.training
import asm2vec.internal.repr
import asm2vec.internal.util
class Asm2VecMemento:
def __init__(self):
self.params: Optional[asm2vec.internal.training.Asm2VecParams] = None
self.vocab: Optional[Dict[str, asm2vec.repo.Token]] = None
def serialize(self) -> Dict[str, Any]:
return {
'params': self.params.to_dict(),
'vocab': asm2vec.repo.serialize_vocabulary(self.vocab)
}
def populate(self, rep: Dict[bytes, Any]) -> None:
self.params = asm2vec.internal.training.Asm2VecParams()
self.params.populate(rep[b'params'])
self.vocab = asm2vec.repo.deserialize_vocabulary(rep[b'vocab'])
class Asm2Vec:
def __init__(self, **kwargs):
self._params = asm2vec.internal.training.Asm2VecParams(**kwargs)
self._vocab = None
def memento(self) -> Asm2VecMemento:
memento = Asm2VecMemento()
memento.params = self._params
memento.vocab = self._vocab
return memento
def set_memento(self, memento: Asm2VecMemento) -> None:
self._params = memento.params
self._vocab = memento.vocab
def make_function_repo(self, funcs: List[asm2vec.asm.Function]) -> asm2vec.repo.FunctionRepository:
return asm2vec.internal.repr.make_function_repo(
funcs, self._params.d, self._params.num_of_rnd_walks, self._params.jobs)
def train(self, repo: asm2vec.repo.FunctionRepository) -> None:
asm2vec.internal.training.train(repo, self._params)
self._vocab = repo.vocab()
def to_vec(self, f: asm2vec.asm.Function) -> np.ndarray:
estimate_repo = asm2vec.internal.repr.make_estimate_repo(
self._vocab, f, self._params.d, self._params.num_of_rnd_walks)
vf = estimate_repo.funcs()[0]
asm2vec.internal.training.estimate(vf, estimate_repo, self._params)
return vf.v
================================================
FILE: asm2vec/parse.py
================================================
from typing import *
import asm2vec.asm
import asm2vec.internal.parse
from asm2vec.internal.parse import AssemblySyntaxError
def parse_text(asm: str, **kwargs) -> List[asm2vec.asm.Function]:
return asm2vec.internal.parse.parse_asm_lines(asm.split('\n'), **kwargs)
def parse_fp(fp, **kwargs) -> List[asm2vec.asm.Function]:
return asm2vec.internal.parse.parse_asm_lines(fp, **kwargs)
def parse(asm_file_name: str, **kwargs) -> List[asm2vec.asm.Function]:
with open(asm_file_name, mode='r') as fp:
return parse_fp(fp, **kwargs)
================================================
FILE: asm2vec/repo.py
================================================
from typing import *
import numpy as np
import asm2vec.asm
import asm2vec.internal.util
class SequentialFunction:
def __init__(self, fid: int, name: str, sequences: List[List[asm2vec.asm.Instruction]]):
self._id = fid
self._name = name
self._seq = sequences
def id(self) -> int:
return self._id
def name(self) -> str:
return self._name
def sequences(self) -> List[List[asm2vec.asm.Instruction]]:
return self._seq
class VectorizedFunction:
def __init__(self, f: SequentialFunction, v: np.ndarray = None, dim: int = 400):
self._f = f
self.v = v if v is not None else asm2vec.internal.util.make_small_ndarray(dim)
def sequential(self) -> SequentialFunction:
return self._f
class VectorizedToken:
def __init__(self, name: str, v: np.ndarray = None, v_pred: np.ndarray = None, dim: int = 200):
self._name = name
self.v = v if v is not None else np.zeros(dim)
self.v_pred = v_pred if v_pred is not None else asm2vec.internal.util.make_small_ndarray(dim * 2)
def __eq__(self, other):
if not isinstance(other, VectorizedToken):
return False
return self._name == other._name
def __ne__(self, other):
return not self.__eq__(other)
def name(self) -> str:
return self._name
class Token:
def __init__(self, vt: VectorizedToken, count: int = 1):
self._vt = vt
self.count: int = count
self.frequency: float = 0
def vectorized(self) -> VectorizedToken:
return self._vt
def name(self) -> str:
return self._vt.name()
class FunctionRepository:
def __init__(self, funcs: List[VectorizedFunction], vocab: Dict[str, Token]):
self._funcs = funcs
self._vocab = vocab
self._num_of_tokens = sum(map(lambda x: x.count, vocab.values()))
def funcs(self) -> List[VectorizedFunction]:
return self._funcs
def vocab(self) -> Dict[str, Token]:
return self._vocab
def num_of_tokens(self) -> int:
return self._num_of_tokens
def _serialize_token(token: Token) -> Dict[str, Any]:
return {
'name': token.name(),
'v': list(token.vectorized().v),
'v_pred': list(token.vectorized().v_pred),
'count': token.count,
'frequency': token.frequency
}
def _deserialize_token(rep: Dict[bytes, Any]) -> Token:
name = rep[b'name'].decode('utf-8')
v = np.array(rep[b'v'])
v_pred = np.array(rep[b'v_pred'])
count = rep[b'count']
frequency = rep[b'frequency']
token = Token(VectorizedToken(name, v, v_pred))
token.count = count
token.frequency = frequency
return token
def serialize_vocabulary(vocab: Dict[str, Token]) -> Dict[str, Any]:
return dict(zip(vocab.keys(), map(_serialize_token, vocab.values())))
def deserialize_vocabulary(rep: Dict[bytes, Any]) -> Dict[str, Token]:
return dict(zip(map(lambda b: b.decode('utf-8'), rep.keys()), map(_deserialize_token, rep.values())))
def _serialize_sequence(seq: List[asm2vec.asm.Instruction]) -> List[Any]:
return list(map(lambda instr: [instr.op(), instr.args()], seq))
def _deserialize_sequence(rep: List[Any]) -> List[asm2vec.asm.Instruction]:
return list(map(
lambda instr_rep: asm2vec.asm.Instruction(instr_rep[0].decode('utf-8'), instr_rep[1].decode('utf-8')), rep))
def _serialize_vectorized_function(func: VectorizedFunction, include_sequences: bool) -> Dict[str, Any]:
data = {
'id': func.sequential().id(),
'name': func.sequential().name(),
'v': list(func.v)
}
if include_sequences:
data['sequences'] = list(map(_serialize_sequence, func.sequential().sequences()))
return data
def _deserialize_vectorized_function(rep: Dict[bytes, Any]) -> VectorizedFunction:
name = rep[b'name'].decode('utf-8')
fid = rep[b'id']
v = np.array(rep[b'v'])
sequences = list(map(_deserialize_sequence, rep.get(b'sequences', [])))
return VectorizedFunction(SequentialFunction(fid, name, sequences), v)
SERIALIZE_VOCABULARY: int = 1
SERIALIZE_FUNCTION: int = 2
SERIALIZE_FUNCTION_SEQUENCES: int = 4
SERIALIZE_ALL: int = SERIALIZE_VOCABULARY | SERIALIZE_FUNCTION | SERIALIZE_FUNCTION_SEQUENCES
def serialize_function_repo(repo: FunctionRepository, flags: int) -> Dict[str, Any]:
data = dict()
if (flags & SERIALIZE_VOCABULARY) != 0:
data['vocab'] = serialize_vocabulary(repo.vocab())
if (flags & SERIALIZE_FUNCTION) != 0:
include_sequences = ((flags & SERIALIZE_FUNCTION_SEQUENCES) != 0)
data['funcs'] = list(map(
lambda f: _serialize_vectorized_function(f, include_sequences),
repo.funcs()))
return data
def deserialize_function_repo(rep: Dict[bytes, Any]) -> FunctionRepository:
funcs = list(map(_deserialize_vectorized_function, rep.get(b'funcs', [])))
vocab = deserialize_vocabulary(rep.get(b'vocab', dict()))
return FunctionRepository(funcs, vocab)
================================================
FILE: examples/estimating.s
================================================
my_strlen_est:
cmp BYTE PTR [rdi], 0
je .L4
mov rax, rdi
.L3:
add rax, 1
cmp BYTE PTR [rax], 0
jne .L3
.L2:
sub rax, rdi
ret
.L4:
mov rax, rdi
jmp .L2
my_strcmp_est:
movzx eax, BYTE PTR [rdi]
test al, al
je .L12
.L7:
movzx edx, BYTE PTR [rsi]
test dl, dl
je .L15
cmp dl, al
jne .L16
add rdi, 1
add rsi, 1
movzx eax, BYTE PTR [rdi]
test al, al
jne .L7
.L12:
cmp BYTE PTR [rsi], 0
setne dl
movzx edx, dl
neg edx
.L6:
mov eax, edx
ret
.L16:
movsx eax, al
movsx edx, dl
sub eax, edx
mov edx, eax
jmp .L6
.L15:
mov edx, 1
test al, al
jne .L6
jmp .L12
.LC0:
.string "%s"
.LC1:
.string "%d\n"
main:
sub rsp, 264
lea rsi, [rsp+128]
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call scanf
mov rsi, rsp
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call scanf
lea rdi, [rsp+128]
call my_strlen_est
mov esi, eax
mov edi, OFFSET FLAT:.LC1
mov eax, 0
call printf
mov rsi, rsp
lea rdi, [rsp+128]
call my_strcmp_est
mov esi, eax
mov edi, OFFSET FLAT:.LC1
mov eax, 0
call printf
mov eax, 0
add rsp, 264
ret
================================================
FILE: examples/training-estimating.py
================================================
import numpy as np
import asm2vec.asm
import asm2vec.parse
import asm2vec.model
def cosine_similarity(v1, v2):
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
def main():
training_funcs = asm2vec.parse.parse('training.s',
func_names=['main', 'my_strlen_train', 'my_strcmp_train'])
estimating_funcs = asm2vec.parse.parse('estimating.s',
func_names=['main', 'my_strlen_est', 'my_strcmp_est'])
print('# of training functions:', len(training_funcs))
print('# of estimating functions:', len(estimating_funcs))
model = asm2vec.model.Asm2Vec(d=200)
training_repo = model.make_function_repo(training_funcs)
model.train(training_repo)
print('Training complete.')
for tf in training_repo.funcs():
print('Norm of trained function "{}" = {}'.format(tf.sequential().name(), np.linalg.norm(tf.v)))
estimating_funcs_vec = list(map(lambda f: model.to_vec(f), estimating_funcs))
print('Estimating complete.')
for (ef, efv) in zip(estimating_funcs, estimating_funcs_vec):
print('Norm of trained function "{}" = {}'.format(ef.name(), np.linalg.norm(efv)))
for tf in training_repo.funcs():
for (ef, efv) in zip(estimating_funcs, estimating_funcs_vec):
sim = cosine_similarity(tf.v, efv)
print('sim("{}", "{}") = {}'.format(tf.sequential().name(), ef.name(), sim))
if __name__ == '__main__':
main()
================================================
FILE: examples/training.s
================================================
my_strlen_train:
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-24], rdi
mov rax, QWORD PTR [rbp-24]
mov QWORD PTR [rbp-8], rax
jmp .L2
.L3:
add QWORD PTR [rbp-8], 1
.L2:
mov rax, QWORD PTR [rbp-8]
movzx eax, BYTE PTR [rax]
test al, al
jne .L3
mov rax, QWORD PTR [rbp-8]
sub rax, QWORD PTR [rbp-24]
pop rbp
ret
my_strcmp_train:
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-8], rdi
mov QWORD PTR [rbp-16], rsi
jmp .L6
.L10:
mov rax, QWORD PTR [rbp-8]
movzx edx, BYTE PTR [rax]
mov rax, QWORD PTR [rbp-16]
movzx eax, BYTE PTR [rax]
cmp dl, al
je .L7
mov rax, QWORD PTR [rbp-8]
movzx eax, BYTE PTR [rax]
movsx edx, al
mov rax, QWORD PTR [rbp-16]
movzx eax, BYTE PTR [rax]
movsx eax, al
sub edx, eax
mov eax, edx
jmp .L8
.L7:
add QWORD PTR [rbp-8], 1
add QWORD PTR [rbp-16], 1
.L6:
mov rax, QWORD PTR [rbp-8]
movzx eax, BYTE PTR [rax]
test al, al
je .L9
mov rax, QWORD PTR [rbp-16]
movzx eax, BYTE PTR [rax]
test al, al
jne .L10
.L9:
mov rax, QWORD PTR [rbp-8]
movzx eax, BYTE PTR [rax]
test al, al
je .L11
mov eax, 1
jmp .L8
.L11:
mov rax, QWORD PTR [rbp-16]
movzx eax, BYTE PTR [rax]
test al, al
je .L12
mov eax, -1
jmp .L8
.L12:
mov eax, 0
.L8:
pop rbp
ret
.LC0:
.string "%s"
.LC1:
.string "%d\n"
main:
push rbp
mov rbp, rsp
sub rsp, 256
lea rax, [rbp-128]
mov rsi, rax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call scanf
lea rax, [rbp-256]
mov rsi, rax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call scanf
lea rax, [rbp-128]
mov rdi, rax
call my_strlen_train
mov esi, eax
mov edi, OFFSET FLAT:.LC1
mov eax, 0
call printf
lea rdx, [rbp-256]
lea rax, [rbp-128]
mov rsi, rdx
mov rdi, rax
call my_strcmp_train
mov esi, eax
mov edi, OFFSET FLAT:.LC1
mov eax, 0
call printf
mov eax, 0
leave
ret
================================================
FILE: tests/asm_test.py
================================================
import unittest as ut
import asm2vec.asm as asm
class InstructionTest(ut.TestCase):
def test_parse_instruction(self):
ins = asm.parse_instruction('mov eax, ebx')
self.assertEqual('mov', ins.op(), 'Operators not equal')
self.assertListEqual(['eax', 'ebx'], ins.args(), 'Operands not equal')
def test_parse_instruction_one_operand(self):
ins = asm.parse_instruction('inc eax')
self.assertEqual('inc', ins.op(), 'Operators not equal')
self.assertListEqual(['eax'], ins.args(), 'Operands not equal')
def test_parse_instruction_no_operands(self):
ins = asm.parse_instruction('ret')
self.assertEqual('ret', ins.op(), 'Operators not equal')
self.assertListEqual([], ins.args(), 'Operands not equal')
class BasicBlockTest(ut.TestCase):
pass
class FunctionTest(ut.TestCase):
pass
================================================
FILE: tests/parse_test.py
================================================
import unittest as ut
import asm2vec.parse
test_asm = """
my_strlen:
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-24], rdi
mov rax, QWORD PTR [rbp-24]
mov QWORD PTR [rbp-8], rax
jmp .L2
.L3:
add QWORD PTR [rbp-8], 1
.L2:
mov rax, QWORD PTR [rbp-8]
movzx eax, BYTE PTR [rax]
test al, al
jne .L3
mov rax, QWORD PTR [rbp-8]
sub rax, QWORD PTR [rbp-24]
pop rbp
ret
.LC0:
.string "%s"
.LC1:
.string "%d\\n"
main:
push rbp
mov rbp, rsp
add rsp, -128
lea rax, [rbp-128]
mov rsi, rax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call scanf
lea rax, [rbp-128]
mov rdi, rax
call my_strlen
mov esi, eax
mov edi, OFFSET FLAT:.LC1
mov eax, 0
call printf
mov eax, 0
leave
ret
"""
class ParseTest(ut.TestCase):
def test_parse_text(self):
funcs = asm2vec.parse.parse_text(test_asm, func_names=['main', 'my_strlen'])
self.assertEqual(2, len(funcs))
self.assertEqual({'main', 'my_strlen'}, set(map(lambda f: f.name(), funcs)))
funcs = dict(map(lambda f: (f.name(), f), funcs))
main_func: asm2vec.asm.Function = funcs['main']
my_strlen_func: asm2vec.asm.Function = funcs['my_strlen']
self.assertListEqual(['my_strlen'], list(map(lambda f: f.name(), main_func.callees())))
self.assertListEqual(['main'], list(map(lambda f: f.name(), my_strlen_func.callers())))
================================================
FILE: tests/utilities_test.py
================================================
import unittest as ut
import asm2vec.internal.util as utilities
class PermutationTest(ut.TestCase):
def test_permute(self):
v = [10, 20, 30, 40, 50]
p = [2, 4, 1, 0, 3]
pv = utilities.permute(v, p)
self.assertListEqual([30, 50, 20, 10, 40], pv, 'Permutated vectors not equal.')
def test_inv_permute(self):
v = [30, 50, 20, 10, 40]
p = [2, 4, 1, 0, 3]
pv = utilities.inverse_permute(v, p)
self.assertListEqual([10, 20, 30, 40, 50], pv, 'Inverse permutated vectors not equal.')
gitextract_5xp0becm/
├── .gitignore
├── README.md
├── asm2vec/
│ ├── __init__.py
│ ├── asm.py
│ ├── internal/
│ │ ├── __init__.py
│ │ ├── atomic.py
│ │ ├── parse.py
│ │ ├── repr.py
│ │ ├── sampling.py
│ │ ├── training.py
│ │ └── util.py
│ ├── logging.py
│ ├── model.py
│ ├── parse.py
│ └── repo.py
├── examples/
│ ├── estimating.s
│ ├── training-estimating.py
│ └── training.s
└── tests/
├── asm_test.py
├── parse_test.py
└── utilities_test.py
SYMBOL INDEX (209 symbols across 15 files)
FILE: asm2vec/asm.py
class Instruction (line 4) | class Instruction:
method __init__ (line 5) | def __init__(self, op: str, *args: str):
method op (line 9) | def op(self) -> str:
method number_of_args (line 12) | def number_of_args(self) -> int:
method args (line 15) | def args(self) -> List[str]:
function parse_instruction (line 19) | def parse_instruction(code: str) -> Instruction:
class BasicBlock (line 29) | class BasicBlock:
method __init__ (line 32) | def __init__(self):
method __iter__ (line 41) | def __iter__(self):
method __len__ (line 44) | def __len__(self):
method __hash__ (line 47) | def __hash__(self):
method __eq__ (line 50) | def __eq__(self, other):
method __ne__ (line 55) | def __ne__(self, other):
method id (line 58) | def id(self) -> int:
method add_instruction (line 61) | def add_instruction(self, instr: Instruction) -> None:
method body_instructions (line 64) | def body_instructions(self) -> List[Instruction]:
method instructions (line 67) | def instructions(self) -> List[Instruction]:
method add_predecessor (line 70) | def add_predecessor(self, predecessor: 'BasicBlock') -> None:
method add_successor (line 74) | def add_successor(self, successor: 'BasicBlock') -> None:
method first_instruction (line 78) | def first_instruction(self) -> Instruction:
method last_instruction (line 81) | def last_instruction(self) -> Instruction:
method predecessors (line 84) | def predecessors(self) -> List['BasicBlock']:
method in_degree (line 87) | def in_degree(self) -> int:
method successors (line 90) | def successors(self) -> List['BasicBlock']:
method out_degree (line 93) | def out_degree(self) -> int:
class CFGWalkerCallback (line 97) | class CFGWalkerCallback:
method __call__ (line 98) | def __call__(self, *args, **kwargs):
method on_enter (line 101) | def on_enter(self, block: BasicBlock) -> None:
method on_exit (line 104) | def on_exit(self, block: BasicBlock) -> None:
function _walk_cfg (line 111) | def _walk_cfg(entry: BasicBlock, action: CFGWalkerCallbackType, visited:...
function walk_cfg (line 125) | def walk_cfg(entry: BasicBlock, action: CFGWalkerCallbackType) -> None:
class Function (line 129) | class Function:
method __init__ (line 132) | def __init__(self, entry: BasicBlock, name: str = None):
method __len__ (line 142) | def __len__(self) -> int:
method __hash__ (line 152) | def __hash__(self):
method __eq__ (line 155) | def __eq__(self, other):
method __ne__ (line 160) | def __ne__(self, other):
method id (line 163) | def id(self) -> int:
method entry (line 166) | def entry(self) -> BasicBlock:
method name (line 169) | def name(self) -> str:
method add_callee (line 172) | def add_callee(self, f: 'Function') -> None:
method callees (line 176) | def callees(self) -> List['Function']:
method out_degree (line 179) | def out_degree(self) -> int:
method add_caller (line 182) | def add_caller(self, f: 'Function') -> None:
method callers (line 186) | def callers(self) -> List['Function']:
method in_degree (line 189) | def in_degree(self) -> int:
FILE: asm2vec/internal/atomic.py
class LockContextManager (line 5) | class LockContextManager:
method __init__ (line 6) | def __init__(self, lock: threading.Lock):
method __enter__ (line 10) | def __enter__(self):
method __exit__ (line 13) | def __exit__(self, exc_type, exc_val, exc_tb):
method exited (line 17) | def exited(self) -> bool:
class Atomic (line 21) | class Atomic:
class AtomicContextManager (line 22) | class AtomicContextManager(LockContextManager):
method __init__ (line 23) | def __init__(self, atomic: 'Atomic'):
method __enter__ (line 28) | def __enter__(self):
method __exit__ (line 32) | def __exit__(self, exc_type, exc_val, exc_tb):
method value (line 35) | def value(self) -> Any:
method set (line 40) | def set(self, value: Any) -> None:
method __init__ (line 45) | def __init__(self, value: Any):
method lock (line 49) | def lock(self) -> AtomicContextManager:
method value (line 52) | def value(self) -> Any:
FILE: asm2vec/internal/parse.py
class AssemblySyntaxError (line 7) | class AssemblySyntaxError(Exception):
method __init__ (line 8) | def __init__(self, message: str = None):
method message (line 11) | def message(self) -> str:
function raise_asm_syntax_error (line 15) | def raise_asm_syntax_error(expect: str, found: str) -> None:
function is_jmp (line 48) | def is_jmp(op: str) -> bool:
function is_conditional_jmp (line 52) | def is_conditional_jmp(op: str) -> bool:
function is_call (line 56) | def is_call(op: str) -> bool:
function is_ret (line 60) | def is_ret(op: str) -> bool:
function is_reg (line 64) | def is_reg(arg: str) -> bool:
class CFGBuilder (line 68) | class CFGBuilder:
method __init__ (line 69) | def __init__(self, context: 'ParseContext'):
method _logger (line 75) | def _logger(self) -> logging.Logger:
method _allocate_block (line 78) | def _allocate_block(self) -> int:
method _allocate_named_block (line 82) | def _allocate_named_block(self, name: str) -> int:
method _get_active_block (line 90) | def _get_active_block(self) -> asm2vec.asm.BasicBlock:
method _set_active_block (line 93) | def _set_active_block(self, block_id: int) -> None:
method _has_active_block (line 96) | def _has_active_block(self) -> bool:
method _close_active_block (line 99) | def _close_active_block(self) -> None:
method _add_jmp (line 102) | def _add_jmp(self, op: str, args: List[str]) -> None:
method add_instr (line 113) | def add_instr(self, op: str, args: List[str]) -> None:
method set_label (line 125) | def set_label(self, label: str) -> None:
method build (line 142) | def build(self) -> List[asm2vec.asm.Function]:
class ParseOptions (line 183) | class ParseOptions:
method __init__ (line 184) | def __init__(self, **kwargs):
method func_names (line 187) | def func_names(self) -> List[str]:
class ParseContext (line 191) | class ParseContext:
method __init__ (line 192) | def __init__(self, **kwargs):
method logger (line 197) | def logger(self) -> logging.Logger:
method options (line 200) | def options(self) -> ParseOptions:
method builder (line 203) | def builder(self) -> CFGBuilder:
function is_fullmatch (line 238) | def is_fullmatch(pattern, s: str) -> bool:
function parse_asm_label (line 242) | def parse_asm_label(ln: str, context: ParseContext) -> None:
function parse_asm_instr (line 250) | def parse_asm_instr(ln: str, context: ParseContext) -> None:
function parse_asm_line (line 262) | def parse_asm_line(ln: str, context: ParseContext) -> None:
function parse_asm_lines (line 274) | def parse_asm_lines(lines: Iterable[str], **kwargs) -> List[asm2vec.asm....
FILE: asm2vec/internal/repr.py
function _random_walk (line 19) | def _random_walk(f: Function) -> List[Instruction]:
function _edge_sampling (line 36) | def _edge_sampling(f: Function) -> List[List[Instruction]]:
function make_sequential_function (line 56) | def make_sequential_function(f: Function, num_of_random_walks: int = 10)...
function _get_function_tokens (line 67) | def _get_function_tokens(f: Function, dim: int = 200) -> List[Vectorized...
function _make_function_repo_helper (line 81) | def _make_function_repo_helper(vocab: Dict[str, Token], funcs: List[Func...
function make_function_repo (line 124) | def make_function_repo(funcs: List[Function], dim: int, num_of_rnd_walks...
function make_estimate_repo (line 128) | def make_estimate_repo(vocabulary: Dict[str, Token], f: Function,
FILE: asm2vec/internal/sampling.py
class NegativeSampler (line 7) | class NegativeSampler:
method __init__ (line 8) | def __init__(self, distribution: List[Tuple[T, float]], alpha: float =...
method sample (line 12) | def sample(self, k: int) -> List[T]:
FILE: asm2vec/internal/training.py
class Asm2VecParams (line 19) | class Asm2VecParams:
method __init__ (line 20) | def __init__(self, **kwargs):
method to_dict (line 29) | def to_dict(self) -> Dict[str, Any]:
method populate (line 40) | def populate(self, rep: Dict[bytes, Any]) -> None:
class SequenceWindow (line 50) | class SequenceWindow:
method __init__ (line 51) | def __init__(self, sequence: List[Instruction], vocabulary: Dict[str, ...
method move_next (line 67) | def move_next(self) -> bool:
method prev_ins (line 89) | def prev_ins(self) -> Instruction:
method prev_ins_op (line 92) | def prev_ins_op(self) -> VectorizedToken:
method prev_ins_args (line 95) | def prev_ins_args(self) -> List[VectorizedToken]:
method curr_ins (line 98) | def curr_ins(self) -> Instruction:
method curr_ins_op (line 101) | def curr_ins_op(self) -> VectorizedToken:
method curr_ins_args (line 104) | def curr_ins_args(self) -> List[VectorizedToken]:
method next_ins (line 107) | def next_ins(self) -> Instruction:
method next_ins_op (line 110) | def next_ins_op(self) -> VectorizedToken:
method next_ins_args (line 113) | def next_ins_args(self) -> List[VectorizedToken]:
class TrainingContext (line 117) | class TrainingContext:
class Counter (line 118) | class Counter:
method __init__ (line 119) | def __init__(self, context: 'TrainingContext', name: str, initial: i...
method val (line 124) | def val(self) -> int:
method inc (line 128) | def inc(self) -> int:
method reset (line 133) | def reset(self) -> int:
method __init__ (line 141) | def __init__(self, repo: FunctionRepository, params: Asm2VecParams, is...
method repo (line 150) | def repo(self) -> FunctionRepository:
method params (line 153) | def params(self) -> Asm2VecParams:
method lock (line 156) | def lock(self) -> LockContextManager:
method alpha (line 159) | def alpha(self) -> float:
method set_alpha (line 163) | def set_alpha(self, alpha: float) -> None:
method sampler (line 167) | def sampler(self) -> NegativeSampler:
method is_estimating (line 170) | def is_estimating(self) -> bool:
method create_sequence_window (line 173) | def create_sequence_window(self, seq: List[Instruction]) -> SequenceWi...
method get_counter (line 176) | def get_counter(self, name: str) -> Counter:
method add_counter (line 180) | def add_counter(self, name: str, initial: int = 0) -> Counter:
function _sigmoid (line 187) | def _sigmoid(x: float) -> float:
function _identity (line 191) | def _identity(cond: bool) -> int:
function _dot_sigmoid (line 195) | def _dot_sigmoid(lhs: np.ndarray, rhs: np.ndarray) -> float:
function _get_inst_repr (line 200) | def _get_inst_repr(op: VectorizedToken, args: List[VectorizedToken]) -> ...
function _train_vectorized (line 208) | def _train_vectorized(wnd: SequenceWindow, f: VectorizedFunction, contex...
function _train_sequence (line 264) | def _train_sequence(f: VectorizedFunction, seq: List[Instruction], conte...
function train (line 270) | def train(repository: FunctionRepository, params: Asm2VecParams) -> None:
function estimate (line 296) | def estimate(f: VectorizedFunction, estimate_repo: FunctionRepository, p...
FILE: asm2vec/internal/util.py
function make_small_ndarray (line 4) | def make_small_ndarray(dim: int) -> np.ndarray:
FILE: asm2vec/logging.py
function asm2vec_logger (line 4) | def asm2vec_logger() -> logging.Logger:
function config_asm2vec_logging (line 8) | def config_asm2vec_logging(**kwargs):
FILE: asm2vec/model.py
class Asm2VecMemento (line 13) | class Asm2VecMemento:
method __init__ (line 14) | def __init__(self):
method serialize (line 18) | def serialize(self) -> Dict[str, Any]:
method populate (line 24) | def populate(self, rep: Dict[bytes, Any]) -> None:
class Asm2Vec (line 30) | class Asm2Vec:
method __init__ (line 31) | def __init__(self, **kwargs):
method memento (line 35) | def memento(self) -> Asm2VecMemento:
method set_memento (line 41) | def set_memento(self, memento: Asm2VecMemento) -> None:
method make_function_repo (line 45) | def make_function_repo(self, funcs: List[asm2vec.asm.Function]) -> asm...
method train (line 49) | def train(self, repo: asm2vec.repo.FunctionRepository) -> None:
method to_vec (line 53) | def to_vec(self, f: asm2vec.asm.Function) -> np.ndarray:
FILE: asm2vec/parse.py
function parse_text (line 9) | def parse_text(asm: str, **kwargs) -> List[asm2vec.asm.Function]:
function parse_fp (line 13) | def parse_fp(fp, **kwargs) -> List[asm2vec.asm.Function]:
function parse (line 17) | def parse(asm_file_name: str, **kwargs) -> List[asm2vec.asm.Function]:
FILE: asm2vec/repo.py
class SequentialFunction (line 9) | class SequentialFunction:
method __init__ (line 10) | def __init__(self, fid: int, name: str, sequences: List[List[asm2vec.a...
method id (line 15) | def id(self) -> int:
method name (line 18) | def name(self) -> str:
method sequences (line 21) | def sequences(self) -> List[List[asm2vec.asm.Instruction]]:
class VectorizedFunction (line 25) | class VectorizedFunction:
method __init__ (line 26) | def __init__(self, f: SequentialFunction, v: np.ndarray = None, dim: i...
method sequential (line 30) | def sequential(self) -> SequentialFunction:
class VectorizedToken (line 34) | class VectorizedToken:
method __init__ (line 35) | def __init__(self, name: str, v: np.ndarray = None, v_pred: np.ndarray...
method __eq__ (line 40) | def __eq__(self, other):
method __ne__ (line 46) | def __ne__(self, other):
method name (line 49) | def name(self) -> str:
class Token (line 53) | class Token:
method __init__ (line 54) | def __init__(self, vt: VectorizedToken, count: int = 1):
method vectorized (line 59) | def vectorized(self) -> VectorizedToken:
method name (line 62) | def name(self) -> str:
class FunctionRepository (line 66) | class FunctionRepository:
method __init__ (line 67) | def __init__(self, funcs: List[VectorizedFunction], vocab: Dict[str, T...
method funcs (line 72) | def funcs(self) -> List[VectorizedFunction]:
method vocab (line 75) | def vocab(self) -> Dict[str, Token]:
method num_of_tokens (line 78) | def num_of_tokens(self) -> int:
function _serialize_token (line 82) | def _serialize_token(token: Token) -> Dict[str, Any]:
function _deserialize_token (line 92) | def _deserialize_token(rep: Dict[bytes, Any]) -> Token:
function serialize_vocabulary (line 105) | def serialize_vocabulary(vocab: Dict[str, Token]) -> Dict[str, Any]:
function deserialize_vocabulary (line 109) | def deserialize_vocabulary(rep: Dict[bytes, Any]) -> Dict[str, Token]:
function _serialize_sequence (line 113) | def _serialize_sequence(seq: List[asm2vec.asm.Instruction]) -> List[Any]:
function _deserialize_sequence (line 117) | def _deserialize_sequence(rep: List[Any]) -> List[asm2vec.asm.Instruction]:
function _serialize_vectorized_function (line 122) | def _serialize_vectorized_function(func: VectorizedFunction, include_seq...
function _deserialize_vectorized_function (line 135) | def _deserialize_vectorized_function(rep: Dict[bytes, Any]) -> Vectorize...
function serialize_function_repo (line 149) | def serialize_function_repo(repo: FunctionRepository, flags: int) -> Dic...
function deserialize_function_repo (line 162) | def deserialize_function_repo(rep: Dict[bytes, Any]) -> FunctionRepository:
FILE: examples/training-estimating.py
function cosine_similarity (line 8) | def cosine_similarity(v1, v2):
function main (line 12) | def main():
FILE: tests/asm_test.py
class InstructionTest (line 6) | class InstructionTest(ut.TestCase):
method test_parse_instruction (line 7) | def test_parse_instruction(self):
method test_parse_instruction_one_operand (line 12) | def test_parse_instruction_one_operand(self):
method test_parse_instruction_no_operands (line 17) | def test_parse_instruction_no_operands(self):
class BasicBlockTest (line 23) | class BasicBlockTest(ut.TestCase):
class FunctionTest (line 27) | class FunctionTest(ut.TestCase):
FILE: tests/parse_test.py
class ParseTest (line 51) | class ParseTest(ut.TestCase):
method test_parse_text (line 52) | def test_parse_text(self):
FILE: tests/utilities_test.py
class PermutationTest (line 6) | class PermutationTest(ut.TestCase):
method test_permute (line 7) | def test_permute(self):
method test_inv_permute (line 13) | def test_inv_permute(self):
Condensed preview — 21 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (61K chars).
[
{
"path": ".gitignore",
"chars": 5091,
"preview": "# Created by https://www.toptal.com/developers/gitignore/api/macos,intellij,virtualenv,python\n# Edit at https://www.topt"
},
{
"path": "README.md",
"chars": 5194,
"preview": "# asm2vec\n\nThis is an unofficial implementation of the `asm2vec` model as a standalone python package. The details of th"
},
{
"path": "asm2vec/__init__.py",
"chars": 36,
"preview": "__all__ = ['asm', 'model', 'parse']\n"
},
{
"path": "asm2vec/asm.py",
"chars": 4852,
"preview": "from typing import *\n\n\nclass Instruction:\n def __init__(self, op: str, *args: str):\n self._op = op\n sel"
},
{
"path": "asm2vec/internal/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "asm2vec/internal/atomic.py",
"chars": 1504,
"preview": "from typing import *\nimport threading\n\n\nclass LockContextManager:\n def __init__(self, lock: threading.Lock):\n "
},
{
"path": "asm2vec/internal/parse.py",
"chars": 8622,
"preview": "from typing import *\nimport logging\n\nimport asm2vec.asm\n\n\nclass AssemblySyntaxError(Exception):\n def __init__(self, m"
},
{
"path": "asm2vec/internal/repr.py",
"chars": 4440,
"preview": "import random\nfrom typing import *\nimport concurrent.futures\n\nfrom asm2vec.asm import Instruction\nfrom asm2vec.asm impor"
},
{
"path": "asm2vec/internal/sampling.py",
"chars": 403,
"preview": "from typing import *\nimport random\n\nT = TypeVar('T')\n\n\nclass NegativeSampler:\n def __init__(self, distribution: List["
},
{
"path": "asm2vec/internal/training.py",
"chars": 10565,
"preview": "from typing import *\nimport math\nimport threading\nimport concurrent.futures\n\nimport numpy as np\n\nfrom asm2vec.asm import"
},
{
"path": "asm2vec/internal/util.py",
"chars": 144,
"preview": "import numpy as np\n\n\ndef make_small_ndarray(dim: int) -> np.ndarray:\n rng = np.random.default_rng()\n return (rng.r"
},
{
"path": "asm2vec/logging.py",
"chars": 432,
"preview": "import logging\n\n\ndef asm2vec_logger() -> logging.Logger:\n return logging.getLogger('asm2vec')\n\n\ndef config_asm2vec_lo"
},
{
"path": "asm2vec/model.py",
"chars": 1996,
"preview": "from typing import *\n\nimport numpy as np\n\nimport asm2vec.asm\nimport asm2vec.repo\n\nimport asm2vec.internal.training\nimpor"
},
{
"path": "asm2vec/parse.py",
"chars": 553,
"preview": "from typing import *\n\nimport asm2vec.asm\nimport asm2vec.internal.parse\n\nfrom asm2vec.internal.parse import AssemblySynta"
},
{
"path": "asm2vec/repo.py",
"chars": 5031,
"preview": "from typing import *\n\nimport numpy as np\n\nimport asm2vec.asm\nimport asm2vec.internal.util\n\n\nclass SequentialFunction:\n "
},
{
"path": "examples/estimating.s",
"chars": 1712,
"preview": "my_strlen_est:\n cmp BYTE PTR [rdi], 0\n je .L4\n mov rax, rdi\n.L3:\n add rax, "
},
{
"path": "examples/training-estimating.py",
"chars": 1501,
"preview": "import numpy as np\n\nimport asm2vec.asm\nimport asm2vec.parse\nimport asm2vec.model\n\n\ndef cosine_similarity(v1, v2):\n re"
},
{
"path": "examples/training.s",
"chars": 2741,
"preview": "my_strlen_train:\n push rbp\n mov rbp, rsp\n mov QWORD PTR [rbp-24], rdi\n mov ra"
},
{
"path": "tests/asm_test.py",
"chars": 872,
"preview": "import unittest as ut\n\nimport asm2vec.asm as asm\n\n\nclass InstructionTest(ut.TestCase):\n def test_parse_instruction(se"
},
{
"path": "tests/parse_test.py",
"chars": 1696,
"preview": "import unittest as ut\n\nimport asm2vec.parse\n\n\ntest_asm = \"\"\"\nmy_strlen:\n push rbp\n mov rbp, rsp\n "
},
{
"path": "tests/utilities_test.py",
"chars": 552,
"preview": "import unittest as ut\n\nimport asm2vec.internal.util as utilities\n\n\nclass PermutationTest(ut.TestCase):\n def test_perm"
}
]
About this extraction
This page contains the full source code of the Lancern/asm2vec GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 21 files (56.6 KB), approximately 15.5k tokens, and a symbol index with 209 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.