Full Code of Lancern/asm2vec for AI

master d38a3bc3bc9c cached

21 files

56.6 KB

15.5k tokens

209 symbols

1 requests

Download .txt

Repository: Lancern/asm2vec
Branch: master
Commit: d38a3bc3bc9c
Files: 21
Total size: 56.6 KB

Directory structure:
gitextract_5xp0becm/

├── .gitignore
├── README.md
├── asm2vec/
│   ├── __init__.py
│   ├── asm.py
│   ├── internal/
│   │   ├── __init__.py
│   │   ├── atomic.py
│   │   ├── parse.py
│   │   ├── repr.py
│   │   ├── sampling.py
│   │   ├── training.py
│   │   └── util.py
│   ├── logging.py
│   ├── model.py
│   ├── parse.py
│   └── repo.py
├── examples/
│   ├── estimating.s
│   ├── training-estimating.py
│   └── training.s
└── tests/
    ├── asm_test.py
    ├── parse_test.py
    └── utilities_test.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Created by https://www.toptal.com/developers/gitignore/api/macos,intellij,virtualenv,python
# Edit at https://www.toptal.com/developers/gitignore?templates=macos,intellij,virtualenv,python

### Intellij ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# Generated files
.idea/**/contentModel.xml

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn.  Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
.idea/httpRequests

# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser

### Intellij Patch ###
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721

# *.iml
# modules.xml
# .idea/misc.xml
# *.ipr

# Sonarlint plugin
# https://plugins.jetbrains.com/plugin/7973-sonarlint
.idea/**/sonarlint/

# SonarQube Plugin
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
.idea/**/sonarIssues.xml

# Markdown Navigator plugin
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
.idea/**/markdown-navigator.xml
.idea/**/markdown-navigator-enh.xml
.idea/**/markdown-navigator/

# Cache file creation bug
# See https://youtrack.jetbrains.com/issue/JBR-2257
.idea/$CACHE_FILE$

# CodeStream plugin
# https://plugins.jetbrains.com/plugin/12206-codestream
.idea/codestream.xml

### macOS ###
# General
.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon


# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk

### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
pytestdebug.log

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/
doc/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
pythonenv*

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# profiling data
.prof

### VirtualEnv ###
# Virtualenv
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
[Bb]in
[Ii]nclude
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
pyvenv.cfg
pip-selfcheck.json

# End of https://www.toptal.com/developers/gitignore/api/macos,intellij,virtualenv,python


================================================
FILE: README.md
================================================
# asm2vec

This is an unofficial implementation of the `asm2vec` model as a standalone python package. The details of the model can be found in the original paper: [(sp'19) Asm2Vec: Boosting Static Representation Robustness for Binary Clone Search against Code Obfuscation and Compiler Optimization](https://www.computer.org/csdl/proceedings-article/sp/2019/666000a038/19skfc3ZfKo)

## Requirements

This implementation is written in python 3.7 and it's recommended to use python 3.7+ as well. The only dependency of this package is `numpy` which can be installed as follows:

```shell
python3 -m pip install numpy
```

## How to use

### Import

To install the package, execute the following commands:

```shell
git clone https://github.com/lancern/asm2vec.git
```

Add the following line to the `.bashrc` file to add `asm2vec` to your python interpreter's search path for external packages:

```shell
export PYTHONPATH="path/to/asm2vec:$PYTHONPATH"
```

Replace `path/to/asm2vec` with the directory you clone `asm2vec` into. Then execute the following commands to update `PYTHONPATH`:

```shell
source ~/.bashrc
```

You can also add the following code snippets to your python source code referring `asm2vec` to guide python interpreter finding the package successfully:

```python
import sys
sys.path.append('path/to/asm2vec')
```

In your python code, use the following `import` statement to import this package:

```python
import asm2vec.<module-name>
```

### Define CFGs And Training

You have 2 approaches to define the binary program that will be sent to the `asm2vec` model. The first approach is to build the CFG manually, as shown below:

```python
from asm2vec.asm import BasicBlock
from asm2vec.asm import Function
from asm2vec.asm import parse_instruction

block1 = BasicBlock()
block1.add_instruction(parse_instruction('mov eax, ebx'))
block1.add_instruction(parse_instruction('jmp _loc'))

block2 = BasicBlock()
block2.add_instruction(parse_instruction('xor eax, eax'))
block2.add_instruction(parse_instruction('ret'))

block1.add_successor(block2)

block3 = BasicBlock()
block3.add_instruction(parse_instruction('sub eax, [ebp]'))

f1 = Function(block1, 'some_func')
f2 = Function(block3, 'another_func')

# block4 is ignore here for clarity
f3 = Function(block4, 'estimate_func')
```

And then you can train a model with the following code:

```python
from asm2vec.model import Asm2Vec

model = Asm2Vec(d=200)
train_repo = model.make_function_repo([f1, f2, f3])
model.train(train_repo)
```

The second approach is using the `parse` module provided by `asm2vec` to build CFGs automatically from an assembly code source file:

```python
from asm2vec.parse import parse_fp

with open('source.asm', 'r') as fp:
    funcs = parse_fp(fp)
```

And then you can train a model with the following code:

```python
from asm2vec.model import Asm2Vec

model = Asm2Vec(d=200)
train_repo = model.make_function_repo(funcs)
model.train(train_repo)
```

### Estimation

You can use the `asm2vec.model.Asm2Vec.to_vec` method to convert a function into its vector representation.

### Serialization

The implementation support serialization on many of its internal data structures so that you can serialize the internal state of a trained model into disk for future use.

You can serialize two data structures to primitive data: the function repository and the model memento.

> To be finished.

## Hyper Parameters

The constructor of `asm2vec.model.Asm2Vec` class accepts some keyword arguments as hyper parameters of the model. The following table lists all the hyper parameters available:

| Parameter Name          | Type    | Meaning                                                                                                | Default Value |
| ----------------------- | ------- | ------------------------------------------------------------------------------------------------------ | ------------- |
| `d`                     | `int`   | The dimention of the vectors for tokens.                                                               | `200`         |
| `initial_alpha`         | `float` | The initial learning rate.                                                                             | `0.05`        |
| `alpha_update_interval` | `int`   | How many tokens can be processed before changing the learning rate?                                    | `10000`       |
| `rnd_walks`             | `int`   | How many random walks to perform to sequentialize a function?                                          | `3`           |
| `neg_samples`           | `int`   | How many samples to take during negative sampling?                                                     | `25`          |
| `iteration`             | `int`   | How many iterations to perform? (This parameter is reserved for future use and is not implemented now) | `1`           |
| `jobs`                  | `int`   | How many tasks to execute concurrently during training?                                                | `4`           |

## Notes

For simplicity, the Selective Callee Expansion is not implemented in this early implementation. You have to do it manually before sending CFG into `asm2vec` .


================================================
FILE: asm2vec/__init__.py
================================================
__all__ = ['asm', 'model', 'parse']


================================================
FILE: asm2vec/asm.py
================================================
from typing import *


class Instruction:
    def __init__(self, op: str, *args: str):
        self._op = op
        self._args = list(args)

    def op(self) -> str:
        return self._op

    def number_of_args(self) -> int:
        return len(self._args)

    def args(self) -> List[str]:
        return self._args


def parse_instruction(code: str) -> Instruction:
    sep_index = code.find(' ')
    if sep_index == -1:
        return Instruction(code)

    op = code[:sep_index]   # Operator
    args_list = list(map(str.strip, code[sep_index:].split(',')))   # Operands
    return Instruction(op, *args_list)


class BasicBlock:
    _next_unused_id: int = 1

    def __init__(self):
        # Allocate a new unique ID for the basic block.
        self._id = self.__class__._next_unused_id
        self.__class__._next_unused_id += 1

        self._instructions = []
        self._predecessors = []
        self._successors = []

    def __iter__(self):
        return self._instructions.__iter__()

    def __len__(self):
        return len(self._instructions)

    def __hash__(self):
        return self._id.__hash__()

    def __eq__(self, other):
        if not isinstance(other, BasicBlock):
            return False
        return self._id == other.id()

    def __ne__(self, other):
        return not self.__eq__(other)

    def id(self) -> int:
        return self._id

    def add_instruction(self, instr: Instruction) -> None:
        self._instructions.append(instr)

    def body_instructions(self) -> List[Instruction]:
        return self._instructions[:-1]

    def instructions(self) -> List[Instruction]:
        return self._instructions

    def add_predecessor(self, predecessor: 'BasicBlock') -> None:
        self._predecessors.append(predecessor)
        predecessor._successors.append(self)

    def add_successor(self, successor: 'BasicBlock') -> None:
        self._successors.append(successor)
        successor._predecessors.append(self)

    def first_instruction(self) -> Instruction:
        return self._instructions[0]

    def last_instruction(self) -> Instruction:
        return self._instructions[-1]

    def predecessors(self) -> List['BasicBlock']:
        return self._predecessors

    def in_degree(self) -> int:
        return len(self._predecessors)

    def successors(self) -> List['BasicBlock']:
        return self._successors

    def out_degree(self) -> int:
        return len(self._successors)


class CFGWalkerCallback:
    def __call__(self, *args, **kwargs):
        self.on_enter(*args)

    def on_enter(self, block: BasicBlock) -> None:
        pass

    def on_exit(self, block: BasicBlock) -> None:
        pass


CFGWalkerCallbackType = Union[CFGWalkerCallback, Callable[[BasicBlock], Any]]


def _walk_cfg(entry: BasicBlock, action: CFGWalkerCallbackType, visited: Set) -> None:
    if entry.id() in visited:
        return

    visited.add(entry.id())
    action(entry)

    for successor in entry.successors():
        _walk_cfg(successor, action, visited)

    if isinstance(action, CFGWalkerCallback):
        action.on_exit(entry)


def walk_cfg(entry: BasicBlock, action: CFGWalkerCallbackType) -> None:
    _walk_cfg(entry, action, set())


class Function:
    _next_unused_id = 1

    def __init__(self, entry: BasicBlock, name: str = None):
        # Allocate a unique ID for the current Function object.
        self._id = self.__class__._next_unused_id
        self.__class__._next_unused_id += 1

        self._entry = entry
        self._name = name
        self._callees = []  # Functions that are called by this function
        self._callers = []  # Functions that call this function

    def __len__(self) -> int:
        instr_count = 0

        def count_instr(block: BasicBlock) -> None:
            nonlocal instr_count
            instr_count += len(block)

        walk_cfg(self._entry, count_instr)
        return instr_count

    def __hash__(self):
        return self._id

    def __eq__(self, other):
        if not isinstance(other, Function):
            return False
        return self._id == other.id()

    def __ne__(self, other):
        return not self.__eq__(other)

    def id(self) -> int:
        return self._id

    def entry(self) -> BasicBlock:
        return self._entry

    def name(self) -> str:
        return self._name

    def add_callee(self, f: 'Function') -> None:
        self._callees.append(f)
        f._callers.append(self)

    def callees(self) -> List['Function']:
        return self._callees

    def out_degree(self) -> int:
        return len(self._callees)

    def add_caller(self, f: 'Function') -> None:
        self._callers.append(f)
        f._callees.append(self)

    def callers(self) -> List['Function']:
        return self._callers

    def in_degree(self) -> int:
        return len(self._callers)


================================================
FILE: asm2vec/internal/__init__.py
================================================


================================================
FILE: asm2vec/internal/atomic.py
================================================
from typing import *
import threading


class LockContextManager:
    def __init__(self, lock: threading.Lock):
        self._lock = lock
        self._exited = False

    def __enter__(self):
        self._lock.acquire()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self._exited = True
        self._lock.release()

    def exited(self) -> bool:
        return self._exited


class Atomic:
    class AtomicContextManager(LockContextManager):
        def __init__(self, atomic: 'Atomic'):
            super().__init__(atomic._lock)
            self._atomic = atomic
            self._exited = False

        def __enter__(self):
            super().__enter__()
            return self

        def __exit__(self, exc_type, exc_val, exc_tb):
            super().__exit__(exc_type, exc_val, exc_tb)

        def value(self) -> Any:
            if self.exited():
                raise RuntimeError('Trying to access AtomicContextManager after its exit.')
            return self._atomic._val

        def set(self, value: Any) -> None:
            if self.exited():
                raise RuntimeError('Trying to access AtomicContextManager after its exit.')
            self._atomic._val = value

    def __init__(self, value: Any):
        self._val = value
        self._lock = threading.Lock()

    def lock(self) -> AtomicContextManager:
        return self.__class__.AtomicContextManager(self)

    def value(self) -> Any:
        with self.lock() as val:
            return val.value()


================================================
FILE: asm2vec/internal/parse.py
================================================
from typing import *
import logging

import asm2vec.asm


class AssemblySyntaxError(Exception):
    def __init__(self, message: str = None):
        self._msg = message

    def message(self) -> str:
        return self._msg


def raise_asm_syntax_error(expect: str, found: str) -> None:
    raise AssemblySyntaxError('Expect "{}", but "{}" was found.'.format(expect, found))


jmp_op = {
    'jmp', 'ja', 'jae', 'jb', 'jbe', 'jc', 'jcxz', 'jecxz', 'jrcxz', 'je', 'jg', 'jge', 'jl', 'jle', 'jna',
    'jnae', 'jnb', 'jnbe', 'jnc', 'jne', 'jng', 'jnge', 'jnl', 'jnle', 'jno', 'jnp', 'jns', 'jnz', 'jo', 'jp',
    'jpe', 'jpo', 'js', 'jz'
}

call_op = {
    'call'
}

ret_op = {
    'ret'
}

x86_64_regs = {
    'al', 'ah', 'bl', 'bh', 'cl', 'ch', 'dl', 'dh', 'spl', 'bpl', 'sil', 'dil',
    'ax', 'bx', 'cx', 'dx', 'sp', 'bp', 'si', 'di',
    'eax', 'ebx', 'ecx', 'edx', 'esp', 'ebp', 'esi', 'edi',
    'rax', 'rdx', 'rcx', 'rdx', 'rsp', 'rbp', 'rsi', 'rdi',
    'r8b', 'r9b', 'r10b', 'r11b', 'r12b', 'r13b', 'r14b', 'r15b',
    'r8w', 'r9w', 'r10w', 'r11w', 'r12w', 'r13w', 'r14w', 'r15w',
    'r8d', 'r9d', 'r10d', 'r11d', 'r12d', 'r13d', 'r14d', 'r15d',
    'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15',
    'cs', 'ss', 'ds', 'es', 'fs', 'gs',
    'ecs', 'ess', 'eds', 'ees', 'efs', 'egs',
    'rcs', 'rss', 'rds', 'res', 'rfs', 'rgs'
}


def is_jmp(op: str) -> bool:
    return op.lower() in jmp_op


def is_conditional_jmp(op: str) -> bool:
    return is_jmp(op) and op.lower() != 'jmp'


def is_call(op: str) -> bool:
    return op.lower() in call_op


def is_ret(op: str) -> bool:
    return op.lower() in ret_op


def is_reg(arg: str) -> bool:
    return arg.lower() in x86_64_regs


class CFGBuilder:
    def __init__(self, context: 'ParseContext'):
        self._context = context
        self._blocks: List[asm2vec.asm.BasicBlock] = []
        self._active_block = -1
        self._block_labels: Dict[str, int] = dict()

    def _logger(self) -> logging.Logger:
        return self._context.logger().getChild(self.__class__.__name__)

    def _allocate_block(self) -> int:
        self._blocks.append(asm2vec.asm.BasicBlock())
        return len(self._blocks) - 1

    def _allocate_named_block(self, name: str) -> int:
        if name in self._block_labels:
            return self._block_labels[name]
        else:
            idx = self._allocate_block()
            self._block_labels[name] = idx
            return idx

    def _get_active_block(self) -> asm2vec.asm.BasicBlock:
        return self._blocks[self._active_block]

    def _set_active_block(self, block_id: int) -> None:
        self._active_block = block_id

    def _has_active_block(self) -> bool:
        return self._active_block != -1

    def _close_active_block(self) -> None:
        self._active_block = -1

    def _add_jmp(self, op: str, args: List[str]) -> None:
        if len(args) != 1:
            raise_asm_syntax_error('Jump with single operand', '{} operands'.format(len(args)))
        cur_block = self._get_active_block()
        self._close_active_block()
        if is_conditional_jmp(op):
            # Allocate another basic block for more instructions since the current code point is reachable.
            # This may produce some empty basic blocks in the final output.
            self._set_active_block(self._allocate_block())
            self._get_active_block().add_predecessor(cur_block)

    def add_instr(self, op: str, args: List[str]) -> None:
        if not self._has_active_block():
            # Allocate a new basic block.
            self._set_active_block(self._allocate_block())

        self._get_active_block().add_instruction(asm2vec.asm.Instruction(op, *args))
        if is_jmp(op):
            self._add_jmp(op, args)
        elif is_ret(op):
            # `ret` instruction encountered. Close current active block.
            self._close_active_block()

    def set_label(self, label: str) -> None:
        block_id = self._block_labels.get(label, -1)
        if block_id == -1:
            # Test if the current active block is empty in which case we can reuse it.
            if self._has_active_block() and len(self._get_active_block()) == 0:
                self._block_labels[label] = self._active_block
            else:
                # Open a new block for the label.
                block_id = self._allocate_block()
                self._block_labels[label] = block_id
                # Link the new block with the previously-active block.
                if self._has_active_block():
                    self._get_active_block().add_successor(self._blocks[block_id])
                self._set_active_block(block_id)
        else:
            self._set_active_block(block_id)

    def build(self) -> List[asm2vec.asm.Function]:
        func_entries: Dict[str, int] = dict()

        # Walk through all instructions and fix block relations formed by jump and call instructions.
        for blk in self._blocks:
            for inst in blk:
                if is_jmp(inst.op()):
                    target = inst.args()[0]
                    if target in self._block_labels:
                        blk.add_successor(self._blocks[self._block_labels[target]])
                elif is_call(inst.op()):
                    target = inst.args()[0]
                    if target in self._block_labels and target not in func_entries:
                        func_entries[target] = self._block_labels[target]

        for func_name in self._context.options().func_names():
            if func_name not in self._block_labels:
                self._logger().warning('Cannot find function "{}"', func_name)
                continue
            if func_name not in func_entries:
                func_entries[func_name] = self._block_labels[func_name]

        funcs: Dict[str, asm2vec.asm.Function] = \
            dict(map(lambda x: (x[0], asm2vec.asm.Function(self._blocks[x[1]], x[0])), func_entries.items()))

        # Fix function call relation.
        for (name, f) in funcs.items():
            def block_action(block: asm2vec.asm.BasicBlock) -> None:
                for instr in block:
                    if is_call(instr.op()):
                        callee_name = instr.args()[0]
                        if callee_name in funcs:
                            f.add_callee(funcs[callee_name])

            asm2vec.asm.walk_cfg(f.entry(), block_action)

        # TODO: Implement Selective Callee Expansion here.

        return list(funcs.values())


class ParseOptions:
    def __init__(self, **kwargs):
        self._func_names = kwargs.get('func_names', [])

    def func_names(self) -> List[str]:
        return self._func_names


class ParseContext:
    def __init__(self, **kwargs):
        self._builder = CFGBuilder(self)
        self._options = ParseOptions(**kwargs)
        self._logger = logging.getLogger('asm2vec.ParseContext')

    def logger(self) -> logging.Logger:
        return self._logger

    def options(self) -> ParseOptions:
        return self._options

    def builder(self) -> CFGBuilder:
        return self._builder


'''

Parser rules for input assembly file:

program
    : asm_line*
    ;

asm_line
    : asm_label '\n'
    | BLANKS asm_instr '\n'
    ;

asm_label
    : ASM_LABEL_ID ':'
    ;

asm_instr
    : ASM_INSTR_OP ' ' asm_instr_arg_list
    ;

asm_instr_arg_list
    : ASM_INSTR_ARG (',' asm_instr_arg_list)?
    | /* epsilon */
    ;

BLANKS : [ \n\t]+;

'''


def is_fullmatch(pattern, s: str) -> bool:
    return pattern.fullmatch(s) is not None


def parse_asm_label(ln: str, context: ParseContext) -> None:
    stripped = ln.strip()
    if stripped[-1] != ':':
        raise_asm_syntax_error('asm_label', ln)

    context.builder().set_label(stripped[:-1])


def parse_asm_instr(ln: str, context: ParseContext) -> None:
    delim_index = ln.find(' ')
    args = []
    if delim_index == -1:
        op = ln
    else:
        op = ln[:delim_index]
        args = list(map(lambda arg: arg.strip(), ln[delim_index + 1:].split(',')))

    context.builder().add_instr(op, args)


def parse_asm_line(ln: str, context: ParseContext) -> None:
    if len(ln.strip()) == 0:
        return

    if ln[0].isspace():
        # Expect production asm_line -> BLANKS asm_instr '\n'
        parse_asm_instr(ln.strip(), context)
    else:
        # Expect production asm_line -> asm_label
        parse_asm_label(ln, context)


def parse_asm_lines(lines: Iterable[str], **kwargs) -> List[asm2vec.asm.Function]:
    context = ParseContext(**kwargs)
    for ln in lines:
        parse_asm_line(ln, context)
    return context.builder().build()


================================================
FILE: asm2vec/internal/repr.py
================================================
import random
from typing import *
import concurrent.futures

from asm2vec.asm import Instruction
from asm2vec.asm import BasicBlock
from asm2vec.asm import Function
from asm2vec.asm import walk_cfg
from asm2vec.repo import SequentialFunction
from asm2vec.repo import VectorizedFunction
from asm2vec.repo import VectorizedToken
from asm2vec.repo import Token
from asm2vec.repo import FunctionRepository
from asm2vec.logging import asm2vec_logger

from asm2vec.internal.atomic import Atomic


def _random_walk(f: Function) -> List[Instruction]:
    visited: Set[int] = set()
    current = f.entry()
    seq: List[Instruction] = []

    while current.id() not in visited:
        visited.add(current.id())
        for instr in current:
            seq.append(instr)
        if len(current.successors()) == 0:
            break

        current = random.choice(current.successors())

    return seq


def _edge_sampling(f: Function) -> List[List[Instruction]]:
    edges: List[Tuple[BasicBlock, BasicBlock]] = []

    def collect_edges(block: BasicBlock) -> None:
        nonlocal edges
        for successor in block.successors():
            edges.append((block, successor))

    walk_cfg(f.entry(), collect_edges)

    visited_edges: Set[Tuple[int, int]] = set()
    sequences = []
    while len(visited_edges) < len(edges):
        e = random.choice(edges)
        visited_edges.add((e[0].id(), e[1].id()))
        sequences.append(list(e[0]) + list(e[1]))

    return sequences


def make_sequential_function(f: Function, num_of_random_walks: int = 10) -> SequentialFunction:
    seq: List[List[Instruction]] = []

    for _ in range(num_of_random_walks):
        seq.append(_random_walk(f))

    # seq += _edge_sampling(f)

    return SequentialFunction(f.id(), f.name(), seq)


def _get_function_tokens(f: Function, dim: int = 200) -> List[VectorizedToken]:
    tokens: List[VectorizedToken] = []

    def collect_tokens(block: BasicBlock) -> None:
        nonlocal tokens
        for ins in block:
            tk: List[str] = [ins.op()] + ins.args()
            for t in tk:
                tokens.append(VectorizedToken(t, None, None, dim))

    walk_cfg(f.entry(), collect_tokens)
    return tokens


def _make_function_repo_helper(vocab: Dict[str, Token], funcs: List[Function],
                               dim: int, num_of_rnd_walks: int, jobs: int) -> FunctionRepository:
    progress = Atomic(1)

    vec_funcs_atomic = Atomic([])
    vocab_atomic = Atomic(vocab)

    def func_handler(f: Function):
        with vec_funcs_atomic.lock() as vfa:
            vfa.value().append(VectorizedFunction(make_sequential_function(f, num_of_rnd_walks), dim=dim*2))

        tokens = _get_function_tokens(f, dim)
        for tk in tokens:
            with vocab_atomic.lock() as va:
                if tk.name() in va.value():
                    va.value()[tk.name()].count += 1
                else:
                    va.value()[tk.name()] = Token(tk)

        asm2vec_logger().debug('Sequence generated for function "%s", progress: %f%%',
                               f.name(), progress.value() / len(funcs) * 100)
        with progress.lock() as prog:
            prog.set(prog.value() + 1)

    executor = concurrent.futures.ThreadPoolExecutor(max_workers=jobs)
    fs = []
    for fn in funcs:
        fs.append(executor.submit(func_handler, fn))
    done, not_done = concurrent.futures.wait(fs, return_when=concurrent.futures.FIRST_EXCEPTION)

    if len(not_done) > 0 or any(map(lambda fut: fut.cancelled() or not fut.done(), done)):
        raise RuntimeError('Not all tasks finished successfully.')

    vec_funcs = vec_funcs_atomic.value()
    repo = FunctionRepository(vec_funcs, vocab)

    # Re-calculate the frequency of each token.
    for t in repo.vocab().values():
        t.frequency = t.count / repo.num_of_tokens()

    return repo


def make_function_repo(funcs: List[Function], dim: int, num_of_rnd_walks: int, jobs: int) -> FunctionRepository:
    return _make_function_repo_helper(dict(), funcs, dim, num_of_rnd_walks, jobs)


def make_estimate_repo(vocabulary: Dict[str, Token], f: Function,
                       dim: int, num_of_rnd_walks: int) -> FunctionRepository:
    # Make a copy of the function list and vocabulary to avoid the change to affect the original trained repo.
    vocab: Dict[str, Token] = dict(**vocabulary)
    return _make_function_repo_helper(vocab, [f], dim, num_of_rnd_walks, 1)


================================================
FILE: asm2vec/internal/sampling.py
================================================
from typing import *
import random

T = TypeVar('T')


class NegativeSampler:
    def __init__(self, distribution: List[Tuple[T, float]], alpha: float = 3 / 4):
        self._values = list(map(lambda x: x[0], distribution))
        self._weights = list(map(lambda x: x[1] ** alpha, distribution))

    def sample(self, k: int) -> List[T]:
        return random.choices(self._values, self._weights, k=k)


================================================
FILE: asm2vec/internal/training.py
================================================
from typing import *
import math
import threading
import concurrent.futures

import numpy as np

from asm2vec.asm import Instruction
from asm2vec.internal.repr import FunctionRepository
from asm2vec.internal.repr import VectorizedFunction
from asm2vec.internal.repr import Token
from asm2vec.internal.repr import VectorizedToken
from asm2vec.internal.sampling import NegativeSampler
from asm2vec.internal.atomic import LockContextManager
from asm2vec.internal.atomic import Atomic
from asm2vec.logging import asm2vec_logger


class Asm2VecParams:
    def __init__(self, **kwargs):
        self.d: int = kwargs.get('d', 200)
        self.initial_alpha: float = kwargs.get('alpha', 0.0025)
        self.alpha_update_interval: int = kwargs.get('alpha_update_interval', 10000)
        self.num_of_rnd_walks: int = kwargs.get('rnd_walks', 3)
        self.neg_samples: int = kwargs.get('neg_samples', 25)
        self.iteration: int = kwargs.get('iteration', 1)
        self.jobs: int = kwargs.get('jobs', 4)

    def to_dict(self) -> Dict[str, Any]:
        return {
            'd': self.d,
            'alpha': self.initial_alpha,
            'alpha_update_interval': self.alpha_update_interval,
            'num_of_rnd_walks': self.num_of_rnd_walks,
            'neg_samples': self.neg_samples,
            'iteration': self.iteration,
            'jobs': self.jobs
        }

    def populate(self, rep: Dict[bytes, Any]) -> None:
        self.d: int = rep.get(b'd', 200)
        self.initial_alpha: float = rep.get(b'alpha', 0.0025)
        self.alpha_update_interval: int = rep.get(b'alpha_update_interval', 10000)
        self.num_of_rnd_walks: int = rep.get(b'rnd_walks', 3)
        self.neg_samples: int = rep.get(b'neg_samples', 25)
        self.iteration: int = rep.get(b'iteration', 1)
        self.jobs: int = rep.get(b'jobs', 4)


class SequenceWindow:
    def __init__(self, sequence: List[Instruction], vocabulary: Dict[str, Token]):
        self._seq = sequence
        self._vocab = vocabulary
        self._i = 1

        self._prev_ins = None
        self._curr_ins = None
        self._next_ins = None

        self._prev_ins_op = None
        self._prev_ins_args = None
        self._curr_ins_op = None
        self._curr_ins_args = None
        self._next_ins_op = None
        self._next_ins_args = None

    def move_next(self) -> bool:
        if self._i >= len(self._seq) - 1:
            return False

        def token_lookup(name) -> VectorizedToken:
            return self._vocab[name].vectorized()

        self._prev_ins = self._seq[self._i - 1]
        self._curr_ins = self._seq[self._i]
        self._next_ins = self._seq[self._i + 1]

        self._prev_ins_op = token_lookup(self._prev_ins.op())
        self._prev_ins_args = list(map(token_lookup, self._prev_ins.args()))
        self._curr_ins_op = token_lookup(self._curr_ins.op())
        self._curr_ins_args = list(map(token_lookup, self._curr_ins.args()))
        self._next_ins_op = token_lookup(self._next_ins.op())
        self._next_ins_args = list(map(token_lookup, self._next_ins.args()))

        self._i += 1

        return True

    def prev_ins(self) -> Instruction:
        return self._prev_ins

    def prev_ins_op(self) -> VectorizedToken:
        return self._prev_ins_op

    def prev_ins_args(self) -> List[VectorizedToken]:
        return self._prev_ins_args

    def curr_ins(self) -> Instruction:
        return self._curr_ins

    def curr_ins_op(self) -> VectorizedToken:
        return self._curr_ins_op

    def curr_ins_args(self) -> List[VectorizedToken]:
        return self._curr_ins_args

    def next_ins(self) -> Instruction:
        return self._next_ins

    def next_ins_op(self) -> VectorizedToken:
        return self._next_ins_op

    def next_ins_args(self) -> List[VectorizedToken]:
        return self._next_ins_args


class TrainingContext:
    class Counter:
        def __init__(self, context: 'TrainingContext', name: str, initial: int = 0):
            self._context = context
            self._name = name
            self._val = initial

        def val(self) -> int:
            with self._context.lock():
                return self._val

        def inc(self) -> int:
            with self._context.lock():
                self._val += 1
                return self._val

        def reset(self) -> int:
            with self._context.lock():
                v = self._val
                self._val = 0
                return v

    TOKENS_HANDLED_COUNTER: str = "tokens_handled"

    def __init__(self, repo: FunctionRepository, params: Asm2VecParams, is_estimating: bool = False):
        self._repo = repo
        self._params = params
        self._alpha = params.initial_alpha
        self._sampler = NegativeSampler(list(map(lambda t: (t, t.frequency), repo.vocab().values())))
        self._is_estimating = is_estimating
        self._counters = dict()
        self._lock = threading.Lock()

    def repo(self) -> FunctionRepository:
        return self._repo

    def params(self) -> Asm2VecParams:
        return self._params

    def lock(self) -> LockContextManager:
        return LockContextManager(self._lock)

    def alpha(self) -> float:
        with self.lock():
            return self._alpha

    def set_alpha(self, alpha: float) -> None:
        with self.lock():
            self._alpha = alpha

    def sampler(self) -> NegativeSampler:
        return self._sampler

    def is_estimating(self) -> bool:
        return self._is_estimating

    def create_sequence_window(self, seq: List[Instruction]) -> SequenceWindow:
        return SequenceWindow(seq, self._repo.vocab())

    def get_counter(self, name: str) -> Counter:
        with self.lock():
            return self._counters.get(name)

    def add_counter(self, name: str, initial: int = 0) -> Counter:
        with self.lock():
            c = self.__class__.Counter(self, name, initial)
            self._counters[name] = c
            return c


def _sigmoid(x: float) -> float:
    return 1 / (1 + np.exp(-x))


def _identity(cond: bool) -> int:
    return 1 if cond else 0


def _dot_sigmoid(lhs: np.ndarray, rhs: np.ndarray) -> float:
    # noinspection PyTypeChecker
    return _sigmoid(np.dot(lhs, rhs))


def _get_inst_repr(op: VectorizedToken, args: List[VectorizedToken]) -> np.ndarray:
    if len(args) == 0:
        arg_vec = np.zeros(len(op.v))
    else:
        arg_vec = np.average(list(map(lambda tk: tk.v, args)), axis=0)
    return np.hstack((op.v, arg_vec))


def _train_vectorized(wnd: SequenceWindow, f: VectorizedFunction, context: TrainingContext) -> None:
    ct_prev = _get_inst_repr(wnd.prev_ins_op(), wnd.prev_ins_args())
    ct_next = _get_inst_repr(wnd.next_ins_op(), wnd.next_ins_args())
    delta = np.average([ct_prev, f.v, ct_next], axis=0)

    tokens = [wnd.curr_ins_op()] + wnd.curr_ins_args()

    f_grad = np.zeros(f.v.shape)
    for tk in tokens:
        # Negative sampling.
        sampled_tokens: Dict[str, VectorizedToken] = \
            dict(map(lambda x: (x.name(), x.vectorized()), context.sampler().sample(context.params().neg_samples)))
        if tk.name() not in sampled_tokens:
            sampled_tokens[tk.name()] = tk

        # The following code block tries to update the learning rate when necessary. Not required for now.
        # tokens_handled_counter = context.get_counter(TrainingContext.TOKENS_HANDLED_COUNTER)
        # if tokens_handled_counter is not None:
        #     if tokens_handled_counter.val() % context.params().alpha_update_interval == 0:
        #         # Update the learning rate.
        #         alpha = 1 - tokens_handled_counter.val() / (
        #                 context.params().iteration * context.repo().num_of_tokens() + 1)
        #         context.set_alpha(max(alpha, context.params().initial_alpha * 0.0001))

        for sp_tk in sampled_tokens.values():
            # Accumulate gradient for function vector.
            g = (_dot_sigmoid(delta, tk.v_pred) - _identity(tk is sp_tk)) * context.alpha()
            f_grad += g / 3 * tk.v_pred

            if not context.is_estimating():
                with context.lock():
                    # Update v'_t
                    tk.v_pred -= g * delta

    # Apply function gradient.
    with context.lock():
        f.v -= f_grad

    if not context.is_estimating():
        # Apply gradient to instructions.
        d = len(f_grad) // 2

        with context.lock():
            wnd.prev_ins_op().v -= f_grad[:d]
            if len(wnd.prev_ins_args()) > 0:
                prev_args_grad = f_grad[d:] / len(wnd.prev_ins_args())
                for t in wnd.prev_ins_args():
                    t.v -= prev_args_grad

            wnd.next_ins_op().v -= f_grad[:d]
            if len(wnd.next_ins_args()) > 0:
                next_args_grad = f_grad[d:] / len(wnd.next_ins_args())
                for t in wnd.next_ins_args():
                    t.v -= next_args_grad


def _train_sequence(f: VectorizedFunction, seq: List[Instruction], context: TrainingContext) -> None:
    wnd = context.create_sequence_window(seq)
    while wnd.move_next():
        _train_vectorized(wnd, f, context)


def train(repository: FunctionRepository, params: Asm2VecParams) -> None:
    context = TrainingContext(repository, params)
    context.add_counter(TrainingContext.TOKENS_HANDLED_COUNTER)

    asm2vec_logger().debug('Total number of functions: %d', len(context.repo().funcs()))
    progress = Atomic(1)

    def train_function(fn: VectorizedFunction):
        for seq in fn.sequential().sequences():
            _train_sequence(fn, seq, context)

        asm2vec_logger().debug('Function "%s" trained, progress: %f%%',
                               fn.sequential().name(), progress.value() / len(context.repo().funcs()) * 100)
        with progress.lock() as prog_proxy:
            prog_proxy.set(prog_proxy.value() + 1)

    executor = concurrent.futures.ThreadPoolExecutor(max_workers=context.params().jobs)
    futures = []
    for f in context.repo().funcs():
        futures.append(executor.submit(train_function, f))

    done, not_done = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_EXCEPTION)
    if len(not_done) > 0:
        raise RuntimeError('Train failed due to one or more failed task.')


def estimate(f: VectorizedFunction, estimate_repo: FunctionRepository, params: Asm2VecParams) -> np.ndarray:
    context = TrainingContext(estimate_repo, params, True)
    for seq in f.sequential().sequences():
        _train_sequence(f, seq, context)

    return f.v


================================================
FILE: asm2vec/internal/util.py
================================================
import numpy as np


def make_small_ndarray(dim: int) -> np.ndarray:
    rng = np.random.default_rng()
    return (rng.random(dim) - 0.5) / dim


================================================
FILE: asm2vec/logging.py
================================================
import logging


def asm2vec_logger() -> logging.Logger:
    return logging.getLogger('asm2vec')


def config_asm2vec_logging(**kwargs):
    level = kwargs.get('level', logging.WARNING)
    handlers = kwargs.get('handlers', [])
    filters = kwargs.get('filters', [])

    asm2vec_logger().setLevel(level)
    for hd in handlers:
        asm2vec_logger().addHandler(hd)
    for ft in filters:
        asm2vec_logger().addFilter(ft)


================================================
FILE: asm2vec/model.py
================================================
from typing import *

import numpy as np

import asm2vec.asm
import asm2vec.repo

import asm2vec.internal.training
import asm2vec.internal.repr
import asm2vec.internal.util


class Asm2VecMemento:
    def __init__(self):
        self.params: Optional[asm2vec.internal.training.Asm2VecParams] = None
        self.vocab: Optional[Dict[str, asm2vec.repo.Token]] = None

    def serialize(self) -> Dict[str, Any]:
        return {
            'params': self.params.to_dict(),
            'vocab': asm2vec.repo.serialize_vocabulary(self.vocab)
        }

    def populate(self, rep: Dict[bytes, Any]) -> None:
        self.params = asm2vec.internal.training.Asm2VecParams()
        self.params.populate(rep[b'params'])
        self.vocab = asm2vec.repo.deserialize_vocabulary(rep[b'vocab'])


class Asm2Vec:
    def __init__(self, **kwargs):
        self._params = asm2vec.internal.training.Asm2VecParams(**kwargs)
        self._vocab = None

    def memento(self) -> Asm2VecMemento:
        memento = Asm2VecMemento()
        memento.params = self._params
        memento.vocab = self._vocab
        return memento

    def set_memento(self, memento: Asm2VecMemento) -> None:
        self._params = memento.params
        self._vocab = memento.vocab

    def make_function_repo(self, funcs: List[asm2vec.asm.Function]) -> asm2vec.repo.FunctionRepository:
        return asm2vec.internal.repr.make_function_repo(
            funcs, self._params.d, self._params.num_of_rnd_walks, self._params.jobs)

    def train(self, repo: asm2vec.repo.FunctionRepository) -> None:
        asm2vec.internal.training.train(repo, self._params)
        self._vocab = repo.vocab()

    def to_vec(self, f: asm2vec.asm.Function) -> np.ndarray:
        estimate_repo = asm2vec.internal.repr.make_estimate_repo(
            self._vocab, f, self._params.d, self._params.num_of_rnd_walks)
        vf = estimate_repo.funcs()[0]

        asm2vec.internal.training.estimate(vf, estimate_repo, self._params)

        return vf.v


================================================
FILE: asm2vec/parse.py
================================================
from typing import *

import asm2vec.asm
import asm2vec.internal.parse

from asm2vec.internal.parse import AssemblySyntaxError


def parse_text(asm: str, **kwargs) -> List[asm2vec.asm.Function]:
    return asm2vec.internal.parse.parse_asm_lines(asm.split('\n'), **kwargs)


def parse_fp(fp, **kwargs) -> List[asm2vec.asm.Function]:
    return asm2vec.internal.parse.parse_asm_lines(fp, **kwargs)


def parse(asm_file_name: str, **kwargs) -> List[asm2vec.asm.Function]:
    with open(asm_file_name, mode='r') as fp:
        return parse_fp(fp, **kwargs)


================================================
FILE: asm2vec/repo.py
================================================
from typing import *

import numpy as np

import asm2vec.asm
import asm2vec.internal.util


class SequentialFunction:
    def __init__(self, fid: int, name: str, sequences: List[List[asm2vec.asm.Instruction]]):
        self._id = fid
        self._name = name
        self._seq = sequences

    def id(self) -> int:
        return self._id

    def name(self) -> str:
        return self._name

    def sequences(self) -> List[List[asm2vec.asm.Instruction]]:
        return self._seq


class VectorizedFunction:
    def __init__(self, f: SequentialFunction, v: np.ndarray = None, dim: int = 400):
        self._f = f
        self.v = v if v is not None else asm2vec.internal.util.make_small_ndarray(dim)

    def sequential(self) -> SequentialFunction:
        return self._f


class VectorizedToken:
    def __init__(self, name: str, v: np.ndarray = None, v_pred: np.ndarray = None, dim: int = 200):
        self._name = name
        self.v = v if v is not None else np.zeros(dim)
        self.v_pred = v_pred if v_pred is not None else asm2vec.internal.util.make_small_ndarray(dim * 2)

    def __eq__(self, other):
        if not isinstance(other, VectorizedToken):
            return False

        return self._name == other._name

    def __ne__(self, other):
        return not self.__eq__(other)

    def name(self) -> str:
        return self._name


class Token:
    def __init__(self, vt: VectorizedToken, count: int = 1):
        self._vt = vt
        self.count: int = count
        self.frequency: float = 0

    def vectorized(self) -> VectorizedToken:
        return self._vt

    def name(self) -> str:
        return self._vt.name()


class FunctionRepository:
    def __init__(self, funcs: List[VectorizedFunction], vocab: Dict[str, Token]):
        self._funcs = funcs
        self._vocab = vocab
        self._num_of_tokens = sum(map(lambda x: x.count, vocab.values()))

    def funcs(self) -> List[VectorizedFunction]:
        return self._funcs

    def vocab(self) -> Dict[str, Token]:
        return self._vocab

    def num_of_tokens(self) -> int:
        return self._num_of_tokens


def _serialize_token(token: Token) -> Dict[str, Any]:
    return {
        'name': token.name(),
        'v': list(token.vectorized().v),
        'v_pred': list(token.vectorized().v_pred),
        'count': token.count,
        'frequency': token.frequency
    }


def _deserialize_token(rep: Dict[bytes, Any]) -> Token:
    name = rep[b'name'].decode('utf-8')
    v = np.array(rep[b'v'])
    v_pred = np.array(rep[b'v_pred'])
    count = rep[b'count']
    frequency = rep[b'frequency']

    token = Token(VectorizedToken(name, v, v_pred))
    token.count = count
    token.frequency = frequency
    return token


def serialize_vocabulary(vocab: Dict[str, Token]) -> Dict[str, Any]:
    return dict(zip(vocab.keys(), map(_serialize_token, vocab.values())))


def deserialize_vocabulary(rep: Dict[bytes, Any]) -> Dict[str, Token]:
    return dict(zip(map(lambda b: b.decode('utf-8'), rep.keys()), map(_deserialize_token, rep.values())))


def _serialize_sequence(seq: List[asm2vec.asm.Instruction]) -> List[Any]:
    return list(map(lambda instr: [instr.op(), instr.args()], seq))


def _deserialize_sequence(rep: List[Any]) -> List[asm2vec.asm.Instruction]:
    return list(map(
        lambda instr_rep: asm2vec.asm.Instruction(instr_rep[0].decode('utf-8'), instr_rep[1].decode('utf-8')), rep))


def _serialize_vectorized_function(func: VectorizedFunction, include_sequences: bool) -> Dict[str, Any]:
    data = {
        'id': func.sequential().id(),
        'name': func.sequential().name(),
        'v': list(func.v)
    }

    if include_sequences:
        data['sequences'] = list(map(_serialize_sequence, func.sequential().sequences()))

    return data


def _deserialize_vectorized_function(rep: Dict[bytes, Any]) -> VectorizedFunction:
    name = rep[b'name'].decode('utf-8')
    fid = rep[b'id']
    v = np.array(rep[b'v'])
    sequences = list(map(_deserialize_sequence, rep.get(b'sequences', [])))
    return VectorizedFunction(SequentialFunction(fid, name, sequences), v)


SERIALIZE_VOCABULARY: int = 1
SERIALIZE_FUNCTION: int = 2
SERIALIZE_FUNCTION_SEQUENCES: int = 4
SERIALIZE_ALL: int = SERIALIZE_VOCABULARY | SERIALIZE_FUNCTION | SERIALIZE_FUNCTION_SEQUENCES


def serialize_function_repo(repo: FunctionRepository, flags: int) -> Dict[str, Any]:
    data = dict()
    if (flags & SERIALIZE_VOCABULARY) != 0:
        data['vocab'] = serialize_vocabulary(repo.vocab())
    if (flags & SERIALIZE_FUNCTION) != 0:
        include_sequences = ((flags & SERIALIZE_FUNCTION_SEQUENCES) != 0)
        data['funcs'] = list(map(
            lambda f: _serialize_vectorized_function(f, include_sequences),
            repo.funcs()))

    return data


def deserialize_function_repo(rep: Dict[bytes, Any]) -> FunctionRepository:
    funcs = list(map(_deserialize_vectorized_function, rep.get(b'funcs', [])))
    vocab = deserialize_vocabulary(rep.get(b'vocab', dict()))
    return FunctionRepository(funcs, vocab)


================================================
FILE: examples/estimating.s
================================================
my_strlen_est:
        cmp     BYTE PTR [rdi], 0
        je      .L4
        mov     rax, rdi
.L3:
        add     rax, 1
        cmp     BYTE PTR [rax], 0
        jne     .L3
.L2:
        sub     rax, rdi
        ret
.L4:
        mov     rax, rdi
        jmp     .L2
my_strcmp_est:
        movzx   eax, BYTE PTR [rdi]
        test    al, al
        je      .L12
.L7:
        movzx   edx, BYTE PTR [rsi]
        test    dl, dl
        je      .L15
        cmp     dl, al
        jne     .L16
        add     rdi, 1
        add     rsi, 1
        movzx   eax, BYTE PTR [rdi]
        test    al, al
        jne     .L7
.L12:
        cmp     BYTE PTR [rsi], 0
        setne   dl
        movzx   edx, dl
        neg     edx
.L6:
        mov     eax, edx
        ret
.L16:
        movsx   eax, al
        movsx   edx, dl
        sub     eax, edx
        mov     edx, eax
        jmp     .L6
.L15:
        mov     edx, 1
        test    al, al
        jne     .L6
        jmp     .L12
.LC0:
        .string "%s"
.LC1:
        .string "%d\n"
main:
        sub     rsp, 264
        lea     rsi, [rsp+128]
        mov     edi, OFFSET FLAT:.LC0
        mov     eax, 0
        call    scanf
        mov     rsi, rsp
        mov     edi, OFFSET FLAT:.LC0
        mov     eax, 0
        call    scanf
        lea     rdi, [rsp+128]
        call    my_strlen_est
        mov     esi, eax
        mov     edi, OFFSET FLAT:.LC1
        mov     eax, 0
        call    printf
        mov     rsi, rsp
        lea     rdi, [rsp+128]
        call    my_strcmp_est
        mov     esi, eax
        mov     edi, OFFSET FLAT:.LC1
        mov     eax, 0
        call    printf
        mov     eax, 0
        add     rsp, 264
        ret

================================================
FILE: examples/training-estimating.py
================================================
import numpy as np

import asm2vec.asm
import asm2vec.parse
import asm2vec.model


def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


def main():
    training_funcs = asm2vec.parse.parse('training.s',
                                         func_names=['main', 'my_strlen_train', 'my_strcmp_train'])
    estimating_funcs = asm2vec.parse.parse('estimating.s',
                                           func_names=['main', 'my_strlen_est', 'my_strcmp_est'])

    print('# of training functions:', len(training_funcs))
    print('# of estimating functions:', len(estimating_funcs))

    model = asm2vec.model.Asm2Vec(d=200)
    training_repo = model.make_function_repo(training_funcs)
    model.train(training_repo)
    print('Training complete.')

    for tf in training_repo.funcs():
        print('Norm of trained function "{}" = {}'.format(tf.sequential().name(), np.linalg.norm(tf.v)))

    estimating_funcs_vec = list(map(lambda f: model.to_vec(f), estimating_funcs))
    print('Estimating complete.')

    for (ef, efv) in zip(estimating_funcs, estimating_funcs_vec):
        print('Norm of trained function "{}" = {}'.format(ef.name(), np.linalg.norm(efv)))

    for tf in training_repo.funcs():
        for (ef, efv) in zip(estimating_funcs, estimating_funcs_vec):
            sim = cosine_similarity(tf.v, efv)
            print('sim("{}", "{}") = {}'.format(tf.sequential().name(), ef.name(), sim))


if __name__ == '__main__':
    main()


================================================
FILE: examples/training.s
================================================
my_strlen_train:
        push    rbp
        mov     rbp, rsp
        mov     QWORD PTR [rbp-24], rdi
        mov     rax, QWORD PTR [rbp-24]
        mov     QWORD PTR [rbp-8], rax
        jmp     .L2
.L3:
        add     QWORD PTR [rbp-8], 1
.L2:
        mov     rax, QWORD PTR [rbp-8]
        movzx   eax, BYTE PTR [rax]
        test    al, al
        jne     .L3
        mov     rax, QWORD PTR [rbp-8]
        sub     rax, QWORD PTR [rbp-24]
        pop     rbp
        ret
my_strcmp_train:
        push    rbp
        mov     rbp, rsp
        mov     QWORD PTR [rbp-8], rdi
        mov     QWORD PTR [rbp-16], rsi
        jmp     .L6
.L10:
        mov     rax, QWORD PTR [rbp-8]
        movzx   edx, BYTE PTR [rax]
        mov     rax, QWORD PTR [rbp-16]
        movzx   eax, BYTE PTR [rax]
        cmp     dl, al
        je      .L7
        mov     rax, QWORD PTR [rbp-8]
        movzx   eax, BYTE PTR [rax]
        movsx   edx, al
        mov     rax, QWORD PTR [rbp-16]
        movzx   eax, BYTE PTR [rax]
        movsx   eax, al
        sub     edx, eax
        mov     eax, edx
        jmp     .L8
.L7:
        add     QWORD PTR [rbp-8], 1
        add     QWORD PTR [rbp-16], 1
.L6:
        mov     rax, QWORD PTR [rbp-8]
        movzx   eax, BYTE PTR [rax]
        test    al, al
        je      .L9
        mov     rax, QWORD PTR [rbp-16]
        movzx   eax, BYTE PTR [rax]
        test    al, al
        jne     .L10
.L9:
        mov     rax, QWORD PTR [rbp-8]
        movzx   eax, BYTE PTR [rax]
        test    al, al
        je      .L11
        mov     eax, 1
        jmp     .L8
.L11:
        mov     rax, QWORD PTR [rbp-16]
        movzx   eax, BYTE PTR [rax]
        test    al, al
        je      .L12
        mov     eax, -1
        jmp     .L8
.L12:
        mov     eax, 0
.L8:
        pop     rbp
        ret
.LC0:
        .string "%s"
.LC1:
        .string "%d\n"
main:
        push    rbp
        mov     rbp, rsp
        sub     rsp, 256
        lea     rax, [rbp-128]
        mov     rsi, rax
        mov     edi, OFFSET FLAT:.LC0
        mov     eax, 0
        call    scanf
        lea     rax, [rbp-256]
        mov     rsi, rax
        mov     edi, OFFSET FLAT:.LC0
        mov     eax, 0
        call    scanf
        lea     rax, [rbp-128]
        mov     rdi, rax
        call    my_strlen_train
        mov     esi, eax
        mov     edi, OFFSET FLAT:.LC1
        mov     eax, 0
        call    printf
        lea     rdx, [rbp-256]
        lea     rax, [rbp-128]
        mov     rsi, rdx
        mov     rdi, rax
        call    my_strcmp_train
        mov     esi, eax
        mov     edi, OFFSET FLAT:.LC1
        mov     eax, 0
        call    printf
        mov     eax, 0
        leave
        ret

================================================
FILE: tests/asm_test.py
================================================
import unittest as ut

import asm2vec.asm as asm


class InstructionTest(ut.TestCase):
    def test_parse_instruction(self):
        ins = asm.parse_instruction('mov eax, ebx')
        self.assertEqual('mov', ins.op(), 'Operators not equal')
        self.assertListEqual(['eax', 'ebx'], ins.args(), 'Operands not equal')

    def test_parse_instruction_one_operand(self):
        ins = asm.parse_instruction('inc eax')
        self.assertEqual('inc', ins.op(), 'Operators not equal')
        self.assertListEqual(['eax'], ins.args(), 'Operands not equal')

    def test_parse_instruction_no_operands(self):
        ins = asm.parse_instruction('ret')
        self.assertEqual('ret', ins.op(), 'Operators not equal')
        self.assertListEqual([], ins.args(), 'Operands not equal')


class BasicBlockTest(ut.TestCase):
    pass


class FunctionTest(ut.TestCase):
    pass


================================================
FILE: tests/parse_test.py
================================================
import unittest as ut

import asm2vec.parse


test_asm = """
my_strlen:
        push    rbp
        mov     rbp, rsp
        mov     QWORD PTR [rbp-24], rdi
        mov     rax, QWORD PTR [rbp-24]
        mov     QWORD PTR [rbp-8], rax
        jmp     .L2
.L3:
        add     QWORD PTR [rbp-8], 1
.L2:
        mov     rax, QWORD PTR [rbp-8]
        movzx   eax, BYTE PTR [rax]
        test    al, al
        jne     .L3
        mov     rax, QWORD PTR [rbp-8]
        sub     rax, QWORD PTR [rbp-24]
        pop     rbp
        ret
.LC0:
        .string "%s"
.LC1:
        .string "%d\\n"
main:
        push    rbp
        mov     rbp, rsp
        add     rsp, -128
        lea     rax, [rbp-128]
        mov     rsi, rax
        mov     edi, OFFSET FLAT:.LC0
        mov     eax, 0
        call    scanf
        lea     rax, [rbp-128]
        mov     rdi, rax
        call    my_strlen
        mov     esi, eax
        mov     edi, OFFSET FLAT:.LC1
        mov     eax, 0
        call    printf
        mov     eax, 0
        leave
        ret
"""


class ParseTest(ut.TestCase):
    def test_parse_text(self):
        funcs = asm2vec.parse.parse_text(test_asm, func_names=['main', 'my_strlen'])
        self.assertEqual(2, len(funcs))
        self.assertEqual({'main', 'my_strlen'}, set(map(lambda f: f.name(), funcs)))

        funcs = dict(map(lambda f: (f.name(), f), funcs))
        main_func: asm2vec.asm.Function = funcs['main']
        my_strlen_func: asm2vec.asm.Function = funcs['my_strlen']

        self.assertListEqual(['my_strlen'], list(map(lambda f: f.name(), main_func.callees())))
        self.assertListEqual(['main'], list(map(lambda f: f.name(), my_strlen_func.callers())))


================================================
FILE: tests/utilities_test.py
================================================
import unittest as ut

import asm2vec.internal.util as utilities


class PermutationTest(ut.TestCase):
    def test_permute(self):
        v = [10, 20, 30, 40, 50]
        p = [2, 4, 1, 0, 3]
        pv = utilities.permute(v, p)
        self.assertListEqual([30, 50, 20, 10, 40], pv, 'Permutated vectors not equal.')

    def test_inv_permute(self):
        v = [30, 50, 20, 10, 40]
        p = [2, 4, 1, 0, 3]
        pv = utilities.inverse_permute(v, p)
        self.assertListEqual([10, 20, 30, 40, 50], pv, 'Inverse permutated vectors not equal.')

Download .txt

gitextract_5xp0becm/

├── .gitignore
├── README.md
├── asm2vec/
│   ├── __init__.py
│   ├── asm.py
│   ├── internal/
│   │   ├── __init__.py
│   │   ├── atomic.py
│   │   ├── parse.py
│   │   ├── repr.py
│   │   ├── sampling.py
│   │   ├── training.py
│   │   └── util.py
│   ├── logging.py
│   ├── model.py
│   ├── parse.py
│   └── repo.py
├── examples/
│   ├── estimating.s
│   ├── training-estimating.py
│   └── training.s
└── tests/
    ├── asm_test.py
    ├── parse_test.py
    └── utilities_test.py

Download .txt

SYMBOL INDEX (209 symbols across 15 files)

FILE: asm2vec/asm.py
  class Instruction (line 4) | class Instruction:
    method __init__ (line 5) | def __init__(self, op: str, *args: str):
    method op (line 9) | def op(self) -> str:
    method number_of_args (line 12) | def number_of_args(self) -> int:
    method args (line 15) | def args(self) -> List[str]:
  function parse_instruction (line 19) | def parse_instruction(code: str) -> Instruction:
  class BasicBlock (line 29) | class BasicBlock:
    method __init__ (line 32) | def __init__(self):
    method __iter__ (line 41) | def __iter__(self):
    method __len__ (line 44) | def __len__(self):
    method __hash__ (line 47) | def __hash__(self):
    method __eq__ (line 50) | def __eq__(self, other):
    method __ne__ (line 55) | def __ne__(self, other):
    method id (line 58) | def id(self) -> int:
    method add_instruction (line 61) | def add_instruction(self, instr: Instruction) -> None:
    method body_instructions (line 64) | def body_instructions(self) -> List[Instruction]:
    method instructions (line 67) | def instructions(self) -> List[Instruction]:
    method add_predecessor (line 70) | def add_predecessor(self, predecessor: 'BasicBlock') -> None:
    method add_successor (line 74) | def add_successor(self, successor: 'BasicBlock') -> None:
    method first_instruction (line 78) | def first_instruction(self) -> Instruction:
    method last_instruction (line 81) | def last_instruction(self) -> Instruction:
    method predecessors (line 84) | def predecessors(self) -> List['BasicBlock']:
    method in_degree (line 87) | def in_degree(self) -> int:
    method successors (line 90) | def successors(self) -> List['BasicBlock']:
    method out_degree (line 93) | def out_degree(self) -> int:
  class CFGWalkerCallback (line 97) | class CFGWalkerCallback:
    method __call__ (line 98) | def __call__(self, *args, **kwargs):
    method on_enter (line 101) | def on_enter(self, block: BasicBlock) -> None:
    method on_exit (line 104) | def on_exit(self, block: BasicBlock) -> None:
  function _walk_cfg (line 111) | def _walk_cfg(entry: BasicBlock, action: CFGWalkerCallbackType, visited:...
  function walk_cfg (line 125) | def walk_cfg(entry: BasicBlock, action: CFGWalkerCallbackType) -> None:
  class Function (line 129) | class Function:
    method __init__ (line 132) | def __init__(self, entry: BasicBlock, name: str = None):
    method __len__ (line 142) | def __len__(self) -> int:
    method __hash__ (line 152) | def __hash__(self):
    method __eq__ (line 155) | def __eq__(self, other):
    method __ne__ (line 160) | def __ne__(self, other):
    method id (line 163) | def id(self) -> int:
    method entry (line 166) | def entry(self) -> BasicBlock:
    method name (line 169) | def name(self) -> str:
    method add_callee (line 172) | def add_callee(self, f: 'Function') -> None:
    method callees (line 176) | def callees(self) -> List['Function']:
    method out_degree (line 179) | def out_degree(self) -> int:
    method add_caller (line 182) | def add_caller(self, f: 'Function') -> None:
    method callers (line 186) | def callers(self) -> List['Function']:
    method in_degree (line 189) | def in_degree(self) -> int:

FILE: asm2vec/internal/atomic.py
  class LockContextManager (line 5) | class LockContextManager:
    method __init__ (line 6) | def __init__(self, lock: threading.Lock):
    method __enter__ (line 10) | def __enter__(self):
    method __exit__ (line 13) | def __exit__(self, exc_type, exc_val, exc_tb):
    method exited (line 17) | def exited(self) -> bool:
  class Atomic (line 21) | class Atomic:
    class AtomicContextManager (line 22) | class AtomicContextManager(LockContextManager):
      method __init__ (line 23) | def __init__(self, atomic: 'Atomic'):
      method __enter__ (line 28) | def __enter__(self):
      method __exit__ (line 32) | def __exit__(self, exc_type, exc_val, exc_tb):
      method value (line 35) | def value(self) -> Any:
      method set (line 40) | def set(self, value: Any) -> None:
    method __init__ (line 45) | def __init__(self, value: Any):
    method lock (line 49) | def lock(self) -> AtomicContextManager:
    method value (line 52) | def value(self) -> Any:

FILE: asm2vec/internal/parse.py
  class AssemblySyntaxError (line 7) | class AssemblySyntaxError(Exception):
    method __init__ (line 8) | def __init__(self, message: str = None):
    method message (line 11) | def message(self) -> str:
  function raise_asm_syntax_error (line 15) | def raise_asm_syntax_error(expect: str, found: str) -> None:
  function is_jmp (line 48) | def is_jmp(op: str) -> bool:
  function is_conditional_jmp (line 52) | def is_conditional_jmp(op: str) -> bool:
  function is_call (line 56) | def is_call(op: str) -> bool:
  function is_ret (line 60) | def is_ret(op: str) -> bool:
  function is_reg (line 64) | def is_reg(arg: str) -> bool:
  class CFGBuilder (line 68) | class CFGBuilder:
    method __init__ (line 69) | def __init__(self, context: 'ParseContext'):
    method _logger (line 75) | def _logger(self) -> logging.Logger:
    method _allocate_block (line 78) | def _allocate_block(self) -> int:
    method _allocate_named_block (line 82) | def _allocate_named_block(self, name: str) -> int:
    method _get_active_block (line 90) | def _get_active_block(self) -> asm2vec.asm.BasicBlock:
    method _set_active_block (line 93) | def _set_active_block(self, block_id: int) -> None:
    method _has_active_block (line 96) | def _has_active_block(self) -> bool:
    method _close_active_block (line 99) | def _close_active_block(self) -> None:
    method _add_jmp (line 102) | def _add_jmp(self, op: str, args: List[str]) -> None:
    method add_instr (line 113) | def add_instr(self, op: str, args: List[str]) -> None:
    method set_label (line 125) | def set_label(self, label: str) -> None:
    method build (line 142) | def build(self) -> List[asm2vec.asm.Function]:
  class ParseOptions (line 183) | class ParseOptions:
    method __init__ (line 184) | def __init__(self, **kwargs):
    method func_names (line 187) | def func_names(self) -> List[str]:
  class ParseContext (line 191) | class ParseContext:
    method __init__ (line 192) | def __init__(self, **kwargs):
    method logger (line 197) | def logger(self) -> logging.Logger:
    method options (line 200) | def options(self) -> ParseOptions:
    method builder (line 203) | def builder(self) -> CFGBuilder:
  function is_fullmatch (line 238) | def is_fullmatch(pattern, s: str) -> bool:
  function parse_asm_label (line 242) | def parse_asm_label(ln: str, context: ParseContext) -> None:
  function parse_asm_instr (line 250) | def parse_asm_instr(ln: str, context: ParseContext) -> None:
  function parse_asm_line (line 262) | def parse_asm_line(ln: str, context: ParseContext) -> None:
  function parse_asm_lines (line 274) | def parse_asm_lines(lines: Iterable[str], **kwargs) -> List[asm2vec.asm....

FILE: asm2vec/internal/repr.py
  function _random_walk (line 19) | def _random_walk(f: Function) -> List[Instruction]:
  function _edge_sampling (line 36) | def _edge_sampling(f: Function) -> List[List[Instruction]]:
  function make_sequential_function (line 56) | def make_sequential_function(f: Function, num_of_random_walks: int = 10)...
  function _get_function_tokens (line 67) | def _get_function_tokens(f: Function, dim: int = 200) -> List[Vectorized...
  function _make_function_repo_helper (line 81) | def _make_function_repo_helper(vocab: Dict[str, Token], funcs: List[Func...
  function make_function_repo (line 124) | def make_function_repo(funcs: List[Function], dim: int, num_of_rnd_walks...
  function make_estimate_repo (line 128) | def make_estimate_repo(vocabulary: Dict[str, Token], f: Function,

FILE: asm2vec/internal/sampling.py
  class NegativeSampler (line 7) | class NegativeSampler:
    method __init__ (line 8) | def __init__(self, distribution: List[Tuple[T, float]], alpha: float =...
    method sample (line 12) | def sample(self, k: int) -> List[T]:

FILE: asm2vec/internal/training.py
  class Asm2VecParams (line 19) | class Asm2VecParams:
    method __init__ (line 20) | def __init__(self, **kwargs):
    method to_dict (line 29) | def to_dict(self) -> Dict[str, Any]:
    method populate (line 40) | def populate(self, rep: Dict[bytes, Any]) -> None:
  class SequenceWindow (line 50) | class SequenceWindow:
    method __init__ (line 51) | def __init__(self, sequence: List[Instruction], vocabulary: Dict[str, ...
    method move_next (line 67) | def move_next(self) -> bool:
    method prev_ins (line 89) | def prev_ins(self) -> Instruction:
    method prev_ins_op (line 92) | def prev_ins_op(self) -> VectorizedToken:
    method prev_ins_args (line 95) | def prev_ins_args(self) -> List[VectorizedToken]:
    method curr_ins (line 98) | def curr_ins(self) -> Instruction:
    method curr_ins_op (line 101) | def curr_ins_op(self) -> VectorizedToken:
    method curr_ins_args (line 104) | def curr_ins_args(self) -> List[VectorizedToken]:
    method next_ins (line 107) | def next_ins(self) -> Instruction:
    method next_ins_op (line 110) | def next_ins_op(self) -> VectorizedToken:
    method next_ins_args (line 113) | def next_ins_args(self) -> List[VectorizedToken]:
  class TrainingContext (line 117) | class TrainingContext:
    class Counter (line 118) | class Counter:
      method __init__ (line 119) | def __init__(self, context: 'TrainingContext', name: str, initial: i...
      method val (line 124) | def val(self) -> int:
      method inc (line 128) | def inc(self) -> int:
      method reset (line 133) | def reset(self) -> int:
    method __init__ (line 141) | def __init__(self, repo: FunctionRepository, params: Asm2VecParams, is...
    method repo (line 150) | def repo(self) -> FunctionRepository:
    method params (line 153) | def params(self) -> Asm2VecParams:
    method lock (line 156) | def lock(self) -> LockContextManager:
    method alpha (line 159) | def alpha(self) -> float:
    method set_alpha (line 163) | def set_alpha(self, alpha: float) -> None:
    method sampler (line 167) | def sampler(self) -> NegativeSampler:
    method is_estimating (line 170) | def is_estimating(self) -> bool:
    method create_sequence_window (line 173) | def create_sequence_window(self, seq: List[Instruction]) -> SequenceWi...
    method get_counter (line 176) | def get_counter(self, name: str) -> Counter:
    method add_counter (line 180) | def add_counter(self, name: str, initial: int = 0) -> Counter:
  function _sigmoid (line 187) | def _sigmoid(x: float) -> float:
  function _identity (line 191) | def _identity(cond: bool) -> int:
  function _dot_sigmoid (line 195) | def _dot_sigmoid(lhs: np.ndarray, rhs: np.ndarray) -> float:
  function _get_inst_repr (line 200) | def _get_inst_repr(op: VectorizedToken, args: List[VectorizedToken]) -> ...
  function _train_vectorized (line 208) | def _train_vectorized(wnd: SequenceWindow, f: VectorizedFunction, contex...
  function _train_sequence (line 264) | def _train_sequence(f: VectorizedFunction, seq: List[Instruction], conte...
  function train (line 270) | def train(repository: FunctionRepository, params: Asm2VecParams) -> None:
  function estimate (line 296) | def estimate(f: VectorizedFunction, estimate_repo: FunctionRepository, p...

FILE: asm2vec/internal/util.py
  function make_small_ndarray (line 4) | def make_small_ndarray(dim: int) -> np.ndarray:

FILE: asm2vec/logging.py
  function asm2vec_logger (line 4) | def asm2vec_logger() -> logging.Logger:
  function config_asm2vec_logging (line 8) | def config_asm2vec_logging(**kwargs):

FILE: asm2vec/model.py
  class Asm2VecMemento (line 13) | class Asm2VecMemento:
    method __init__ (line 14) | def __init__(self):
    method serialize (line 18) | def serialize(self) -> Dict[str, Any]:
    method populate (line 24) | def populate(self, rep: Dict[bytes, Any]) -> None:
  class Asm2Vec (line 30) | class Asm2Vec:
    method __init__ (line 31) | def __init__(self, **kwargs):
    method memento (line 35) | def memento(self) -> Asm2VecMemento:
    method set_memento (line 41) | def set_memento(self, memento: Asm2VecMemento) -> None:
    method make_function_repo (line 45) | def make_function_repo(self, funcs: List[asm2vec.asm.Function]) -> asm...
    method train (line 49) | def train(self, repo: asm2vec.repo.FunctionRepository) -> None:
    method to_vec (line 53) | def to_vec(self, f: asm2vec.asm.Function) -> np.ndarray:

FILE: asm2vec/parse.py
  function parse_text (line 9) | def parse_text(asm: str, **kwargs) -> List[asm2vec.asm.Function]:
  function parse_fp (line 13) | def parse_fp(fp, **kwargs) -> List[asm2vec.asm.Function]:
  function parse (line 17) | def parse(asm_file_name: str, **kwargs) -> List[asm2vec.asm.Function]:

FILE: asm2vec/repo.py
  class SequentialFunction (line 9) | class SequentialFunction:
    method __init__ (line 10) | def __init__(self, fid: int, name: str, sequences: List[List[asm2vec.a...
    method id (line 15) | def id(self) -> int:
    method name (line 18) | def name(self) -> str:
    method sequences (line 21) | def sequences(self) -> List[List[asm2vec.asm.Instruction]]:
  class VectorizedFunction (line 25) | class VectorizedFunction:
    method __init__ (line 26) | def __init__(self, f: SequentialFunction, v: np.ndarray = None, dim: i...
    method sequential (line 30) | def sequential(self) -> SequentialFunction:
  class VectorizedToken (line 34) | class VectorizedToken:
    method __init__ (line 35) | def __init__(self, name: str, v: np.ndarray = None, v_pred: np.ndarray...
    method __eq__ (line 40) | def __eq__(self, other):
    method __ne__ (line 46) | def __ne__(self, other):
    method name (line 49) | def name(self) -> str:
  class Token (line 53) | class Token:
    method __init__ (line 54) | def __init__(self, vt: VectorizedToken, count: int = 1):
    method vectorized (line 59) | def vectorized(self) -> VectorizedToken:
    method name (line 62) | def name(self) -> str:
  class FunctionRepository (line 66) | class FunctionRepository:
    method __init__ (line 67) | def __init__(self, funcs: List[VectorizedFunction], vocab: Dict[str, T...
    method funcs (line 72) | def funcs(self) -> List[VectorizedFunction]:
    method vocab (line 75) | def vocab(self) -> Dict[str, Token]:
    method num_of_tokens (line 78) | def num_of_tokens(self) -> int:
  function _serialize_token (line 82) | def _serialize_token(token: Token) -> Dict[str, Any]:
  function _deserialize_token (line 92) | def _deserialize_token(rep: Dict[bytes, Any]) -> Token:
  function serialize_vocabulary (line 105) | def serialize_vocabulary(vocab: Dict[str, Token]) -> Dict[str, Any]:
  function deserialize_vocabulary (line 109) | def deserialize_vocabulary(rep: Dict[bytes, Any]) -> Dict[str, Token]:
  function _serialize_sequence (line 113) | def _serialize_sequence(seq: List[asm2vec.asm.Instruction]) -> List[Any]:
  function _deserialize_sequence (line 117) | def _deserialize_sequence(rep: List[Any]) -> List[asm2vec.asm.Instruction]:
  function _serialize_vectorized_function (line 122) | def _serialize_vectorized_function(func: VectorizedFunction, include_seq...
  function _deserialize_vectorized_function (line 135) | def _deserialize_vectorized_function(rep: Dict[bytes, Any]) -> Vectorize...
  function serialize_function_repo (line 149) | def serialize_function_repo(repo: FunctionRepository, flags: int) -> Dic...
  function deserialize_function_repo (line 162) | def deserialize_function_repo(rep: Dict[bytes, Any]) -> FunctionRepository:

FILE: examples/training-estimating.py
  function cosine_similarity (line 8) | def cosine_similarity(v1, v2):
  function main (line 12) | def main():

FILE: tests/asm_test.py
  class InstructionTest (line 6) | class InstructionTest(ut.TestCase):
    method test_parse_instruction (line 7) | def test_parse_instruction(self):
    method test_parse_instruction_one_operand (line 12) | def test_parse_instruction_one_operand(self):
    method test_parse_instruction_no_operands (line 17) | def test_parse_instruction_no_operands(self):
  class BasicBlockTest (line 23) | class BasicBlockTest(ut.TestCase):
  class FunctionTest (line 27) | class FunctionTest(ut.TestCase):

FILE: tests/parse_test.py
  class ParseTest (line 51) | class ParseTest(ut.TestCase):
    method test_parse_text (line 52) | def test_parse_text(self):

FILE: tests/utilities_test.py
  class PermutationTest (line 6) | class PermutationTest(ut.TestCase):
    method test_permute (line 7) | def test_permute(self):
    method test_inv_permute (line 13) | def test_inv_permute(self):

Download .json

Condensed preview — 21 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (61K chars).

[
  {
    "path": ".gitignore",
    "chars": 5091,
    "preview": "# Created by https://www.toptal.com/developers/gitignore/api/macos,intellij,virtualenv,python\n# Edit at https://www.topt"
  },
  {
    "path": "README.md",
    "chars": 5194,
    "preview": "# asm2vec\n\nThis is an unofficial implementation of the `asm2vec` model as a standalone python package. The details of th"
  },
  {
    "path": "asm2vec/__init__.py",
    "chars": 36,
    "preview": "__all__ = ['asm', 'model', 'parse']\n"
  },
  {
    "path": "asm2vec/asm.py",
    "chars": 4852,
    "preview": "from typing import *\n\n\nclass Instruction:\n    def __init__(self, op: str, *args: str):\n        self._op = op\n        sel"
  },
  {
    "path": "asm2vec/internal/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "asm2vec/internal/atomic.py",
    "chars": 1504,
    "preview": "from typing import *\nimport threading\n\n\nclass LockContextManager:\n    def __init__(self, lock: threading.Lock):\n        "
  },
  {
    "path": "asm2vec/internal/parse.py",
    "chars": 8622,
    "preview": "from typing import *\nimport logging\n\nimport asm2vec.asm\n\n\nclass AssemblySyntaxError(Exception):\n    def __init__(self, m"
  },
  {
    "path": "asm2vec/internal/repr.py",
    "chars": 4440,
    "preview": "import random\nfrom typing import *\nimport concurrent.futures\n\nfrom asm2vec.asm import Instruction\nfrom asm2vec.asm impor"
  },
  {
    "path": "asm2vec/internal/sampling.py",
    "chars": 403,
    "preview": "from typing import *\nimport random\n\nT = TypeVar('T')\n\n\nclass NegativeSampler:\n    def __init__(self, distribution: List["
  },
  {
    "path": "asm2vec/internal/training.py",
    "chars": 10565,
    "preview": "from typing import *\nimport math\nimport threading\nimport concurrent.futures\n\nimport numpy as np\n\nfrom asm2vec.asm import"
  },
  {
    "path": "asm2vec/internal/util.py",
    "chars": 144,
    "preview": "import numpy as np\n\n\ndef make_small_ndarray(dim: int) -> np.ndarray:\n    rng = np.random.default_rng()\n    return (rng.r"
  },
  {
    "path": "asm2vec/logging.py",
    "chars": 432,
    "preview": "import logging\n\n\ndef asm2vec_logger() -> logging.Logger:\n    return logging.getLogger('asm2vec')\n\n\ndef config_asm2vec_lo"
  },
  {
    "path": "asm2vec/model.py",
    "chars": 1996,
    "preview": "from typing import *\n\nimport numpy as np\n\nimport asm2vec.asm\nimport asm2vec.repo\n\nimport asm2vec.internal.training\nimpor"
  },
  {
    "path": "asm2vec/parse.py",
    "chars": 553,
    "preview": "from typing import *\n\nimport asm2vec.asm\nimport asm2vec.internal.parse\n\nfrom asm2vec.internal.parse import AssemblySynta"
  },
  {
    "path": "asm2vec/repo.py",
    "chars": 5031,
    "preview": "from typing import *\n\nimport numpy as np\n\nimport asm2vec.asm\nimport asm2vec.internal.util\n\n\nclass SequentialFunction:\n  "
  },
  {
    "path": "examples/estimating.s",
    "chars": 1712,
    "preview": "my_strlen_est:\n        cmp     BYTE PTR [rdi], 0\n        je      .L4\n        mov     rax, rdi\n.L3:\n        add     rax, "
  },
  {
    "path": "examples/training-estimating.py",
    "chars": 1501,
    "preview": "import numpy as np\n\nimport asm2vec.asm\nimport asm2vec.parse\nimport asm2vec.model\n\n\ndef cosine_similarity(v1, v2):\n    re"
  },
  {
    "path": "examples/training.s",
    "chars": 2741,
    "preview": "my_strlen_train:\n        push    rbp\n        mov     rbp, rsp\n        mov     QWORD PTR [rbp-24], rdi\n        mov     ra"
  },
  {
    "path": "tests/asm_test.py",
    "chars": 872,
    "preview": "import unittest as ut\n\nimport asm2vec.asm as asm\n\n\nclass InstructionTest(ut.TestCase):\n    def test_parse_instruction(se"
  },
  {
    "path": "tests/parse_test.py",
    "chars": 1696,
    "preview": "import unittest as ut\n\nimport asm2vec.parse\n\n\ntest_asm = \"\"\"\nmy_strlen:\n        push    rbp\n        mov     rbp, rsp\n   "
  },
  {
    "path": "tests/utilities_test.py",
    "chars": 552,
    "preview": "import unittest as ut\n\nimport asm2vec.internal.util as utilities\n\n\nclass PermutationTest(ut.TestCase):\n    def test_perm"
  }
]

About this extraction

This page contains the full source code of the Lancern/asm2vec GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 21 files (56.6 KB), approximately 15.5k tokens, and a symbol index with 209 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo