Repository: Lancern/asm2vec
Branch: master
Commit: d38a3bc3bc9c
Files: 21
Total size: 56.6 KB

Directory structure:
gitextract_5xp0becm/

├── .gitignore
├── README.md
├── asm2vec/
│   ├── __init__.py
│   ├── asm.py
│   ├── internal/
│   │   ├── __init__.py
│   │   ├── atomic.py
│   │   ├── parse.py
│   │   ├── repr.py
│   │   ├── sampling.py
│   │   ├── training.py
│   │   └── util.py
│   ├── logging.py
│   ├── model.py
│   ├── parse.py
│   └── repo.py
├── examples/
│   ├── estimating.s
│   ├── training-estimating.py
│   └── training.s
└── tests/
    ├── asm_test.py
    ├── parse_test.py
    └── utilities_test.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Created by https://www.toptal.com/developers/gitignore/api/macos,intellij,virtualenv,python
# Edit at https://www.toptal.com/developers/gitignore?templates=macos,intellij,virtualenv,python

### Intellij ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# Generated files
.idea/**/contentModel.xml

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn.  Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
.idea/httpRequests

# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser

### Intellij Patch ###
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721

# *.iml
# modules.xml
# .idea/misc.xml
# *.ipr

# Sonarlint plugin
# https://plugins.jetbrains.com/plugin/7973-sonarlint
.idea/**/sonarlint/

# SonarQube Plugin
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
.idea/**/sonarIssues.xml

# Markdown Navigator plugin
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
.idea/**/markdown-navigator.xml
.idea/**/markdown-navigator-enh.xml
.idea/**/markdown-navigator/

# Cache file creation bug
# See https://youtrack.jetbrains.com/issue/JBR-2257
.idea/$CACHE_FILE$

# CodeStream plugin
# https://plugins.jetbrains.com/plugin/12206-codestream
.idea/codestream.xml

### macOS ###
# General
.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon


# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk

### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
pytestdebug.log

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/
doc/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
pythonenv*

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# profiling data
.prof

### VirtualEnv ###
# Virtualenv
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
[Bb]in
[Ii]nclude
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
pyvenv.cfg
pip-selfcheck.json

# End of https://www.toptal.com/developers/gitignore/api/macos,intellij,virtualenv,python


================================================
FILE: README.md
================================================
# asm2vec

This is an unofficial implementation of the `asm2vec` model as a standalone python package. The details of the model can be found in the original paper: [(sp'19) Asm2Vec: Boosting Static Representation Robustness for Binary Clone Search against Code Obfuscation and Compiler Optimization](https://www.computer.org/csdl/proceedings-article/sp/2019/666000a038/19skfc3ZfKo)

## Requirements

This implementation is written in python 3.7 and it's recommended to use python 3.7+ as well. The only dependency of this package is `numpy` which can be installed as follows:

```shell
python3 -m pip install numpy
```

## How to use

### Import

To install the package, execute the following commands:

```shell
git clone https://github.com/lancern/asm2vec.git
```

Add the following line to the `.bashrc` file to add `asm2vec` to your python interpreter's search path for external packages:

```shell
export PYTHONPATH="path/to/asm2vec:$PYTHONPATH"
```

Replace `path/to/asm2vec` with the directory you clone `asm2vec` into. Then execute the following commands to update `PYTHONPATH`:

```shell
source ~/.bashrc
```

You can also add the following code snippets to your python source code referring `asm2vec` to guide python interpreter finding the package successfully:

```python
import sys
sys.path.append('path/to/asm2vec')
```

In your python code, use the following `import` statement to import this package:

```python
import asm2vec.<module-name>
```

### Define CFGs And Training

You have 2 approaches to define the binary program that will be sent to the `asm2vec` model. The first approach is to build the CFG manually, as shown below:

```python
from asm2vec.asm import BasicBlock
from asm2vec.asm import Function
from asm2vec.asm import parse_instruction

block1 = BasicBlock()
block1.add_instruction(parse_instruction('mov eax, ebx'))
block1.add_instruction(parse_instruction('jmp _loc'))

block2 = BasicBlock()
block2.add_instruction(parse_instruction('xor eax, eax'))
block2.add_instruction(parse_instruction('ret'))

block1.add_successor(block2)

block3 = BasicBlock()
block3.add_instruction(parse_instruction('sub eax, [ebp]'))

f1 = Function(block1, 'some_func')
f2 = Function(block3, 'another_func')

# block4 is ignore here for clarity
f3 = Function(block4, 'estimate_func')
```

And then you can train a model with the following code:

```python
from asm2vec.model import Asm2Vec

model = Asm2Vec(d=200)
train_repo = model.make_function_repo([f1, f2, f3])
model.train(train_repo)
```

The second approach is using the `parse` module provided by `asm2vec` to build CFGs automatically from an assembly code source file:

```python
from asm2vec.parse import parse_fp

with open('source.asm', 'r') as fp:
    funcs = parse_fp(fp)
```

And then you can train a model with the following code:

```python
from asm2vec.model import Asm2Vec

model = Asm2Vec(d=200)
train_repo = model.make_function_repo(funcs)
model.train(train_repo)
```

### Estimation

You can use the `asm2vec.model.Asm2Vec.to_vec` method to convert a function into its vector representation.

### Serialization

The implementation support serialization on many of its internal data structures so that you can serialize the internal state of a trained model into disk for future use.

You can serialize two data structures to primitive data: the function repository and the model memento.

> To be finished.

## Hyper Parameters

The constructor of `asm2vec.model.Asm2Vec` class accepts some keyword arguments as hyper parameters of the model. The following table lists all the hyper parameters available:

| Parameter Name          | Type    | Meaning                                                                                                | Default Value |
| ----------------------- | ------- | ------------------------------------------------------------------------------------------------------ | ------------- |
| `d`                     | `int`   | The dimention of the vectors for tokens.                                                               | `200`         |
| `initial_alpha`         | `float` | The initial learning rate.                                                                             | `0.05`        |
| `alpha_update_interval` | `int`   | How many tokens can be processed before changing the learning rate?                                    | `10000`       |
| `rnd_walks`             | `int`   | How many random walks to perform to sequentialize a function?                                          | `3`           |
| `neg_samples`           | `int`   | How many samples to take during negative sampling?                                                     | `25`          |
| `iteration`             | `int`   | How many iterations to perform? (This parameter is reserved for future use and is not implemented now) | `1`           |
| `jobs`                  | `int`   | How many tasks to execute concurrently during training?                                                | `4`           |

## Notes

For simplicity, the Selective Callee Expansion is not implemented in this early implementation. You have to do it manually before sending CFG into `asm2vec` .


================================================
FILE: asm2vec/__init__.py
================================================
__all__ = ['asm', 'model', 'parse']


================================================
FILE: asm2vec/asm.py
================================================
from typing import *


class Instruction:
    def __init__(self, op: str, *args: str):
        self._op = op
        self._args = list(args)

    def op(self) -> str:
        return self._op

    def number_of_args(self) -> int:
        return len(self._args)

    def args(self) -> List[str]:
        return self._args


def parse_instruction(code: str) -> Instruction:
    sep_index = code.find(' ')
    if sep_index == -1:
        return Instruction(code)

    op = code[:sep_index]   # Operator
    args_list = list(map(str.strip, code[sep_index:].split(',')))   # Operands
    return Instruction(op, *args_list)


class BasicBlock:
    _next_unused_id: int = 1

    def __init__(self):
        # Allocate a new unique ID for the basic block.
        self._id = self.__class__._next_unused_id
        self.__class__._next_unused_id += 1

        self._instructions = []
        self._predecessors = []
        self._successors = []

    def __iter__(self):
        return self._instructions.__iter__()

    def __len__(self):
        return len(self._instructions)

    def __hash__(self):
        return self._id.__hash__()

    def __eq__(self, other):
        if not isinstance(other, BasicBlock):
            return False
        return self._id == other.id()

    def __ne__(self, other):
        return not self.__eq__(other)

    def id(self) -> int:
        return self._id

    def add_instruction(self, instr: Instruction) -> None:
        self._instructions.append(instr)

    def body_instructions(self) -> List[Instruction]:
        return self._instructions[:-1]

    def instructions(self) -> List[Instruction]:
        return self._instructions

    def add_predecessor(self, predecessor: 'BasicBlock') -> None:
        self._predecessors.append(predecessor)
        predecessor._successors.append(self)

    def add_successor(self, successor: 'BasicBlock') -> None:
        self._successors.append(successor)
        successor._predecessors.append(self)

    def first_instruction(self) -> Instruction:
        return self._instructions[0]

    def last_instruction(self) -> Instruction:
        return self._instructions[-1]

    def predecessors(self) -> List['BasicBlock']:
        return self._predecessors

    def in_degree(self) -> int:
        return len(self._predecessors)

    def successors(self) -> List['BasicBlock']:
        return self._successors

    def out_degree(self) -> int:
        return len(self._successors)


class CFGWalkerCallback:
    def __call__(self, *args, **kwargs):
        self.on_enter(*args)

    def on_enter(self, block: BasicBlock) -> None:
        pass

    def on_exit(self, block: BasicBlock) -> None:
        pass


CFGWalkerCallbackType = Union[CFGWalkerCallback, Callable[[BasicBlock], Any]]


def _walk_cfg(entry: BasicBlock, action: CFGWalkerCallbackType, visited: Set) -> None:
    if entry.id() in visited:
        return

    visited.add(entry.id())
    action(entry)

    for successor in entry.successors():
        _walk_cfg(successor, action, visited)

    if isinstance(action, CFGWalkerCallback):
        action.on_exit(entry)


def walk_cfg(entry: BasicBlock, action: CFGWalkerCallbackType) -> None:
    _walk_cfg(entry, action, set())


class Function:
    _next_unused_id = 1

    def __init__(self, entry: BasicBlock, name: str = None):
        # Allocate a unique ID for the current Function object.
        self._id = self.__class__._next_unused_id
        self.__class__._next_unused_id += 1

        self._entry = entry
        self._name = name
        self._callees = []  # Functions that are called by this function
        self._callers = []  # Functions that call this function

    def __len__(self) -> int:
        instr_count = 0

        def count_instr(block: BasicBlock) -> None:
            nonlocal instr_count
            instr_count += len(block)

        walk_cfg(self._entry, count_instr)
        return instr_count

    def __hash__(self):
        return self._id

    def __eq__(self, other):
        if not isinstance(other, Function):
            return False
        return self._id == other.id()

    def __ne__(self, other):
        return not self.__eq__(other)

    def id(self) -> int:
        return self._id

    def entry(self) -> BasicBlock:
        return self._entry

    def name(self) -> str:
        return self._name

    def add_callee(self, f: 'Function') -> None:
        self._callees.append(f)
        f._callers.append(self)

    def callees(self) -> List['Function']:
        return self._callees

    def out_degree(self) -> int:
        return len(self._callees)

    def add_caller(self, f: 'Function') -> None:
        self._callers.append(f)
        f._callees.append(self)

    def callers(self) -> List['Function']:
        return self._callers

    def in_degree(self) -> int:
        return len(self._callers)


================================================
FILE: asm2vec/internal/__init__.py
================================================


================================================
FILE: asm2vec/internal/atomic.py
================================================
from typing import *
import threading


class LockContextManager:
    def __init__(self, lock: threading.Lock):
        self._lock = lock
        self._exited = False

    def __enter__(self):
        self._lock.acquire()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self._exited = True
        self._lock.release()

    def exited(self) -> bool:
        return self._exited


class Atomic:
    class AtomicContextManager(LockContextManager):
        def __init__(self, atomic: 'Atomic'):
            super().__init__(atomic._lock)
            self._atomic = atomic
            self._exited = False

        def __enter__(self):
            super().__enter__()
            return self

        def __exit__(self, exc_type, exc_val, exc_tb):
            super().__exit__(exc_type, exc_val, exc_tb)

        def value(self) -> Any:
            if self.exited():
                raise RuntimeError('Trying to access AtomicContextManager after its exit.')
            return self._atomic._val

        def set(self, value: Any) -> None:
            if self.exited():
                raise RuntimeError('Trying to access AtomicContextManager after its exit.')
            self._atomic._val = value

    def __init__(self, value: Any):
        self._val = value
        self._lock = threading.Lock()

    def lock(self) -> AtomicContextManager:
        return self.__class__.AtomicContextManager(self)

    def value(self) -> Any:
        with self.lock() as val:
            return val.value()


================================================
FILE: asm2vec/internal/parse.py
================================================
from typing import *
import logging

import asm2vec.asm


class AssemblySyntaxError(Exception):
    def __init__(self, message: str = None):
        self._msg = message

    def message(self) -> str:
        return self._msg


def raise_asm_syntax_error(expect: str, found: str) -> None:
    raise AssemblySyntaxError('Expect "{}", but "{}" was found.'.format(expect, found))


jmp_op = {
    'jmp', 'ja', 'jae', 'jb', 'jbe', 'jc', 'jcxz', 'jecxz', 'jrcxz', 'je', 'jg', 'jge', 'jl', 'jle', 'jna',
    'jnae', 'jnb', 'jnbe', 'jnc', 'jne', 'jng', 'jnge', 'jnl', 'jnle', 'jno', 'jnp', 'jns', 'jnz', 'jo', 'jp',
    'jpe', 'jpo', 'js', 'jz'
}

call_op = {
    'call'
}

ret_op = {
    'ret'
}

x86_64_regs = {
    'al', 'ah', 'bl', 'bh', 'cl', 'ch', 'dl', 'dh', 'spl', 'bpl', 'sil', 'dil',
    'ax', 'bx', 'cx', 'dx', 'sp', 'bp', 'si', 'di',
    'eax', 'ebx', 'ecx', 'edx', 'esp', 'ebp', 'esi', 'edi',
    'rax', 'rdx', 'rcx', 'rdx', 'rsp', 'rbp', 'rsi', 'rdi',
    'r8b', 'r9b', 'r10b', 'r11b', 'r12b', 'r13b', 'r14b', 'r15b',
    'r8w', 'r9w', 'r10w', 'r11w', 'r12w', 'r13w', 'r14w', 'r15w',
    'r8d', 'r9d', 'r10d', 'r11d', 'r12d', 'r13d', 'r14d', 'r15d',
    'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15',
    'cs', 'ss', 'ds', 'es', 'fs', 'gs',
    'ecs', 'ess', 'eds', 'ees', 'efs', 'egs',
    'rcs', 'rss', 'rds', 'res', 'rfs', 'rgs'
}


def is_jmp(op: str) -> bool:
    return op.lower() in jmp_op


def is_conditional_jmp(op: str) -> bool:
    return is_jmp(op) and op.lower() != 'jmp'


def is_call(op: str) -> bool:
    return op.lower() in call_op


def is_ret(op: str) -> bool:
    return op.lower() in ret_op


def is_reg(arg: str) -> bool:
    return arg.lower() in x86_64_regs


class CFGBuilder:
    def __init__(self, context: 'ParseContext'):
        self._context = context
        self._blocks: List[asm2vec.asm.BasicBlock] = []
        self._active_block = -1
        self._block_labels: Dict[str, int] = dict()

    def _logger(self) -> logging.Logger:
        return self._context.logger().getChild(self.__class__.__name__)

    def _allocate_block(self) -> int:
        self._blocks.append(asm2vec.asm.BasicBlock())
        return len(self._blocks) - 1

    def _allocate_named_block(self, name: str) -> int:
        if name in self._block_labels:
            return self._block_labels[name]
        else:
            idx = self._allocate_block()
            self._block_labels[name] = idx
            return idx

    def _get_active_block(self) -> asm2vec.asm.BasicBlock:
        return self._blocks[self._active_block]

    def _set_active_block(self, block_id: int) -> None:
        self._active_block = block_id

    def _has_active_block(self) -> bool:
        return self._active_block != -1

    def _close_active_block(self) -> None:
        self._active_block = -1

    def _add_jmp(self, op: str, args: List[str]) -> None:
        if len(args) != 1:
            raise_asm_syntax_error('Jump with single operand', '{} operands'.format(len(args)))
        cur_block = self._get_active_block()
        self._close_active_block()
        if is_conditional_jmp(op):
            # Allocate another basic block for more instructions since the current code point is reachable.
            # This may produce some empty basic blocks in the final output.
            self._set_active_block(self._allocate_block())
            self._get_active_block().add_predecessor(cur_block)

    def add_instr(self, op: str, args: List[str]) -> None:
        if not self._has_active_block():
            # Allocate a new basic block.
            self._set_active_block(self._allocate_block())

        self._get_active_block().add_instruction(asm2vec.asm.Instruction(op, *args))
        if is_jmp(op):
            self._add_jmp(op, args)
        elif is_ret(op):
            # `ret` instruction encountered. Close current active block.
            self._close_active_block()

    def set_label(self, label: str) -> None:
        block_id = self._block_labels.get(label, -1)
        if block_id == -1:
            # Test if the current active block is empty in which case we can reuse it.
            if self._has_active_block() and len(self._get_active_block()) == 0:
                self._block_labels[label] = self._active_block
            else:
                # Open a new block for the label.
                block_id = self._allocate_block()
                self._block_labels[label] = block_id
                # Link the new block with the previously-active block.
                if self._has_active_block():
                    self._get_active_block().add_successor(self._blocks[block_id])
                self._set_active_block(block_id)
        else:
            self._set_active_block(block_id)

    def build(self) -> List[asm2vec.asm.Function]:
        func_entries: Dict[str, int] = dict()

        # Walk through all instructions and fix block relations formed by jump and call instructions.
        for blk in self._blocks:
            for inst in blk:
                if is_jmp(inst.op()):
                    target = inst.args()[0]
                    if target in self._block_labels:
                        blk.add_successor(self._blocks[self._block_labels[target]])
                elif is_call(inst.op()):
                    target = inst.args()[0]
                    if target in self._block_labels and target not in func_entries:
                        func_entries[target] = self._block_labels[target]

        for func_name in self._context.options().func_names():
            if func_name not in self._block_labels:
                self._logger().warning('Cannot find function "{}"', func_name)
                continue
            if func_name not in func_entries:
                func_entries[func_name] = self._block_labels[func_name]

        funcs: Dict[str, asm2vec.asm.Function] = \
            dict(map(lambda x: (x[0], asm2vec.asm.Function(self._blocks[x[1]], x[0])), func_entries.items()))

        # Fix function call relation.
        for (name, f) in funcs.items():
            def block_action(block: asm2vec.asm.BasicBlock) -> None:
                for instr in block:
                    if is_call(instr.op()):
                        callee_name = instr.args()[0]
                        if callee_name in funcs:
                            f.add_callee(funcs[callee_name])

            asm2vec.asm.walk_cfg(f.entry(), block_action)

        # TODO: Implement Selective Callee Expansion here.

        return list(funcs.values())


class ParseOptions:
    def __init__(self, **kwargs):
        self._func_names = kwargs.get('func_names', [])

    def func_names(self) -> List[str]:
        return self._func_names


class ParseContext:
    def __init__(self, **kwargs):
        self._builder = CFGBuilder(self)
        self._options = ParseOptions(**kwargs)
        self._logger = logging.getLogger('asm2vec.ParseContext')

    def logger(self) -> logging.Logger:
        return self._logger

    def options(self) -> ParseOptions:
        return self._options

    def builder(self) -> CFGBuilder:
        return self._builder


'''

Parser rules for input assembly file:

program
    : asm_line*
    ;

asm_line
    : asm_label '\n'
    | BLANKS asm_instr '\n'
    ;

asm_label
    : ASM_LABEL_ID ':'
    ;

asm_instr
    : ASM_INSTR_OP ' ' asm_instr_arg_list
    ;

asm_instr_arg_list
    : ASM_INSTR_ARG (',' asm_instr_arg_list)?
    | /* epsilon */
    ;

BLANKS : [ \n\t]+;

'''


def is_fullmatch(pattern, s: str) -> bool:
    return pattern.fullmatch(s) is not None


def parse_asm_label(ln: str, context: ParseContext) -> None:
    stripped = ln.strip()
    if stripped[-1] != ':':
        raise_asm_syntax_error('asm_label', ln)

    context.builder().set_label(stripped[:-1])


def parse_asm_instr(ln: str, context: ParseContext) -> None:
    delim_index = ln.find(' ')
    args = []
    if delim_index == -1:
        op = ln
    else:
        op = ln[:delim_index]
        args = list(map(lambda arg: arg.strip(), ln[delim_index + 1:].split(',')))

    context.builder().add_instr(op, args)


def parse_asm_line(ln: str, context: ParseContext) -> None:
    if len(ln.strip()) == 0:
        return

    if ln[0].isspace():
        # Expect production asm_line -> BLANKS asm_instr '\n'
        parse_asm_instr(ln.strip(), context)
    else:
        # Expect production asm_line -> asm_label
        parse_asm_label(ln, context)


def parse_asm_lines(lines: Iterable[str], **kwargs) -> List[asm2vec.asm.Function]:
    context = ParseContext(**kwargs)
    for ln in lines:
        parse_asm_line(ln, context)
    return context.builder().build()


================================================
FILE: asm2vec/internal/repr.py
================================================
import random
from typing import *
import concurrent.futures

from asm2vec.asm import Instruction
from asm2vec.asm import BasicBlock
from asm2vec.asm import Function
from asm2vec.asm import walk_cfg
from asm2vec.repo import SequentialFunction
from asm2vec.repo import VectorizedFunction
from asm2vec.repo import VectorizedToken
from asm2vec.repo import Token
from asm2vec.repo import FunctionRepository
from asm2vec.logging import asm2vec_logger

from asm2vec.internal.atomic import Atomic


def _random_walk(f: Function) -> List[Instruction]:
    visited: Set[int] = set()
    current = f.entry()
    seq: List[Instruction] = []

    while current.id() not in visited:
        visited.add(current.id())
        for instr in current:
            seq.append(instr)
        if len(current.successors()) == 0:
            break

        current = random.choice(current.successors())

    return seq


def _edge_sampling(f: Function) -> List[List[Instruction]]:
    edges: List[Tuple[BasicBlock, BasicBlock]] = []

    def collect_edges(block: BasicBlock) -> None:
        nonlocal edges
        for successor in block.successors():
            edges.append((block, successor))

    walk_cfg(f.entry(), collect_edges)

    visited_edges: Set[Tuple[int, int]] = set()
    sequences = []
    while len(visited_edges) < len(edges):
        e = random.choice(edges)
        visited_edges.add((e[0].id(), e[1].id()))
        sequences.append(list(e[0]) + list(e[1]))

    return sequences


def make_sequential_function(f: Function, num_of_random_walks: int = 10) -> SequentialFunction:
    seq: List[List[Instruction]] = []

    for _ in range(num_of_random_walks):
        seq.append(_random_walk(f))

    # seq += _edge_sampling(f)

    return SequentialFunction(f.id(), f.name(), seq)


def _get_function_tokens(f: Function, dim: int = 200) -> List[VectorizedToken]:
    tokens: List[VectorizedToken] = []

    def collect_tokens(block: BasicBlock) -> None:
        nonlocal tokens
        for ins in block:
            tk: List[str] = [ins.op()] + ins.args()
            for t in tk:
                tokens.append(VectorizedToken(t, None, None, dim))

    walk_cfg(f.entry(), collect_tokens)
    return tokens


def _make_function_repo_helper(vocab: Dict[str, Token], funcs: List[Function],
                               dim: int, num_of_rnd_walks: int, jobs: int) -> FunctionRepository:
    progress = Atomic(1)

    vec_funcs_atomic = Atomic([])
    vocab_atomic = Atomic(vocab)

    def func_handler(f: Function):
        with vec_funcs_atomic.lock() as vfa:
            vfa.value().append(VectorizedFunction(make_sequential_function(f, num_of_rnd_walks), dim=dim*2))

        tokens = _get_function_tokens(f, dim)
        for tk in tokens:
            with vocab_atomic.lock() as va:
                if tk.name() in va.value():
                    va.value()[tk.name()].count += 1
                else:
                    va.value()[tk.name()] = Token(tk)

        asm2vec_logger().debug('Sequence generated for function "%s", progress: %f%%',
                               f.name(), progress.value() / len(funcs) * 100)
        with progress.lock() as prog:
            prog.set(prog.value() + 1)

    executor = concurrent.futures.ThreadPoolExecutor(max_workers=jobs)
    fs = []
    for fn in funcs:
        fs.append(executor.submit(func_handler, fn))
    done, not_done = concurrent.futures.wait(fs, return_when=concurrent.futures.FIRST_EXCEPTION)

    if len(not_done) > 0 or any(map(lambda fut: fut.cancelled() or not fut.done(), done)):
        raise RuntimeError('Not all tasks finished successfully.')

    vec_funcs = vec_funcs_atomic.value()
    repo = FunctionRepository(vec_funcs, vocab)

    # Re-calculate the frequency of each token.
    for t in repo.vocab().values():
        t.frequency = t.count / repo.num_of_tokens()

    return repo


def make_function_repo(funcs: List[Function], dim: int, num_of_rnd_walks: int, jobs: int) -> FunctionRepository:
    return _make_function_repo_helper(dict(), funcs, dim, num_of_rnd_walks, jobs)


def make_estimate_repo(vocabulary: Dict[str, Token], f: Function,
                       dim: int, num_of_rnd_walks: int) -> FunctionRepository:
    # Make a copy of the function list and vocabulary to avoid the change to affect the original trained repo.
    vocab: Dict[str, Token] = dict(**vocabulary)
    return _make_function_repo_helper(vocab, [f], dim, num_of_rnd_walks, 1)


================================================
FILE: asm2vec/internal/sampling.py
================================================
from typing import *
import random

T = TypeVar('T')


class NegativeSampler:
    def __init__(self, distribution: List[Tuple[T, float]], alpha: float = 3 / 4):
        self._values = list(map(lambda x: x[0], distribution))
        self._weights = list(map(lambda x: x[1] ** alpha, distribution))

    def sample(self, k: int) -> List[T]:
        return random.choices(self._values, self._weights, k=k)


================================================
FILE: asm2vec/internal/training.py
================================================
from typing import *
import math
import threading
import concurrent.futures

import numpy as np

from asm2vec.asm import Instruction
from asm2vec.internal.repr import FunctionRepository
from asm2vec.internal.repr import VectorizedFunction
from asm2vec.internal.repr import Token
from asm2vec.internal.repr import VectorizedToken
from asm2vec.internal.sampling import NegativeSampler
from asm2vec.internal.atomic import LockContextManager
from asm2vec.internal.atomic import Atomic
from asm2vec.logging import asm2vec_logger


class Asm2VecParams:
    def __init__(self, **kwargs):
        self.d: int = kwargs.get('d', 200)
        self.initial_alpha: float = kwargs.get('alpha', 0.0025)
        self.alpha_update_interval: int = kwargs.get('alpha_update_interval', 10000)
        self.num_of_rnd_walks: int = kwargs.get('rnd_walks', 3)
        self.neg_samples: int = kwargs.get('neg_samples', 25)
        self.iteration: int = kwargs.get('iteration', 1)
        self.jobs: int = kwargs.get('jobs', 4)

    def to_dict(self) -> Dict[str, Any]:
        return {
            'd': self.d,
            'alpha': self.initial_alpha,
            'alpha_update_interval': self.alpha_update_interval,
            'num_of_rnd_walks': self.num_of_rnd_walks,
            'neg_samples': self.neg_samples,
            'iteration': self.iteration,
            'jobs': self.jobs
        }

    def populate(self, rep: Dict[bytes, Any]) -> None:
        self.d: int = rep.get(b'd', 200)
        self.initial_alpha: float = rep.get(b'alpha', 0.0025)
        self.alpha_update_interval: int = rep.get(b'alpha_update_interval', 10000)
        self.num_of_rnd_walks: int = rep.get(b'rnd_walks', 3)
        self.neg_samples: int = rep.get(b'neg_samples', 25)
        self.iteration: int = rep.get(b'iteration', 1)
        self.jobs: int = rep.get(b'jobs', 4)


class SequenceWindow:
    def __init__(self, sequence: List[Instruction], vocabulary: Dict[str, Token]):
        self._seq = sequence
        self._vocab = vocabulary
        self._i = 1

        self._prev_ins = None
        self._curr_ins = None
        self._next_ins = None

        self._prev_ins_op = None
        self._prev_ins_args = None
        self._curr_ins_op = None
        self._curr_ins_args = None
        self._next_ins_op = None
        self._next_ins_args = None

    def move_next(self) -> bool:
        if self._i >= len(self._seq) - 1:
            return False

        def token_lookup(name) -> VectorizedToken:
            return self._vocab[name].vectorized()

        self._prev_ins = self._seq[self._i - 1]
        self._curr_ins = self._seq[self._i]
        self._next_ins = self._seq[self._i + 1]

        self._prev_ins_op = token_lookup(self._prev_ins.op())
        self._prev_ins_args = list(map(token_lookup, self._prev_ins.args()))
        self._curr_ins_op = token_lookup(self._curr_ins.op())
        self._curr_ins_args = list(map(token_lookup, self._curr_ins.args()))
        self._next_ins_op = token_lookup(self._next_ins.op())
        self._next_ins_args = list(map(token_lookup, self._next_ins.args()))

        self._i += 1

        return True

    def prev_ins(self) -> Instruction:
        return self._prev_ins

    def prev_ins_op(self) -> VectorizedToken:
        return self._prev_ins_op

    def prev_ins_args(self) -> List[VectorizedToken]:
        return self._prev_ins_args

    def curr_ins(self) -> Instruction:
        return self._curr_ins

    def curr_ins_op(self) -> VectorizedToken:
        return self._curr_ins_op

    def curr_ins_args(self) -> List[VectorizedToken]:
        return self._curr_ins_args

    def next_ins(self) -> Instruction:
        return self._next_ins

    def next_ins_op(self) -> VectorizedToken:
        return self._next_ins_op

    def next_ins_args(self) -> List[VectorizedToken]:
        return self._next_ins_args


class TrainingContext:
    class Counter:
        def __init__(self, context: 'TrainingContext', name: str, initial: int = 0):
            self._context = context
            self._name = name
            self._val = initial

        def val(self) -> int:
            with self._context.lock():
                return self._val

        def inc(self) -> int:
            with self._context.lock():
                self._val += 1
                return self._val

        def reset(self) -> int:
            with self._context.lock():
                v = self._val
                self._val = 0
                return v

    TOKENS_HANDLED_COUNTER: str = "tokens_handled"

    def __init__(self, repo: FunctionRepository, params: Asm2VecParams, is_estimating: bool = False):
        self._repo = repo
        self._params = params
        self._alpha = params.initial_alpha
        self._sampler = NegativeSampler(list(map(lambda t: (t, t.frequency), repo.vocab().values())))
        self._is_estimating = is_estimating
        self._counters = dict()
        self._lock = threading.Lock()

    def repo(self) -> FunctionRepository:
        return self._repo

    def params(self) -> Asm2VecParams:
        return self._params

    def lock(self) -> LockContextManager:
        return LockContextManager(self._lock)

    def alpha(self) -> float:
        with self.lock():
            return self._alpha

    def set_alpha(self, alpha: float) -> None:
        with self.lock():
            self._alpha = alpha

    def sampler(self) -> NegativeSampler:
        return self._sampler

    def is_estimating(self) -> bool:
        return self._is_estimating

    def create_sequence_window(self, seq: List[Instruction]) -> SequenceWindow:
        return SequenceWindow(seq, self._repo.vocab())

    def get_counter(self, name: str) -> Counter:
        with self.lock():
            return self._counters.get(name)

    def add_counter(self, name: str, initial: int = 0) -> Counter:
        with self.lock():
            c = self.__class__.Counter(self, name, initial)
            self._counters[name] = c
            return c


def _sigmoid(x: float) -> float:
    return 1 / (1 + np.exp(-x))


def _identity(cond: bool) -> int:
    return 1 if cond else 0


def _dot_sigmoid(lhs: np.ndarray, rhs: np.ndarray) -> float:
    # noinspection PyTypeChecker
    return _sigmoid(np.dot(lhs, rhs))


def _get_inst_repr(op: VectorizedToken, args: List[VectorizedToken]) -> np.ndarray:
    if len(args) == 0:
        arg_vec = np.zeros(len(op.v))
    else:
        arg_vec = np.average(list(map(lambda tk: tk.v, args)), axis=0)
    return np.hstack((op.v, arg_vec))


def _train_vectorized(wnd: SequenceWindow, f: VectorizedFunction, context: TrainingContext) -> None:
    ct_prev = _get_inst_repr(wnd.prev_ins_op(), wnd.prev_ins_args())
    ct_next = _get_inst_repr(wnd.next_ins_op(), wnd.next_ins_args())
    delta = np.average([ct_prev, f.v, ct_next], axis=0)

    tokens = [wnd.curr_ins_op()] + wnd.curr_ins_args()

    f_grad = np.zeros(f.v.shape)
    for tk in tokens:
        # Negative sampling.
        sampled_tokens: Dict[str, VectorizedToken] = \
            dict(map(lambda x: (x.name(), x.vectorized()), context.sampler().sample(context.params().neg_samples)))
        if tk.name() not in sampled_tokens:
            sampled_tokens[tk.name()] = tk

        # The following code block tries to update the learning rate when necessary. Not required for now.
        # tokens_handled_counter = context.get_counter(TrainingContext.TOKENS_HANDLED_COUNTER)
        # if tokens_handled_counter is not None:
        #     if tokens_handled_counter.val() % context.params().alpha_update_interval == 0:
        #         # Update the learning rate.
        #         alpha = 1 - tokens_handled_counter.val() / (
        #                 context.params().iteration * context.repo().num_of_tokens() + 1)
        #         context.set_alpha(max(alpha, context.params().initial_alpha * 0.0001))

        for sp_tk in sampled_tokens.values():
            # Accumulate gradient for function vector.
            g = (_dot_sigmoid(delta, tk.v_pred) - _identity(tk is sp_tk)) * context.alpha()
            f_grad += g / 3 * tk.v_pred

            if not context.is_estimating():
                with context.lock():
                    # Update v'_t
                    tk.v_pred -= g * delta

    # Apply function gradient.
    with context.lock():
        f.v -= f_grad

    if not context.is_estimating():
        # Apply gradient to instructions.
        d = len(f_grad) // 2

        with context.lock():
            wnd.prev_ins_op().v -= f_grad[:d]
            if len(wnd.prev_ins_args()) > 0:
                prev_args_grad = f_grad[d:] / len(wnd.prev_ins_args())
                for t in wnd.prev_ins_args():
                    t.v -= prev_args_grad

            wnd.next_ins_op().v -= f_grad[:d]
            if len(wnd.next_ins_args()) > 0:
                next_args_grad = f_grad[d:] / len(wnd.next_ins_args())
                for t in wnd.next_ins_args():
                    t.v -= next_args_grad


def _train_sequence(f: VectorizedFunction, seq: List[Instruction], context: TrainingContext) -> None:
    wnd = context.create_sequence_window(seq)
    while wnd.move_next():
        _train_vectorized(wnd, f, context)


def train(repository: FunctionRepository, params: Asm2VecParams) -> None:
    context = TrainingContext(repository, params)
    context.add_counter(TrainingContext.TOKENS_HANDLED_COUNTER)

    asm2vec_logger().debug('Total number of functions: %d', len(context.repo().funcs()))
    progress = Atomic(1)

    def train_function(fn: VectorizedFunction):
        for seq in fn.sequential().sequences():
            _train_sequence(fn, seq, context)

        asm2vec_logger().debug('Function "%s" trained, progress: %f%%',
                               fn.sequential().name(), progress.value() / len(context.repo().funcs()) * 100)
        with progress.lock() as prog_proxy:
            prog_proxy.set(prog_proxy.value() + 1)

    executor = concurrent.futures.ThreadPoolExecutor(max_workers=context.params().jobs)
    futures = []
    for f in context.repo().funcs():
        futures.append(executor.submit(train_function, f))

    done, not_done = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_EXCEPTION)
    if len(not_done) > 0:
        raise RuntimeError('Train failed due to one or more failed task.')


def estimate(f: VectorizedFunction, estimate_repo: FunctionRepository, params: Asm2VecParams) -> np.ndarray:
    context = TrainingContext(estimate_repo, params, True)
    for seq in f.sequential().sequences():
        _train_sequence(f, seq, context)

    return f.v


================================================
FILE: asm2vec/internal/util.py
================================================
import numpy as np


def make_small_ndarray(dim: int) -> np.ndarray:
    rng = np.random.default_rng()
    return (rng.random(dim) - 0.5) / dim


================================================
FILE: asm2vec/logging.py
================================================
import logging


def asm2vec_logger() -> logging.Logger:
    return logging.getLogger('asm2vec')


def config_asm2vec_logging(**kwargs):
    level = kwargs.get('level', logging.WARNING)
    handlers = kwargs.get('handlers', [])
    filters = kwargs.get('filters', [])

    asm2vec_logger().setLevel(level)
    for hd in handlers:
        asm2vec_logger().addHandler(hd)
    for ft in filters:
        asm2vec_logger().addFilter(ft)


================================================
FILE: asm2vec/model.py
================================================
from typing import *

import numpy as np

import asm2vec.asm
import asm2vec.repo

import asm2vec.internal.training
import asm2vec.internal.repr
import asm2vec.internal.util


class Asm2VecMemento:
    def __init__(self):
        self.params: Optional[asm2vec.internal.training.Asm2VecParams] = None
        self.vocab: Optional[Dict[str, asm2vec.repo.Token]] = None

    def serialize(self) -> Dict[str, Any]:
        return {
            'params': self.params.to_dict(),
            'vocab': asm2vec.repo.serialize_vocabulary(self.vocab)
        }

    def populate(self, rep: Dict[bytes, Any]) -> None:
        self.params = asm2vec.internal.training.Asm2VecParams()
        self.params.populate(rep[b'params'])
        self.vocab = asm2vec.repo.deserialize_vocabulary(rep[b'vocab'])


class Asm2Vec:
    def __init__(self, **kwargs):
        self._params = asm2vec.internal.training.Asm2VecParams(**kwargs)
        self._vocab = None

    def memento(self) -> Asm2VecMemento:
        memento = Asm2VecMemento()
        memento.params = self._params
        memento.vocab = self._vocab
        return memento

    def set_memento(self, memento: Asm2VecMemento) -> None:
        self._params = memento.params
        self._vocab = memento.vocab

    def make_function_repo(self, funcs: List[asm2vec.asm.Function]) -> asm2vec.repo.FunctionRepository:
        return asm2vec.internal.repr.make_function_repo(
            funcs, self._params.d, self._params.num_of_rnd_walks, self._params.jobs)

    def train(self, repo: asm2vec.repo.FunctionRepository) -> None:
        asm2vec.internal.training.train(repo, self._params)
        self._vocab = repo.vocab()

    def to_vec(self, f: asm2vec.asm.Function) -> np.ndarray:
        estimate_repo = asm2vec.internal.repr.make_estimate_repo(
            self._vocab, f, self._params.d, self._params.num_of_rnd_walks)
        vf = estimate_repo.funcs()[0]

        asm2vec.internal.training.estimate(vf, estimate_repo, self._params)

        return vf.v


================================================
FILE: asm2vec/parse.py
================================================
from typing import *

import asm2vec.asm
import asm2vec.internal.parse

from asm2vec.internal.parse import AssemblySyntaxError


def parse_text(asm: str, **kwargs) -> List[asm2vec.asm.Function]:
    return asm2vec.internal.parse.parse_asm_lines(asm.split('\n'), **kwargs)


def parse_fp(fp, **kwargs) -> List[asm2vec.asm.Function]:
    return asm2vec.internal.parse.parse_asm_lines(fp, **kwargs)


def parse(asm_file_name: str, **kwargs) -> List[asm2vec.asm.Function]:
    with open(asm_file_name, mode='r') as fp:
        return parse_fp(fp, **kwargs)


================================================
FILE: asm2vec/repo.py
================================================
from typing import *

import numpy as np

import asm2vec.asm
import asm2vec.internal.util


class SequentialFunction:
    def __init__(self, fid: int, name: str, sequences: List[List[asm2vec.asm.Instruction]]):
        self._id = fid
        self._name = name
        self._seq = sequences

    def id(self) -> int:
        return self._id

    def name(self) -> str:
        return self._name

    def sequences(self) -> List[List[asm2vec.asm.Instruction]]:
        return self._seq


class VectorizedFunction:
    def __init__(self, f: SequentialFunction, v: np.ndarray = None, dim: int = 400):
        self._f = f
        self.v = v if v is not None else asm2vec.internal.util.make_small_ndarray(dim)

    def sequential(self) -> SequentialFunction:
        return self._f


class VectorizedToken:
    def __init__(self, name: str, v: np.ndarray = None, v_pred: np.ndarray = None, dim: int = 200):
        self._name = name
        self.v = v if v is not None else np.zeros(dim)
        self.v_pred = v_pred if v_pred is not None else asm2vec.internal.util.make_small_ndarray(dim * 2)

    def __eq__(self, other):
        if not isinstance(other, VectorizedToken):
            return False

        return self._name == other._name

    def __ne__(self, other):
        return not self.__eq__(other)

    def name(self) -> str:
        return self._name


class Token:
    def __init__(self, vt: VectorizedToken, count: int = 1):
        self._vt = vt
        self.count: int = count
        self.frequency: float = 0

    def vectorized(self) -> VectorizedToken:
        return self._vt

    def name(self) -> str:
        return self._vt.name()


class FunctionRepository:
    def __init__(self, funcs: List[VectorizedFunction], vocab: Dict[str, Token]):
        self._funcs = funcs
        self._vocab = vocab
        self._num_of_tokens = sum(map(lambda x: x.count, vocab.values()))

    def funcs(self) -> List[VectorizedFunction]:
        return self._funcs

    def vocab(self) -> Dict[str, Token]:
        return self._vocab

    def num_of_tokens(self) -> int:
        return self._num_of_tokens


def _serialize_token(token: Token) -> Dict[str, Any]:
    return {
        'name': token.name(),
        'v': list(token.vectorized().v),
        'v_pred': list(token.vectorized().v_pred),
        'count': token.count,
        'frequency': token.frequency
    }


def _deserialize_token(rep: Dict[bytes, Any]) -> Token:
    name = rep[b'name'].decode('utf-8')
    v = np.array(rep[b'v'])
    v_pred = np.array(rep[b'v_pred'])
    count = rep[b'count']
    frequency = rep[b'frequency']

    token = Token(VectorizedToken(name, v, v_pred))
    token.count = count
    token.frequency = frequency
    return token


def serialize_vocabulary(vocab: Dict[str, Token]) -> Dict[str, Any]:
    return dict(zip(vocab.keys(), map(_serialize_token, vocab.values())))


def deserialize_vocabulary(rep: Dict[bytes, Any]) -> Dict[str, Token]:
    return dict(zip(map(lambda b: b.decode('utf-8'), rep.keys()), map(_deserialize_token, rep.values())))


def _serialize_sequence(seq: List[asm2vec.asm.Instruction]) -> List[Any]:
    return list(map(lambda instr: [instr.op(), instr.args()], seq))


def _deserialize_sequence(rep: List[Any]) -> List[asm2vec.asm.Instruction]:
    return list(map(
        lambda instr_rep: asm2vec.asm.Instruction(instr_rep[0].decode('utf-8'), instr_rep[1].decode('utf-8')), rep))


def _serialize_vectorized_function(func: VectorizedFunction, include_sequences: bool) -> Dict[str, Any]:
    data = {
        'id': func.sequential().id(),
        'name': func.sequential().name(),
        'v': list(func.v)
    }

    if include_sequences:
        data['sequences'] = list(map(_serialize_sequence, func.sequential().sequences()))

    return data


def _deserialize_vectorized_function(rep: Dict[bytes, Any]) -> VectorizedFunction:
    name = rep[b'name'].decode('utf-8')
    fid = rep[b'id']
    v = np.array(rep[b'v'])
    sequences = list(map(_deserialize_sequence, rep.get(b'sequences', [])))
    return VectorizedFunction(SequentialFunction(fid, name, sequences), v)


SERIALIZE_VOCABULARY: int = 1
SERIALIZE_FUNCTION: int = 2
SERIALIZE_FUNCTION_SEQUENCES: int = 4
SERIALIZE_ALL: int = SERIALIZE_VOCABULARY | SERIALIZE_FUNCTION | SERIALIZE_FUNCTION_SEQUENCES


def serialize_function_repo(repo: FunctionRepository, flags: int) -> Dict[str, Any]:
    data = dict()
    if (flags & SERIALIZE_VOCABULARY) != 0:
        data['vocab'] = serialize_vocabulary(repo.vocab())
    if (flags & SERIALIZE_FUNCTION) != 0:
        include_sequences = ((flags & SERIALIZE_FUNCTION_SEQUENCES) != 0)
        data['funcs'] = list(map(
            lambda f: _serialize_vectorized_function(f, include_sequences),
            repo.funcs()))

    return data


def deserialize_function_repo(rep: Dict[bytes, Any]) -> FunctionRepository:
    funcs = list(map(_deserialize_vectorized_function, rep.get(b'funcs', [])))
    vocab = deserialize_vocabulary(rep.get(b'vocab', dict()))
    return FunctionRepository(funcs, vocab)


================================================
FILE: examples/estimating.s
================================================
my_strlen_est:
        cmp     BYTE PTR [rdi], 0
        je      .L4
        mov     rax, rdi
.L3:
        add     rax, 1
        cmp     BYTE PTR [rax], 0
        jne     .L3
.L2:
        sub     rax, rdi
        ret
.L4:
        mov     rax, rdi
        jmp     .L2
my_strcmp_est:
        movzx   eax, BYTE PTR [rdi]
        test    al, al
        je      .L12
.L7:
        movzx   edx, BYTE PTR [rsi]
        test    dl, dl
        je      .L15
        cmp     dl, al
        jne     .L16
        add     rdi, 1
        add     rsi, 1
        movzx   eax, BYTE PTR [rdi]
        test    al, al
        jne     .L7
.L12:
        cmp     BYTE PTR [rsi], 0
        setne   dl
        movzx   edx, dl
        neg     edx
.L6:
        mov     eax, edx
        ret
.L16:
        movsx   eax, al
        movsx   edx, dl
        sub     eax, edx
        mov     edx, eax
        jmp     .L6
.L15:
        mov     edx, 1
        test    al, al
        jne     .L6
        jmp     .L12
.LC0:
        .string "%s"
.LC1:
        .string "%d\n"
main:
        sub     rsp, 264
        lea     rsi, [rsp+128]
        mov     edi, OFFSET FLAT:.LC0
        mov     eax, 0
        call    scanf
        mov     rsi, rsp
        mov     edi, OFFSET FLAT:.LC0
        mov     eax, 0
        call    scanf
        lea     rdi, [rsp+128]
        call    my_strlen_est
        mov     esi, eax
        mov     edi, OFFSET FLAT:.LC1
        mov     eax, 0
        call    printf
        mov     rsi, rsp
        lea     rdi, [rsp+128]
        call    my_strcmp_est
        mov     esi, eax
        mov     edi, OFFSET FLAT:.LC1
        mov     eax, 0
        call    printf
        mov     eax, 0
        add     rsp, 264
        ret

================================================
FILE: examples/training-estimating.py
================================================
import numpy as np

import asm2vec.asm
import asm2vec.parse
import asm2vec.model


def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


def main():
    training_funcs = asm2vec.parse.parse('training.s',
                                         func_names=['main', 'my_strlen_train', 'my_strcmp_train'])
    estimating_funcs = asm2vec.parse.parse('estimating.s',
                                           func_names=['main', 'my_strlen_est', 'my_strcmp_est'])

    print('# of training functions:', len(training_funcs))
    print('# of estimating functions:', len(estimating_funcs))

    model = asm2vec.model.Asm2Vec(d=200)
    training_repo = model.make_function_repo(training_funcs)
    model.train(training_repo)
    print('Training complete.')

    for tf in training_repo.funcs():
        print('Norm of trained function "{}" = {}'.format(tf.sequential().name(), np.linalg.norm(tf.v)))

    estimating_funcs_vec = list(map(lambda f: model.to_vec(f), estimating_funcs))
    print('Estimating complete.')

    for (ef, efv) in zip(estimating_funcs, estimating_funcs_vec):
        print('Norm of trained function "{}" = {}'.format(ef.name(), np.linalg.norm(efv)))

    for tf in training_repo.funcs():
        for (ef, efv) in zip(estimating_funcs, estimating_funcs_vec):
            sim = cosine_similarity(tf.v, efv)
            print('sim("{}", "{}") = {}'.format(tf.sequential().name(), ef.name(), sim))


if __name__ == '__main__':
    main()


================================================
FILE: examples/training.s
================================================
my_strlen_train:
        push    rbp
        mov     rbp, rsp
        mov     QWORD PTR [rbp-24], rdi
        mov     rax, QWORD PTR [rbp-24]
        mov     QWORD PTR [rbp-8], rax
        jmp     .L2
.L3:
        add     QWORD PTR [rbp-8], 1
.L2:
        mov     rax, QWORD PTR [rbp-8]
        movzx   eax, BYTE PTR [rax]
        test    al, al
        jne     .L3
        mov     rax, QWORD PTR [rbp-8]
        sub     rax, QWORD PTR [rbp-24]
        pop     rbp
        ret
my_strcmp_train:
        push    rbp
        mov     rbp, rsp
        mov     QWORD PTR [rbp-8], rdi
        mov     QWORD PTR [rbp-16], rsi
        jmp     .L6
.L10:
        mov     rax, QWORD PTR [rbp-8]
        movzx   edx, BYTE PTR [rax]
        mov     rax, QWORD PTR [rbp-16]
        movzx   eax, BYTE PTR [rax]
        cmp     dl, al
        je      .L7
        mov     rax, QWORD PTR [rbp-8]
        movzx   eax, BYTE PTR [rax]
        movsx   edx, al
        mov     rax, QWORD PTR [rbp-16]
        movzx   eax, BYTE PTR [rax]
        movsx   eax, al
        sub     edx, eax
        mov     eax, edx
        jmp     .L8
.L7:
        add     QWORD PTR [rbp-8], 1
        add     QWORD PTR [rbp-16], 1
.L6:
        mov     rax, QWORD PTR [rbp-8]
        movzx   eax, BYTE PTR [rax]
        test    al, al
        je      .L9
        mov     rax, QWORD PTR [rbp-16]
        movzx   eax, BYTE PTR [rax]
        test    al, al
        jne     .L10
.L9:
        mov     rax, QWORD PTR [rbp-8]
        movzx   eax, BYTE PTR [rax]
        test    al, al
        je      .L11
        mov     eax, 1
        jmp     .L8
.L11:
        mov     rax, QWORD PTR [rbp-16]
        movzx   eax, BYTE PTR [rax]
        test    al, al
        je      .L12
        mov     eax, -1
        jmp     .L8
.L12:
        mov     eax, 0
.L8:
        pop     rbp
        ret
.LC0:
        .string "%s"
.LC1:
        .string "%d\n"
main:
        push    rbp
        mov     rbp, rsp
        sub     rsp, 256
        lea     rax, [rbp-128]
        mov     rsi, rax
        mov     edi, OFFSET FLAT:.LC0
        mov     eax, 0
        call    scanf
        lea     rax, [rbp-256]
        mov     rsi, rax
        mov     edi, OFFSET FLAT:.LC0
        mov     eax, 0
        call    scanf
        lea     rax, [rbp-128]
        mov     rdi, rax
        call    my_strlen_train
        mov     esi, eax
        mov     edi, OFFSET FLAT:.LC1
        mov     eax, 0
        call    printf
        lea     rdx, [rbp-256]
        lea     rax, [rbp-128]
        mov     rsi, rdx
        mov     rdi, rax
        call    my_strcmp_train
        mov     esi, eax
        mov     edi, OFFSET FLAT:.LC1
        mov     eax, 0
        call    printf
        mov     eax, 0
        leave
        ret

================================================
FILE: tests/asm_test.py
================================================
import unittest as ut

import asm2vec.asm as asm


class InstructionTest(ut.TestCase):
    def test_parse_instruction(self):
        ins = asm.parse_instruction('mov eax, ebx')
        self.assertEqual('mov', ins.op(), 'Operators not equal')
        self.assertListEqual(['eax', 'ebx'], ins.args(), 'Operands not equal')

    def test_parse_instruction_one_operand(self):
        ins = asm.parse_instruction('inc eax')
        self.assertEqual('inc', ins.op(), 'Operators not equal')
        self.assertListEqual(['eax'], ins.args(), 'Operands not equal')

    def test_parse_instruction_no_operands(self):
        ins = asm.parse_instruction('ret')
        self.assertEqual('ret', ins.op(), 'Operators not equal')
        self.assertListEqual([], ins.args(), 'Operands not equal')


class BasicBlockTest(ut.TestCase):
    pass


class FunctionTest(ut.TestCase):
    pass


================================================
FILE: tests/parse_test.py
================================================
import unittest as ut

import asm2vec.parse


test_asm = """
my_strlen:
        push    rbp
        mov     rbp, rsp
        mov     QWORD PTR [rbp-24], rdi
        mov     rax, QWORD PTR [rbp-24]
        mov     QWORD PTR [rbp-8], rax
        jmp     .L2
.L3:
        add     QWORD PTR [rbp-8], 1
.L2:
        mov     rax, QWORD PTR [rbp-8]
        movzx   eax, BYTE PTR [rax]
        test    al, al
        jne     .L3
        mov     rax, QWORD PTR [rbp-8]
        sub     rax, QWORD PTR [rbp-24]
        pop     rbp
        ret
.LC0:
        .string "%s"
.LC1:
        .string "%d\\n"
main:
        push    rbp
        mov     rbp, rsp
        add     rsp, -128
        lea     rax, [rbp-128]
        mov     rsi, rax
        mov     edi, OFFSET FLAT:.LC0
        mov     eax, 0
        call    scanf
        lea     rax, [rbp-128]
        mov     rdi, rax
        call    my_strlen
        mov     esi, eax
        mov     edi, OFFSET FLAT:.LC1
        mov     eax, 0
        call    printf
        mov     eax, 0
        leave
        ret
"""


class ParseTest(ut.TestCase):
    def test_parse_text(self):
        funcs = asm2vec.parse.parse_text(test_asm, func_names=['main', 'my_strlen'])
        self.assertEqual(2, len(funcs))
        self.assertEqual({'main', 'my_strlen'}, set(map(lambda f: f.name(), funcs)))

        funcs = dict(map(lambda f: (f.name(), f), funcs))
        main_func: asm2vec.asm.Function = funcs['main']
        my_strlen_func: asm2vec.asm.Function = funcs['my_strlen']

        self.assertListEqual(['my_strlen'], list(map(lambda f: f.name(), main_func.callees())))
        self.assertListEqual(['main'], list(map(lambda f: f.name(), my_strlen_func.callers())))


================================================
FILE: tests/utilities_test.py
================================================
import unittest as ut

import asm2vec.internal.util as utilities


class PermutationTest(ut.TestCase):
    def test_permute(self):
        v = [10, 20, 30, 40, 50]
        p = [2, 4, 1, 0, 3]
        pv = utilities.permute(v, p)
        self.assertListEqual([30, 50, 20, 10, 40], pv, 'Permutated vectors not equal.')

    def test_inv_permute(self):
        v = [30, 50, 20, 10, 40]
        p = [2, 4, 1, 0, 3]
        pv = utilities.inverse_permute(v, p)
        self.assertListEqual([10, 20, 30, 40, 50], pv, 'Inverse permutated vectors not equal.')