Repository: TakahiroHaruyama/ida_haru
Branch: master
Commit: 29bd253294c3
Files: 27
Total size: 210.7 KB

Directory structure:
gitextract_b2x18g5t/

├── .gitignore
├── ADVobfuscator/
│   ├── README.org
│   └── idapy3_ADVobfuscator_deob.py
├── LICENSE
├── README.org
├── bindiff/
│   ├── README.org
│   ├── bindiff.py
│   ├── bindiff_export.idc
│   ├── save_func_names.py
│   └── save_func_names_7x.py
├── callstrings/
│   ├── README.org
│   ├── hexrays_utils.py
│   ├── ida_callstrings_dbg.py
│   ├── ida_callstrings_flare_emu.py
│   └── ida_callstrings_static.py
├── eset_crackme/
│   ├── README.org
│   ├── loaders/
│   │   └── ida_loader_drv_vm.py
│   └── procs/
│       └── ida_processor_drv_vm.py
├── fn_fuzzy/
│   ├── README.org
│   ├── cli_export.py
│   ├── dump_types.py
│   ├── fn_fuzzy.py
│   ├── fn_fuzzy_7x.py
│   ├── yara_fn.py
│   └── yara_fn_7x.py
└── stackstring_static/
    ├── README.org
    └── stackstring_static.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/


================================================
FILE: ADVobfuscator/README.org
================================================
* IDAPython script deobfuscating ADVobfuscator strings, applied to a TrickBoot sample

The script requires [[https://github.com/fireeye/flare-emu][flare-emu]].
The tested sample is [[https://www.virustotal.com/gui/file/c1f1bc58456cff7413d7234e348d47a8acfdc9d019ae7a4aba1afc1b3ed55ffa/details][491115422a6b94dc952982e6914adc39]] (TrickBot's UEFI firmware reconnaissance module called "TrickBoot").

Note: We may not be able to reuse it for a different sample that was compiled with a different compiler or with different flags but I think the same approach (decoder function pattern matching + emulation) can be applied.

A result example:

#+BEGIN_SRC 
[*] 0x1000a124: xor2-encoded function detected (size = 0x2f)
[*] 0x1000b92c: emulating from 0x1000b71b to 0x1000b92c
[+] 0x1000b92c: uefi_expl_port_writeDeviceIoControl() ERROR %d
#+END_SRC

[[./img/adv_result.png]]

** Reference

- https://github.com/andrivet/ADVobfuscator
- https://eclypsium.com/2020/12/03/trickbot-now-offers-trickboot-persist-brick-profit/
- [[http://antonioparata.blogspot.com/2020/06/deobfuscating-c-advobfuscator-with.html]]


================================================
FILE: ADVobfuscator/idapy3_ADVobfuscator_deob.py
================================================
# idapy3_ADVobfuscator_deob.py - IDAPython script deobfuscating ADVobfuscator strings, applied to a TrickBoot sample
# Takahiro Haruyama (@cci_forensics)

from idc import *
from idautils import *
import idaapi

try:
    import flare_emu
except ImportError as e:
    print(("Could not import flare_emu: {}\nExiting.".format(e.message)))
    raise

import re, unicorn
'''
dec 


'''
g_pat_sub = re.compile(rb'^\x33\xD2\x8A\x04\x0A\x0F\xBE\xC0\x83\xE8(.)\x88\x04\x0A\x42\x83\xFA(.)\x72\xEE\x8B\xC1\xC3$', re.DOTALL)
g_pat_xor1 = re.compile(rb'^\x53\x55\x56\x57\x8b\xf9\x6a(.)\x5d\x8d\x47\x04\x8a\x10\x0f\xbe\x37\x0f\xbe\xca\x33\xce\x88\x08\x40\x83\xed\x01\x75\xee\xc6\x47.\x00\x8d\x47\x04\x5f\x5e\x5d\x5b\xc3$', re.DOTALL)
g_pat_xor2 = re.compile(rb'^\x53\x56\x57\x8b\xf1\x33\xdb\x8a\x54\x1e\x04\x8b\x06\x02\xc3\x0f\xbe\xca\x33\xc1\x88\x44\x1e\x04\x43\x83\xfb(.)\x72\xe9\x5f\xc6\x46.\x00\x8d\x46\x04\x5e\x5b\xc3$', re.DOTALL)
g_pat_dec = re.compile(rb'^\x33\xd2\x8a\x04\x0a\x0f\xbe\xc0\x48\x88\x04\x0a\x42\x83\xfa(.)\x72\xf0\x8b\xc1\xc3$', re.DOTALL)
g_pats = {
    'sub': g_pat_sub,
    'xor1': g_pat_xor1,
    'xor2': g_pat_xor2,
    'dec': g_pat_dec,
}

def info(msg):
    print(("[*] {}".format(msg)))

def success(msg):
    print(("[+] {}".format(msg)))

def error(msg):
    print(("[!] {}".format(msg)))

def set_decomplier_cmt(ea, cmt):
    try:
        cfunc = idaapi.decompile(ea)
        tl = idaapi.treeloc_t()
        tl.ea = ea
        tl.itp = idaapi.ITP_SEMI
        if cfunc:
          cfunc.set_user_cmt(tl, cmt)
          cfunc.save_user_cmts()
        else:
          error("Decompile failed: {:#x}".format(ea))
    except:
        error("Decompile failed: {:#x}".format(ea))

def add_bookmark(ea, comment):
    last_free_idx = -1
    for i in range(0, 1024):
        slot_ea = get_bookmark(i)
        if slot_ea == BADADDR or slot_ea == ea:
            # empty slot found or overwrite existing one
            last_free_idx = i
            break
    # Check Empty Slot
    if last_free_idx < 0:
        return False
    # Register Slot
    put_bookmark(ea, 0, 0, 0, last_free_idx, comment)
    return True

def get_emu_range(ea):
    func = idaapi.get_func(ea)
    if func is None:
        return None, None

    for bb in idaapi.FlowChart(func):
        if bb.start_ea <= ea <= bb.end_ea:            
            #return bb.start_ea, next_head(ea) # 
            return bb.start_ea, ea
    return None, None

# enable a step into emulation for the decoder (disabled)
def call_hook(address, argv, funcName, userData):
    if funcName == userData["dec_fn_name"]:
        #print('dec_fn detected')
        userData['skipCalls'] = False
    else:
        userData['skipCalls'] = True

# validate the emulation result, based on the encoded buf ptr (disabled)
def inst_hook(uc, address, size, userData):
    #info('instr_hook {:#x}'.format(address))
    if address == userData['ref']:
        eh = userData["EmuHelper"]
        try:
            pc = uc.reg_read(eh.regs["pc"])
            enc_ea = uc.reg_read(eh.regs["ecx"])
            info('pc = {:#x}, address = {:#x}), enc_ea = {:#x}'.format(pc, address, enc_ea))
            userData["enc_ea"] = enc_ea
        except unicorn.UcError as e:
            error("emulation error: {}".format(str(e)))
    elif address == userData['end'] and userData.get('enc_ea'):
        eh = userData["EmuHelper"]
        try:
            pc = uc.reg_read(eh.regs["pc"])
            if userData["dec_fn_name"].find('sub') != -1:
                dec = uc.mem_read(userData["enc_ea"], userData['size'])
            else: # xor
                dec = uc.mem_read(userData["enc_ea"] + 4, userData['size'])
            success('{:#x}: {}'.format(userData['ref'], dec))
            
        except unicorn.UcError as e:
            error("emulation error: {}".format(str(e)))

def emulate(pname, eh, dec_fn, size, key):
    cnt = 0
    
    refs = CodeRefsTo(dec_fn, False)
    for ref in refs:
        if GetMnem(ref) == 'call':
            start, end = get_emu_range(ref)
            
            if start and end:
                info('{:#x}: emulating from {:#x} to {:#x}'.format(ref, start, end))
                userData = {
                    'dec_fn_name': get_name(dec_fn),
                    'start': start,
                    'end': end,
                    'ref': ref,
                    'size': size,
                }
                try:
                    #eh.emulateRange(start, endAddr=end, callHook=call_hook, instructionHook=inst_hook, hookData=userData)
                    #eh.emulateRange(start, endAddr=end, callHook=call_hook, hookData=userData)
                    eh.emulateRange(start, endAddr=end)
                    
                    pc = eh.uc.reg_read(eh.regs["pc"])
                    ea = eh.uc.reg_read(eh.regs["ecx"])
                    if pname == 'sub':
                        enc = eh.uc.mem_read(ea, size)
                        #info('key = {:#x}, enc = {}'.format(key, enc))
                        dec = bytes([(x - key) & 0xff for x in enc]).decode()
                    elif pname == 'dec':
                        enc = eh.uc.mem_read(ea, size)
                        dec = bytes([(x - 1) & 0xff for x in enc]).decode()
                    else:
                        key = eh.uc.mem_read(ea, 4)[0]
                        enc = eh.uc.mem_read(ea + 4, size)
                        #info('key = {:#x}, enc = {}'.format(key, enc))
                        if pname == 'xor1':
                            dec = bytes([x ^ key for x in enc]).decode()
                        else: # xor2
                            dec = bytes([x ^ (key + i) for i, x in enumerate(enc)]).decode()

                    # to obtain the step into emulation (disabled)
                    #dec_ea = eh.uc.reg_read(eh.regs["eax"])
                    #info('{:#x}: dec_ea = {:#x}'.format(pc, dec_ea))
                    #dec = eh.uc.mem_read(dec_ea, size)
                    
                    success('{:#x}: {}'.format(ref, dec))
                    MakeComm(ref, dec)
                    set_decomplier_cmt(ref, dec)
                    add_bookmark(ref, 'decoded: {}'.format(dec))
                    cnt += 1
                    
                except unicorn.UcError as e:
                    pc = eh.uc.reg_read(eh.regs["pc"])
                    error("{:#x}: {} when reading {:#x}".format(pc, str(e), ea))
                    
                finally:
                    eh.resetEmulatorHeapAndStack()

    return cnt

def main():
    info('start')
    eh = flare_emu.EmuHelper()

    # search the decoding functions
    cnts = {}
    for fva in Functions():
        #if fva != 0x1000A19F:
        #    continue
        if idc.get_func_flags(fva) & (idc.FUNC_LIB | idc.FUNC_THUNK):
            continue

        size = 0
        fn_bytes = idc.get_bytes(fva, get_func_attr(fva, FUNCATTR_END) - fva)

        for pname, pat in g_pats.items():
            m = pat.search(fn_bytes)
            if m:
                try:
                    if pname == 'sub':
                        key = int.from_bytes(m.group(1), 'little')
                        size = int.from_bytes(m.group(2), 'little')
                    else:
                        key = None
                        size = int.from_bytes(m.group(1), 'little')
                except ValueError:
                    pass
                else:
                    print('\n')
                    info('{:#x}: {}-encoded function detected (size = {:#x})'.format(fva, pname, size))
                    idaapi.do_name_anyway(fva, 'fn_ADVobfuscator_decode_{}_len{}'.format(pname, size))
                    
                    cnt = emulate(pname, eh, fva, size, key)
                    if cnts.get(pname):
                        cnts[pname] += cnt
                    else:
                        cnts[pname] = cnt
                    break

    info('number of decoded strings: {}'.format(cnts))
    info('done')    

if __name__ == '__main__':
    main()
    

================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.org
================================================
#+OPTIONS: ^:{}
* ida_haru

Scripts/plugins for IDA Pro

Note: Old scripts don't work for IDA 8.x, but I leave them just for reference. 

** eset_crackme

IDA Pro loader/processor modules for ESET CrackMe driver VM

** stackstring_static

IDAPython script statically-recovering strings constructed in stack

** fn_fuzzy

IDAPython script for fast multiple binary diffing triage

** bindiff

python script for multiple binary diffing by BinDiff

** ADVobfuscator

IDAPython script deobfuscating ADVobfuscator strings, applied to a TrickBoot sample

** HexRaysDeob

modified version for defeating APT10 ANEL's code obfuscations (located in a [[https://github.com/carbonblack/HexRaysDeob][corporate github repository]])

** callstrings

scripts for defeating "polymorphic stack strings" obfuscation used by Hodur sample


================================================
FILE: bindiff/README.org
================================================
#+OPTIONS: ^:{}

#+TITLE: BinDiff wrapper script for multiple binary diffing

* Purpose

multiple binary diffing up to 100 samples ([[https://github.com/TakahiroHaruyama/ida_haru/tree/master/fn_fuzzy][fn_fuzzy]] is better for more samples)

* Requirements

- IDA 7.6 and BinDiff 6
- python packages: pefile macholib pyelftools python-idb prettytable

* How to Use

Before using it, you have to edit the paths for executables/scripts in bindiff.py.
#+BEGIN_SRC 
# paths (should be edited)
g_out_dir = r'Z:\haru\analysis\tics\bindiff_db' 
g_ida_dir = r'C:\work\tool\IDAx64'
g_exp_path = r'Z:\cloud\gd\python\IDAPython\ida_haru\bindiff\bindiff_export.idc'
g_differ_path = r"C:\Program Files\BinDiff\bin\bindiff.exe"
#g_differ_path = r'C:\Program Files (x86)\zynamics\BinDiff 4.2\bin\differ64.exe'
g_save_fname_path = r'Z:\cloud\gd\python\IDAPython\ida_haru\bindiff\save_func_names.py'
#+END_SRC

You can check the command line options by -h or --help.
#+BEGIN_EXAMPLE
Z:\cloud\gd\work\python\IDAPython\bindiff>python bindiff.py -h
usage: bindiff.py [-h] [--out_dir OUT_DIR] [--ws_th WS_TH] [--fs_th FS_TH] [--ins_th INS_TH] [--bb_th BB_TH] [--size_th SIZE_TH] [--func_regex FUNC_REGEX] [--debug]
                  [--clear] [--noidb] [--use_pyidb]
                  primary {1,m} ...

positional arguments:
  primary               primary binary to compare
  {1,m}                 mode: 1, m
    1                   BinDiff 1 to 1
    m                   BinDiff 1 to many

optional arguments:
  -h, --help            show this help message and exit
  --out_dir OUT_DIR, -o OUT_DIR
                        output directory including .BinExport/.BinDiff (default: Z:\haru\analysis\tics\bindiff_db)
  --ws_th WS_TH, -w WS_TH
                        whole binary similarity threshold (default: 0.2)
  --fs_th FS_TH, -f FS_TH
                        function similarity threshold (default: 0.8)
  --ins_th INS_TH, -i INS_TH
                        instruction threshold (default: 30)
  --bb_th BB_TH, -b BB_TH
                        basic block threshold (default: 1)
  --size_th SIZE_TH, -s SIZE_TH
                        file size threshold (MB) (default: 10)
  --func_regex FUNC_REGEX, -e FUNC_REGEX
                        function name regex to reduce noise (default: sub_|fn_|chg_)
  --debug, -d           print debug output (default: False)
  --clear, -c           clear .BinExport, .BinDiff and function name cache (default: False)
  --noidb, -n           skip a secondary binary without idb (default: False)
  --use_pyidb           use python-idb (default: False)
#+END_EXAMPLE

There are 2 modes. One is "1 to 1" mode, the other is "1 to many" mode.

** "1 to 1" mode example

In "1 to 1" mode, we should specify executable file paths for primary and secondary targets.

#+BEGIN_EXAMPLE
Z:\cloud\gd\work\python\IDAPython\bindiff>python bindiff.py Z:\haru\analysis\tics\hoge\[redacted]_worker_fixed
1 Z:\haru\analysis\tics\hoge\samples\checked\[redacted]c2f05
---------------------------------------------
[*] BinDiff result
[*] elapsed time = 0.390000104904 sec, number of diffing = 1
[*] primary binary: (([redacted]_worker_fixed))

============== 1 high similar binaries (>0.2) ================
+----------------+--------------------------------------+
|   similarity   |           secondary binary           |
+----------------+--------------------------------------+
| 0.211967127395 | [redacted]c2f05                      |
+----------------+--------------------------------------+
---------------------------------------------
#+END_EXAMPLE

"high similar binaries" means some binaries are found with whole binary similarities. You can adjust the similarity by -w option.

** "1 to many" mode example

In "1 to many" mode, we should specify an executable file path for a primary target and a folder path for secondary targets. We can specify to compare secondary binaries recursively (-r option).

#+BEGIN_EXAMPLE
Z:\cloud\gd\work\python\IDAPython\bindiff>python bindiff.py Z:\haru\analysis\tics\hoge\samples\attacker\[redacted]_worker_fixed
m Z:\haru\analysis\tics\hoge\samples\tmp
---------------------------------------------
[*] BinDiff result
[*] elapsed time = 6.71900010109 sec, number of diffing = 3
[*] primary binary: (([redacted]_worker_fixed))

============== 10 high similar functions (>0.8), except high similar binaries ================
+----------------+--------------+--------------------------------+----------------+----------------------------------+-----------------+
|   similarity   | primary addr |          primary name          | secondary addr |          secondary name          |secondary binary |
+----------------+--------------+--------------------------------+----------------+----------------------------------+-----------------+
|      1.0       | 0x180067720  |       Virt_sub_180067720       |  0x180004c30   |          sub_180004c30           | [redacted]e6504 |
|      1.0       | 0x1800674b0  |         sub_1800674b0          |  0x180004930   |          sub_180004930           | [redacted]e6504 |
|      1.0       | 0x1800673a0  | chg_peparse_Virt_sub_1800673A0 |  0x180004820   |          sub_180004820           | [redacted]e6504 |
|      1.0       | 0x1800672b0  |       Virt_sub_1800672B0       |  0x180004730   |          sub_180004730           | [redacted]e6504 |
|      1.0       | 0x18005fd84  |         sub_18005fd84          |  0x13f69af94   |          sub_13f69af94           | [redacted]fb841 |
|      1.0       | 0x18005fd84  |         sub_18005fd84          |  0x180012648   |         __crtMessageBoxW         | [redacted]e6504 |
|      1.0       | 0x180050f30  |         sub_180050f30          |  0x1800019f0   | ?erase@?$basic_string@DU?$char_t | [redacted]e6504 |
| 0.98987073046  | 0x1800677e0  | chg_peparse_Virt_sub_1800677E0 |  0x180004cf0   |          sub_180004cf0           | [redacted]e6504 |
| 0.963708558784 | 0x180067560  |         sub_180067560          |  0x1800049e0   |          sub_1800049e0           | [redacted]e6504 |
| 0.946399194338 | 0x180018780  |    chg_rotate_sub_180018780    |  0x140004360   |          sub_140004360           | [redacted]92023 |
+----------------+--------------+--------------------------------+----------------+----------------------------------+-----------------+
---------------------------------------------
#+END_EXAMPLE
"high similar functions" means some functions are found with function similarities though they have lower whole binary similarities than the threshold. You can ajust the similarity by -f option.

The function similarity result is very noisy so library/thunk functions are filtered out by the script. Additionally, we can specify the number of instructions/basic blocks, file size, and so on to reduce the noise.

And by default, the script newly creates idbs for the target binaries if not found. If you want to only compare existing idbs, please specify -n.

* Notes

- If you can't get the function similarities correctly, adjust the function similarity threshold (--fs_th), instruction threshold (--ins_th), basic block threshold (--bb_th) and function name filter rule (--func_regex) options. The script excludes the matches of small codes because function similarity results of multiple binaries are noisy.

- BinDiff 5.0 and later contains a [[https://issuetracker.google.com/issues/129600738][bug]] that we can't load existing .BinDiff files and import symbols/comments due to missing .BinExport files. I hope it will be fixed someday. 

- python-idb doesn't work for IDA 7.6 IDBs. So by default it's not used (enable --use_pyidb option if needed).


================================================
FILE: bindiff/bindiff.py
================================================
# bindiff.py - BinDiff wrapper script for multiple binary diffing
# Takahiro Haruyama (@cci_forensics)

import argparse, subprocess, os, sqlite3, time, pickle, re, multiprocessing, sys, struct, logging
from prettytable import PrettyTable
import pefile
from macholib.MachO import MachO
from macholib.mach_o import *
from elftools.elf.elffile import ELFFile
import idb

logging.basicConfig(level=logging.ERROR) # to suppress python-idb warning

# paths (should be edited)
# Windows 
#g_out_dir = r'C:\analysisw\tmp\bindiff'
#g_ida_dir = r'C:\analysisw\tool\IDA'
#g_differ_path = r"C:\Program Files\BinDiff\bin\bindiff.exe"
# MacOS
g_out_dir = r'/Users/haru/analysis/tmp/bindiff'
#g_ida_dir = r'/Applications/IDA/ida.app/Contents/MacOS'
g_ida32_path = r'/Applications/IDA/ida.app/Contents/MacOS/ida'
g_ida64_path = r'/Applications/IDA/ida64.app/Contents/MacOS/ida64'
g_differ_path = r"/Applications/BinDiff/BinDiff.app/Contents/MacOS/bin/bindiff"
g_exp_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'bindiff_export.idc')
g_save_fname_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'save_func_names_7x.py')

# parameters
g_ws_th = 0.15 # whole binary similarity threshold
g_fs_th = 0.70 # function similarity threshold
g_ins_th = 10 # instruction threshold
g_bb_th = 0 # basic block threshold
g_size_th = 10 # file size threshold (MB)
#g_func_regex = r'sub_|fn_|chg_' # function name filter rule
g_func_regex = r'.*' # function name filter rule

class LocalError(Exception): pass
class ProcExportError(LocalError): pass
class ProcDiffError(LocalError): pass
class LoadFuncNamesError(LocalError): pass
class FileNotFoundError(LocalError): pass
class ChildProcessError(LocalError): pass

class BinDiff(object):
    
    def __init__ (self, primary, out_dir, ws_th, fs_th, ins_th, bb_th, size_th, func_regex, debug=False, clear=False, newidb=False, use_pyidb=False):
    #def __init__ (self, primary, out_dir, ws_th, fs_th, ins_th, bb_th, size_th, debug=False, clear=False, noidb=False, use_pyidb=False):        
        self._debug = debug
        self._clear = clear
        self._newidb = newidb
        self._lock = multiprocessing.Lock()        
        self._primary = primary
        self._ws_th = ws_th
        self._fs_th = fs_th
        self._ins_th = ins_th
        self._bb_th = bb_th
        self._size_th = size_th
        self._out_dir = out_dir
        self.use_pyidb = use_pyidb
        
        self._format, self._arch = self._get_machine_type(primary)
        if self._format is None:
            raise ProcExportError('primary binary should be PE/Mach-O/ELF'.format(primary))
        self._dprint('primary binary format: {}'.format(self._format))
        self._dprint('primary binary architecture: {}'.format(self._arch))
        
        self._ida_path = self._get_ida_path(self._arch)
        res = self._files_not_found()
        if res is not None:
            raise FileNotFoundError('file is not found: {}'.format(res))
        self._dprint('IDA binary path for primary: {}'.format(self._ida_path))
        
        if self._make_BinExport(self._primary, self._ida_path) != 0:
            raise ProcExportError('primary BinExport failed: {}'.format(primary))

        if self.use_pyidb:
            idb_path = self._get_idb_path(primary, self._arch)
            self._func_names = self._load_func_names_pyidb(idb_path)
        else:
            self._func_p = re.compile(func_regex)
            self._func_regex = func_regex
            self._func_names = self._load_func_names_default(func_regex, primary,
                                                             self._ida_path)
        
        self._high_ws = {}
        self._high_fs = {}
        self._diff_cnt = 0

    def _dprint(self, msg):
        if self._debug:
            self._lock.acquire()            
            print('[+] [{}]: {}'.format(os.getpid(), msg))
            self._lock.release()

    def _get_machine_type(self, path):
        try:
            pe = pefile.PE(path)
            format_ = 'PE'
            if pefile.MACHINE_TYPE[pe.FILE_HEADER.Machine].find('I386') != -1:
                arch = '32-bit'
            else:
                arch = '64-bit'
        except (pefile.PEFormatError,KeyError) as detail:
            try:
                self._dprint(detail)
                m = MachO(path)
                format_ = 'Mach-O'
                for header in m.headers:
                    if CPU_TYPE_NAMES.get(header.header.cputype,header.header.cputype) == 'x86_64':
                    #if header.MH_MAGIC == MH_MAGIC_64:
                        arch = '64-bit'
                    else:
                        arch = '32-bit'
            except:
                try:
                    elffile = ELFFile(open(path, 'rb'))
                    format_ = 'ELF'
                    e_ident = elffile.header['e_ident']
                    if e_ident['EI_CLASS'] == 'ELFCLASS64':
                        arch = '64-bit'
                    else:
                        arch = '32-bit'
                except:                    
                    return None, None
                    #format_ = 'shellcode'
                    #arch = '32-bit' # 32-bit fixed
        return format_, arch

    def _files_not_found(self):
        #for path in (self._ida_path, g_exp_path, g_save_fname_path, g_differ_path):
        for path in (self._ida_path, g_exp_path, g_differ_path):
            if not os.path.isfile(path):
                return path
        return None

    def _get_db_path_noext(self, target):
        return os.path.join(self._out_dir, os.path.splitext(os.path.basename(target))[0])
        #return os.path.join(self._out_dir, os.path.basename(target))

    def _get_idb_path(self, target, arch):
        db_ext = '.idb' if arch == '32-bit' else '.i64'
        target_split = os.path.splitext(target)[0]
        
        if os.path.exists(target_split + db_ext):
            return target_split + db_ext
        else:
            return target + db_ext # for recent IDA versions

    def _get_ida_path(self, arch):
        #idaq = 'idaq.exe' if arch == '32-bit' else 'idaq64.exe'
        #idaq = g_ida32_name if arch == '32-bit' else g_ida64_name
        #return os.path.join(g_ida_dir, idaq)
        return g_ida32_path if arch == '32-bit' else g_ida64_path

    def _load_func_names_pyidb(self, idb_path): # exlcude library/thunk functions
        pickle_path = os.path.splitext(os.path.join(self._out_dir, os.path.basename(idb_path)))[0] + '_func_names.pickle'
        if self._clear or not os.path.exists(pickle_path):        
            func_names = {}        
            with idb.from_file(idb_path) as db:
                api = idb.IDAPython(db)
                for ea in api.idautils.Functions(api.idc.MinEA(), api.idc.MaxEA()):
                    flags = api.idc.GetFunctionFlags(ea)
                    if flags & api.ida_funcs.FUNC_LIB or flags & api.ida_funcs.FUNC_THUNK:
                        continue
                    func_name = api.idc.GetFunctionName(ea)
                    func_names[ea] = func_name
            with open(pickle_path, 'wb') as f:
                pickle.dump(func_names, f)

        with open(pickle_path, 'rb') as f:
            self._dprint('function names loaded: {}'.format(idb_path))
            return pickle.load(f)
                        
    # default function without python-idb
    def _load_func_names_default(self, func_regex, path, ida_path):
        pickle_path = os.path.splitext(os.path.join(self._out_dir, os.path.basename(path)))[0] + '_func_names.pickle'
        if self._clear or not os.path.exists(pickle_path):
            cmd = [ida_path, '-A', '-S{}'.format(g_save_fname_path), '-Osave_func_names:{}:{}'.format(func_regex, pickle_path), path]
            #cmd = [ida_path, '-S{}'.format(g_save_fname_path), '-Osave_func_names:{}:{}'.format(func_regex, pickle_path), path]

            self._dprint('saving function names for {}'.format(path))
            self._dprint(' '.join(cmd))
            proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            stdout, stderr = proc.communicate()            
            if proc.returncode != 0:
                raise LoadFuncNamesError('function names saving failed: {}'.format(path))
            
        with open(pickle_path, 'rb') as f:
            self._dprint('function names loaded: {}'.format(path))
            return pickle.load(f)
        
        raise LoadFuncNamesError('function names loading failed: {}'.format(path))

    def _make_BinExport(self, target, ida_path):
        binexp_path = self._get_db_path_noext(target) + '.BinExport'
        #binexp_path = os.path.splitext(target)[0] + '.BinExport'
        if not self._clear and os.path.exists(binexp_path):
            self._dprint('already existed BinExport: {}'.format(binexp_path))
            return 0

        #cmd = [ida_path, '-A', '-S{}'.format(g_exp_path), '-OExporterModule:{}'.format(binexp_path), target]  # the .BinExport filename should be specified in 4.3
        #if self._debug:
            #cmd = [ida_path, '-S{}'.format(g_exp_path), '-OBinExportModule:{}'.format(binexp_path), target]
        #else:
        cmd = [ida_path, '-A', '-S{}'.format(g_exp_path), '-OBinExportModule:{}'.format(binexp_path), target]
        #print cmd
        
        self._dprint('getting BinExport for {}'.format(target))
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = proc.communicate()
        return proc.returncode

    def _get_BinDiff_path(self, secondary):
        primary_noext = self._get_db_path_noext(self._primary)
        secondary_noext = os.path.splitext(secondary)[0]
        return primary_noext + '_vs_' + os.path.basename(secondary_noext) + '.BinDiff'

    def _make_BinDiff(self, secondary):
        pri_binexp = self._get_db_path_noext(self._primary) + '.BinExport'
        sec_binexp = self._get_db_path_noext(secondary) + '.BinExport'
        #pri_binexp = os.path.splitext(self._primary)[0] + '.BinExport'
        #sec_binexp = os.path.splitext(secondary)[0] + '.BinExport'
        bindiff_path = self._get_BinDiff_path(secondary)
        if not self._clear and os.path.exists(bindiff_path):
            self._dprint('already existed BinDiff: {}'.format(bindiff_path))
            return 0, None            
        
        cmd = [g_differ_path, '--primary={}'.format(pri_binexp), '--secondary={}'.format(sec_binexp), '--output_dir={}'.format(self._out_dir)]
        #print cmd
        
        self._dprint('diffing the binaries..')
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = proc.communicate()
        self._dprint('differ output:')
        self._dprint(stdout)
        self._dprint(stderr)
        return proc.returncode, cmd

    def is_skipped(self, secondary):
        # file check (in case of the same dir)
        #if os.path.splitext(self._primary)[0] == os.path.splitext(secondary)[0]:
        if self._primary == secondary:
            return True
        
        # target at executables
        if os.path.splitext(secondary)[1] in ('.BinExport', '.BinDiff', '.idb', '.i64'):
            return True
        
        # size check
        if (os.path.getsize(secondary) >> 20) > self._size_th:
            self._dprint('The size is bigger (skipped): {}'.format(secondary))
            return True
        
        # format/arch check
        format_, arch = self._get_machine_type(secondary)
        if format_ is None:
            return True
        #elif format_ != self._format or arch != self._arch:
        elif format_ != self._format: # only check the format 
            self._dprint('different executable format (skipped): {}'.format(secondary))
            return True

        # skip if idb not found
        idb_path = self._get_idb_path(secondary, arch)
        if not self._newidb and not os.path.exists(idb_path):
            self._dprint('no existing idb (skipped): {}'.format(secondary))
            return True
        
        return False

    def check_similarity(self, secondary, q=None):
        format_, arch = self._get_machine_type(secondary)
        ida_path = self._get_ida_path(arch)
        self._dprint('IDA binary path for secondary: {}'.format(ida_path))        
        if self._make_BinExport(secondary, ida_path) != 0:
            if q is not None:
                q.put((None, None))            
            raise ProcExportError('secondary BinExport failed: {}'.format(secondary))

        retcode, cmd = self._make_BinDiff(secondary)
        if retcode != 0:
            if q is not None:
                q.put((None, None))            
            raise ProcDiffError('BinDiff failed: {}'.format(cmd))

        conn = sqlite3.connect(self._get_BinDiff_path(secondary))
        c = conn.cursor()
        try:
            c.execute("SELECT similarity,confidence FROM metadata")
        except sqlite3.OperationalError as detail:
            print('[!] .BinDiff database ({}) is something wrong: {}'.format(self._get_BinDiff_path(secondary), detail))
            return
            
        ws, wc = c.fetchone()
        self._dprint('whole binary similarity={} confidence={}'.format(ws, wc))
        c.execute("SELECT address1,address2,similarity,confidence FROM function WHERE similarity > ? and instructions > ? and basicblocks > ?", (self._fs_th, self._ins_th, self._bb_th))
        frows = c.fetchall()
        self._dprint('{} similar functions detected'.format(len(frows)))
        conn.close()

        c_high_ws = {}
        c_high_fs = {}
        if ws > self._ws_th:
            c_high_ws[secondary] = {'similarity':ws, 'confidence':wc}
        elif frows:
            if self.use_pyidb:
                idb_path = self._get_idb_path(secondary, arch)
                func_names = self._load_func_names_pyidb(idb_path)
            else:
                func_names = self._load_func_names_default(self._func_regex, secondary,
                                                           ida_path)
            for row in frows:
                addr1, addr2, fs, fc = row
                self._dprint('addr1={:#x}, addr2={:#x}, similarity={}, confidence={}'.format(addr1, addr2, fs, fc))
                if addr1 in self._func_names and addr2 in func_names:
                    c_high_fs[(addr1, self._func_names[addr1], addr2, func_names[addr2], secondary)] = {'similarity':fs, 'confidence':fc}
            if not c_high_fs and not self._debug:
                os.remove(self._get_BinDiff_path(secondary))
        else:
            if not self._debug:
                os.remove(self._get_BinDiff_path(secondary))

        #self._dprint(c_high_ws)
        #self._dprint(c_high_fs)
        if q is None:
            self._high_ws = c_high_ws
            self._high_fs = c_high_fs
        else:
            q.put((c_high_ws, c_high_fs))

    def check_similarities(self, secondary_dir, recursively):
        if recursively:
            seconds = [os.path.join(root, file_) for root, dirs, files in os.walk(secondary_dir) for file_ in files]
        else:
            seconds = [os.path.join(secondary_dir, entry) for entry in os.listdir(secondary_dir) if os.path.isfile(os.path.join(secondary_dir, entry))]

        procs = []            
        for secondary in seconds:
            if self.is_skipped(secondary):
                continue
            q = multiprocessing.Queue()
            p = multiprocessing.Process(target=self.check_similarity, args=(secondary, q))
            p.start()
            procs.append((p,q))
        self._diff_cnt = len(procs)
        for p,q in procs:
            c_high_ws, c_high_fs = q.get()
            self._high_ws.update(c_high_ws)
            self._high_fs.update(c_high_fs)
            p.join()

    def increment_count(self):
        self._diff_cnt += 1
    
    def get_result(self):
        return self._high_ws, self._high_fs, self._diff_cnt

    
def main():    
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('primary', help="primary binary to compare")
    parser.add_argument('--out_dir', '-o', default=g_out_dir, help="output directory including .BinExport/.BinDiff")
    parser.add_argument('--ws_th', '-w', type=float, default=g_ws_th, help="whole binary similarity threshold")
    parser.add_argument('--fs_th', '-f', type=float, default=g_fs_th, help="function similarity threshold")
    parser.add_argument('--ins_th', '-i', type=int, default=g_ins_th, help="instruction threshold")
    parser.add_argument('--bb_th', '-b', type=int, default=g_bb_th, help="basic block threshold")    
    parser.add_argument('--size_th', '-s', type=int, default=g_size_th, help="file size threshold (MB)")
    parser.add_argument('--func_regex', '-e', default=g_func_regex, help="function name regex to include in the result")
    parser.add_argument('--debug', '-d', action='store_true', help="print debug output")
    parser.add_argument('--clear', '-c', action='store_true', help="clear .BinExport, .BinDiff and function name cache")
    parser.add_argument('--newidb', '-n', action='store_true', help="create an idb for the secondary binary")
    parser.add_argument('--use_pyidb', action='store_true', help="use python-idb")
    
    subparsers = parser.add_subparsers(dest='mode', help='mode: 1, m')
    parser_1 = subparsers.add_parser('1', help='BinDiff 1 to 1')
    parser_1.add_argument('secondary', help="secondary binary to compare")    
    parser_m = subparsers.add_parser('m', help='BinDiff 1 to many')
    parser_m.add_argument('secondary_dir', help="secondary directory including binaries to compare")
    parser_m.add_argument('--recursively', '-r', action='store_true', help="getting binaries recursively")

    args = parser.parse_args()

    high_ws = high_fs = None
    if os.path.isfile(args.primary):
        start = time.time()
        try:
            bd = BinDiff(args.primary, args.out_dir, args.ws_th, args.fs_th, args.ins_th, args.bb_th, args.size_th, args.func_regex, args.debug, args.clear, args.newidb, args.use_pyidb)
            #bd = BinDiff(args.primary, args.out_dir, args.ws_th, args.fs_th, args.ins_th, args.bb_th, args.size_th, args.debug, args.clear, args.noidb, args.use_pyidb)
            if args.mode == '1' and os.path.isfile(args.secondary):
                if not bd.is_skipped(args.secondary):
                    bd.check_similarity(args.secondary)
                    bd.increment_count()
            elif args.mode == 'm' and os.path.isdir(args.secondary_dir):
                bd.check_similarities(args.secondary_dir, args.recursively)
            high_ws, high_fs, cnt = bd.get_result()                
        except LocalError as e:
            print('[!] {} ({})'.format(str(e), type(e)))
            return 
        elapsed = time.time() - start

        print('---------------------------------------------')
        print('[*] BinDiff result')
        print('[*] elapsed time = {} sec, number of diffing = {}'.format(elapsed, cnt))
        print('[*] primary binary: (({}))'.format(os.path.basename(args.primary)))
        if high_ws:
            print('\n============== {} high similar binaries (>{}) ================'.format(len(high_ws), args.ws_th))
            table = PrettyTable(['similarity', 'secondary binary'])
            for path,res in sorted(list(high_ws.items()), key=lambda x:x[1]['similarity'], reverse=True):
                table.add_row([res['similarity'], '(({}))'.format(os.path.basename(path))])
            print(table)
        if high_fs:
            print('\n============== {} high similar functions (>{}), except high similar binaries ================'.format(len(high_fs), args.fs_th))
            table = PrettyTable(['similarity', 'primary addr', 'primary name', 'secondary addr', 'secondary name', 'secondary binary'])
            for key,res in sorted(list(high_fs.items()), key=lambda x:(x[1]['similarity'], x[0][0]), reverse=True):
                addr1, func_name1, addr2, func_name2, path = key
                table.add_row([res['similarity'], '{:#x}'.format(addr1), func_name1[:0x20], '{:#x}'.format(addr2), func_name2[:0x20], '{}'.format(os.path.basename(path))])
            print(table)
        if (not high_ws) and (not high_fs):
            print('\nno similar binaries/functions found')
        print('---------------------------------------------')
        
if ( __name__ == "__main__" ):
    main()


================================================
FILE: bindiff/bindiff_export.idc
================================================
#include <idc.idc>
static main()
{
    ChangeConfig("ABANDON_DATABASE=YES");
    Batch(0);
    Wait();
    //RunPlugin("binexport11", 2 );
    //Exit( 1 - RunPlugin("zynamics_binexport_9", 2 ));
    //Exit( 1 - RunPlugin("zynamics_binexport_8", 2 ));
    //Exit( 1 - RunPlugin("binexport10", 2 ));
    //Exit( 1 - RunPlugin("binexport11", 2 ));
    //RunPlugin("binexport12_ida", 2 );
    Exit( 1 - RunPlugin("binexport12_ida", 2 ));
}


================================================
FILE: bindiff/save_func_names.py
================================================
import os, pickle, re

g_track_parent_th = 2 # parent function tracking level threshold
g_parent_func_exclude_list = ['__NMSG_WRITE', '__fassign_l']
g_pfe_list = [LocByName(p) for p in g_parent_func_exclude_list]

def get_pfuncs(ea, track_th):
    pfuncs = [GetFunctionAttr(ref, FUNCATTR_START) for ref in CodeRefsTo(ea, False)]
    track_th -= 1
    if track_th > 0:
        ppfuncs = [ppfunc for pfunc in pfuncs for ppfunc in get_pfuncs(pfunc, track_th)]
        pfuncs.extend(ppfuncs)
    return pfuncs

def main():    
    #Wait()

    # not change the database to maintain the window setting
    process_config_line("ABANDON_DATABASE=YES")

    # -Odecomp:option1:option2:option3
    options = idaapi.get_plugin_options("save_func_names").split(':')
    func_regex = options[0]
    pickle_path = ':'.join(options[1:])
    p = re.compile(func_regex)

    func_names = {}
    with open(pickle_path, 'wb') as f:
        for ea in Functions(MinEA(), MaxEA()):
            func_name = GetFunctionName(ea)
            if p.search(func_name):
                flags = GetFunctionFlags(ea)
                if flags & FUNC_LIB or flags & FUNC_THUNK:
                    continue
                pfuncs = get_pfuncs(ea, g_track_parent_th)
                if not (set(pfuncs) & set(g_pfe_list)):
                    func_names[ea] = func_name
        pickle.dump(func_names, f)

    Exit(0)

    #with open(os.path.splitext(GetIdbPath())[0] + '_func_names.pickle', 'rb') as f:
    #    func_names = pickle.load(f)
    #    print func_names

if ( __name__ == "__main__" ):
    main()


================================================
FILE: bindiff/save_func_names_7x.py
================================================
import os, pickle, re
from idautils import *

g_track_parent_th = 2 # parent function tracking level threshold
g_parent_func_exclude_list = ['__NMSG_WRITE', '__fassign_l']
g_pfe_list = [get_name_ea_simple(p) for p in g_parent_func_exclude_list]

def get_pfuncs(ea, track_th):
    pfuncs = [get_func_attr(ref, FUNCATTR_START) for ref in CodeRefsTo(ea, False)]
    track_th -= 1
    if track_th > 0:
        ppfuncs = [ppfunc for pfunc in pfuncs for ppfunc in get_pfuncs(pfunc, track_th)]
        pfuncs.extend(ppfuncs)
    return pfuncs

def main():    
    #Wait()

    # not change the database to maintain the window setting
    process_config_line("ABANDON_DATABASE=YES")

    # -Odecomp:option1:option2:option3
    options = idaapi.get_plugin_options("save_func_names").split(':')
    func_regex = options[0]
    pickle_path = ':'.join(options[1:])
    p = re.compile(func_regex)

    func_names = {}
    with open(pickle_path, 'wb') as f:
        for ea in Functions(idaapi.cvar.inf.minEA, idaapi.cvar.inf.maxEA):
            func_name = idc.get_func_name(ea)
            if p.search(func_name):
                flags = idc.get_func_attr(ea, FUNCATTR_FLAGS)
                if flags & FUNC_LIB or flags & FUNC_THUNK:
                    continue
                pfuncs = get_pfuncs(ea, g_track_parent_th)
                if not (set(pfuncs) & set(g_pfe_list)):
                    func_names[ea] = func_name
        pickle.dump(func_names, f)

    ida_pro.qexit(0)

    #with open(os.path.splitext(idc.get_idb_path())[0] + '_func_names.pickle', 'rb') as f:
    #    func_names = pickle.load(f)
    #    print func_names

if ( __name__ == "__main__" ):
    main()


================================================
FILE: callstrings/README.org
================================================
#+OPTIONS: ^:{}
* callstrings - deobfuscating Hodur's global string encryption

- Recover strings using various methods (static decoding, emulation, IDA debug hook)
- Apply API function types to the local variable pointers

The script comparison is below:
[[./img/comparison.png]]

- As the comparison shows, ida_callstrings_dbg.py and ida_callstrings_flare_emu.py (except emulateSelection) can work for other malware.
- As the reference slides say, it is recommended to use modified [[https://github.com/TakahiroHaruyama/flare-emu/tree/xorloop][flare-emu]] and [[https://github.com/TakahiroHaruyama/capa/tree/comment_insertion][CAPA]] to make ida_callstrings_flare_emu.py work better.

** Reference

- https://speakerdeck.com/takahiro_haruyama/the-art-of-malware-c2-scanning-how-to-reverse-and-emulate-protocol-obfuscated-by-compiler


================================================
FILE: callstrings/hexrays_utils.py
================================================
'''
hexrays_utils.py - common classes/functions using Hex-Rays decompiler APIs
Takahiro Haruyama (@cci_forensics)
'''

#from abc import ABCMeta, abstractmethod

from idc import *
import idaapi, ida_ida, ida_ua, ida_typeinf, ida_kernwin
from ida_hexrays import *
from ida_allins import NN_callni, NN_call, NN_callfi
import idautils
import re

# Global options/variables
g_DEBUG = True
g_CACHE = True
g_ASCII_TYPES = ['CHAR *', 'CONST CHAR *', 'LPSTR', 'LPCSTR']
g_UNICODE_TYPES = ['WCHAR *', 'CONST WCHAR *', 'LPWSTR', 'LPCWSTR']
g_STR_TYPES = g_ASCII_TYPES + g_UNICODE_TYPES
g_stub_GetProcAddress = 'fn_resolve_API_addr'
g_RENAME_RETRY_CNT = 100

def info(msg):
    print("\033[34m\033[1m[*]\033[0m {}".format(msg))

def success(msg):
    print("\033[32m\033[1m[+]\033[0m {}".format(msg))
    
def error(msg):
    print("\033[31m\033[1m[!]\033[0m {}".format(msg))

def debug(msg):
    if g_DEBUG:
        print("\033[33m\033[1m[D]\033[0m {}".format(msg))


def extract_ascii(data):
    pat = re.compile(rb'^(?:[\x20-\x7E]){2,}')
    return list(set([w.decode('ascii') for w in pat.findall(data)]))

def extract_unicode(data):
    pat = re.compile(r'^(?:[\x20-\x7E][\x00]){2,}')
    return list(set([w.decode('utf-16le') for w in pat.findall(data)]))

def get_ctree_root(ea, cache=True):
    
    cfunc = None
    try:
        if cache:
            cfunc = decompile(ea)
        else:
            cfunc = decompile(ea, flags=DECOMP_NO_CACHE)        
    except:
        error('Decompilation of a function {:#x} failed'.format(ea))

    return cfunc

# Detect constant value used in string decoding
class cnt_val_finder_t(ctree_visitor_t):

    def __init__(self):
        
        ctree_visitor_t.__init__(self, CV_FAST)

        self.cst_val = None

    def visit_expr(self, expr):

        if expr.op == cot_asgxor and expr.y.op == cot_xor and expr.y.y.op == cot_num:
            cst = expr.y.y.n._value
            
            if expr.y.x.op == cot_add:
                expr_add = expr.y.x
            elif expr.y.x.op == cot_cast and expr.y.x.x.op == cot_add:
                expr_add = expr.y.x.x
            else:
                expr_add = None

            if expr_add and expr_add.y.op == cot_num and \
                (expr_add.y.n._value == cst) and (0 < cst < 0xff):
                success(f'{expr.ea:#x}: string decoding constant value {cst:#x} detected')
                self.cst_val = cst
                return 1
            
            # x ^ (y - 0x1d) ^ 0xe3 == x ^ (y + 0xe3) ^ 0xe3
            if expr.y.x.op == cot_sub:
                expr_sub = expr.y.x
            elif expr.y.x.op == cot_cast and expr.y.x.x.op == cot_sub:
                expr_sub = expr.y.x.x
            else:
                expr_sub = None

            if expr_sub and expr_sub.y.op == cot_num and \
                (expr_sub.y.n._value + cst == 0x100) and (0 < cst < 0xff):
                success(f'{expr.ea:#x}: string decoding constant value {cst:#x} detected')
                self.cst_val = cst
                return 1
            
        return 0
    
    def get_cnt_val(self):

        return self.cst_val

# Detect assignments when inserting comments
class asg_parent_finder_t(ctree_visitor_t):

    def __init__(self, call_ea):
        
        ctree_visitor_t.__init__(self, CV_PARENTS)
        self.call_ea = call_ea
        self.asg_ea = BADADDR

    def visit_expr(self, expr):

        if expr.op == cot_asg and \
            ((expr.y.op == cot_call and expr.y.ea == self.call_ea) or \
             (expr.y.op == cot_cast and expr.y.x.op == cot_call and expr.y.x.ea == self.call_ea)):
            self.asg_ea = expr.ea
            info(f'{self.call_ea:#x}: assignment detected, replaced with the ea {self.asg_ea:#x}')
            return 1
        
        return 0

# Change type/name of the specified lvar name
class my_lvar_modifier_t(user_lvar_modifier_t):

    def __init__(self, target_name, new_name=None, new_decl=None, new_tif=None):
        
        user_lvar_modifier_t.__init__(self)
        self.target_name = target_name
        self.new_name = new_name
        self.new_decl = new_decl
        self.new_tif = new_tif

    def modify_lvars(self, lvars):

        # Note: Variables without user-specified info are not present in lvvec
        if len(lvars.lvvec) == 0:
            error('modify_lvars: len(lvars.lvvec) == 0')

        for idx, one in enumerate(lvars.lvvec):
            debug('modify_lvars: target_name = "{}" current = "{}"'.format(self.target_name, one.name))

            # Set the type to the target var
            if one.name == self.target_name:
                if self.new_name:
                    one.name = self.new_name
                    info('modify_lvars: Name "{}" set to {}'.format(one.name, self.target_name))

                tif = None
                if self.new_decl:                    
                    tif = ida_typeinf.tinfo_t()
                    res = ida_typeinf.parse_decl(tif, None, self.new_decl, 0)
                    #if not res:
                    #    error('{}: parse_decl from {} FAILED'.format(one.name, self.new_decl))
                elif self.new_tif:
                    tif = self.new_tif
                if tif:
                    one.type = tif
                    info('modify_lvars: Type "{}" set to {}'.format(str(tif), one.name))

                return True

        return False

#class HexRaysUtils(metaclass=ABCMeta):
class HexRaysUtils():

    def __init__(self):

        self.cmts = {}
        self.call_eas = []

    #@abstractmethod
    def get_reg_value(self, reg_name):
        raise NotImplementedError()
    
    #@abstractmethod
    def get_ptr_value(self, ptr):
        raise NotImplementedError()

    #@abstractmethod
    def get_string(self, ea, is_unicode=False):
        raise NotImplementedError()

    def get_bytes(self, ea):
        raise NotImplementedError()

    def get_fn_offset(self, ea):

        func_ea = get_func_attr(ea, FUNCATTR_START)
        return get_name(func_ea) + f'+{ea-func_ea:#x}'
    '''
    def set_decomplier_cmt(self, cfunc, ea, cmt):

        tl = idaapi.treeloc_t()
        tl.ea = ea
        tl.itp = idaapi.ITP_SEMI
        cfunc.set_user_cmt(tl, cmt)
        cfunc.save_user_cmts()
    '''
    def set_decomplier_cmt(self, cfunc, ea, cmt):

        # Prevent orphan comment issues in assignments
        finder = asg_parent_finder_t(ea)
        finder.apply_to_exprs(cfunc.body, None)
        #print(f'{finder.asg_ea=:#x}')
        cmt_ea = ea if finder.asg_ea == BADADDR else finder.asg_ea

        tl = idaapi.treeloc_t()
        tl.ea = cmt_ea
        tl.itp = idaapi.ITP_SEMI

        cfunc.set_user_cmt(tl, cmt)
        cfunc.save_user_cmts()
        cfunc.refresh_func_ctext()

    # This function was ported from https://github.com/RolfRolles/Miscellaneous/blob/master/PrintTypeSignature.py
    # If an indirect API call still has a cast after the var type is set, apply "Force call type" on the var in Pseudocode view
    def GetTypeSignature(self, apiName):
        
        # Look up the prototype by name from the main TIL
        o = ida_typeinf.get_named_type(None, apiName, ida_typeinf.NTF_SYMU)
        
        # Found?
        if o is not None:
            code, type_str, fields_str, cmt, field_cmts, sclass, value = o
            
            # Create a tinfo_t by deserializing the data returned above
            t = ida_typeinf.tinfo_t()
            if t.deserialize(None, type_str, fields_str, field_cmts):
                
                # And change the prototype into a function pointer
                ptrType = ida_typeinf.tinfo_t()
                ptrType.create_ptr(t)
                return ptrType
        
        # On any failure, return None
        return None

    # IDA decompiler has no API forcing lvar name
    def force_rename_lvar(self, ea, var, new_name):

        func_ea = get_func_attr(ea, FUNCATTR_START)
        debug('force_rename_lvar: function ea = {:#x}'.format(func_ea))
        old_name = var.name
        
        if rename_lvar(func_ea, var.name, new_name):
            info('force_rename_lvar {:#x}: lvar name changed "{}" ->  "{}"'.format(ea, old_name, new_name))
            var.name = new_name # to refresh immediately
            return
                
        for i in range(g_RENAME_RETRY_CNT):            
            if rename_lvar(func_ea, var.name, new_name + '_{}'.format(i + 1)):
                info('force_rename_lvar {:#x}: lvar name changed "{}" -> "{}"'.format(ea, old_name, new_name + '_{}'.format(i + 1)))
                var.name = new_name + '_{}'.format(i + 1)
                break
        else:
            error('{:#x}: renaming {} failed (rename_lvar, {} times)'.format(ea, var.name, g_RENAME_RETRY_CNT))

    def get_arg_strings(self, address):

        if address in self.call_eas:
            info(f'{address:#x} ({self.get_fn_offset(address)}): already-visited call')
            return
        else:
            self.call_eas.append(address)

        cfunc = get_ctree_root(address, cache=g_CACHE)

        if cfunc:
            item = cfunc.body.find_closest_addr(address)

            if item.op == cot_call:
                expr = item.cexpr
                print('-' * 80)

                if expr.x.obj_ea == BADADDR:
                    # dynamically-resolved API
                    if expr.x.op == cot_var:
                        callee_name = expr.x.v.getv().name
                    elif expr.x.op == cot_cast and expr.x.x.op == cot_var:
                        callee_name = expr.x.x.v.getv().name
                        # Force call type (remove the cast)
                        tif = ida_typeinf.tinfo_t()
                        if print_insn_mnem(expr.ea) == 'call' and not ida_nalt.get_op_tinfo(tif, expr.ea, 0): # Skip an already-specified operand
                            tif = self.GetTypeSignature(callee_name)
                            if tif:
                                if ida_nalt.set_op_tinfo(expr.ea, 0, tif):
                                    success(f'{expr.ea:#x}: Force call type "{str(tif)}" to the operand "{callee_name}"')
                                else:
                                    error(f'{expr.ea:#x}: Force call type failed')
                    else:
                        callee_name = 'UNRESOLVED'
                else:
                    callee_name = get_name(expr.x.obj_ea)
                
                info(f'{address:#x} ({self.get_fn_offset(address)}): call {callee_name} ({expr.x.obj_ea:#x})')
                debug(f'{str(expr.x.type)}')
                
                debug(f'argc = {expr.a.size()}')
                arg_strs = []
                for i in range(expr.a.size()):
                    #breakpoint()
                    arg = expr.a.at(i)

                    # Sometimes the arg type in stubs is int *
                    if str(arg.type).upper() in g_STR_TYPES or callee_name.find(g_stub_GetProcAddress) != -1:
                        debug(f'arg{i} = {str(arg.type)}')

                        ea = 0
                        if str(expr.x.type).find('__thiscall') != -1:
                            debug('thiscall')
                            if i == 0:
                                ea = self.get_reg_value("ECX")
                            else:
                                ea = self.get_ptr_value(self.get_reg_value("ESP") + (i - 1) * 4)
                        elif str(expr.x.type).find('__fastcall') != -1:
                            debug('fastcall')
                            if i == 0:
                                ea = self.get_reg_value("RCX")
                            elif i == 1:
                                ea = self.get_reg_value("RDX")
                            elif i == 2:
                                ea = self.get_reg_value("R8")
                            elif i == 3:
                                ea = self.get_reg_value("R9")
                            else:
                                ea = self.get_ptr_value(self.get_reg_value("RSP") + (i - 4) * 4)
                        else: # __stdcall, __cdecl, etc.
                            debug('other calling conventions')
                            ea = self.get_ptr_value(self.get_reg_value("ESP") + i * 4)
                        
                        debug(f'{ea=:#x}')
                        if str(arg.type).upper() in g_ASCII_TYPES or callee_name.find(g_stub_GetProcAddress) != -1:
                            #if i == 2:
                            #    res = self.get_bytes(ea)
                            #else:
                            res = self.get_string(ea)
                        else: # g_UNICODE_TYPES
                            res = self.get_string(ea, is_unicode=True)
                        
                        if res:
                            arg_strs.append(f'arg{i} = {res}')
                            debug(f'arg{i} = {res}')

                            # Set the function prototype if the callee is the GetProcAddress stubs or GetProcAddress API
                            if (i == 0 and callee_name.find(g_stub_GetProcAddress) != -1) or \
                                (i == 1 and callee_name == "GetProcAddress"):
                                #breakpoint()
                                p_item = cfunc.body.find_parent_of(expr)
                                p_expr = p_item.cexpr

                                if p_expr.op == cot_cast:
                                    p_item = cfunc.body.find_parent_of(p_expr)
                                    p_expr = p_item.cexpr

                                if p_expr.op == cot_asg and p_expr.x.op == cot_var:
                                    var = p_expr.x.v.getv()
                                    tif = self.GetTypeSignature(res)
                                    # We need to use rename_lvar calling modify_user_lvar_info indirectly to add the var into lvvec
                                    self.force_rename_lvar(address, var, res)
                                    my_lvar_mod = my_lvar_modifier_t(var.name, new_tif=tif)
                                    modify_user_lvars(get_func_attr(address, FUNCATTR_START), my_lvar_mod)

                # Set the arguments comment at the call instruction address
                if arg_strs:
                    cmt = f'{address:#x} ({self.get_fn_offset(address)}): {",".join(arg_strs)}'
                    success(cmt)
                    self.set_decomplier_cmt(cfunc, address, cmt)
                    self.cmts[address] = cmt
                    cfunc.refresh_func_ctext()

    def print_summary(self):

        if self.cmts:
            success('Summary:')
            for k,v in self.cmts.items():
                print(f'{v}')

    def decode(self, enc, cst_val):

        return bytes([enc[i] ^ ((i + cst_val) & 0xff) ^ cst_val for i in range(len(enc))])


================================================
FILE: callstrings/ida_callstrings_dbg.py
================================================
'''
ida_callstrings_dbg.py - string deobfuscation using IDA debug hook class
Takahiro Haruyama (@cci_forensics)
'''

import idaapi
idaapi.require('hexrays_utils', package='*')
from hexrays_utils import *
from ida_dbg import *

# Global options/variables
g_DEBUG = False
g_MAX_INSTRUCTIONS = 0 # 0 = disabled

def info(msg):
    print("\033[34m\033[1m[*]\033[0m {}".format(msg))

def success(msg):
    print("\033[32m\033[1m[+]\033[0m {}".format(msg))
    
def error(msg):
    print("\033[31m\033[1m[!]\033[0m {}".format(msg))

def debug(msg):
    if g_DEBUG:
        print("\033[33m\033[1m[D]\033[0m {}".format(msg))


class TraceHook(DBG_Hooks, HexRaysUtils):

    def __init__(self, target_ea):

        DBG_Hooks.__init__(self)
        HexRaysUtils.__init__(self)

        self.traces = 0
        self.target_ea = target_ea
        #self.current_tid = get_current_thread()

    def get_reg_value(self, reg_name):

        return get_reg_val(reg_name)

    def get_ptr_value(self, ptr):
        
        if idaapi.get_inf_structure().is_64bit():
            return get_qword(ptr)
        else:
            return get_wide_dword(ptr)
    
    def get_string(self, ea, is_unicode=False):

        res = get_strlit_contents(ea, strtype=STRTYPE_C_16) if is_unicode else get_strlit_contents(ea)

        return res.decode() if res else None

    def dbg_trace(self, tid, ea):

        debug("[tid %X] trace %08X" % (tid, ea))

        if ea < ida_ida.inf_get_min_ea() or ea > ida_ida.inf_get_max_ea():
            raise Exception(
                "Received a trace callback for an address outside this database!"
            )
        
        insn = ida_ua.insn_t()
        insnlen = ida_ua.decode_insn(insn, ea)
        fn_name = get_name(get_func_attr(ea, FUNCATTR_START))
        if insnlen > 0 and insn.itype in [NN_callni, NN_call, NN_callfi] and fn_name.find(g_stub_GetProcAddress) == -1:
            refresh_debugger_memory()
            self.get_arg_strings(ea)

        self.traces += 1
        if g_MAX_INSTRUCTIONS and self.traces >= g_MAX_INSTRUCTIONS:
            request_disable_step_trace()
            request_suspend_process()

            if run_requests():
                info('Requests suspending the process executed (g_MAX_INSTRUCTIONS)')
            else:
                error('Requests suspending the process failed (g_MAX_INSTRUCTIONS)')

        #return 1
        return 0 # log it
    
    def dbg_thread_start(self, pid, tid, ea):

        info(f'[Thread {tid:#x}] {ea:#x}: New thread started')
        '''
        add_bpt(ea)
        select_thread(tid)
        request_suspend_process()

        #if tid != self.current_tid:
        if not self.unhook():
            error("Error uninstalling hooks!")
        else:
            info('Hooks uninstalled')
        #self.current_tid = tid
        end = prev_head(get_func_attr(ea, FUNCATTR_END))
        self.target_ea = end
        info(f'Selecting the new thread to trace until {end:#x}')
        #dbg_del_thread(self.current_tid)
        #suspend_thread(self.current_tid)
        select_thread(tid)
        set_trace_base_address(ea)
        dbg_add_thread(tid)
        self.hook()
        enable_step_trace(1) # needed per thread?
        set_step_trace_options(ST_OPTIONS_MASK)
        request_enable_step_trace(1)
        request_run_to(end)
        #request_continue_process()

        if run_requests():
            info('Requests successful')
        else:
            error('Requests failed')
        '''
        
    def dbg_thread_exit(self, pid, tid, ea, exit_code):

        info(f'[Thread {tid:#x}] {ea:#x}: Thread exited with {exit_code:#x}')

    def dbg_run_to(self, pid, tid=0, ea=0):

        if ea == self.target_ea:
            info(f'[Thread {tid:#x}] Reached to the target {self.get_fn_offset(ea)}')
        elif pid != 0:
            error(f'[Thread {tid:#x}] The suspended address {self.get_fn_offset(ea)} is different from the target {self.get_fn_offset(self.target_ea)}. Probably another breakpoint set?')
        else:
            error(f'[Thread {tid:#x}] The suspended address {self.get_fn_offset(ea)} is different from the target {self.get_fn_offset(self.target_ea)}. Probably suspended by users manually?')

        info(f"Traced {self.traces} instructions")
        refresh_debugger_memory()
        self.print_summary()

    def dbg_process_exit(self, pid, tid, ea, code):

        error(f"[Thread {tid:#x}] Process exited with {code:#x} before reaching to the target")
        info(f"Traced {self.traces} instructions")
        self.print_summary()

        return 0
    '''
    def dbg_suspend_process(self):

        self.dbg_run_to(0, ea=get_ip_val())
    '''

            
def main():

    info('start')

    if not is_debugger_on():
        error("Please run the process first!")
        return

    end = prev_head(get_func_attr(get_reg_val("EIP"), FUNCATTR_END))
    info(f"Tracing to the end of function {end:#x}")

    debugHook = TraceHook(end)
    debugHook.hook()
    enable_step_trace(1) # Only the same thread works
    #set_step_trace_options(ST_OVER_DEBUG_SEG | ST_OVER_LIB_FUNC | ST_SKIP_LOOPS | ST_ALREADY_LOGGED | ST_DIFFERENTIAL)
    #set_step_trace_options(ST_OVER_DEBUG_SEG | ST_OVER_LIB_FUNC)
    set_step_trace_options(ST_OPTIONS_MASK) # all included

    run_to(end)

    while get_process_state() == DSTATE_RUN:
    #while get_process_state() != DSTATE_NOTASK: # as long as process is currently debugged
        wait_for_next_event(WFNE_ANY, 0)

    if not debugHook.unhook():
        error("Error uninstalling hooks!")
    else:
        info('Hooks uninstalled')
    del debugHook

    info('done')

if __name__ == '__main__':
    main()
    

================================================
FILE: callstrings/ida_callstrings_flare_emu.py
================================================
'''
ida_callstrings_flare_emu.py - string deobfuscation using flare-emu
Takahiro Haruyama (@cci_forensics)
'''

import idaapi
#idaapi.require('logging') # <- This suppresses the flare-emu debug messages!
import logging, hexdump
#logging.basicConfig(level=logging.DEBUG, force=True)

idaapi.require('hexrays_utils', package='*')
from hexrays_utils import *

idaapi.require('flare_emu')
idaapi.require('flare_emu_hooks')
import flare_emu, flare_emu_hooks, unicorn

# Global options
g_DEBUG = False
g_DEBUG_FLARE_EMU = False
g_FLAG_ALL_PATHS = False # True: iterateAllPaths, False: emulateRange
g_MAX_SAME_STATE_VAR = 0x1000 # to detect infinite loop by CFF
g_MAX_INST_VISIT = 10000 # to detect infinite loop
#g_MAX_EMU_INSN = 1000000
g_MAX_STACK_BUF = 0x100
#g_ENC_OFFSET = 0x0

def info(msg):
    print("\033[34m\033[1m[*]\033[0m {}".format(msg))

def success(msg):
    print("\033[32m\033[1m[+]\033[0m {}".format(msg))
    
def error(msg):
    print("\033[31m\033[1m[!]\033[0m {}".format(msg))

def debug(msg):
    if g_DEBUG:
        print("\033[33m\033[1m[D]\033[0m {}".format(msg))

def debug_bin(n, v):
    if g_DEBUG:
        debug(n)
        hexdump.hexdump(v)


class HexRaysEmu(HexRaysUtils):

    def __init__(self, eh):

        HexRaysUtils.__init__(self)
        self.eh = eh

    def get_reg_value(self, reg_name):

        return self.eh.getRegVal(reg_name.lower())

    def get_ptr_value(self, ptr):
        
        return self.eh.getEmuPtr(ptr)
    
    def get_string(self, ea, is_unicode=False):

        return self.eh.getEmuWideString(ea).decode('utf-16') if is_unicode else self.eh.getEmuString(ea).decode()
    
    def get_bytes(self, ea):

        return self.eh.getEmuBytes(ea, 0x20)


def call_hook(address, argv, funcName, userData):

    debug(f'call_hook at {address:#x}')

    #is_64bit = True if idaapi.get_inf_structure().lflags & idaapi.LFLG_64BIT == 4 else False
    hremu = userData["hremu"]

    try:
        hremu.get_arg_strings(address)
    except unicorn.UcError as e:
        error(f'{address:#x} ({hremu.get_fn_offset(address)}): Unicorn emulation exception in get_arg_strings() ({e})')

def mem_write_hook(unicornObject, accessType, memAccessAddress, memAccessSize, memValue, userData):

    if accessType == unicorn.UC_MEM_WRITE:

        hremu = userData["hremu"]
        sp = hremu.eh.getRegVal('esp')
        ip = hremu.eh.getRegVal('ip')

        if sp < memAccessAddress < sp + g_MAX_STACK_BUF:
            userData["enc_heads"][ip] = memAccessAddress

def is_high_entropy(v):

    res = True
    vbytes = v.to_bytes(4, 'little')

    for b in vbytes:
        if b & 0xff == 0: # e.g., 0, 1, 0x10000000, etc.
            res = False
            break
    else:
        vlist = [b for b in vbytes]
        for b in vbytes:
            if b == vlist[0] and b == vlist[1] and b == vlist[2] and b == vlist[3]: # e.g., 0x11111111, 0xffffffff, etc.
                res = False
                break
        
    return res

def inst_hook_cff(unicornObject, address, instructionSize, userData):

    eh = userData["EmuHelper"]
    state_var_cnt = userData["state_var_cnt"]
    state_excluded = userData["state_excluded"]
    abort = False

    if print_insn_mnem(address) == 'cmp' and get_operand_type(address, 0) == o_reg and get_operand_type(address, 1) == o_imm and \
        is_high_entropy(get_operand_value(address, 1)) and print_insn_mnem(next_head(address)) in ['jz', 'jnz']:
        #debug(f'{address:#x}: compare state var with cmp var')

        reg_name = print_operand(address, 0)
        state_var = eh.getRegVal(reg_name)        
        cmp_var = get_operand_value(address, 1)

        if state_var != cmp_var:
            abort = True

    elif print_insn_mnem(address) in ['cmovz'] and get_operand_type(address, 0) == o_reg:

        reg_name = print_operand(address, 0)
        state_var = eh.getRegVal(reg_name)
        
        cmp_var = None
        if is_high_entropy(state_var):

            op1type = get_operand_type(address, 1)
            if op1type == o_imm:
                cmp_var = get_operand_value(address, 1)
            elif op1type == o_reg:
                op1_reg_name = print_operand(address, 1)
                cmp_var = eh.getRegVal(op1_reg_name)

            if cmp_var and state_var != cmp_var:
                abort = True

    if abort:        
        if address not in state_excluded:
            uid = (address, state_var)
            state_var_cnt[uid] = 1 if uid not in state_var_cnt else state_var_cnt[uid] + 1
            #debug(f'{address:#x}: The same state variable is compared or conditional moved {state_var_cnt[uid]} times')

            if state_var_cnt[uid] >= g_MAX_SAME_STATE_VAR:
                error(f'{address:#x}: CFF infinite loop detected. Update the state variable {state_var:#x} with the new one {cmp_var:#x}')
                debug([f'{ea:#x}: {var=:#x}, {cnt=}' for (ea, var), cnt in state_var_cnt.items()])
                debug(f'excluded: {[f"{e:#x}" for e in state_excluded]}')

                eh.uc.reg_write(eh.regs[reg_name], cmp_var)
                state_excluded.append(address)
                # Reset the counts of the external loops
                state_var_cnt = {}

def inst_hook(unicornObject, address, instructionSize, userData):

    eh = userData["EmuHelper"]
    inst_visit_cnt = userData["inst_visit_cnt"]

    inst_visit_cnt[address] = 1 if address not in inst_visit_cnt else inst_visit_cnt[address] + 1
    if inst_visit_cnt[address] >= g_MAX_INST_VISIT:
        error(f'{address:#x}: Infinite loop detected. Aborted.')
        eh.stopEmulation(userData)

def noop(*args):

    pass

def main():

    info('start')
    #breakpoint()

    if g_DEBUG_FLARE_EMU:
        eh = flare_emu.EmuHelper(verbose=10)
        eh.logger.setLevel(logging.DEBUG)
    else:
        eh = flare_emu.EmuHelper()

    hremu = HexRaysEmu(eh)

    selection = idaapi.read_range_selection(None)
    if selection[0]:
        info(f'Emulating the selection {selection[1]:#x} to {selection[2]:#x}')
        enc_heads = {}
        userData = {
            'hremu': hremu,
            'enc_heads': enc_heads
        }
        eh.emulateSelection(memAccessHook=mem_write_hook, hookData=userData)

        # Get the head of encoded string
        stack_buf = eh.getEmuBytes(eh.getRegVal('esp'), g_MAX_STACK_BUF)
        debug_bin('stack', stack_buf)
        for i in range(len(stack_buf)):
            if 65 <= stack_buf[i] <= 122: # A to z
                offset = i
                break
        else:
            offset = 0
        #offset = 0x48 # Sometimes you need to adjust the offset manually :-(
        debug(f'detected offset = {offset:#x}')
        
        # Decode the string after detecting the constant value
        cfunc = get_ctree_root(selection[1], cache=g_CACHE)
        cvf = cnt_val_finder_t()
        cvf.apply_to_exprs(cfunc.body, None)
        cnt_val = cvf.get_cnt_val()

        if cnt_val:
            if stack_buf[offset + 1] != 0:
                enc = stack_buf[offset:]
                debug(f'enc {enc} is ascii')
            else:
                enc = eh.getEmuWideString(eh.getRegVal('esp') + offset).decode('utf-16-le')
                enc = enc.encode()
                debug(f'enc {enc} is unicode (utf-16-le)')
            dec = hremu.decode(enc, cnt_val)
            debug_bin('dec', dec)

            # Extract the ascii strings (no null termination)
            head = eh.getRegVal('esp') + offset
            ascs = extract_ascii(dec)
            if ascs:
                keys = [k for k, v in enc_heads.items() if v == head]
                if len(keys) == 1:
                    success(f'{keys[0]:#x}: string decoded "{ascs[0]}"')
                    hremu.set_decomplier_cmt(cfunc, keys[0], ascs[0])
                else:
                    success(f'string decoded "{ascs[0]}"')

        else:
            error(f'A constant value for decoding is not found')            

    else:        
        ans = ida_kernwin.ask_yn(0, 'only decode the selected function?')
        if ans == ida_kernwin.ASKBTN_YES:
            fvas = [get_func_attr(get_screen_ea(), FUNCATTR_START)]
        elif ans == ida_kernwin.ASKBTN_NO:
            fvas = idautils.Functions()
        else:
            info('canceled')
            return

        for fva in fvas:
            if get_func_flags(fva) & (FUNC_LIB | FUNC_THUNK):
                debug(f"{fva:#x}: skipping library or thunk function")
                continue

            fn_name = get_name(get_func_attr(fva, FUNCATTR_START))
            if fn_name.find(g_stub_GetProcAddress) != -1:
                debug(f"{fva:#x}: skipping GetProcAddress stub function")
                continue

            print('-' * 100)
            info(f'{get_name(fva)} ({fva:#x})')
        
            '''
            state_var_cnt = {}
            state_excluded = []
            userData = {
                'hremu': hremu,
                'state_var_cnt': state_var_cnt,
                'state_excluded': state_excluded,
            }
            eh.emulateRange(fva, callHook=call_hook, instructionHook=inst_hook_cff, hookData=userData, count=g_MAX_EMU_INSN)
            '''
            inst_visit_cnt = {}
            userData = {
                'hremu': hremu,
                'inst_visit_cnt': inst_visit_cnt,
            }

            try:
                if g_FLAG_ALL_PATHS:
                    info('The mode is iterateAllPaths')
                    eh.iterateAllPaths(fva, noop, hookData=userData, callHook=call_hook)
                else:
                    info('The mode is emulateRange')
                    eh.emulateRange(fva, callHook=call_hook, instructionHook=inst_hook, hookData=userData)
            except unicorn.unicorn.UcError as e:
                error(f'{fva:#x}: unicorn error ({e})')

            refresh_idaview_anyway()
            eh.resetEmulatorHeapAndStack()

    print('-' * 100)
    hremu.print_summary()

    info('done')

if __name__ == '__main__':
    main()
    

================================================
FILE: callstrings/ida_callstrings_static.py
================================================
'''
ida_callstrings_static.py - string deobfuscation for Hodur
Takahiro Haruyama (@cci_forensics)
'''

import idaapi
idaapi.require('hexrays_utils', package='*')
from hexrays_utils import *

g_DEBUG = False
g_CACHE = True
g_memcpy_names = ['qmemcpy', 'wmemcpy', 'strcpy']

def info(msg):
    print("\033[34m\033[1m[*]\033[0m {}".format(msg))

def success(msg):
    print("\033[32m\033[1m[+]\033[0m {}".format(msg))
    
def error(msg):
    print("\033[31m\033[1m[!]\033[0m {}".format(msg))

def debug(msg):
    if g_DEBUG:
        print("\033[33m\033[1m[D]\033[0m {}".format(msg))


class static_decoder_t(ctree_visitor_t, HexRaysUtils):

    def __init__(self, cst_val, cfunc):
        
        ctree_visitor_t.__init__(self, CV_PARENTS | CV_POST | CV_RESTART)
        HexRaysUtils.__init__(self)

        self.cst_val = cst_val
        self.cfunc = cfunc

    def visit_expr(self, expr):

        # Decode the src string by the constant value
        if expr.op == cot_call and expr.x.op == cot_helper and expr.x.helper in g_memcpy_names:
            #breakpoint()
            info(f'{expr.ea:#x}: target helper function "{expr.x.helper}" is called')
            arg_dst = expr.a.at(0)
            arg_src = expr.a.at(1)
            #arg_size = expr.a.at(2)

            #if (arg_dst.op == cot_var or (arg_dst.op == cot_ref and arg_dst.x.op == cot_var)) and \
            #    (arg_src.op == cot_str or (arg_src.op == cot_cast and arg_src.x.op == cot_str)):
            if (arg_src.op == cot_str or (arg_src.op == cot_cast and arg_src.x.op == cot_str)):
                enc = arg_src.string if arg_src.op == cot_str else arg_src.x.string
                enc = enc.encode('utf-16-le') if expr.x.helper == 'wmemcpy' else enc.encode()
                info(f'{expr.ea:#x}: src bytes = {enc}')
                dec = self.decode(enc, self.cst_val).decode()
                if dec:
                    success(f'{expr.ea:#x}: string decoded "{dec}"')
                    self.set_decomplier_cmt(self.cfunc, expr.ea, dec)
                else:
                    error(f'{expr.ea:#x}: string decoding failed using a constant value ({self.cst_val:#x})')

        return 0
    

def main():

    info('start')

    ans = ida_kernwin.ask_yn(0, 'only decode the selected function?')
    if ans == ida_kernwin.ASKBTN_YES:
        fvas = [get_func_attr(get_screen_ea(), FUNCATTR_START)]
    elif ans == ida_kernwin.ASKBTN_NO:
        fvas = idautils.Functions()
    else:
        info('canceled')
        return
    
    for fva in fvas:
        if get_func_flags(fva) & (FUNC_LIB | FUNC_THUNK):
            debug(f"{fva:#x}: skipping library or thunk function")
            continue

        fn_name = get_name(get_func_attr(fva, FUNCATTR_START))
        if fn_name.find(g_stub_GetProcAddress) != -1:
            debug(f"{fva:#x}: skipping GetProcAddress stub function")
            continue

        print('-' * 100)
        info(f'{get_name(fva)} ({fva:#x})')

        cfunc = get_ctree_root(fva, cache=g_CACHE)

        cvf = cnt_val_finder_t()
        cvf.apply_to_exprs(cfunc.body, None)
        cnt_val = cvf.get_cnt_val()

        if cnt_val:
            sd = static_decoder_t(cnt_val, cfunc)
            sd.apply_to_exprs(cfunc.body, None)
        else:
            error(f'{fva:#x}: A constant value for decoding is not found')

        refresh_idaview_anyway()

    print('-' * 100)

    info('done')    

if __name__ == '__main__':
    main()

================================================
FILE: eset_crackme/README.org
================================================
* IDA Pro loader/processor modules for ESET CrackMe driver VM

You can download the initial sample for the CrackMe challenge from [[https://join.eset.com/en/challenges/crack-me][here]].

before:

[[./img/eset_before.png]]

after:

[[./img/eset_after.png]]

** Reference

- https://quequero.org/2016/01/eset-crackme-challenge-2015-walkthrough/
- http://mshetta.blogspot.jp/2016/11/join-eset-crackme-2015-solution.html


================================================
FILE: eset_crackme/loaders/ida_loader_drv_vm.py
================================================
import idaapi
import ida_segment
from idc import *
from struct import *

DATA_SEG_START = 0x10000 # may be changed

def accept_file(li, filename):
    sig = int16(li.read(2))
    if sig in [0x3713, 0x481c, 0x1337]:
        return {'format': "ESET Crackme driver VM program"}
    else:
        return 0

def int16(b):
    return unpack('<H', b)[0]
    
def int32(b):
    return unpack('<I', b)[0]

def myAddSeg(startea, endea, base, use32, name, clas):
    s = idaapi.segment_t()
    s.start_ea = startea
    s.end_ea   = endea
    s.sel      = idaapi.setup_selector(base)
    s.bitness  = use32
    s.align    = idaapi.saRelPara
    s.comb     = idaapi.scPub
    #idaapi.add_segm_ex(s, name, clas, idaapi.ADDSEG_NOSREG|idaapi.ADDSEG_OR_DIE)
    idaapi.add_segm(base, startea, endea, name, clas)

def load_file(li, neflags, format):
    li.seek(0) # needed to read signature
    sig = int16(li.read(2)) 
    size = int32(li.read(4)) # the program size
    code_off = int32(li.read(4)) # the code segment offset
    if sig != 0x3713: # for inline VM
        code_off = 0x12 
    data_off = int32(li.read(4)) # the data segment offset
    flag_kernel_mode = int32(li.read(4))
    
    #set_processor_type('eset_vm', SETPROC_USER | SETPROC_LOADER)
    set_processor_type('eset_vm', SETPROC_LOADER)

    # Create segment & Populate
    #'''
    myAddSeg(0, data_off - code_off, 0, 1, 'VM_CODE', "CODE")
    li.file2base(li.tell(), 0, data_off - code_off, 1)
    myAddSeg(DATA_SEG_START, DATA_SEG_START + size - data_off, 0, 1, 'VM_DATA', "DATA") # flat memory space
    #myAddSeg(DATA_SEG_START, DATA_SEG_START + size - data_off, DATA_SEG_START >> 4, 1, 'VM_DATA', "DATA") # segmentation (base should be in paragraphs 16-bits)    
    li.file2base(li.tell(), DATA_SEG_START, DATA_SEG_START + size - data_off, 1)
    '''
    myAddSeg(code_off, data_off, 0, 1, 'VM_CODE', "CODE")
    li.file2base(li.tell(), code_off, data_off, 1)
    myAddSeg(data_off, size, 0, 1, 'VM_DATA', "DATA")
    li.file2base(li.tell(), data_off, size, 1)
    '''

    # initialize
    set_inf_attr(INF_START_EA, 0)
    set_inf_attr(INF_START_IP, 0)
    set_inf_attr(INF_START_CS, 0)
    #add_entry(0, ep, "start", 1)
    add_entry(0, 0, "start", 1)

    # should return 1 or terminate immediately
    return 1 


================================================
FILE: eset_crackme/procs/ida_processor_drv_vm.py
================================================
import sys
import copy

import ida_idaapi
import ida_idp
import ida_ua
import ida_bytes
import ida_xref
import ida_offset
import ida_problems
import ida_lines
import ida_segment

from ida_idp import CF_USE1, CF_USE2, CF_CHG1, CF_CHG2, CF_STOP, CF_JUMP, CF_SHFT, CF_CALL

# enum definitions from VM engine idb
# enum_vm_size
SIZE_BYTE = 0
SIZE_WORD = 1
SIZE_DWORD = 2
# enum_vm_type
TYPE_REG_VAL = 0
TYPE_REG_PTR = 1
TYPE_IMM_VAL = 2 
TYPE_DATA_OFF = 3
# enum_vm_cmp
CMP_EQUAL = 0
CMP_NOT_EQUAL = 1
CMP_LESS_THAN = 2
# enum_vm_arith
ARITH_XOR = 0
ARITH_ADD = 1
ARITH_SUB = 2
ARITH_SHL = 3
ARITH_SHR = 4
ARITH_ROL = 5
ARITH_ROR = 6
ARITH_MOD = 7

# ----------------------------------------------------------------------
class eset_drv_vm_processor_t(ida_idp.processor_t):
    """
    Processor module classes must derive from ida_idp.processor_t
    """

    # IDP id ( Numbers above 0x8000 are reserved for the third-party modules)
    id = 0x8fff

    # Processor features
    flag = ida_idp.PRN_HEX | ida_idp.PR_RNAMESOK 

    # Number of bits in a byte for code segments (usually 8)
    # IDA supports values up to 32 bits
    cnbits = 8

    # Number of bits in a byte for non-code segments (usually 8)
    # IDA supports values up to 32 bits
    dnbits = 8

    # short processor names
    # Each name should be shorter than 9 characters
    psnames = ['eset_vm']

    # long processor names
    # No restriction on name lengthes.
    plnames = ['ESET Crackme driver VM processor']

    # size of a segment register in bytes
    segreg_size = 0

    # Array of instructions
    instruc = [
      {'name': '',      'feature': 0},  # placeholder for "not an instruction"
      {'name': 'hlt',   'feature': CF_STOP,   'cmt': "halt CPU"},
      {'name': 'mov',   'feature': CF_USE1 | CF_USE2 | CF_CHG1,   'cmt': "move"},      
      {'name': 'ncall', 'feature': CF_USE1 | CF_CALL,   'cmt': "call native function"},
      {'name': 'lcall', 'feature': CF_USE1 | CF_USE2 | CF_CALL,   'cmt': "call library function"},
      {'name': 'push',  'feature': CF_USE1,   'cmt': "push to stack"},
      {'name': 'pop',   'feature': CF_USE1 | CF_CHG1,   'cmt': "pop from stack"},      
      {'name': 'cmpeq', 'feature': CF_USE1 | CF_USE2,   'cmt': "compare #0 (equal)"},
      {'name': 'cmpne', 'feature': CF_USE1 | CF_USE2,   'cmt': "compare #1 (not equal)"},
      {'name': 'cmpb',  'feature': CF_USE1 | CF_USE2,   'cmt': "compare #2 (less than)"},
      {'name': 'jmp',   'feature': CF_USE1 | CF_JUMP | CF_STOP,   'cmt': "jump #0 (unconditional)"},
      {'name': 'cjmp',  'feature': CF_USE1 | CF_JUMP,   'cmt': "jump #1 (conditional)"},
      {'name': 'call',  'feature': CF_USE1 | CF_CALL,   'cmt': "call VM function"},
      {'name': 'ret',   'feature': 0,   'cmt': "return"},
      {'name': 'xor',   'feature': CF_USE1 | CF_USE2 | CF_CHG1,   'cmt': "arithmetic operation #0 (xor)"},
      {'name': 'add',   'feature': CF_USE1 | CF_USE2 | CF_CHG1,   'cmt': "arithmetic operation #1 (add)"},
      {'name': 'sub',   'feature': CF_USE1 | CF_USE2 | CF_CHG1,   'cmt': "arithmetic operation #2 (sub)"},
      {'name': 'shl',   'feature': CF_USE1 | CF_USE2 | CF_CHG1 | CF_SHFT,   'cmt': "arithmetic operation #3 (shift left)"},
      {'name': 'shr',   'feature': CF_USE1 | CF_USE2 | CF_CHG1 | CF_SHFT,   'cmt': "arithmetic operation #4 (shift right)"},
      {'name': 'rol',   'feature': CF_USE1 | CF_USE2 | CF_CHG1 | CF_SHFT,   'cmt': "arithmetic operation #5 (rotation left)"},
      {'name': 'ror',   'feature': CF_USE1 | CF_USE2 | CF_CHG1 | CF_SHFT,   'cmt': "arithmetic operation #6 (rotation right)"},
      {'name': 'mod',   'feature': CF_USE1 | CF_USE2 | CF_CHG1,   'cmt': "arithmetic operation #7 (modulo)"},
      {'name': 'alloc', 'feature': CF_USE1,   'cmt': "allocate buffer"},
      {'name': 'free',  'feature': CF_USE1,   'cmt': "free buffer"},
      {'name': 'loadVM','feature': CF_USE1 | CF_USE2,   'cmt': "load another VM"},
      {'name': 'nop',   'feature': 0,   'cmt': "nop"},      
    ]

    # icode of the first instruction
    instruc_start = 0

    # icode of the last instruction + 1
    instruc_end = len(instruc) + 1

    # Size of long double (tbyte) for this processor (meaningful only if ash.a_tbyte != NULL) (optional)
    # tbyte_size = 0

    #
    # Number of digits in floating numbers after the decimal point.
    # If an element of this array equals 0, then the corresponding
    # floating point data is not used for the processor.
    # This array is used to align numbers in the output.
    #      real_width[0] - number of digits for short floats (only PDP-11 has them)
    #      real_width[1] - number of digits for "float"
    #      real_width[2] - number of digits for "double"
    #      real_width[3] - number of digits for "long double"
    # Example: IBM PC module has { 0,7,15,19 }
    #
    # (optional)
    #real_width = (0, 7, 0, 0)


    # only one assembler is supported
    assembler = {
        # flag (mostly for the format)
        'flag' : ida_idp.ASH_HEXF3 | ida_idp.ASD_DECF0 | ida_idp.ASO_OCTF5 | ida_idp.ASB_BINF0 | ida_idp.AS_N2CHR,

        # user defined flags (local only for IDP) (optional)
        #'uflag' : 0,

        # Assembler name (displayed in menus)
        'name': "ESET Crackme driver VM assembler",

        # array of automatically generated header lines they appear at the start of disassembled text (optional)
        'header': [".esetvm"],

        # array of unsupported instructions (array of insn.itype) (optional)
        #'badworks': [],

        # org directive
        'origin': ".org",

        # end directive
        'end': ".end",

        # comment string (see also cmnt2)
        'cmnt': ";",

        # ASCII string delimiter
        'ascsep': "\"",

        # ASCII char constant delimiter
        'accsep': "'",

        # ASCII special chars (they can't appear in character and ascii constants)
        'esccodes': "\"'",

        #
        #      Data representation (db,dw,...):
        #
        # ASCII string directive
        'a_ascii': ".char",

        # byte directive
        'a_byte': "db",

        # word directive
        'a_word': "dw",

        # remove if not allowed
        'a_dword': "dd",

        # remove if not allowed
        # 'a_qword': "dq",

        # float;  4bytes; remove if not allowed
        #'a_float': ".float",

        # uninitialized data directive (should include '%s' for the size of data)
        'a_bss': ".space %s",

        # 'equ' Used if AS_UNEQU is set (optional)
        #'a_equ': ".equ",

        # 'seg ' prefix (example: push seg seg001)
        'a_seg': "seg",

        # current IP (instruction pointer) symbol in assembler
        'a_curip': "$",

        # "public" name keyword. NULL-gen default, ""-do not generate
        'a_public': ".def",

        # "weak"   name keyword. NULL-gen default, ""-do not generate
        'a_weak': "",

        # "extrn"  name keyword
        'a_extrn': ".ref",

        # "comm" (communal variable)
        'a_comdef': "",

        # "align" keyword
        'a_align': ".align",

        # Left and right braces used in complex expressions
        'lbrace': "(",
        'rbrace': ")",

        # %  mod     assembler time operation
        'a_mod': "%",

        # &  bit and assembler time operation
        'a_band': "&",

        # |  bit or  assembler time operation
        'a_bor': "|",

        # ^  bit xor assembler time operation
        'a_xor': "^",

        # ~  bit not assembler time operation
        'a_bnot': "~",

        # << shift left assembler time operation
        'a_shl': "<<",

        # >> shift right assembler time operation
        'a_shr': ">>",

        # size of type (format string) (optional)
        'a_sizeof_fmt': "size %s",

        'flag2': 0,

        # the include directive (format string) (optional)
        'a_include_fmt': '.include "%s"',
    } # Assembler


    # ----------------------------------------------------------------------
    # The following callbacks are optional
    #

    #def notify_newprc(self, nproc):
    #    """
    #    Before changing proccesor type
    #    nproc - processor number in the array of processor names
    #    return 1-ok,0-prohibit
    #    """
    #    return 1

    #def notify_assemble(self, ea, cs, ip, use32, line):
    #    """
    #    Assemble an instruction
    #     (make sure that ida_idp.PR_ASSEMBLE flag is set in the processor flags)
    #     (display a warning if an error occurs)
    #     args:
    #       ea -  linear address of instruction
    #       cs -  cs of instruction
    #       ip -  ip of instruction
    #       use32 - is 32bit segment?
    #       line - line to assemble
    #    returns the opcode string
    #    """
    #    pass

    def notify_get_frame_retsize(self, func_ea):
        """
        Get size of function return address in bytes
        If this function is absent, the kernel will assume
             4 bytes for 32-bit function
             2 bytes otherwise
        """
        return 2

    def notify_get_autocmt(self, insn):
        """
        Get instruction comment. 'insn' describes the instruction in question
        @return: None or the comment string
        """
        if 'cmt' in self.instruc[insn.itype]:
          return self.instruc[insn.itype]['cmt']

    # ----------------------------------------------------------------------
    def notify_is_sane_insn(self, insn, no_crefs):
        """
        is the instruction sane for the current file type?
        args: no_crefs
        1: the instruction has no code refs to it.
           ida just tries to convert unexplored bytes
           to an instruction (but there is no other
           reason to convert them into an instruction)
        0: the instruction is created because
           of some coderef, user request or another
           weighty reason.
        The instruction is in 'insn'
        returns: 1-ok, <=0-no, the instruction isn't
        likely to appear in the program
        """
        #w = ida_bytes.get_wide_word(insn.ea)
        #if w == 0 or w == 0xFFFF:
        #  return 0
        #return 1
        return -1

    # ----------------------------------------------------------------------
    def handle_operand(self, insn, op, isRead):
      flags     = ida_bytes.get_flags(insn.ea)
      is_offs   = ida_bytes.is_off(flags, op.n)
      dref_flag = ida_xref.dr_R if isRead else ida_xref.dr_W
      def_arg   = ida_bytes.is_defarg(flags, op.n)
      optype    = op.type

      itype = insn.itype
      # create code xrefs
      if optype == ida_ua.o_imm:        
        makeoff = False
        if itype in [self.itype_ncall, self.itype_call]:
          insn.add_cref(op.value, op.offb, ida_xref.fl_CN)
          makeoff = True
        #elif itype == self.itype_mov: # e.g., mov #addr, PC
        #  insn.add_cref(op.value, op.offb, ida_xref.fl_JN)
        #  makeoff = True        
        if makeoff and not def_arg:
          otype = ida_offset.get_default_reftype(insn.ea)
          ida_offset.op_offset(insn.ea, op.n, otype, ida_idaapi.BADADDR, insn.cs)
          is_offs = True
        if is_offs:
          insn.add_off_drefs(op, ida_xref.dr_O, 0)
      elif optype == ida_ua.o_near:
        if insn.itype in [self.itype_ncall, self.itype_call]:
            fl = ida_xref.fl_CN
        else:
            fl = ida_xref.fl_JN
        insn.add_cref(op.addr, op.offb, fl)
      # create data xrefs
      elif optype == ida_ua.o_mem:
        insn.create_op_data(op.addr, op.offb, op.dtype)
        insn.add_dref(op.addr, op.offb, dref_flag)
        '''
        ds = ida_segment.get_segm_by_name('VM_DATA')        
        start = ds.start_ea
        insn.create_op_data(start + op.addr, op.offb, op.dtype)
        insn.add_dref(start + op.addr, op.offb, dref_flag)
        '''

    # ----------------------------------------------------------------------
    # The following callbacks are mandatory
    #
    def notify_emu(self, insn):
      """
      Emulate instruction, create cross-references, plan to analyze
      subsequent instructions, modify flags etc. Upon entrance to this function
      all information about the instruction is in 'insn' structure.
      If zero is returned, the kernel will delete the instruction.
      """
      aux = self.get_auxpref(insn)
      Feature = insn.get_canon_feature()

      if Feature & CF_USE1:
        self.handle_operand(insn, insn.Op1, 1)
      if Feature & CF_CHG1:
        self.handle_operand(insn, insn.Op1, 0)
      if Feature & CF_USE2:
        self.handle_operand(insn, insn.Op2, 1)
      if Feature & CF_CHG2:
        self.handle_operand(insn, insn.Op2, 0)
      if Feature & CF_JUMP:
        ida_problems.remember_problem(ida_problems.PR_JUMP, insn.ea)

      # is it an unconditional jump?
      uncond_jmp = insn.itype in [self.itype_jmp]

      # add flow
      flow = (Feature & CF_STOP == 0) and not uncond_jmp
      if flow:
        insn.add_cref(insn.ea + insn.size, 0, ida_xref.fl_F)

      return 1

    # ----------------------------------------------------------------------
    def notify_out_operand(self, ctx, op):
      """
        Generate text representation of an instructon operand.
        This function shouldn't change the database, flags or anything else.
        All these actions should be performed only by the emu() function.
        This function uses out_...() functions from ua.hpp to generate the operand text
        Returns: 1-ok, 0-operand is hidden.
      """
      optype = op.type
      dtype = op.dtype
      signed = 0

      if optype == ida_ua.o_reg:
        if dtype == ida_ua.dt_byte:          
          #ctx.out_register('b')
          ctx.out_keyword('byte ')
        elif dtype == ida_ua.dt_word:          
          #ctx.out_register('w')
          ctx.out_keyword('word ')
        ctx.out_register(self.reg_names[op.reg])
      elif optype == ida_ua.o_phrase:
        if dtype == ida_ua.dt_dword:          
          ctx.out_keyword('dword ptr ')
        elif dtype == ida_ua.dt_byte:
          ctx.out_keyword('byte ptr ')
        elif dtype == ida_ua.dt_word:          
          ctx.out_keyword('word ptr ')
        ctx.out_symbol('[')
        ctx.out_register(self.reg_names[op.reg])
        ctx.out_symbol(']')
      elif optype == ida_ua.o_imm:
        ctx.out_symbol('#')
        ctx.out_value(op, ida_ua.OOFW_IMM | signed )
      elif optype in [ida_ua.o_near, ida_ua.o_mem]:
        r = ctx.out_name_expr(op, op.addr, ida_idaapi.BADADDR)
        if not r:
          ctx.out_tagon(ida_lines.COLOR_ERROR)
          ctx.out_long(op.addr, 16)
          ctx.out_tagoff(ida_lines.COLOR_ERROR)
          ida_problems.remember_problem(ida_problems.PR_NONAME, ctx.insn.ea)
      else:
        return False
        
      # for Op2 of mov instruction
      #if op.specflag1:
      #  ctx.out_keyword(' as ptr')

      return True

    # ----------------------------------------------------------------------
    def notify_out_insn(self, ctx):
        """
        Generate text representation of an instruction in 'ctx.insn' structure.
        This function shouldn't change the database, flags or anything else.
        All these actions should be performed only by emu() function.
        Returns: nothing
        """
        postfix = ""

        ctx.out_mnemonic()

        # output first operand
        # kernel will call outop()
        if ctx.insn.Op1.type != ida_ua.o_void:
            ctx.out_one_operand(0)

        # output the rest of operands separated by commas
        for i in xrange(1, 3):
            if ctx.insn[i].type == ida_ua.o_void:
                break
            ctx.out_symbol(',')
            ctx.out_char(' ')
            ctx.out_one_operand(i)

        ctx.set_gen_cmt() # generate comment at the next call to MakeLine()
        ctx.flush_outbuf()

    def fill_reg(self, op, dtype, regno):
      op.type = ida_ua.o_reg
      op.dtype = dtype
      op.reg = regno
      #op.specflag1 = 0     

    def fill_phrase(self, op, dtype, regno):
      op.type = ida_ua.o_phrase
      op.dtype = dtype
      op.phrase = regno
      #op.specflag1 = 0     

    def fill_imm(self, op, dtype, val):
      op.type = ida_ua.o_imm
      op.dtype = dtype
      op.value = val
      #op.specflag1 = 0     

    def fill_near(self, op, dtype, addr):
      op.type = ida_ua.o_near
      op.dtype = dtype
      op.addr = addr
      #op.specflag1 = 0     

    def fill_mem(self, op, dtype, addr):
      op.type = ida_ua.o_mem
      op.dtype = dtype
      #op.addr = addr
      # add data segment base addr
      ds = ida_segment.get_segm_by_name('VM_DATA')        
      op.addr = ds.start_ea + addr
      #op.specflag1 = 0     

    def get_next_bytes(self, insn, dtype):
      if dtype == ida_ua.dt_byte:
        return insn.get_next_byte()
      elif dtype == ida_ua.dt_word:
        return insn.get_next_word()
      elif dtype == ida_ua.dt_dword:
        return insn.get_next_dword()

    def set_operand(self, insn, op, type_, regno, dtype):
      # check dtype
      if dtype > 2:
        return -1
      
      # IDA data type enum is matched with enum_vm_size of the idb
      if type_ == TYPE_REG_VAL:
        self.fill_reg(op, dtype, regno)
      elif type_ == TYPE_REG_PTR:
        self.fill_phrase(op, dtype, regno)
      elif type_ == TYPE_IMM_VAL:
        val = self.get_next_bytes(insn, dtype)
        self.fill_imm(op, dtype, val)
      elif type_ == TYPE_DATA_OFF:
        dt_off = insn.get_next_dword()
        self.fill_mem(op, dtype, dt_off)
      return 0
    
    # ----------------------------------------------------------------------
    def notify_ana(self, insn):
      """
      Decodes an instruction into 'insn'.
      Returns: insn.size (=the size of the decoded instruction) or zero
      """      
      opc = insn.get_next_byte()        
      # cmp (0x6), jmp (0x7),  arithmetic operation (0xa): multiple instructions
      # 0xe - 0xff: nop
      if opc > 0xd:
        insn.itype = self.itype_nop
      elif opc > 0xa:
        insn.itype = self.itype_hlt + opc + 2 + 1 + 7
      elif opc > 7:
        insn.itype = self.itype_hlt + opc + 2 + 1
      elif opc > 6:
        insn.itype = self.itype_hlt + opc + 2
      else:
        insn.itype = self.itype_hlt + opc

      if insn.itype not in [self.itype_hlt, self.itype_ret, self.itype_nop]:
        if insn.itype in [self.itype_call, self.itype_jmp]:
          if insn.itype == self.itype_jmp:
            cflag = insn.get_next_byte() # check conditional flag
            if cflag > 1:
              return 0 # invalid flag value
            insn.itype += cflag
          addr = insn.get_next_dword()
          self.fill_near(insn.Op1, ida_ua.dt_dword, addr)
        elif insn.itype == self.itype_pop:
          regno = insn.get_next_byte() & 0xf
          self.fill_reg(insn.Op1, ida_ua.dt_dword,  regno)
        elif insn.itype in [self.itype_push, self.itype_alloc, self.itype_free, self.itype_ncall]:
          b1 = insn.get_next_byte()
          dtype = ida_ua.dt_dword if insn.itype == self.itype_ncall else b1 >> 6
          if self.set_operand(insn, insn.Op1, (b1 >> 4) & 3, b1 & 0xf, dtype):
            return 0 # invalid dtype
        elif insn.itype in [self.itype_lcall, self.itype_loadVM]:
          b1 = insn.get_next_byte()
          b2 = insn.get_next_byte()
          if self.set_operand(insn, insn.Op1, b2 & 3, b1 & 0xf, ida_ua.dt_dword):
            return 0 # invalid dtype
          dtype = ida_ua.dt_dword if insn.itype == self.itype_lcall else (b2 >> 4) & 3
          if self.set_operand(insn, insn.Op2, (b2 >> 2) & 3, b1 >> 4, dtype):
            return 0 # invalid dtype
        elif insn.itype == self.itype_mov:
          b1 = insn.get_next_byte()
          b2 = insn.get_next_byte()            
          dtype = (b2 >> 4) & 3
          if self.set_operand(insn, insn.Op2, b2 & 3, b1 >> 4, dtype):
            return 0 # invalid dtype
          dst_regno = b1 & 0xf
          if (b2 >> 2) & 3: # used as pointer
            self.fill_phrase(insn.Op1, dtype, dst_regno)
            #insn.Op2.specflag1 = 1
          else:
            self.fill_reg(insn.Op1, dtype, dst_regno)
        elif insn.itype in [self.itype_cmpeq, self.itype_xor]:
          b1 = insn.get_next_byte()
          b2 = insn.get_next_byte()
          self.fill_reg(insn.Op1, ida_ua.dt_dword, b1 & 0xf)            
          if self.set_operand(insn, insn.Op2, b2 & 3, b1 >> 4, (b2 >> 2) & 3):
            return 0 # invalid dtype
          # update itype
          itype_idx = (b2 >> 4) & 7
          if insn.itype == self.itype_cmpeq and itype_idx > 2:
            return 0 # invalid cmp operation
          else:
            insn.itype += itype_idx                          

      # Return decoded instruction size or zero
      return insn.size if insn.itype != self.itype_null else 0

    # ----------------------------------------------------------------------
    def init_instructions(self):
        Instructions = []
        i = 0
        for x in self.instruc:
            if x['name'] != '':
                setattr(self, 'itype_' + x['name'], i)
            else:
                setattr(self, 'itype_null', i)
            i += 1

        # icode of the last instruction + 1
        self.instruc_end = len(self.instruc) + 1

    # ----------------------------------------------------------------------
    def init_registers(self):
      """
      This function parses the register table and creates corresponding ireg_XXX constants
      """

      # Registers definition
      self.reg_names = [
        # General purpose registers
        "r0", 
        "r1", 
        "r2", 
        "r3", 
        "r4", 
        "r5",        
        # SP
        "r6",
        # VM pointer
        "r7",        
        # VM size
        "r8",
        # ntoskrnl_base
        "r9",
        # arg registers
        "r10",
        "r11",
        "r12",
        "r13",
        "r14",
        "r15",
        # Fake segment registers
        "CS",
        "DS",
      ]

      # Create the ireg_XXXX constants
      for i in xrange(len(self.reg_names)):
        setattr(self, 'ireg_' + self.reg_names[i], i)

      # Segment register information (use virtual CS and DS registers if your
      # processor doesn't have segment registers):
      self.reg_first_sreg = self.ireg_CS
      self.reg_last_sreg  = self.ireg_DS

      # number of CS register
      self.reg_code_sreg = self.ireg_CS

      # number of DS register
      self.reg_data_sreg = self.ireg_DS

    # ----------------------------------------------------------------------
    def __init__(self):
        ida_idp.processor_t.__init__(self)
        self.init_instructions()
        self.init_registers()

# ----------------------------------------------------------------------
# Every processor module script must provide this function.
# It should return a new instance of a class derived from ida_idp.processor_t
def PROCESSOR_ENTRY():
    return eset_drv_vm_processor_t()


================================================
FILE: fn_fuzzy/README.org
================================================
#+OPTIONS: ^:{}

#+TITLE: fn_fuzzy.py - IDAPython script for fast multiple binary diffing triage

* Motivation

See the [[https://conference.hitb.org/hitbsecconf2019ams/sessions/fn_fuzzy-fast-multiple-binary-diffing-triage-with-ida/][conference information]] or [[https://www.carbonblack.com/2019/05/09/fn_fuzzy-fast-multiple-binary-diffing-triage-with-ida/][blog]] post.

* how to use

- fn_fuzzy.py :: IDAPython script to export/compare fuzzy hashes of the sample
- cli_export.py :: python wrapper script to export fuzzy hashes of multiple samples

The typical usage is to run cli_export.py to make a database for large idbs then compare on IDA by executing fn_fuzzy.py.

[[./img/fn_fuzzy.png]]

[[./img/res_summary.png]]

[[./img/res_funcs.png]]

* supported IDB version

IDBs generated by IDA 6.9 or later due to SHA256 API

* required python packages

- mmh3
- [[https://github.com/williballenthin/python-idb%0A][python-idb]]


================================================
FILE: fn_fuzzy/cli_export.py
================================================
# cli_export.py - batch export script for fn_fuzzy
# Takahiro Haruyama (@cci_forensics)

import argparse, subprocess, os, sqlite3, time, sys
import idb # python-idb
import logging
logging.basicConfig(level=logging.ERROR) # to suppress python-idb warning

# plz edit the following paths
g_ida_dir = r'C:\analysisw\tool\IDA'
g_db_path = r'C:\analysisw\tics\fn_fuzzy.sqlite'
g_fn_fuzzy_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'fn_fuzzy.py')

g_min_bytes = 0x10 # minimum number of extracted code bytes per function
g_analyzed_prefix = r'fn_' # analyzed function name prefix (regex)

class LocalError(Exception): pass
class ProcExportError(LocalError): pass

def info(msg):
    print("[*] {}".format(msg))

def success(msg):
    print("[+] {}".format(msg))

def error(msg):
    print("[!] {}".format(msg))

def init_db(cur):
    cur.execute("SELECT * FROM sqlite_master WHERE type='table'")
    if cur.fetchone() is None:
        info('DB initialized')
        cur.execute("CREATE TABLE IF NOT EXISTS sample(sha256 UNIQUE, path)")
        #cur.execute("CREATE INDEX sha256_index ON sample(sha256)")
        cur.execute("CREATE INDEX path_index ON sample(path)")
        cur.execute("CREATE TABLE IF NOT EXISTS function(sha256, fva, fname, fhd, fhm, f_ana, bsize, ptype, UNIQUE(sha256, fva))")
        cur.execute("CREATE INDEX f_ana_index ON function(f_ana)")        
        cur.execute("CREATE INDEX bsize_index ON function(bsize)")

def existed(cur, sha256):
    cur.execute("SELECT * FROM sample WHERE sha256 = ?", (sha256,))
    if cur.fetchone() is None:
        return False
    else:
        return True        

def remove(cur, sha256):
    cur.execute("DELETE FROM sample WHERE sha256 = ?", (sha256,))
    cur.execute("DELETE FROM function WHERE sha256 = ?", (sha256,))    
    
def export(f_debug, idb_path, outdb, min_, f_ex_libthunk, f_update, f_ana_exp, ana_pre, f_remove):
    # check the ext and signature
    ext = os.path.splitext(idb_path)[1]
    if ext != '.idb' and ext != '.i64':
        return 0   
    with open(idb_path, 'rb') as f:
        sig = f.read(4)        
    if sig != b'IDA1' and sig != b'IDA2':
        return 0

    # check the database record for the idb
    #print idb_path
    conn = sqlite3.connect(outdb)
    cur = conn.cursor()
    init_db(cur)
    with idb.from_file(idb_path) as db: # Fix: Cause NameError. need to rewrite in IDA batch mode to calculate SHA256
        api = idb.IDAPython(db)
        try:
            sha256 = api.ida_nalt.retrieve_input_file_sha256()            
        except KeyError:
            error('{}: ida_nalt.retrieve_input_file_sha256() failed. The API is supported in 6.9 or later idb version. Check the API on IDA for validation.'.format(idb_path))
            return 0
        sha256 = sha256.lower()
    if f_remove:
        remove(cur, sha256)
        success('{}: the records successfully removed (SHA256={})'.format(idb_path, sha256))
        conn.commit()
        cur.close()            
        return 0        
    if existed(cur, sha256) and not f_update:
        info('{}: The sample records are present in DB (SHA256={}). Skipped.'.format(idb_path, sha256))
        return 0
    conn.commit()
    cur.close()    
    
    ida = 'ida.exe' if sig == 'IDA1' else 'ida64.exe'
    ida_path = os.path.join(g_ida_dir, ida)
    #cmd = [ida_path, '-L{}'.format(os.path.join(g_ida_dir, 'debug.log')), '-S{}'.format(g_fn_fuzzy_path), '-Ofn_fuzzy:{}:{}:{}:{}:{}:{}'.format(min_, f_ex_libthunk, f_update, f_ana_exp, ana_pre, outdb), idb_path]
    cmd = [ida_path, '-S{}'.format(g_fn_fuzzy_path), '-Ofn_fuzzy:{}:{}:{}:{}:{}:{}'.format(min_, f_ex_libthunk, f_update, f_ana_exp, ana_pre, outdb), idb_path]
    if not f_debug:
        cmd.insert(1, '-A')
    #print cmd        
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = proc.communicate()
    if proc.returncode == 0:
        success('{}: successfully exported'.format(idb_path))
        return 1
    elif proc.returncode == 2: # skipped
        return 0
    else: # maybe 1
        raise ProcExportError('{}: Something wrong with the IDAPython script (returncode={}). Use -d for debug'.format(idb_path, proc.returncode))

def list_file(d):
    for entry in os.listdir(d):
        if os.path.isfile(os.path.join(d, entry)):
            yield os.path.join(d, entry)

def list_file_recursive(d):
    for root, dirs, files in os.walk(d):
        for file_ in files:
            yield os.path.join(root, file_)    

def main():
    info('start')
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('target', help="idb file or folder to export")
    parser.add_argument('--outdb', '-o', default=g_db_path, help="export DB path")
    parser.add_argument('--min_', '-m', type=int, default=g_min_bytes, help="minimum number of extracted code bytes per function")
    parser.add_argument('--exclude', '-e', action='store_true', help="exclude library/thunk functions")
    parser.add_argument('--update', '-u', action='store_true', help="update the DB records")
    parser.add_argument('--ana_exp', '-a', action='store_true', help="check analyzed functions")
    parser.add_argument('--ana_pre', '-p', default=g_analyzed_prefix, help="analyzed function name prefix (regex)")    
    parser.add_argument('--recursively', '-r', action='store_true', help="export idbs recursively")
    parser.add_argument('--debug', '-d', action='store_true', help="display IDA dialog for debug")
    parser.add_argument('--remove', action='store_true', help="remove records from db")
    args = parser.parse_args()

    start = time.time()
    cnt = 0
    if os.path.isfile(args.target):
        try:
            cnt += export(args.debug, args.target, args.outdb, args.min_, args.exclude, args.update, args.ana_exp, args.ana_pre, args.remove)
        except LocalError as e:
            error('{} ({})'.format(str(e), type(e)))
            return         
    elif os.path.isdir(args.target):
        gen_lf = list_file_recursive if args.recursively else list_file
        for t in gen_lf(args.target):
            try:
                cnt += export(args.debug, t, args.outdb, args.min_, args.exclude, args.update, args.ana_exp, args.ana_pre, args.remove)
            except LocalError as e:
                error('{} ({})'.format(str(e), type(e)))
                return         
    else:
        error('the target is not file/dir')
        return
    elapsed = time.time() - start
    success('totally {} samples exported'.format(cnt))
    info('elapsed time = {} sec'.format(elapsed))
    info('done')

if __name__ == '__main__':
    main()


================================================
FILE: fn_fuzzy/dump_types.py
================================================
import os

def main():
    path = os.path.splitext(get_idb_path())[0] + '.idc'
    gen_file(OFILE_IDC, path, 0, 0, GENFLG_IDCTYPE)
    Exit(0)

if ( __name__ == "__main__" ):
    main()


================================================
FILE: fn_fuzzy/fn_fuzzy.py
================================================
# fn_fuzzy.py - IDAPython script for fast multiple binary diffing triage
# Takahiro Haruyama (@cci_forensics)

import os, ctypes, sqlite3, re, time, sys, subprocess
import cProfile
from collections import defaultdict
from pprint import PrettyPrinter
from io import StringIO
from tqdm import tqdm

from idc import *
import idautils, ida_nalt, ida_kernwin, idaapi, ida_expr

import mmh3
import yara_fn # modified version in the same folder

g_db_path = r'Z:\haru\analysis\tics\fn_fuzzy.sqlite' # plz edit your path
g_min_bytes = 0x10 # minimum number of extracted code bytes per function
g_analyzed_prefix = r'fn_|func_' # analyzed function name prefix (regex)
g_threshold = 50 # function similarity score threshold without CFG match
g_threshold_cfg = 10 # function similarity score threshold with CFG match
g_max_bytes_for_score = 0x100 # more code bytes are evaluated by only CFG match
g_bsize_ratio = 40 # function binary size correction ratio to compare (40 is enough)

# debug purpose to check one function matching
g_dbg_flag = False
g_dbg_fva = 0x180015978
g_dbg_fname = 'fn_blob_get_word_param_and_seek'
g_dbg_sha256 = ''

# initialization for ssdeep
SPAMSUM_LENGTH = 64
FUZZY_MAX_RESULT = (2 * SPAMSUM_LENGTH + 20)
dirpath = os.path.dirname(__file__)
_lib_path = os.path.join(dirpath, 'fuzzy64.dll')
fuzzy_lib = ctypes.cdll.LoadLibrary(_lib_path)

g_dump_types_path = os.path.join(dirpath, 'dump_types.py')

class defaultdictRecurse(defaultdict):
    def __init__(self):
        self.default_factory = type(self)

class import_handler_t(ida_kernwin.action_handler_t):
    def __init__(self, items, idb_path, title):
        ida_kernwin.action_handler_t.__init__(self)
        self.items = items
        self.idb_path = idb_path
        self.title = title
        
    def import_types(self):        
        idc_path = os.path.splitext(self.idb_path)[0] + '.idc'
        # dump type information from the 2nd idb
        if not (os.path.exists(idc_path)):
            with open(self.idb_path, 'rb') as f:
                sig = f.read(4)
            ida = 'ida.exe' if sig == 'IDA1' else 'ida64.exe'
            ida_path = os.path.join(idadir(), ida)                
            cmd = [ida_path, '-S{}'.format(g_dump_types_path), self.idb_path]
            #print cmd        
            proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            stdout, stderr = proc.communicate()
            if proc.returncode == 0:
                success('{}: type information successfully dumped'.format(self.idb_path))
            else: 
                error('{}: type information dumping failed'.format(self.idb_path))
                return False

        # import the type information
        idc_path = os.path.splitext(self.idb_path)[0] + '.idc'
        ida_expr.exec_idc_script(None, str(idc_path), "main", None, 0)
        return True
        
    def activate(self, ctx):
        sel = []
        for idx in ctx.chooser_selection:
            # rename the function
            ea = get_name_ea_simple(self.items[idx][2])
            sfname = str(self.items[idx][4])
            #set_name(ea, sfname)
            idaapi.do_name_anyway(ea, sfname)
            success('{:#x}: renamed to {}'.format(ea, sfname))
            # set the function prototype
            sptype = str(self.items[idx][5])
            if sptype != 'None':
                tinfo = idaapi.tinfo_t()
                idaapi.parse_decl2(idaapi.cvar.idati, sptype, tinfo, 0)
                #idaapi.apply_callee_tinfo(ea, tinfo)
                if idaapi.apply_tinfo(ea, tinfo, 0):
                    success('{:#x}: function prototype set to {}'.format(ea, sptype))
                else:
                    error('{:#x}: function prototype set FAILED (maybe you should import the types?)'.format(ea))
                    if ask_yn(0, 'Do you import types from the secondary idb?') == 1:
                        if self.import_types():
                            tinfo = idaapi.tinfo_t()
                            idaapi.parse_decl2(idaapi.cvar.idati, sptype, tinfo, 0)
                            if idaapi.apply_tinfo(ea, tinfo, 0):
                                success('{:#x}: function prototype set to {}'.format(ea, sptype))
                            else:
                                error('{:#x}: function prototype set FAILED again'.format(ea))
                        
            # insert the comment
            score = self.items[idx][0]
            mmatch = self.items[idx][1]
            cmt = 'fn_fuzzy: ssdeep={}, machoc={}'.format(score, mmatch)
            set_func_cmt(ea, cmt, 1)
            #set_decomplier_cmt(ea, cmt) # not sure how to avoid orphan comment

        # update the Choose rows
        ida_kernwin.refresh_chooser(self.title)

    def update(self, ctx):
        return idaapi.AST_ENABLE_ALWAYS
    '''
        return ida_kernwin.AST_ENABLE_FOR_WIDGET \
            if ida_kernwin.is_chooser_widget(ctx.widget_type) \
          else ida_kernwin.AST_DISABLE_FOR_WIDGET
    '''

class FnCh(ida_kernwin.Choose):
    def __init__(self, title, mfn, idb_path):
        self.mfn = mfn
        self.idb_path = idb_path
        self.title = title
        ida_kernwin.Choose.__init__(
            self,
            title,
            [
              ["ssdeep score",   10 | ida_kernwin.Choose.CHCOL_DEC],
              ["machoc matched",   10 | ida_kernwin.Choose.CHCOL_PLAIN],
              ["primary function", 30 | ida_kernwin.Choose.CHCOL_PLAIN],
              ["primary bsize",   10 | ida_kernwin.Choose.CHCOL_DEC],
              ["secondary analyzed function",   30 | ida_kernwin.Choose.CHCOL_PLAIN], 
              ["secondary prototype", 40 | ida_kernwin.Choose.CHCOL_PLAIN]
            ],
            flags = ida_kernwin.Choose.CH_MULTI)

    def OnInit(self):
        self.items = []
        for fva,v in sorted(list(self.mfn.items()), key=lambda x:x[1]['score'], reverse=True):
            if v['sfname']:
                self.items.append(['{}'.format(v['score']), '{}'.format(v['cfg_match']), str(get_name(fva)), '{}'.format(v['pbsize']), str(v['sfname']), '{}'.format(v['sptype'])])
        return True

    def OnPopup(self, form, popup_handle):
        actname = "choose:actFnFuzzyImport"
        desc = ida_kernwin.action_desc_t(actname, 'Import function name and prototype', import_handler_t(self.items, self.idb_path, self.title))
        ida_kernwin.attach_dynamic_action_to_popup(form, popup_handle, desc)

    def OnGetSize(self):
        return len(self.items)

    def OnGetLine(self, n):
        return self.items[n]

    def OnSelectLine(self, n):
        idx = n[0] # due to CH_MULTI
        idc.Jump(get_name_ea_simple(self.items[idx][2]))

    def OnRefresh(self, n):
        self.OnInit()
        # try to preserve the cursor
        #return [ida_kernwin.Choose.ALL_CHANGED] + self.adjust_last_item(n)
        #return n
        return None

    def OnClose(self):
        print("closed ", self.title)

class SummaryCh(ida_kernwin.Choose):
    def __init__(self, title, res):
        self.res = res
        ida_kernwin.Choose.__init__(
            self,
            title,
            [ ["SHA256", 20 | ida_kernwin.Choose.CHCOL_PLAIN],
              ["total similar functions",   20 | ida_kernwin.Choose.CHCOL_DEC],
              ["analyzed similar functions",   20 | ida_kernwin.Choose.CHCOL_DEC],
              ["IDB path",   80 | ida_kernwin.Choose.CHCOL_PATH] ])
        self.items = []

    def OnInit(self):
        for sha256,v in sorted(list(self.res.items()), key=lambda x:x[1]['mcnt']['total'], reverse=True):
            if v['mcnt']['total'] > 0:
                self.items.append([str(sha256), '{}'.format(v['mcnt']['total']), '{}'.format(v['mcnt']['analyzed']), str(v['path'])])
        return True
            
    def OnGetSize(self):
        return len(self.items)

    def OnGetLine(self, n):
        return self.items[n]

    def OnSelectLine(self, n):
        sha256 = self.items[n][0]
        c = FnCh("similarities with {}(snip)".format(sha256[:8]), self.res[sha256]['mfn'], self.res[sha256]['path'])
        c.Show()

    def OnRefresh(self, n):
        return n

    def OnClose(self):
        print("closed ", self.title)

class FnFuzzyForm(ida_kernwin.Form):
    def __init__(self):
        ida_kernwin.Form.__init__(self,
r"""BUTTON YES* Run
BUTTON CANCEL Cancel
fn_fuzzy

{FormChangeCb}
General Options
<DB file path:{iDBSave}>
<minimum function code size:{iMinBytes}>
<exclude library/thunk functions:{cLibthunk}>
<enable debug messages:{cDebug}>{cGroup}>

<##Commands##Export:{rExport}>
<Compare:{rCompare}>{rGroup}>

Export Options
<update the DB records:{cUpdate}>
<store flags as analyzed functions:{cAnaExp}>{cEGroup}>
<analyzed function name prefix/suffix (regex):{iPrefix}>

Compare Options
<compare with only analyzed functions:{cAnaCmp}>
<compare with only IDBs in the specified folder:{cFolCmp}>{cCGroup}>
<the folder path:{iFolder}>
<function code size comparison criteria (0-100):{iRatio}>
<function similarity score threshold (0-100) without CFG match:{iSimilarity}>
<function similarity score threshold (0-100) with CFG match:{iSimilarityCFG}>
<function code size threshold evaluated by only CFG match:{iMaxBytesForScore}>
""",
        {
            'FormChangeCb': ida_kernwin.Form.FormChangeCb(self.OnFormChange),
            'cGroup': ida_kernwin.Form.ChkGroupControl(("cLibthunk", "cDebug")),
            'iDBSave': ida_kernwin.Form.FileInput(save=True),
            'iMinBytes': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_HEX),
            'rGroup': ida_kernwin.Form.RadGroupControl(("rCompare", "rExport")),
            'cEGroup': ida_kernwin.Form.ChkGroupControl(("cUpdate", "cAnaExp")),
            'iPrefix': ida_kernwin.Form.StringInput(),
            'cCGroup': ida_kernwin.Form.ChkGroupControl(("cAnaCmp", "cFolCmp")),
            'iFolder': ida_kernwin.Form.DirInput(),
            'iRatio': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_DEC),
            'iSimilarity': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_DEC),
            'iSimilarityCFG': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_DEC),
            'iMaxBytesForScore': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_HEX),            
        })

    def OnFormChange(self, fid):
        if fid == -1:
            self.SetControlValue(self.cLibthunk, True)
            self.SetControlValue(self.cAnaExp, True)
            self.SetControlValue(self.cAnaCmp, True)
            self.SetControlValue(self.rCompare, True)
            
            self.EnableField(self.cEGroup, False)            
            self.EnableField(self.iPrefix, False)
            self.EnableField(self.cCGroup, True)
            self.EnableField(self.iSimilarity, True)
            self.EnableField(self.iSimilarityCFG, True)
            self.EnableField(self.iMaxBytesForScore, True)
            self.EnableField(self.iRatio, True)
        if fid == self.rExport.id:
            self.EnableField(self.cEGroup, True)
            self.EnableField(self.iPrefix, True)
            self.EnableField(self.cCGroup, False)
            self.EnableField(self.iSimilarity, False)
            self.EnableField(self.iSimilarityCFG, False)
            self.EnableField(self.iMaxBytesForScore, False)
            self.EnableField(self.iRatio, False)
        elif fid == self.rCompare.id:
            self.EnableField(self.cEGroup, False)
            self.EnableField(self.iPrefix, False)
            self.EnableField(self.cCGroup, True)
            self.EnableField(self.iSimilarity, True)
            self.EnableField(self.iSimilarityCFG, True)
            self.EnableField(self.iMaxBytesForScore, True)
            self.EnableField(self.iRatio, True)
        return 1

class FnFuzzy(object):
    def __init__(self, f_debug, db_path, min_bytes, f_ex_libthunk, f_update, f_ana_exp, ana_pre, f_ana_cmp = False, f_fol_cmp = False, ana_fol='', threshold = None, threshold_cfg = None, max_bytes_for_score = None, ratio = 0):
        self.f_debug = f_debug
        self.conn = sqlite3.connect(db_path)
        self.cur = self.conn.cursor()
        self.init_db()
        self.in_memory_db()        
        self.min_bytes = min_bytes
        self.f_ex_libthunk = f_ex_libthunk
        # for export
        self.f_update = f_update
        self.f_ana_exp = f_ana_exp        
        self.ana_pre = ana_pre
        if f_ana_exp:
            self.ana_pat = re.compile(self.ana_pre)
        # for compare
        self.f_ana_cmp = f_ana_cmp
        self.f_fol_cmp = f_fol_cmp
        self.ana_fol = ana_fol
        self.threshold = threshold
        self.threshold_cfg = threshold_cfg
        self.max_bytes_for_score = max_bytes_for_score
        self.ratio = float(ratio)

        self.idb_path = get_idb_path()
        self.sha256 = ida_nalt.retrieve_input_file_sha256()
        try:
            #self.sha256 = self.sha256.lower()
            self.sha256 = self.sha256.hex()
            self.md5 = ida_nalt.retrieve_input_file_md5().lower()
        except AttributeError:
            message = 'ida_nalt.retrieve_input_file_sha256() returned None. Probably the IDB was generated by old IDA (<6.9). Check the version by ida_netnode.cvar.root_node.supstr(ida_nalt.RIDX_IDA_VERSION)'
            error(message)
            #ida_kernwin.warning(message)

    def debug(self, msg):
        if self.f_debug:
            print("[D] {}".format(msg))

    def init_db(self):
        self.cur.execute("SELECT * FROM sqlite_master WHERE type='table'")
        if self.cur.fetchone() is None:
            info('DB initialized')
            self.cur.execute("CREATE TABLE IF NOT EXISTS sample(sha256 UNIQUE, path)")
            #self.cur.execute("CREATE INDEX sha256_index ON sample(sha256)")
            self.cur.execute("CREATE INDEX path_index ON sample(path)")
            self.cur.execute("CREATE TABLE IF NOT EXISTS function(sha256, fva, fname, fhd, fhm, f_ana, bsize, ptype, UNIQUE(sha256, fva))")
            self.cur.execute("CREATE INDEX f_ana_index ON function(f_ana)") 
            self.cur.execute("CREATE INDEX bsize_index ON function(bsize)")

    def in_memory_db(self): # for SELECT
        tempfile = StringIO()
        for line in self.conn.iterdump():
            tempfile.write("{}\n".format(line))
        tempfile.seek(0)
        self.mconn = sqlite3.connect(":memory:")
        self.mconn.cursor().executescript(tempfile.read())
        self.mconn.commit()
        self.mconn.row_factory=sqlite3.Row
        self.mcur = self.mconn.cursor()

    def calc_fn_machoc(self, fva, fname): # based on Machoc hash implementation (https://github.com/0x00ach/idadiff)
        func = idaapi.get_func(fva)
        if type(func) == type(None):
            self.debug('{}: ignored due to lack of function object'.format(fname))
            return None, None

        flow = idaapi.FlowChart(f=func)
        cur_hash_rev = ""
        addrIds = []
        cur_id = 1
        for c in range(0,flow.size):
            cur_basic = flow.__getitem__(c)
            cur_hash_rev += shex(cur_basic.start_ea)+":"
            addrIds.append((shex(cur_basic.start_ea),str(cur_id)))
            cur_id += 1
            addr = cur_basic.start_ea
            blockEnd = cur_basic.end_ea
            mnem = GetMnem(addr)
            while mnem != "":
                if mnem == "call": # should be separated into 2 blocks by call
                     cur_hash_rev += "c,"
                     addr = NextHead(addr,blockEnd)
                     mnem = GetMnem(addr)
                     if addr != BADADDR:
                        cur_hash_rev += shex(addr)+";"+shex(addr)+":"
                        addrIds.append((shex(addr),str(cur_id)))
                        cur_id += 1
                else:
                    addr = NextHead(addr,blockEnd)
                    mnem = GetMnem(addr)
            refs = []
            for suc in cur_basic.succs():
                refs.append(suc.start_ea)
            refs.sort()
            refsrev = ""
            for ref in refs:
                refsrev += shex(ref)+","
            if refsrev != "":
                refsrev = refsrev[:-1]
            cur_hash_rev +=  refsrev+";"

        # change addr to index
        for aid in addrIds:
            #cur_hash_rev = string.replace(cur_hash_rev,aid[0],aid[1])
            cur_hash_rev = cur_hash_rev.replace(aid[0],aid[1])
        # calculate machoc hash value
        self.debug('{}: CFG = {}'.format(fname, cur_hash_rev))
        return mmh3.hash(cur_hash_rev) & 0xFFFFFFFF, cur_id-1

    def calc_fn_ssdeep(self, fva, fname):
        d2h = b''
        for bb in yara_fn.get_basic_blocks(fva):
            rule = yara_fn.get_basic_block_rule(bb)
            if rule:
                chk = rule.cut_bytes_for_hash
                if len(chk) < yara_fn.MIN_BB_BYTE_COUNT:
                    continue
                d2h += chk.encode()
                #self.debug('chunk at {:#x}: {}'.format(bb.va, get_hex_pat(chk)))

        #self.debug('total func seq at {:#x}: {}'.format(fva, get_hex_pat(d2h)))
        if len(d2h) < self.min_bytes:
            self.debug('{}: ignored because of the number of extracted code bytes {}'.format(fname, len(d2h)))
            return None, None

        result_buffer = ctypes.create_string_buffer(FUZZY_MAX_RESULT)
        file_buffer = ctypes.create_string_buffer(d2h)
        hash_result = fuzzy_lib.fuzzy_hash_buf(file_buffer, len(file_buffer) - 1, result_buffer)
        hash_value = result_buffer.value.decode("ascii")
        return hash_value, len(d2h)

    def existed(self):
        self.mcur.execute("SELECT sha256 FROM sample WHERE sha256 = ?", (self.sha256,))
        if self.mcur.fetchone() is None:
            return False
        else:
            return True

    def exclude_libthunk(self, fva, fname):
        if self.f_ex_libthunk:
            flags = get_func_attr(fva, FUNCATTR_FLAGS)
            if flags & FUNC_LIB:
                self.debug('{}: ignored because of library function'.format(fname))
                return True
            if flags & FUNC_THUNK:
                self.debug('{}: ignored because of thunk function'.format(fname))
                return True
        return False

    def export(self):
        if self.existed() and not self.f_update:
            info('{}: The sample records are present in DB. skipped.'.format(self.sha256))
            return False

        self.cur.execute("REPLACE INTO sample values(?, ?)", (self.sha256, self.idb_path))

        pnum = tnum = 0
        records = []
        for fva in idautils.Functions():
            fname = get_func_name(fva)
            tnum += 1
            if self.exclude_libthunk(fva, fname):
                continue
            fhd, bsize = self.calc_fn_ssdeep(fva, fname)
            fhm, cfgnum = self.calc_fn_machoc(fva, fname)
            if fhd and fhm:
                pnum += 1
                f_ana = bool(self.ana_pat.search(fname)) if self.f_ana_exp else False
                tinfo = idaapi.tinfo_t()
                idaapi.get_tinfo(fva, tinfo)
                ptype = idaapi.print_tinfo('', 0, 0, idaapi.PRTYPE_1LINE, tinfo, fname, '')
                ptype = ptype + ';' if ptype is not None else ptype
                # fva is 64-bit int causing OverflowError
                records.append((self.sha256, '{:#x}'.format(fva), fname, fhd, fhm, f_ana, bsize, ptype)) 
                self.debug('EXPORT {} at {:#x}: ssdeep={} (size={}), machoc={} (num of CFG={})'.format(fname, fva, fhd, bsize, fhm, cfgnum))

        self.cur.executemany("REPLACE INTO function values (?, ?, ?, ?, ?, ?, ?, ?)", records)
        success ('{} of {} functions exported'.format(pnum, tnum))
        return True

    def compare(self):
        res = defaultdictRecurse()
        if self.f_fol_cmp:
            self.mcur.execute("SELECT sha256,path FROM sample WHERE path LIKE ?", (self.ana_fol+'%',))
        else:
            self.mcur.execute("SELECT sha256,path FROM sample")
        frows = self.mcur.fetchall()
        num_of_samples = len(frows)
        for sha256, path in frows:
            res[sha256]['path'] = path
            res[sha256]['mcnt'].default_factory = lambda: 0
        
        #sql = "SELECT sha256,fname,fhd,fhm,f_ana,ptype FROM function WHERE f_ana == 1 AND bsize BETWEEN ? AND ?" if self.f_ana_cmp else "SELECT sha256,fname,fhd,fhm,f_ana,ptype FROM function WHERE bsize BETWEEN ? AND ?"
        sql = "SELECT function.sha256,fname,fhd,fhm,f_ana,ptype FROM function INNER JOIN sample on function.sha256 == sample.sha256 WHERE path LIKE ? AND " if self.f_fol_cmp else "SELECT sha256,fname,fhd,fhm,f_ana,ptype FROM function WHERE "
        sql += "f_ana == 1 AND bsize BETWEEN ? AND ?" if self.f_ana_cmp else "bsize BETWEEN ? AND ?"
        fns = list(idautils.Functions())
        for fva in tqdm(fns, desc='comparing functions'):
            fname = get_func_name(fva)
            if self.exclude_libthunk(fva, fname) or not num_of_samples:
                continue
            pfhd, pbsize = self.calc_fn_ssdeep(fva, fname)
            pfhm, pcfgnum = self.calc_fn_machoc(fva, fname)
            if pfhd and pfhm:
                pbuf = ctypes.create_string_buffer(pfhd.encode())                
                self.debug('COMPARE {}: ssdeep={} (size={}), machoc={} (num of bb={})'.format(fname, pfhd, pbsize, pfhm, pcfgnum))                
                min_ = pbsize * (1 - (self.ratio / 100))
                max_ = pbsize * (1 + (self.ratio / 100))
                self.debug('min={}, max={}'.format(min_, max_))
                if self.f_fol_cmp:
                    self.mcur.execute(sql, (self.ana_fol+'%', min_, max_))
                else:
                    self.mcur.execute(sql, (min_, max_))
                frows = self.mcur.fetchall()
                self.debug('targeted {} records'.format(len(frows)))                
                for sha256, sfname, sfhd, sfhm, sf_ana, sptype in frows:
                    if sha256 == self.sha256: # skip the self
                        continue
                    res[sha256]['mfn'][fva].default_factory = lambda: 0
                    sbuf = ctypes.create_string_buffer(sfhd.encode())
                    score = fuzzy_lib.fuzzy_compare(pbuf, sbuf)

                    dbg_cond = g_dbg_flag and fva == g_dbg_fva and sfname == g_dbg_fname and sha256 == g_dbg_sha256
                    if dbg_cond:
                        print(('{:#x}: compared with {} in {} score = {} machoc match = {}'.format(fva, sfname, sha256, score, bool(pfhm == sfhm))))
                        
                    if (score >= self.threshold) or (score >= self.threshold_cfg and pfhm == sfhm) or (pbsize > self.max_bytes_for_score and pfhm == sfhm):
                        if dbg_cond:
                            print(('{:#x}: counting {} in {} for total number'.format(fva, sfname, sha256)))
                        res[sha256]['mcnt']['total'] += 1
                        if sf_ana:
                            res[sha256]['mcnt']['analyzed'] += 1
                            if score > res[sha256]['mfn'][fva]['score'] or (res[sha256]['mfn'][fva]['score'] == 0 and pbsize > self.max_bytes_for_score):
                                res[sha256]['mfn'][fva]['score'] = score
                                res[sha256]['mfn'][fva]['cfg_match'] = bool(pfhm == sfhm)
                                res[sha256]['mfn'][fva]['sfname'] = sfname
                                res[sha256]['mfn'][fva]['sptype'] = sptype
                                res[sha256]['mfn'][fva]['pbsize'] = pbsize
                                if dbg_cond:
                                    print(('{:#x}: appended record = {} in {}'.format(fva, sfname, sha256)))

        
        c = SummaryCh("fn_fuzzy summary", res)
        c.Show()
        success('totally {} samples compared'.format(num_of_samples))

    def close(self):
        self.conn.commit()
        self.cur.close()

def info(msg):
    print("[*] {}".format(msg))

def success(msg):
    print("[+] {}".format(msg))

def error(msg):
    print("[!] {}".format(msg))

def get_hex_pat(buf):
    # get hex pattern
    return ' '.join(['{:02x}'.format(ord(x)) for x in buf])

def shex(a):
    return hex(a).rstrip("L")

def set_decomplier_cmt(ea, cmt):
    cfunc = idaapi.decompile(ea)
    tl = idaapi.treeloc_t()
    tl.ea = ea
    tl.itp = idaapi.ITP_SEMI
    if cfunc:
      cfunc.set_user_cmt(tl, cmt)
      cfunc.save_user_cmts()
    else:
      error("Decompile failed: {:#x}".formart(ea))

def main():
    info('start')
        
    if idaapi.get_plugin_options("fn_fuzzy"): # CLI (export only)
        # not change the database to maintain the window setting
        process_config_line("ABANDON_DATABASE=YES")
        
        start = time.time()
        options = idaapi.get_plugin_options("fn_fuzzy").split(':')
        #print options
        min_bytes = int(options[0])
        f_ex_libthunk = eval(options[1])
        f_update = eval(options[2])
        f_ana_exp = eval(options[3])
        ana_pre = options[4]
        db_path = ':'.join(options[5:])
        ff = FnFuzzy(False, db_path, min_bytes, f_ex_libthunk, f_update, f_ana_exp, ana_pre)        
        res = ff.export()
        ff.close()
        elapsed = time.time() - start
        info('done (CLI)')
        if res: # return code 1 is reserved for error
            qexit(0) 
        else:
            qexit(2) # already exported (skipped)
    else: 
        f = FnFuzzyForm()
        f.Compile()
        f.iDBSave.value = g_db_path
        f.iMinBytes.value = g_min_bytes
        f.iPrefix.value = g_analyzed_prefix
        f.iFolder.value = os.path.dirname(get_idb_path())
        f.iSimilarity.value = g_threshold
        f.iSimilarityCFG.value = g_threshold_cfg
        f.iMaxBytesForScore.value = g_max_bytes_for_score
        f.iRatio.value = g_bsize_ratio
        r = f.Execute()
        if r == 1: # Run
            start = time.time()
            ff = FnFuzzy(f.cDebug.checked, f.iDBSave.value, f.iMinBytes.value, f.cLibthunk.checked, f.cUpdate.checked, f.cAnaExp.checked, f.iPrefix.value, f.cAnaCmp.checked, f.cFolCmp.checked, f.iFolder.value, f.iSimilarity.value, f.iSimilarityCFG.value, f.iMaxBytesForScore.value, f.iRatio.value)
            if f.rExport.selected:
                if ff.sha256 is None:
                    print('aborted')
                    return        
                ff.export()
                #cProfile.runctx('ff.export()', None, locals())
            else: 
                ff.compare()
                #cProfile.runctx('ff.compare()', None, locals())
            ff.close()
            elapsed = time.time() - start
        else:  
            print('canceled')
            return
    
    info('elapsed time = {} sec'.format(elapsed))            
    info('done')

if __name__ == '__main__':
    main()


================================================
FILE: fn_fuzzy/fn_fuzzy_7x.py
================================================
# fn_fuzzy.py - IDAPython script for fast multiple binary diffing triage
# Takahiro Haruyama (@cci_forensics)

import os, ctypes, sqlite3, re, time, sys, subprocess
import cProfile
from collections import defaultdict
from pprint import PrettyPrinter
from io import StringIO
from tqdm import tqdm

from idc import *
import idautils, ida_nalt, ida_kernwin, idaapi, ida_expr, ida_typeinf

import mmh3
import yara_fn_7x # modified version in the same folder

g_db_path = r'C:\analysisw\tics\fn_fuzzy.sqlite' # plz edit your path
g_min_bytes = 0x10 # minimum number of extracted code bytes per function
g_analyzed_prefix = r'fn_|func_' # analyzed function name prefix (regex)
g_threshold = 50 # function similarity score threshold without CFG match
g_threshold_cfg = 10 # function similarity score threshold with CFG match
g_max_bytes_for_score = 0x100 # more code bytes are evaluated by only CFG match
g_bsize_ratio = 40 # function binary size correction ratio to compare (40 is enough)

# debug purpose to check one function matching
g_dbg_flag = False
g_dbg_fva = 0x180015978
g_dbg_fname = 'fn_blob_get_word_param_and_seek'
g_dbg_sha256 = ''

# initialization for ssdeep
SPAMSUM_LENGTH = 64
FUZZY_MAX_RESULT = (2 * SPAMSUM_LENGTH + 20)
dirpath = os.path.dirname(__file__)
_lib_path = os.path.join(dirpath, 'fuzzy64.dll')
fuzzy_lib = ctypes.cdll.LoadLibrary(_lib_path)

g_dump_types_path = os.path.join(dirpath, 'dump_types.py')

class defaultdictRecurse(defaultdict):
    def __init__(self):
        self.default_factory = type(self)

class import_handler_t(ida_kernwin.action_handler_t):
    def __init__(self, items, idb_path, title):
        ida_kernwin.action_handler_t.__init__(self)
        self.items = items
        self.idb_path = idb_path
        self.title = title
        
    def import_types(self):        
        idc_path = os.path.splitext(self.idb_path)[0] + '.idc'
        # dump type information from the 2nd idb
        if not (os.path.exists(idc_path)):
            with open(self.idb_path, 'rb') as f:
                sig = f.read(4)
            ida = 'ida.exe' if sig == 'IDA1' else 'ida64.exe'
            ida_path = os.path.join(idadir(), ida)                
            cmd = [ida_path, '-S{}'.format(g_dump_types_path), self.idb_path]
            #print cmd        
            proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            stdout, stderr = proc.communicate()
            if proc.returncode == 0:
                success('{}: type information successfully dumped'.format(self.idb_path))
            else: 
                error('{}: type information dumping failed'.format(self.idb_path))
                return False

        # import the type information
        idc_path = os.path.splitext(self.idb_path)[0] + '.idc'
        ida_expr.exec_idc_script(None, str(idc_path), "main", None, 0)
        return True
        
    def activate(self, ctx):
        sel = []
        for idx in ctx.chooser_selection:
            # rename the function
            ea = get_name_ea_simple(self.items[idx][2])
            sfname = str(self.items[idx][4])
            #set_name(ea, sfname)
            ida_name.force_name(ea, sfname)
            success('{:#x}: renamed to {}'.format(ea, sfname))
            # set the function prototype
            sptype = str(self.items[idx][5])
            if sptype != 'None':
                tinfo = idaapi.tinfo_t()
                idaapi.parse_decl2(idaapi.cvar.idati, sptype, tinfo, 0)
                #idaapi.apply_callee_tinfo(ea, tinfo)
                if idaapi.apply_tinfo(ea, tinfo, 0):
                    success('{:#x}: function prototype set to {}'.format(ea, sptype))
                else:
                    error('{:#x}: function prototype set FAILED (maybe you should import the types?)'.format(ea))
                    if ask_yn(0, 'Do you import types from the secondary idb?') == 1:
                        if self.import_types():
                            tinfo = idaapi.tinfo_t()
                            idaapi.parse_decl2(idaapi.cvar.idati, sptype, tinfo, 0)
                            if idaapi.apply_tinfo(ea, tinfo, 0):
                                success('{:#x}: function prototype set to {}'.format(ea, sptype))
                            else:
                                error('{:#x}: function prototype set FAILED again'.format(ea))
                        
            # insert the comment
            score = self.items[idx][0]
            mmatch = self.items[idx][1]
            cmt = 'fn_fuzzy: ssdeep={}, machoc={}'.format(score, mmatch)
            set_func_cmt(ea, cmt, 1)
            #set_decomplier_cmt(ea, cmt) # not sure how to avoid orphan comment

        # update the Choose rows
        ida_kernwin.refresh_chooser(self.title)

    def update(self, ctx):
        return idaapi.AST_ENABLE_ALWAYS
    '''
        return ida_kernwin.AST_ENABLE_FOR_WIDGET \
            if ida_kernwin.is_chooser_widget(ctx.widget_type) \
          else ida_kernwin.AST_DISABLE_FOR_WIDGET
    '''

class FnCh(ida_kernwin.Choose):
    def __init__(self, title, mfn, idb_path):
        self.mfn = mfn
        self.idb_path = idb_path
        self.title = title
        ida_kernwin.Choose.__init__(
            self,
            title,
            [
              ["ssdeep score",   10 | ida_kernwin.Choose.CHCOL_DEC],
              ["machoc matched",   10 | ida_kernwin.Choose.CHCOL_PLAIN],
              ["primary function", 30 | ida_kernwin.Choose.CHCOL_PLAIN],
              ["primary bsize",   10 | ida_kernwin.Choose.CHCOL_DEC],
              ["secondary analyzed function",   30 | ida_kernwin.Choose.CHCOL_PLAIN], 
              ["secondary prototype", 40 | ida_kernwin.Choose.CHCOL_PLAIN]
            ],
            flags = ida_kernwin.Choose.CH_MULTI)

    def OnInit(self):
        self.items = []
        for fva,v in sorted(list(self.mfn.items()), key=lambda x:x[1]['score'], reverse=True):
            if v['sfname']:
                self.items.append(['{}'.format(v['score']), '{}'.format(v['cfg_match']), str(get_name(fva)), '{}'.format(v['pbsize']), str(v['sfname']), '{}'.format(v['sptype'])])
        return True

    def OnPopup(self, form, popup_handle):
        actname = "choose:actFnFuzzyImport"
        desc = ida_kernwin.action_desc_t(actname, 'Import function name and prototype', import_handler_t(self.items, self.idb_path, self.title))
        ida_kernwin.attach_dynamic_action_to_popup(form, popup_handle, desc)

    def OnGetSize(self):
        return len(self.items)

    def OnGetLine(self, n):
        return self.items[n]

    def OnSelectLine(self, n):
        idx = n[0] # due to CH_MULTI
        ida_kernwin.jumpto(get_name_ea_simple(self.items[idx][2]))

    def OnRefresh(self, n):
        self.OnInit()
        # try to preserve the cursor
        #return [ida_kernwin.Choose.ALL_CHANGED] + self.adjust_last_item(n)
        #return n
        return None

    def OnClose(self):
        print("closed ", self.title)

class SummaryCh(ida_kernwin.Choose):
    def __init__(self, title, res):
        self.res = res
        ida_kernwin.Choose.__init__(
            self,
            title,
            [ ["SHA256", 20 | ida_kernwin.Choose.CHCOL_PLAIN],
              ["total similar functions",   20 | ida_kernwin.Choose.CHCOL_DEC],
              ["analyzed similar functions",   20 | ida_kernwin.Choose.CHCOL_DEC],
              ["IDB path",   80 | ida_kernwin.Choose.CHCOL_PATH] ])
        self.items = []

    def OnInit(self):
        for sha256,v in sorted(list(self.res.items()), key=lambda x:x[1]['mcnt']['total'], reverse=True):
            if v['mcnt']['total'] > 0:
                self.items.append([str(sha256), '{}'.format(v['mcnt']['total']), '{}'.format(v['mcnt']['analyzed']), str(v['path'])])
        return True
            
    def OnGetSize(self):
        return len(self.items)

    def OnGetLine(self, n):
        return self.items[n]

    def OnSelectLine(self, n):
        sha256 = self.items[n][0]
        c = FnCh("similarities with {}(snip)".format(sha256[:8]), self.res[sha256]['mfn'], self.res[sha256]['path'])
        c.Show()

    def OnRefresh(self, n):
        return n

    def OnClose(self):
        print("closed ", self.title)

class FnFuzzyForm(ida_kernwin.Form):
    def __init__(self):
        ida_kernwin.Form.__init__(self,
r"""BUTTON YES* Run
BUTTON CANCEL Cancel
fn_fuzzy

{FormChangeCb}
General Options
<DB file path:{iDBSave}>
<minimum function code size:{iMinBytes}>
<exclude library/thunk functions:{cLibthunk}>
<enable debug messages:{cDebug}>{cGroup}>

<##Commands##Export:{rExport}>
<Compare:{rCompare}>{rGroup}>

Export Options
<update the DB records:{cUpdate}>
<store flags as analyzed functions:{cAnaExp}>{cEGroup}>
<analyzed function name prefix/suffix (regex):{iPrefix}>

Compare Options
<compare with only analyzed functions:{cAnaCmp}>
<compare with only IDBs in the specified folder:{cFolCmp}>{cCGroup}>
<the folder path:{iFolder}>
<function code size comparison criteria (0-100):{iRatio}>
<function similarity score threshold (0-100) without CFG match:{iSimilarity}>
<function similarity score threshold (0-100) with CFG match:{iSimilarityCFG}>
<function code size threshold evaluated by only CFG match:{iMaxBytesForScore}>
""",
        {
            'FormChangeCb': ida_kernwin.Form.FormChangeCb(self.OnFormChange),
            'cGroup': ida_kernwin.Form.ChkGroupControl(("cLibthunk", "cDebug")),
            'iDBSave': ida_kernwin.Form.FileInput(save=True),
            'iMinBytes': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_HEX),
            'rGroup': ida_kernwin.Form.RadGroupControl(("rCompare", "rExport")),
            'cEGroup': ida_kernwin.Form.ChkGroupControl(("cUpdate", "cAnaExp")),
            'iPrefix': ida_kernwin.Form.StringInput(),
            'cCGroup': ida_kernwin.Form.ChkGroupControl(("cAnaCmp", "cFolCmp")),
            'iFolder': ida_kernwin.Form.DirInput(),
            'iRatio': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_DEC),
            'iSimilarity': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_DEC),
            'iSimilarityCFG': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_DEC),
            'iMaxBytesForScore': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_HEX),            
        })

    def OnFormChange(self, fid):
        if fid == -1:
            self.SetControlValue(self.cLibthunk, True)
            self.SetControlValue(self.cAnaExp, True)
            self.SetControlValue(self.cAnaCmp, True)
            self.SetControlValue(self.rCompare, True)
            
            self.EnableField(self.cEGroup, False)            
            self.EnableField(self.iPrefix, False)
            self.EnableField(self.cCGroup, True)
            self.EnableField(self.iSimilarity, True)
            self.EnableField(self.iSimilarityCFG, True)
            self.EnableField(self.iMaxBytesForScore, True)
            self.EnableField(self.iRatio, True)
        if fid == self.rExport.id:
            self.EnableField(self.cEGroup, True)
            self.EnableField(self.iPrefix, True)
            self.EnableField(self.cCGroup, False)
            self.EnableField(self.iSimilarity, False)
            self.EnableField(self.iSimilarityCFG, False)
            self.EnableField(self.iMaxBytesForScore, False)
            self.EnableField(self.iRatio, False)
        elif fid == self.rCompare.id:
            self.EnableField(self.cEGroup, False)
            self.EnableField(self.iPrefix, False)
            self.EnableField(self.cCGroup, True)
            self.EnableField(self.iSimilarity, True)
            self.EnableField(self.iSimilarityCFG, True)
            self.EnableField(self.iMaxBytesForScore, True)
            self.EnableField(self.iRatio, True)
        return 1

class FnFuzzy(object):
    def __init__(self, f_debug, db_path, min_bytes, f_ex_libthunk, f_update, f_ana_exp, ana_pre, f_ana_cmp = False, f_fol_cmp = False, ana_fol='', threshold = None, threshold_cfg = None, max_bytes_for_score = None, ratio = 0):
        self.f_debug = f_debug
        self.conn = sqlite3.connect(db_path)
        self.cur = self.conn.cursor()
        self.init_db()
        self.in_memory_db()        
        self.min_bytes = min_bytes
        self.f_ex_libthunk = f_ex_libthunk
        # for export
        self.f_update = f_update
        self.f_ana_exp = f_ana_exp        
        self.ana_pre = ana_pre
        if f_ana_exp:
            self.ana_pat = re.compile(self.ana_pre)
        # for compare
        self.f_ana_cmp = f_ana_cmp
        self.f_fol_cmp = f_fol_cmp
        self.ana_fol = ana_fol
        self.threshold = threshold
        self.threshold_cfg = threshold_cfg
        self.max_bytes_for_score = max_bytes_for_score
        self.ratio = float(ratio)

        self.idb_path = get_idb_path()
        self.sha256 = ida_nalt.retrieve_input_file_sha256()
        try:
            #self.sha256 = self.sha256.lower()
            self.sha256 = self.sha256.hex()
            self.md5 = ida_nalt.retrieve_input_file_md5().lower()
        except AttributeError:
            message = 'ida_nalt.retrieve_input_file_sha256() returned None. Probably the IDB was generated by old IDA (<6.9). Check the version by ida_netnode.cvar.root_node.supstr(ida_nalt.RIDX_IDA_VERSION)'
            error(message)
            #ida_kernwin.warning(message)

    def debug(self, msg):
        if self.f_debug:
            print("[D] {}".format(msg))

    def init_db(self):
        self.cur.execute("SELECT * FROM sqlite_master WHERE type='table'")
        if self.cur.fetchone() is None:
            info('DB initialized')
            self.cur.execute("CREATE TABLE IF NOT EXISTS sample(sha256 UNIQUE, path)")
            #self.cur.execute("CREATE INDEX sha256_index ON sample(sha256)")
            self.cur.execute("CREATE INDEX path_index ON sample(path)")
            self.cur.execute("CREATE TABLE IF NOT EXISTS function(sha256, fva, fname, fhd, fhm, f_ana, bsize, ptype, UNIQUE(sha256, fva))")
            self.cur.execute("CREATE INDEX f_ana_index ON function(f_ana)") 
            self.cur.execute("CREATE INDEX bsize_index ON function(bsize)")

    def in_memory_db(self): # for SELECT
        tempfile = StringIO()
        for line in self.conn.iterdump():
            tempfile.write("{}\n".format(line))
        tempfile.seek(0)
        self.mconn = sqlite3.connect(":memory:")
        self.mconn.cursor().executescript(tempfile.read())
        self.mconn.commit()
        self.mconn.row_factory=sqlite3.Row
        self.mcur = self.mconn.cursor()

    def calc_fn_machoc(self, fva, fname): # based on Machoc hash implementation (https://github.com/0x00ach/idadiff)
        func = idaapi.get_func(fva)
        if type(func) == type(None):
            self.debug('{}: ignored due to lack of function object'.format(fname))
            return None, None

        flow = idaapi.FlowChart(f=func)
        cur_hash_rev = ""
        addrIds = []
        cur_id = 1
        for c in range(0,flow.size):
            cur_basic = flow.__getitem__(c)
            cur_hash_rev += shex(cur_basic.start_ea)+":"
            addrIds.append((shex(cur_basic.start_ea),str(cur_id)))
            cur_id += 1
            addr = cur_basic.start_ea
            blockEnd = cur_basic.end_ea
            mnem = idc.print_insn_mnem(addr)
            while mnem != "":
                if mnem == "call": # should be separated into 2 blocks by call
                     cur_hash_rev += "c,"
                     addr = idc.next_head(addr,blockEnd)
                     mnem = idc.print_insn_mnem(addr)
                     if addr != BADADDR:
                        cur_hash_rev += shex(addr)+";"+shex(addr)+":"
                        addrIds.append((shex(addr),str(cur_id)))
                        cur_id += 1
                else:
                    addr = idc.next_head(addr,blockEnd)
                    mnem = idc.print_insn_mnem(addr)
            refs = []
            for suc in cur_basic.succs():
                refs.append(suc.start_ea)
            refs.sort()
            refsrev = ""
            for ref in refs:
                refsrev += shex(ref)+","
            if refsrev != "":
                refsrev = refsrev[:-1]
            cur_hash_rev +=  refsrev+";"

        # change addr to index
        for aid in addrIds:
            #cur_hash_rev = string.replace(cur_hash_rev,aid[0],aid[1])
            cur_hash_rev = cur_hash_rev.replace(aid[0],aid[1])
        # calculate machoc hash value
        self.debug('{}: CFG = {}'.format(fname, cur_hash_rev))
        return mmh3.hash(cur_hash_rev) & 0xFFFFFFFF, cur_id-1

    def calc_fn_ssdeep(self, fva, fname):
        d2h = b''
        for bb in yara_fn_7x.get_basic_blocks(fva):
            rule = yara_fn_7x.get_basic_block_rule(bb)
            if rule:
                chk = rule.cut_bytes_for_hash
                if len(chk) < yara_fn_7x.MIN_BB_BYTE_COUNT:
                    continue
                d2h += chk.encode()
                #self.debug('chunk at {:#x}: {}'.format(bb.va, get_hex_pat(chk)))

        #self.debug('total func seq at {:#x}: {}'.format(fva, get_hex_pat(d2h)))
        if len(d2h) < self.min_bytes:
            self.debug('{}: ignored because of the number of extracted code bytes {}'.format(fname, len(d2h)))
            return None, None

        result_buffer = ctypes.create_string_buffer(FUZZY_MAX_RESULT)
        file_buffer = ctypes.create_string_buffer(d2h)
        hash_result = fuzzy_lib.fuzzy_hash_buf(file_buffer, len(file_buffer) - 1, result_buffer)
        hash_value = result_buffer.value.decode("ascii")
        return hash_value, len(d2h)

    def existed(self):
        self.mcur.execute("SELECT sha256 FROM sample WHERE sha256 = ?", (self.sha256,))
        if self.mcur.fetchone() is None:
            return False
        else:
            return True

    def exclude_libthunk(self, fva, fname):
        if self.f_ex_libthunk:
            flags = get_func_attr(fva, FUNCATTR_FLAGS)
            if flags & FUNC_LIB:
                self.debug('{}: ignored because of library function'.format(fname))
                return True
            if flags & FUNC_THUNK:
                self.debug('{}: ignored because of thunk function'.format(fname))
                return True
        return False

    def export(self):
        if self.existed() and not self.f_update:
            info('{}: The sample records are present in DB. skipped.'.format(self.sha256))
            return False

        self.cur.execute("REPLACE INTO sample values(?, ?)", (self.sha256, self.idb_path))

        pnum = tnum = 0
        records = []
        for fva in idautils.Functions():
            fname = get_func_name(fva)
            tnum += 1
            if self.exclude_libthunk(fva, fname):
                continue
            fhd, bsize = self.calc_fn_ssdeep(fva, fname)
            fhm, cfgnum = self.calc_fn_machoc(fva, fname)
            if fhd and fhm:
                pnum += 1
                f_ana = bool(self.ana_pat.search(fname)) if self.f_ana_exp else False
                #tinfo = idaapi.tinfo_t()
                #idc.get_tinfo(fva, tinfo)
                #tif = ida_typeinf.tinfo_t()
                #tinfo = idc.get_tinfo(fva)
                #ptype = idaapi.print_tinfo('', 0, 0, idaapi.PRTYPE_1LINE, tinfo, fname, '')
                ptype = ida_typeinf.idc_get_type(fva)
                ptype = ptype + ';' if ptype is not None else ptype
                # fva is 64-bit int causing OverflowError
                records.append((self.sha256, '{:#x}'.format(fva), fname, fhd, fhm, f_ana, bsize, ptype)) 
                self.debug('EXPORT {} at {:#x}: ssdeep={} (size={}), machoc={} (num of CFG={})'.format(fname, fva, fhd, bsize, fhm, cfgnum))

        self.cur.executemany("REPLACE INTO function values (?, ?, ?, ?, ?, ?, ?, ?)", records)
        success ('{} of {} functions exported'.format(pnum, tnum))
        return True

    def compare(self):
        res = defaultdictRecurse()
        if self.f_fol_cmp:
            self.mcur.execute("SELECT sha256,path FROM sample WHERE path LIKE ?", (self.ana_fol+'%',))
        else:
            self.mcur.execute("SELECT sha256,path FROM sample")
        frows = self.mcur.fetchall()
        num_of_samples = len(frows)
        for sha256, path in frows:
            res[sha256]['path'] = path
            res[sha256]['mcnt'].default_factory = lambda: 0
        
        #sql = "SELECT sha256,fname,fhd,fhm,f_ana,ptype FROM function WHERE f_ana == 1 AND bsize BETWEEN ? AND ?" if self.f_ana_cmp else "SELECT sha256,fname,fhd,fhm,f_ana,ptype FROM function WHERE bsize BETWEEN ? AND ?"
        sql = "SELECT function.sha256,fname,fhd,fhm,f_ana,ptype FROM function INNER JOIN sample on function.sha256 == sample.sha256 WHERE path LIKE ? AND " if self.f_fol_cmp else "SELECT sha256,fname,fhd,fhm,f_ana,ptype FROM function WHERE "
        sql += "f_ana == 1 AND bsize BETWEEN ? AND ?" if self.f_ana_cmp else "bsize BETWEEN ? AND ?"
        fns = list(idautils.Functions())
        for fva in tqdm(fns, desc='comparing functions'):
            fname = get_func_name(fva)
            if self.exclude_libthunk(fva, fname) or not num_of_samples:
                continue
            pfhd, pbsize = self.calc_fn_ssdeep(fva, fname)
            pfhm, pcfgnum = self.calc_fn_machoc(fva, fname)
            if pfhd and pfhm:
                pbuf = ctypes.create_string_buffer(pfhd.encode())                
                self.debug('COMPARE {}: ssdeep={} (size={}), machoc={} (num of bb={})'.format(fname, pfhd, pbsize, pfhm, pcfgnum))                
                min_ = pbsize * (1 - (self.ratio / 100))
                max_ = pbsize * (1 + (self.ratio / 100))
                self.debug('min={}, max={}'.format(min_, max_))
                if self.f_fol_cmp:
                    self.mcur.execute(sql, (self.ana_fol+'%', min_, max_))
                else:
                    self.mcur.execute(sql, (min_, max_))
                frows = self.mcur.fetchall()
                self.debug('targeted {} records'.format(len(frows)))                
                for sha256, sfname, sfhd, sfhm, sf_ana, sptype in frows:
                    if sha256 == self.sha256: # skip the self
                        continue
                    res[sha256]['mfn'][fva].default_factory = lambda: 0
                    sbuf = ctypes.create_string_buffer(sfhd.encode())
                    score = fuzzy_lib.fuzzy_compare(pbuf, sbuf)

                    dbg_cond = g_dbg_flag and fva == g_dbg_fva and sfname == g_dbg_fname and sha256 == g_dbg_sha256
                    if dbg_cond:
                        print(('{:#x}: compared with {} in {} score = {} machoc match = {}'.format(fva, sfname, sha256, score, bool(pfhm == sfhm))))
                        
                    if (score >= self.threshold) or (score >= self.threshold_cfg and pfhm == sfhm) or (pbsize > self.max_bytes_for_score and pfhm == sfhm):
                        if dbg_cond:
                            print(('{:#x}: counting {} in {} for total number'.format(fva, sfname, sha256)))
                        res[sha256]['mcnt']['total'] += 1
                        if sf_ana:
                            res[sha256]['mcnt']['analyzed'] += 1
                            if score > res[sha256]['mfn'][fva]['score'] or (res[sha256]['mfn'][fva]['score'] == 0 and pbsize > self.max_bytes_for_score):
                                res[sha256]['mfn'][fva]['score'] = score
                                res[sha256]['mfn'][fva]['cfg_match'] = bool(pfhm == sfhm)
                                res[sha256]['mfn'][fva]['sfname'] = sfname
                                res[sha256]['mfn'][fva]['sptype'] = sptype
                                res[sha256]['mfn'][fva]['pbsize'] = pbsize
                                if dbg_cond:
                                    print(('{:#x}: appended record = {} in {}'.format(fva, sfname, sha256)))

        
        c = SummaryCh("fn_fuzzy summary", res)
        c.Show()
        success('totally {} samples compared'.format(num_of_samples))

    def close(self):
        self.conn.commit()
        self.cur.close()

def info(msg):
    print("[*] {}".format(msg))

def success(msg):
    print("[+] {}".format(msg))

def error(msg):
    print("[!] {}".format(msg))

def get_hex_pat(buf):
    # get hex pattern
    return ' '.join(['{:02x}'.format(ord(x)) for x in buf])

def shex(a):
    return hex(a).rstrip("L")

def set_decomplier_cmt(ea, cmt):
    cfunc = idaapi.decompile(ea)
    tl = idaapi.treeloc_t()
    tl.ea = ea
    tl.itp = idaapi.ITP_SEMI
    if cfunc:
      cfunc.set_user_cmt(tl, cmt)
      cfunc.save_user_cmts()
    else:
      error("Decompile failed: {:#x}".formart(ea))

def main():
    info('start')
        
    if idaapi.get_plugin_options("fn_fuzzy"): # CLI (export only)
        # not change the database to maintain the window setting
        process_config_line("ABANDON_DATABASE=YES")
        
        start = time.time()
        options = idaapi.get_plugin_options("fn_fuzzy").split(':')
        #print options
        min_bytes = int(options[0])
        f_ex_libthunk = eval(options[1])
        f_update = eval(options[2])
        f_ana_exp = eval(options[3])
        ana_pre = options[4]
        db_path = ':'.join(options[5:])
        ff = FnFuzzy(False, db_path, min_bytes, f_ex_libthunk, f_update, f_ana_exp, ana_pre)        
        res = ff.export()
        ff.close()
        elapsed = time.time() - start
        info('done (CLI)')
        if res: # return code 1 is reserved for error
            qexit(0) 
        else:
            qexit(2) # already exported (skipped)
    else: 
        f = FnFuzzyForm()
        f.Compile()
        f.iDBSave.value = g_db_path
        f.iMinBytes.value = g_min_bytes
        f.iPrefix.value = g_analyzed_prefix
        f.iFolder.value = os.path.dirname(get_idb_path())
        f.iSimilarity.value = g_threshold
        f.iSimilarityCFG.value = g_threshold_cfg
        f.iMaxBytesForScore.value = g_max_bytes_for_score
        f.iRatio.value = g_bsize_ratio
        r = f.Execute()
        if r == 1: # Run
            start = time.time()
            ff = FnFuzzy(f.cDebug.checked, f.iDBSave.value, f.iMinBytes.value, f.cLibthunk.checked, f.cUpdate.checked, f.cAnaExp.checked, f.iPrefix.value, f.cAnaCmp.checked, f.cFolCmp.checked, f.iFolder.value, f.iSimilarity.value, f.iSimilarityCFG.value, f.iMaxBytesForScore.value, f.iRatio.value)
            if f.rExport.selected:
                if ff.sha256 is None:
                    print('aborted')
                    return        
                ff.export()
                #cProfile.runctx('ff.export()', None, locals())
            else: 
                ff.compare()
                #cProfile.runctx('ff.compare()', None, locals())
            ff.close()
            elapsed = time.time() - start
        else:  
            print('canceled')
            return
    
    info('elapsed time = {} sec'.format(elapsed))            
    info('done')

if __name__ == '__main__':
    main()


================================================
FILE: fn_fuzzy/yara_fn.py
================================================
'''
IDAPython script that generates a YARA rule to match against the
basic blocks of the current function. It masks out relocation bytes
and ignores jump instructions (given that we're already trying to
match compiler-specific bytes, this is of arguable benefit).

If python-yara is installed, the IDAPython script also validates that
the generated rule matches at least one segment in the current file.

author: Willi Ballenthin <william.ballenthin@fireeye.com>
'''
# 2018/8/6 Takahiro Haruyama modified to calculate fixup (relocation) size correctly
# and exclude direct memory reference data and other ignorable variable code

import logging
from collections import namedtuple

from idc import *
import idaapi
import idautils
import ida_ua, ida_kernwin

logger = logging.getLogger(__name__)

BasicBlock = namedtuple('BasicBlock', ['va', 'size'])


# each rule must have at least this many non-masked bytes
MIN_BB_BYTE_COUNT = 4

def get_basic_blocks(fva):
    '''
    return sequence of `BasicBlock` instances for given function.
    '''
    ret = []
    func = idaapi.get_func(fva)
    if func is None:
        return ret

    for bb in idaapi.FlowChart(func):
        ret.append(BasicBlock(va=bb.start_ea,
                              size=bb.end_ea - bb.start_ea))

    return ret


def get_function(va):
    '''
    return va for first instruction in function that contains given va.
    '''
    return idaapi.get_func(va).start_ea


Rule = namedtuple('Rule', ['name', 'bytes', 'masked_bytes', 'cut_bytes_for_hash'])


def is_jump(va):
    '''
    return True if the instruction at the given address appears to be a jump.
    '''
    return print_insn_mnem(va).startswith('j')

def get_fixup_va_and_size(va):
    fva = idaapi.get_next_fixup_ea(va)
    ftype = get_fixup_target_type(fva)
    fsize = ida_fixup.calc_fixup_size(ftype)
    return fva, fsize

def get_basic_block_rule(bb):
    '''
    create and format a YARA rule for a single basic block.
    The following bytes are ignored:
        - relocation bytes
        - the last jump instruction
        - direct memory references / immediate values and other igorable data
    '''
    # fetch the instruction start addresses
    insns = []
    va = bb.va
    while va < bb.va + bb.size:
        insns.append(va)
        va = next_head(va)

    # drop the last instruction if its a jump
    if insns and is_jump(insns[-1]):
        insns = insns[:-1]

    _bytes = []
    # `masked_bytes` is the list of formatted bytes,
    #   not yet join'd for performance.
    masked_bytes = []
    cut_bytes_for_hash = ''
    for va in insns:
        insn = ida_ua.insn_t()
        size = ida_ua.decode_insn(insn, va)
        mnem = insn.get_canon_mnem()
        op1 = insn.Op1
        op2 = insn.Op2

        fixup_byte_addrs = set([])
        if idaapi.contains_fixups(va, size): # not work for x64 binaries? (e.g., idaapi.contains_fixups(here(), 0x2d000) -> False)
            logging.debug('ea = {:#x}, fixups'.format(va))
            # fetch the fixup locations and sizes within this one instruction.
            fixups = []
            fva, fsize = get_fixup_va_and_size(va)
            fixups.append((fva, fsize))
            fva += fsize
            while fva < va + size:
                fva, fsize = get_fixup_va_and_size(fva - 1) # to detect consecutive fixups
                fixups.append((fva, fsize))
                fva += fsize
            logging.debug('fixups: {}'.format(fixups))
            # compute the addresses of each component byte.
            for fva, fsize in fixups:
                for i in range(fva, fva+fsize):
                    fixup_byte_addrs.add(i)

        # fetch and format each byte of the instruction,
        #  possibly masking it into an unknown byte if its a fixup or several operand types like direct mem ref.
        masked_types = [o_mem, o_imm, o_displ, o_near, o_far]
        #masked_types = [o_mem, o_imm, o_near, o_far]
        bytes_ = get_bytes(va, size)
        if bytes_ is None:
            return None
        for i, byte in enumerate(bytes_):
            _bytes.append(byte)
            byte_addr = i + va
            if byte_addr in fixup_byte_addrs:
                logging.debug('{:#x}: fixup byte (masked)'.format(byte_addr))
                masked_bytes.append('??')
            elif op1.type in masked_types and i >= op1.offb and (i < op2.offb or op2.offb == 0):
                logging.debug('{:#x}: Op1 masked byte'.format(byte_addr))
                masked_bytes.append('??')
            elif op2.type in masked_types and i >= op2.offb:
                logging.debug('{:#x}: Op2 masked byte'.format(byte_addr))
                masked_bytes.append('??')
            else:
                masked_bytes.append('%02X' % (byte)) # for Python3
                cut_bytes_for_hash += chr(byte)

    return Rule('$0x%x' % (bb.va), _bytes, masked_bytes, cut_bytes_for_hash)


def format_rules(fva, rules):
    '''
    given the address of a function, and the byte signatures for basic blocks in
     the function, format a complete YARA rule that matches all of the
     basic block signatures.
    '''
    name = GetFunctionName(fva)
    if not rules:
        logging.info('no rules for {}'.format(name))
        return None

    # some characters aren't valid for YARA rule names
    safe_name = name
    BAD_CHARS = '@ /\\!@#$%^&*()[]{};:\'",./<>?'
    for c in BAD_CHARS:
        safe_name = safe_name.replace(c, '')

    md5 = idautils.GetInputFileMD5()
    ret = []
    ret.append('rule a_{hash:s}_{name:s} {{'.format(
        hash=md5.hex(),
        name=safe_name))
    ret.append('  meta:')
    ret.append('    sample_md5 = "{md5:s}"'.format(md5=md5.hex()))
    ret.append('    function_address = "0x{fva:x}"'.format(fva=fva))
    ret.append('    function_name = "{name:s}"'.format(name=name))
    ret.append('  strings:')
    for rule in rules:
        formatted_rule = ' '.join(rule.masked_bytes).rstrip('?? ')
        ret.append('    {name:s} = {{ {hex:s} }}'.format(
            name=rule.name,
            hex=formatted_rule))
    ret.append('  condition:')
    ret.append('    all of them')
    ret.append('}')
    return '\n'.join(ret)


def create_yara_rule_for_function(fva):
    '''
    given the address of a function, generate and format a complete YARA rule
     that matches the basic blocks.
    '''
    rules = []
    for bb in get_basic_blocks(fva):
        rule = get_basic_block_rule(bb)

        if rule:
            # ensure there at least MIN_BB_BYTE_COUNT
            #  non-masked bytes in the rule, or ignore it.
            # this will reduce the incidence of many very small matches.
            unmasked_count = len([b for b in rule.masked_bytes if b != '??'])
            if unmasked_count < MIN_BB_BYTE_COUNT:
                continue

            rules.append(rule)

    return format_rules(fva, rules)


def get_segment_buffer(segstart):
    '''
    fetch the bytes of the section that starts at the given address.
    if the entire section cannot be accessed, try smaller regions until it works.
    '''
    segend = idaapi.getseg(segstart).end_ea
    buf = None
    segsize = segend - segstart
    while buf is None and segsize > 0:
        buf = GetManyBytes(segstart, segsize)
        if buf is None:
            segsize -= 0x1000
    return buf


Segment = namedtuple('Segment', ['start', 'size', 'name', 'buf'])


def get_segments():
    '''
    fetch the segments in the current executable.
    '''
    for segstart in idautils.Segments():
         segend = idaapi.getseg(segstart).end_ea
         segsize = segend - segstart
         segname = str(SegName(segstart)).rstrip('\x00')
         segbuf = get_segment_buffer(segstart)
         yield Segment(segstart, segend, segname, segbuf)


class TestDidntRunError(Exception):
    pass


def test_yara_rule(rule):
    '''
    try to match the given rule against each segment in the current exectuable.
    raise TestDidntRunError if its not possible to import the YARA library.
    return True if there's at least one match, False otherwise.
    '''
    try:
        import yara
    except ImportError:
        logger.warning("can't test rule: failed to import python-yara")
        raise TestDidntRunError('python-yara not available')

    r = yara.compile(source=rule)

    for segment in get_segments():
        if segment.buf is not None:
            matches = r.match(data=segment.buf)
            if len(matches) > 0:
                logger.info('generated rule matches section: {:s}'.format(segment.name))
                return True
    return False


def main():
    print('Start')
    ans = ida_kernwin.ask_yn(0, 'define only selected function?')
    if ans:
        va = ScreenEA()
        fva = get_function(va)
        print(('-' * 80))
        rule = create_yara_rule_for_function(fva)
        if rule:
            print(rule)
            '''
            if test_yara_rule(rule):
                logging.info('success: validated the generated rule')
            else:
                logging.error('error: failed to validate generated rule')
            '''
    else:
        for fva in idautils.Functions():
            print(('-' * 80))
            rule = create_yara_rule_for_function(fva)
            if rule:
                print(rule)
    print('Done')

if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    logging.getLogger().setLevel(logging.INFO)
    #logging.basicConfig(level=logging.DEBUG)
    #logging.getLogger().setLevel(logging.DEBUG)
    main()


================================================
FILE: fn_fuzzy/yara_fn_7x.py
================================================
'''
IDAPython script that generates a YARA rule to match against the
basic blocks of the current function. It masks out relocation bytes
and ignores jump instructions (given that we're already trying to
match compiler-specific bytes, this is of arguable benefit).

If python-yara is installed, the IDAPython script also validates that
the generated rule matches at least one segment in the current file.

author: Willi Ballenthin <william.ballenthin@fireeye.com>
'''
# 2018/8/6 Takahiro Haruyama modified to calculate fixup (relocation) size correctly
# and exclude direct memory reference data and other ignorable variable code

import logging
from collections import namedtuple

from idc import *
import idaapi
import idautils
import ida_ua, ida_kernwin

logger = logging.getLogger(__name__)

BasicBlock = namedtuple('BasicBlock', ['va', 'size'])


# each rule must have at least this many non-masked bytes
MIN_BB_BYTE_COUNT = 4

def get_basic_blocks(fva):
    '''
    return sequence of `BasicBlock` instances for given function.
    '''
    ret = []
    func = idaapi.get_func(fva)
    if func is None:
        return ret

    for bb in idaapi.FlowChart(func):
        ret.append(BasicBlock(va=bb.start_ea,
                              size=bb.end_ea - bb.start_ea))

    return ret


def get_function(va):
    '''
    return va for first instruction in function that contains given va.
    '''
    return idaapi.get_func(va).start_ea


Rule = namedtuple('Rule', ['name', 'bytes', 'masked_bytes', 'cut_bytes_for_hash'])


def is_jump(va):
    '''
    return True if the instruction at the given address appears to be a jump.
    '''
    return print_insn_mnem(va).startswith('j')

def get_fixup_va_and_size(va):
    fva = idaapi.get_next_fixup_ea(va)
    ftype = get_fixup_target_type(fva)
    fsize = ida_fixup.calc_fixup_size(ftype)
    return fva, fsize

def get_basic_block_rule(bb):
    '''
    create and format a YARA rule for a single basic block.
    The following bytes are ignored:
        - relocation bytes
        - the last jump instruction
        - direct memory references / immediate values and other igorable data
    '''
    # fetch the instruction start addresses
    insns = []
    va = bb.va
    while va < bb.va + bb.size:
        insns.append(va)
        va = next_head(va)

    # drop the last instruction if its a jump
    if insns and is_jump(insns[-1]):
        insns = insns[:-1]

    _bytes = []
    # `masked_bytes` is the list of formatted bytes,
    #   not yet join'd for performance.
    masked_bytes = []
    cut_bytes_for_hash = ''
    for va in insns:
        insn = ida_ua.insn_t()
        size = ida_ua.decode_insn(insn, va)
        mnem = insn.get_canon_mnem()
        op1 = insn.Op1
        op2 = insn.Op2

        fixup_byte_addrs = set([])
        if idaapi.contains_fixups(va, size): # not work for x64 binaries? (e.g., idaapi.contains_fixups(here(), 0x2d000) -> False)
            logging.debug('ea = {:#x}, fixups'.format(va))
            # fetch the fixup locations and sizes within this one instruction.
            fixups = []
            fva, fsize = get_fixup_va_and_size(va)
            fixups.append((fva, fsize))
            fva += fsize
            while fva < va + size:
                fva, fsize = get_fixup_va_and_size(fva - 1) # to detect consecutive fixups
                fixups.append((fva, fsize))
                fva += fsize
            logging.debug('fixups: {}'.format(fixups))
            # compute the addresses of each component byte.
            for fva, fsize in fixups:
                for i in range(fva, fva+fsize):
                    fixup_byte_addrs.add(i)

        # fetch and format each byte of the instruction,
        #  possibly masking it into an unknown byte if its a fixup or several operand types like direct mem ref.
        masked_types = [o_mem, o_imm, o_displ, o_near, o_far]
        #masked_types = [o_mem, o_imm, o_near, o_far]
        bytes_ = get_bytes(va, size)
        if bytes_ is None:
            return None
        for i, byte in enumerate(bytes_):
            _bytes.append(byte)
            byte_addr = i + va
            if byte_addr in fixup_byte_addrs:
                logging.debug('{:#x}: fixup byte (masked)'.format(byte_addr))
                masked_bytes.append('??')
            elif op1.type in masked_types and i >= op1.offb and (i < op2.offb or op2.offb == 0):
                logging.debug('{:#x}: Op1 masked byte'.format(byte_addr))
                masked_bytes.append('??')
            elif op2.type in masked_types and i >= op2.offb:
                logging.debug('{:#x}: Op2 masked byte'.format(byte_addr))
                masked_bytes.append('??')
            else:
                masked_bytes.append('%02X' % (byte)) # for Python3
                cut_bytes_for_hash += chr(byte)

    return Rule('$0x%x' % (bb.va), _bytes, masked_bytes, cut_bytes_for_hash)


def format_rules(fva, rules):
    '''
    given the address of a function, and the byte signatures for basic blocks in
     the function, format a complete YARA rule that matches all of the
     basic block signatures.
    '''
    name = idc.get_func_name(fva)
    if not rules:
        logging.info('no rules for {}'.format(name))
        return None

    # some characters aren't valid for YARA rule names
    safe_name = name
    BAD_CHARS = '@ /\\!@#$%^&*()[]{};:\'",./<>?'
    for c in BAD_CHARS:
        safe_name = safe_name.replace(c, '')

    md5 = idautils.GetInputFileMD5()
    ret = []
    ret.append('rule a_{hash:s}_{name:s} {{'.format(
        hash=md5.hex(),
        name=safe_name))
    ret.append('  meta:')
    ret.append('    sample_md5 = "{md5:s}"'.format(md5=md5.hex()))
    ret.append('    function_address = "0x{fva:x}"'.format(fva=fva))
    ret.append('    function_name = "{name:s}"'.format(name=name))
    ret.append('  strings:')
    for rule in rules:
        formatted_rule = ' '.join(rule.masked_bytes).rstrip('?? ')
        ret.append('    {name:s} = {{ {hex:s} }}'.format(
            name=rule.name,
            hex=formatted_rule))
    ret.append('  condition:')
    ret.append('    all of them')
    ret.append('}')
    return '\n'.join(ret)


def create_yara_rule_for_function(fva):
    '''
    given the address of a function, generate and format a complete YARA rule
     that matches the basic blocks.
    '''
    rules = []
    for bb in get_basic_blocks(fva):
        rule = get_basic_block_rule(bb)

        if rule:
            # ensure there at least MIN_BB_BYTE_COUNT
            #  non-masked bytes in the rule, or ignore it.
            # this will reduce the incidence of many very small matches.
            unmasked_count = len([b for b in rule.masked_bytes if b != '??'])
            if unmasked_count < MIN_BB_BYTE_COUNT:
                continue

            rules.append(rule)

    return format_rules(fva, rules)


def get_segment_buffer(segstart):
    '''
    fetch the bytes of the section that starts at the given address.
    if the entire section cannot be accessed, try smaller regions until it works.
    '''
    segend = idaapi.getseg(segstart).end_ea
    buf = None
    segsize = segend - segstart
    while buf is None and segsize > 0:
        buf = idc.get_bytes(segstart, segsize)
        if buf is None:
            segsize -= 0x1000
    return buf


Segment = namedtuple('Segment', ['start', 'size', 'name', 'buf'])


def get_segments():
    '''
    fetch the segments in the current executable.
    '''
    for segstart in idautils.Segments():
         segend = idaapi.getseg(segstart).end_ea
         segsize = segend - segstart
         segname = str(idc.get_segm_name(segstart)).rstrip('\x00')
         segbuf = get_segment_buffer(segstart)
         yield Segment(segstart, segend, segname, segbuf)


class TestDidntRunError(Exception):
    pass


def test_yara_rule(rule):
    '''
    try to match the given rule against each segment in the current exectuable.
    raise TestDidntRunError if its not possible to import the YARA library.
    return True if there's at least one match, False otherwise.
    '''
    try:
        import yara
    except ImportError:
        logger.warning("can't test rule: failed to import python-yara")
        raise TestDidntRunError('python-yara not available')

    r = yara.compile(source=rule)

    for segment in get_segments():
        if segment.buf is not None:
            matches = r.match(data=segment.buf)
            if len(matches) > 0:
                logger.info('generated rule matches section: {:s}'.format(segment.name))
                return True
    return False


def main():
    print('Start')
    ans = ida_kernwin.ask_yn(0, 'define only selected function?')
    if ans:
        va = ida_kernwin.get_screen_ea()
        fva = get_function(va)
        print(('-' * 80))
        rule = create_yara_rule_for_function(fva)
        if rule:
            print(rule)
            '''
            if test_yara_rule(rule):
                logging.info('success: validated the generated rule')
            else:
                logging.error('error: failed to validate generated rule')
            '''
    else:
        for fva in idautils.Functions():
            print(('-' * 80))
            rule = create_yara_rule_for_function(fva)
            if rule:
                print(rule)
    print('Done')

if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    logging.getLogger().setLevel(logging.INFO)
    #logging.basicConfig(level=logging.DEBUG)
    #logging.getLogger().setLevel(logging.DEBUG)
    main()


================================================
FILE: stackstring_static/README.org
================================================
* stackstring_static.py - IDAPython script statically-recovering strings constructed in stack

The motivation is the same as [[https://www.fireeye.com/blog/threat-research/2014/08/flare-ida-pro-script-series-automatic-recovery-of-constructed-strings-in-malware.html][FireEye FLARE script]], but I implemented it statically without Vivisect a few years ago.

Note: the script internally renames the stack variables so manually-renamed info in the function will be lost

ASCII case:

[[./img/sss_asc_after.jpg]]

Unicode case:

[[./img/sss_uni_after.jpg]]

xor-decoding case:

[[./img/sss_xor_after.jpg]]


================================================
FILE: stackstring_static/stackstring_static.py
================================================
# stackstring_static.py - IDAPython script statically-recovering strings constructed in stack
# Takahiro Haruyama (@cci_forensics)
# Note: the script internally renames the stack variables so manually-renamed info will be lost

import struct

from ida_ua import *
from ida_allins import *
from idautils import *
#from ida_funcs import *
from idc import *
import ida_kernwin

def extract_unicode(data):
    pat = re.compile(r'^(?:[\x20-\x7E][\x00]){2,}')
    return list(set([w.decode('utf-16le') for w in pat.findall(data)]))

def extract_ascii(data):
    pat = re.compile(r'^(?:[\x20-\x7E]){2,}')
    return list(set([w.decode('ascii') for w in pat.findall(data)]))

class StackString(object):

    def __init__ (self, start, end, debug, do_xor, static_xor_key):
        self.start = start
        self.end = end
        self.debug = debug
        self.do_xor = do_xor
        self.regs_w_value = {}
        self.stack_chars = {}
        self.xor_vars = {}
        self.stack_imm = None
        self.static_xor_key = static_xor_key

    def rename_vars(self):
        stack = GetFrame(self.start)
        stack_size = GetStrucSize(stack)
        args_and_ret_size = stack_size - GetFrameLvarSize(self.start)

        for offset, name, size in StructMembers(stack):
            postfix = stack_size - offset - args_and_ret_size
            if postfix >= 0:
                self.stack_chars[postfix] = 0 # initialize vars
                if name.find('var_') == -1:
                    #postfix = stack_size - offset - args_and_ret_size
                    SetMemberName(stack, offset, 'var_{:X}'.format(postfix))

    def store_bytes_to_reg(self, r, b):
        if r == procregs.sp.reg or r == procregs.bp.reg:
            return
        elif procregs.xmm0.reg <= r and r <= procregs.xmm15.reg:
            self.dprint('reg enum {} = {}'.format(r, repr(b)))
            self.regs_w_value[r] = b
        #if (0x1f < b and b < 0x7f) or b == 0:
        elif 0 <= b and b < 0x100:
            self.dprint('reg enum {} = {:#x}'.format(r, b))
            self.regs_w_value[r] = b
            if procregs.ax.reg <= r and r <= procregs.bx.reg:
                # ax = eax = rax = 0 but al = 16 / ah = 20
                self.regs_w_value[r+16] = b
                self.regs_w_value[r+20] = b

    def store_reg_to_reg(self, dst, src):
        if dst == procregs.sp.reg or dst == procregs.bp.reg:
            return
        if src in self.regs_w_value:
            self.dprint('reg enum {} = reg enum {} ({:#x})'.format(dst, src, self.regs_w_value[src]))
            self.regs_w_value[dst] = self.regs_w_value[src]

    def parse_and_get_var_hex(self, vstr):
        # e.g., mov     [ebp+68h+var_18+0Ch], 61h
        var_off = vstr.split('_')[1][:-1].rstrip('h').split('+') # '18+0C'
        if len(var_off) == 2:
            res = int(var_off[0], 16) - int(var_off[1], 16)
        else:
            res = int(var_off[0], 16)

        # handle base+index registers (e.g., mov     [rsp+rax+258h+var_C0], 6Fh)
        try:
            the_reg = eval('procregs.{}.reg'.format(vstr.split('+')[1]))
            if the_reg in self.regs_w_value:
                res = res - self.regs_w_value[the_reg]
        except SyntaxError:
            pass
        return res
        #return eval('0x{}'.format(var_num)) # '18-4' = 20

    def store_byte_to_var(self, v, b):
        #if (0x1f < b and b < 0x7f) or b == 0:
        if 0 <= b and b < 0x100:
            #'''
            try:
                if self.stack_chars[v] != 0: # should not be overwritten
                    return
            except KeyError: # when not initialized (to handle the bytes one by one)
                #print 'keyerror var_{:X} = {}'.format(v, b)
                pass
            #'''
            self.dprint('var_{:X} = {:#x}'.format(v, b))
            self.stack_chars[v] = b

    def store_bytes_to_vars(self, v, bs):
        if isinstance(bs, str): # binary sequence for xmm registers
            blist = [ord(x) for x in bs]
        else: # int or long
            blist = self.int_to_bytes_list(bs)

        for i, b in enumerate(blist):
            #self.store_byte_to_var(v - i, blist[i])
            self.store_byte_to_var(v - i, b)

    def store_key_to_name(self, v, b):
        #if (0x1f < b and b < 0x7f) or b == 0:
        if 0 <= b and b < 0x100:
            self.dprint('{} ^ {:#x}'.format(v, b))
            self.xor_vars[v] = b

    def int_to_bytes_list(self, v):
        if v == 0:
            return [0]
        res = []
        while(1):
            b = v & 0xff
            v = v >> 8
            #if 0x1f < b and b < 0x7f or b == 0:
            if 0 <= b and b < 0x100:
                res.append(b)
                #if v == 0 and (len(res) == 1 or len(res) == 2 or len(res) == 4 or len(res) == 8):
                if v == 0 and (len(res) == 2 or len(res) == 4 or len(res) == 8):
                    # e.g., mov     [rsp+3A8h+var_290], 6E0069h
                    return res
            else:
                break
        return []

    def store_byte_to_stack(self, b):
        if 0 <= b and b < 0x100:
            self.stack_imm = b

    def dprint(self, s):
        if self.debug:
            print s

    def traverse(self):
        print '----------------------------------------------'
        print '{:#x}:'.format(self.start)

        # replace analyzed names with 'var_*' in stack for calculation
        try:
            self.rename_vars()
        #except TypeError: # caused by StructMembers()
        except:
            return

        for head in Heads(self.start, self.end):
            self.dprint('{:#x}'.format(head))
            insn = insn_t()
            inslen = decode_insn(insn, head)

            if insn.itype == NN_mov or insn.itype == NN_movsxd:
                if insn.Op1.type == o_reg and insn.Op2.type == o_imm: # e.g., mov     cl/cx/ecx, 6Ch
                    self.store_bytes_to_reg(insn.Op1.reg, insn.Op2.value)

                elif insn.Op1.type == o_reg and insn.Op2.type == o_reg: # e.g., mov     cl/cx/ecx, al/ax/eax
                    self.store_reg_to_reg(insn.Op1.reg, insn.Op2.reg)

                elif insn.Op1.type == o_reg and insn.Op2.dtype == dt_byte and insn.Op2.type == o_mem: # e.g., mov     al, ds:byte_100040F8
                    self.store_bytes_to_reg(insn.Op1.reg, Byte(insn.Op2.addr))

                elif insn.Op1.type == o_displ and GetOpnd(head, 0).find('var_') != -1 and insn.Op2.type == o_reg and (insn.Op2.dtype == dt_byte or insn.Op2.dtype == dt_word): # e.g., mov     [esp+180h+var_127], cl
                #elif insn.Op1.type == o_displ and GetOpnd(head, 0).find('var_') != -1 and insn.Op2.type == o_reg: # e.g., mov [rsp+258h+var_1F0], eax (index register)
                    try:
                        var_hex = self.parse_and_get_var_hex(GetOpnd(head, 0))
                    except (AttributeError, IndexError, ValueError): # e.g., var_10.S_un
                        continue
                    if insn.Op2.reg in self.regs_w_value:
                        self.store_bytes_to_vars(var_hex, self.regs_w_value[insn.Op2.reg])

                elif insn.Op1.type == o_displ and insn.Op2.type == o_imm: # e.g., mov     [esp+188h+var_130], 6Ah/2E32h/3362646Fh
                    #print 'o_displ = o_imm'
                    try:
                        var_hex = self.parse_and_get_var_hex(GetOpnd(head, 0))
                    except (AttributeError, IndexError, ValueError): # e.g., var_10.S_un
                        continue
                    self.store_bytes_to_vars(var_hex, insn.Op2.value)
                elif insn.Op1.type == o_reg and insn.Op2.type == o_displ: # e.g., mov     eax, [rsp+258h+var_1F0]
                    try:
                        var_hex = self.parse_and_get_var_hex(GetOpnd(head, 1))
                    except (AttributeError, IndexError, ValueError): # e.g., var_10.S_un
                        continue
                    if var_hex in self.stack_chars:
                        self.store_bytes_to_reg(insn.Op1.reg, self.stack_chars[var_hex])

            elif insn.itype == NN_xor:
                if insn.Op1.type == o_reg and insn.Op2.type == o_reg and insn.Op1.reg == insn.Op2.reg:
                    # e.g., xor ebx, ebx
                    self.store_bytes_to_reg(insn.Op1.reg, 0)
                elif insn.Op1.type == o_displ:
                    # e.g., xor     [esp+eax+384h+var_2A4], bl
                    try:
                        var_hex = self.parse_and_get_var_hex(GetOpnd(head, 0))
                    except (AttributeError, IndexError, ValueError): # e.g., var_10.S_un
                        continue
                    str_var_hex = 'var_{:X}'.format(var_hex)
                    if insn.Op2.type == o_reg and insn.Op2.reg in self.regs_w_value:
                        self.store_key_to_name(str_var_hex, self.regs_w_value[insn.Op2.reg])
                    elif insn.Op2.type == o_imm:
                        self.store_key_to_name(str_var_hex, insn.Op2.value)

            elif insn.itype == NN_and:
                if insn.Op1.type == o_displ and GetOpnd(head, 0).find('var_') != -1 and insn.Op2.value == 0:
                    # e.g., and     [ebp+var_24], 0
                    try:
                        var_hex = self.parse_and_get_var_hex(GetOpnd(head, 0))
                    except (AttributeError, IndexError, ValueError): # e.g., var_10.S_un
                        continue
                    self.store_byte_to_var(var_hex, 0)

            # e.g., push    7; pop     edx
            elif insn.itype == NN_push and insn.Op1.type == o_imm:
                self.store_byte_to_stack(insn.Op1.value)
            elif insn.itype == NN_pop and insn.Op1.type == o_reg and self.stack_imm:
                    self.store_bytes_to_reg(insn.Op1.reg, self.stack_imm)
                    self.stack_imm = None

            # for SSE registers
            elif (insn.itype == NN_movdqa or insn.itype == NN_movaps) and insn.Op1.type == o_reg:
                # e.g., movdqa  xmm1, ds:xmmword_155680
                self.store_bytes_to_reg(insn.Op1.reg, GetManyBytes(insn.Op2.addr, 0x10))
            elif (insn.itype == NN_movdqu or insn.itype == NN_movups) and insn.Op1.type == o_displ:
                # e.g., movdqu  [ebp+var_27C], xmm1
                try:
                    var_hex = self.parse_and_get_var_hex(GetOpnd(head, 0))
                except (AttributeError, IndexError, ValueError): # e.g., var_10.S_un
                    continue
                if insn.Op2.reg in self.regs_w_value:
                    self.store_bytes_to_vars(var_hex, self.regs_w_value[insn.Op2.reg])

            # for o_displ operand with base+index registers (increment index)
            elif insn.itype == NN_inc and insn.Op1.type == o_reg and insn.Op1.reg in self.regs_w_value:
                self.dprint('{}: incremented {}->{}'.format(GetOpnd(head, 0), self.regs_w_value[insn.Op1.reg], self.regs_w_value[insn.Op1.reg]+1))
                self.regs_w_value[insn.Op1.reg] += 1

        strings = {}
        result = []
        prev = 0
        len_ = 0
        uresult = []
        uprev = 0
        ulen = 0
        for k in sorted(self.stack_chars.keys(), reverse=True):
            self.dprint('{:x}: prev={:x}, uprev={:x}'.format(k, prev, uprev))

            # detect discontinuous chars
            if prev != 0 and prev != k + 1:
                self.dprint('discontinuous chars detected')
                stack_var = 'var_{:X}'.format(prev - 1  + len_)
                strings[stack_var] = ''.join(result)
                if strings[stack_var][0] != '\x00':
                    print '{} = {}'.format(stack_var, repr(strings[stack_var]))
                result = []
                prev = 0
                len_ = 0
                uresult = []
                uprev = 0
                ulen = 0
            elif uprev != 0 and uprev != k + 1:
            #elif uprev != 0 and uprev != k + 1 and uresult[1] == 0: # tiny check for unicode
                self.dprint('discontinuous chars detected (unicode)')
                stack_var = 'var_{:X}'.format(uprev - 1  + ulen)
                try:
                    #strings[stack_var] = ''.join(uresult).decode('utf-16')
                    self.dprint('data: {}'.format(repr(''.join(uresult))))
                    if extract_unicode(''.join(uresult)):
                        strings[stack_var] = extract_unicode(''.join(uresult))[0]
                        if strings[stack_var][0] != '\x00':
                            print '{} = {}'.format(stack_var, repr(strings[stack_var]))
                #except UnicodeDecodeError:
                except (TypeError, IndexError):
                    self.dprint('exception: {}'.format(stack_var))
                    #strings[stack_var] = ''.join(uresult)
                    pass
                uresult = []
                uprev = 0
                ulen = 0
                result = []
                prev = 0
                len_ = 0

            self.dprint('{:x}: {} (len={}, ulen={})'.format(k, repr(chr(self.stack_chars[k])), len_, ulen))
            result.append(chr(self.stack_chars[k]))
            uresult.append(chr(self.stack_chars[k]))

            # detect null-terminated chars
            #'''
            if self.stack_chars[k] == 0:
            #if self.stack_chars[k] == 0 and (prev != 0 and self.stack_chars[prev] == 0):
                #stack_var = 'var_{:X}'.format(k + len_)
                #if uprev != 0 and self.stack_chars[uprev] == 0:
                if uprev != 0 and self.stack_chars[uprev] == 0 and uresult[1] == 0: # tiny check for unicode
                    self.dprint('null-terminated chars detected (unicode)')
                    stack_var = 'var_{:X}'.format(k + ulen)
                    try:
                        #print ''.join(uresult)
                        #strings[stack_var] = ''.join(uresult)[:-1].decode('utf-16')
                        if extract_unicode(''.join(uresult)):
                            strings[stack_var] = extract_unicode(''.join(uresult))[0]
                            if strings[stack_var][0] != '\x00':
                                print '{} = {}'.format(stack_var, repr(strings[stack_var]))
                    #except UnicodeDecodeError:
                    except (TypeError, IndexError):
                        #strings[stack_var] = ''.join(uresult)
                        pass
                    uresult = []
                    uprev = 0
                    ulen = 0
                    prev = k
                    len_ += 1
                else:
                    self.dprint('null-terminated chars detected')
                    stack_var = 'var_{:X}'.format(k + len_)
                    strings[stack_var] = ''.join(result)
                    if strings[stack_var][0] != '\x00':
                        print '{} = {}'.format(stack_var, repr(strings[stack_var]))
                    result = []
                    prev = 0
                    len_ = 0
                    uprev = k
                    ulen += 1
            else:
            #'''
                prev = k
                len_ += 1
                uprev = k
                ulen += 1

        if len(result) > 0:
            print('the string is not null-terminated: {}'.format(repr(''.join(result))))

        stack = GetFrame(self.start)
        results = []
        for offset, name, size in StructMembers(stack):
            if name in strings:
                if self.do_xor:
                    if name in self.xor_vars:
                        k = self.xor_vars[name]
                    else:
                        k = self.static_xor_key
                    res = ''.join([chr(ord(x) ^ k) for x in strings[name][:-1]])
                    #print k
                    print '{} (xor-decoded): {} ({})'.format(name, repr(res), repr(strings[name]))
                    res = res + ' (decoded)'
                else:
                    res = strings[name]
                if res[0] != '\x00':
                    SetMemberComment(stack, offset, repr(res.rstrip('\x00')), 1)
                    results.append(repr(res.rstrip('\x00')))

        # set comment at the function start ea
        if results:
            cmt = ', '.join(results)
            if len(cmt) < 128:
                set_func_cmt(self.start, cmt, True)
            else:
                set_func_cmt(self.start, 'a lot of stack strings recovered (need to be checked)', True)

        # restore analyzed names in stack
        AnalyzeArea(self.start, self.end)

class SSSForm(ida_kernwin.Form):
    def __init__(self):
        ida_kernwin.Form.__init__(self,
r"""BUTTON YES* Run
BUTTON CANCEL Cancel
stackstring_static

{FormChangeCb}
<current function only:{cCurrentOnly}>
<enable debug messages:{cDebug}>
<enable xor decoding:{cDecode}>{cGroup}>
<default xor value in hex (single byte):{iXorValue}>
""",
        {
            'FormChangeCb': ida_kernwin.Form.FormChangeCb(self.OnFormChange),
            'cGroup': ida_kernwin.Form.ChkGroupControl(("cCurrentOnly", "cDebug", "cDecode")),
            'iXorValue': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_HEX),
        })

    def OnFormChange(self, fid):
        if fid == -1:
            self.SetControlValue(self.cCurrentOnly, True)
            self.EnableField(self.iXorValue, False)                
        if fid == self.cDecode.id:
            #print('cDecode changed: {}'.format(self.cDecode.checked))
            #if self.cDecode.checked:
            self.EnableField(self.iXorValue, True)
            #else:
                #self.EnableField(self.iXorValue, False)                
        return 1

def main():
    print 'start'

    f = SSSForm()
    f.Compile()
    f.iXorValue.value = 0x55
    r = f.Execute()
    if r == 1: # Run
        if f.cCurrentOnly.checked:
            start = GetFunctionAttr(here(), FUNCATTR_START)
            end = GetFunctionAttr(here(), FUNCATTR_END)
            ss = StackString(start, end, f.cDebug.checked, f.cDecode.checked, f.iXorValue.value)
            ss.traverse()
        else:
            for start in Functions():
                end = GetFunctionAttr(start, FUNCATTR_END)
                ss = StackString(start, end, f.cDebug.checked, f.cDecode.checked, f.iXorValue.value)
                ss.traverse()
    else:  # Cancel
        print 'cancel'

    Refresh()
    print '----------------------------------------------'
    print 'done'

if __name__ == '__main__':
    main()