Repository: TakahiroHaruyama/ida_haru Branch: master Commit: 29bd253294c3 Files: 27 Total size: 210.7 KB Directory structure: gitextract_b2x18g5t/ ├── .gitignore ├── ADVobfuscator/ │ ├── README.org │ └── idapy3_ADVobfuscator_deob.py ├── LICENSE ├── README.org ├── bindiff/ │ ├── README.org │ ├── bindiff.py │ ├── bindiff_export.idc │ ├── save_func_names.py │ └── save_func_names_7x.py ├── callstrings/ │ ├── README.org │ ├── hexrays_utils.py │ ├── ida_callstrings_dbg.py │ ├── ida_callstrings_flare_emu.py │ └── ida_callstrings_static.py ├── eset_crackme/ │ ├── README.org │ ├── loaders/ │ │ └── ida_loader_drv_vm.py │ └── procs/ │ └── ida_processor_drv_vm.py ├── fn_fuzzy/ │ ├── README.org │ ├── cli_export.py │ ├── dump_types.py │ ├── fn_fuzzy.py │ ├── fn_fuzzy_7x.py │ ├── yara_fn.py │ └── yara_fn_7x.py └── stackstring_static/ ├── README.org └── stackstring_static.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ ================================================ FILE: ADVobfuscator/README.org ================================================ * IDAPython script deobfuscating ADVobfuscator strings, applied to a TrickBoot sample The script requires [[https://github.com/fireeye/flare-emu][flare-emu]]. The tested sample is [[https://www.virustotal.com/gui/file/c1f1bc58456cff7413d7234e348d47a8acfdc9d019ae7a4aba1afc1b3ed55ffa/details][491115422a6b94dc952982e6914adc39]] (TrickBot's UEFI firmware reconnaissance module called "TrickBoot"). Note: We may not be able to reuse it for a different sample that was compiled with a different compiler or with different flags but I think the same approach (decoder function pattern matching + emulation) can be applied. A result example: #+BEGIN_SRC [*] 0x1000a124: xor2-encoded function detected (size = 0x2f) [*] 0x1000b92c: emulating from 0x1000b71b to 0x1000b92c [+] 0x1000b92c: uefi_expl_port_writeDeviceIoControl() ERROR %d #+END_SRC [[./img/adv_result.png]] ** Reference - https://github.com/andrivet/ADVobfuscator - https://eclypsium.com/2020/12/03/trickbot-now-offers-trickboot-persist-brick-profit/ - [[http://antonioparata.blogspot.com/2020/06/deobfuscating-c-advobfuscator-with.html]] ================================================ FILE: ADVobfuscator/idapy3_ADVobfuscator_deob.py ================================================ # idapy3_ADVobfuscator_deob.py - IDAPython script deobfuscating ADVobfuscator strings, applied to a TrickBoot sample # Takahiro Haruyama (@cci_forensics) from idc import * from idautils import * import idaapi try: import flare_emu except ImportError as e: print(("Could not import flare_emu: {}\nExiting.".format(e.message))) raise import re, unicorn ''' dec ''' g_pat_sub = re.compile(rb'^\x33\xD2\x8A\x04\x0A\x0F\xBE\xC0\x83\xE8(.)\x88\x04\x0A\x42\x83\xFA(.)\x72\xEE\x8B\xC1\xC3$', re.DOTALL) g_pat_xor1 = re.compile(rb'^\x53\x55\x56\x57\x8b\xf9\x6a(.)\x5d\x8d\x47\x04\x8a\x10\x0f\xbe\x37\x0f\xbe\xca\x33\xce\x88\x08\x40\x83\xed\x01\x75\xee\xc6\x47.\x00\x8d\x47\x04\x5f\x5e\x5d\x5b\xc3$', re.DOTALL) g_pat_xor2 = re.compile(rb'^\x53\x56\x57\x8b\xf1\x33\xdb\x8a\x54\x1e\x04\x8b\x06\x02\xc3\x0f\xbe\xca\x33\xc1\x88\x44\x1e\x04\x43\x83\xfb(.)\x72\xe9\x5f\xc6\x46.\x00\x8d\x46\x04\x5e\x5b\xc3$', re.DOTALL) g_pat_dec = re.compile(rb'^\x33\xd2\x8a\x04\x0a\x0f\xbe\xc0\x48\x88\x04\x0a\x42\x83\xfa(.)\x72\xf0\x8b\xc1\xc3$', re.DOTALL) g_pats = { 'sub': g_pat_sub, 'xor1': g_pat_xor1, 'xor2': g_pat_xor2, 'dec': g_pat_dec, } def info(msg): print(("[*] {}".format(msg))) def success(msg): print(("[+] {}".format(msg))) def error(msg): print(("[!] {}".format(msg))) def set_decomplier_cmt(ea, cmt): try: cfunc = idaapi.decompile(ea) tl = idaapi.treeloc_t() tl.ea = ea tl.itp = idaapi.ITP_SEMI if cfunc: cfunc.set_user_cmt(tl, cmt) cfunc.save_user_cmts() else: error("Decompile failed: {:#x}".format(ea)) except: error("Decompile failed: {:#x}".format(ea)) def add_bookmark(ea, comment): last_free_idx = -1 for i in range(0, 1024): slot_ea = get_bookmark(i) if slot_ea == BADADDR or slot_ea == ea: # empty slot found or overwrite existing one last_free_idx = i break # Check Empty Slot if last_free_idx < 0: return False # Register Slot put_bookmark(ea, 0, 0, 0, last_free_idx, comment) return True def get_emu_range(ea): func = idaapi.get_func(ea) if func is None: return None, None for bb in idaapi.FlowChart(func): if bb.start_ea <= ea <= bb.end_ea: #return bb.start_ea, next_head(ea) # return bb.start_ea, ea return None, None # enable a step into emulation for the decoder (disabled) def call_hook(address, argv, funcName, userData): if funcName == userData["dec_fn_name"]: #print('dec_fn detected') userData['skipCalls'] = False else: userData['skipCalls'] = True # validate the emulation result, based on the encoded buf ptr (disabled) def inst_hook(uc, address, size, userData): #info('instr_hook {:#x}'.format(address)) if address == userData['ref']: eh = userData["EmuHelper"] try: pc = uc.reg_read(eh.regs["pc"]) enc_ea = uc.reg_read(eh.regs["ecx"]) info('pc = {:#x}, address = {:#x}), enc_ea = {:#x}'.format(pc, address, enc_ea)) userData["enc_ea"] = enc_ea except unicorn.UcError as e: error("emulation error: {}".format(str(e))) elif address == userData['end'] and userData.get('enc_ea'): eh = userData["EmuHelper"] try: pc = uc.reg_read(eh.regs["pc"]) if userData["dec_fn_name"].find('sub') != -1: dec = uc.mem_read(userData["enc_ea"], userData['size']) else: # xor dec = uc.mem_read(userData["enc_ea"] + 4, userData['size']) success('{:#x}: {}'.format(userData['ref'], dec)) except unicorn.UcError as e: error("emulation error: {}".format(str(e))) def emulate(pname, eh, dec_fn, size, key): cnt = 0 refs = CodeRefsTo(dec_fn, False) for ref in refs: if GetMnem(ref) == 'call': start, end = get_emu_range(ref) if start and end: info('{:#x}: emulating from {:#x} to {:#x}'.format(ref, start, end)) userData = { 'dec_fn_name': get_name(dec_fn), 'start': start, 'end': end, 'ref': ref, 'size': size, } try: #eh.emulateRange(start, endAddr=end, callHook=call_hook, instructionHook=inst_hook, hookData=userData) #eh.emulateRange(start, endAddr=end, callHook=call_hook, hookData=userData) eh.emulateRange(start, endAddr=end) pc = eh.uc.reg_read(eh.regs["pc"]) ea = eh.uc.reg_read(eh.regs["ecx"]) if pname == 'sub': enc = eh.uc.mem_read(ea, size) #info('key = {:#x}, enc = {}'.format(key, enc)) dec = bytes([(x - key) & 0xff for x in enc]).decode() elif pname == 'dec': enc = eh.uc.mem_read(ea, size) dec = bytes([(x - 1) & 0xff for x in enc]).decode() else: key = eh.uc.mem_read(ea, 4)[0] enc = eh.uc.mem_read(ea + 4, size) #info('key = {:#x}, enc = {}'.format(key, enc)) if pname == 'xor1': dec = bytes([x ^ key for x in enc]).decode() else: # xor2 dec = bytes([x ^ (key + i) for i, x in enumerate(enc)]).decode() # to obtain the step into emulation (disabled) #dec_ea = eh.uc.reg_read(eh.regs["eax"]) #info('{:#x}: dec_ea = {:#x}'.format(pc, dec_ea)) #dec = eh.uc.mem_read(dec_ea, size) success('{:#x}: {}'.format(ref, dec)) MakeComm(ref, dec) set_decomplier_cmt(ref, dec) add_bookmark(ref, 'decoded: {}'.format(dec)) cnt += 1 except unicorn.UcError as e: pc = eh.uc.reg_read(eh.regs["pc"]) error("{:#x}: {} when reading {:#x}".format(pc, str(e), ea)) finally: eh.resetEmulatorHeapAndStack() return cnt def main(): info('start') eh = flare_emu.EmuHelper() # search the decoding functions cnts = {} for fva in Functions(): #if fva != 0x1000A19F: # continue if idc.get_func_flags(fva) & (idc.FUNC_LIB | idc.FUNC_THUNK): continue size = 0 fn_bytes = idc.get_bytes(fva, get_func_attr(fva, FUNCATTR_END) - fva) for pname, pat in g_pats.items(): m = pat.search(fn_bytes) if m: try: if pname == 'sub': key = int.from_bytes(m.group(1), 'little') size = int.from_bytes(m.group(2), 'little') else: key = None size = int.from_bytes(m.group(1), 'little') except ValueError: pass else: print('\n') info('{:#x}: {}-encoded function detected (size = {:#x})'.format(fva, pname, size)) idaapi.do_name_anyway(fva, 'fn_ADVobfuscator_decode_{}_len{}'.format(pname, size)) cnt = emulate(pname, eh, fva, size, key) if cnts.get(pname): cnts[pname] += cnt else: cnts[pname] = cnt break info('number of decoded strings: {}'.format(cnts)) info('done') if __name__ == '__main__': main() ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.org ================================================ #+OPTIONS: ^:{} * ida_haru Scripts/plugins for IDA Pro Note: Old scripts don't work for IDA 8.x, but I leave them just for reference. ** eset_crackme IDA Pro loader/processor modules for ESET CrackMe driver VM ** stackstring_static IDAPython script statically-recovering strings constructed in stack ** fn_fuzzy IDAPython script for fast multiple binary diffing triage ** bindiff python script for multiple binary diffing by BinDiff ** ADVobfuscator IDAPython script deobfuscating ADVobfuscator strings, applied to a TrickBoot sample ** HexRaysDeob modified version for defeating APT10 ANEL's code obfuscations (located in a [[https://github.com/carbonblack/HexRaysDeob][corporate github repository]]) ** callstrings scripts for defeating "polymorphic stack strings" obfuscation used by Hodur sample ================================================ FILE: bindiff/README.org ================================================ #+OPTIONS: ^:{} #+TITLE: BinDiff wrapper script for multiple binary diffing * Purpose multiple binary diffing up to 100 samples ([[https://github.com/TakahiroHaruyama/ida_haru/tree/master/fn_fuzzy][fn_fuzzy]] is better for more samples) * Requirements - IDA 7.6 and BinDiff 6 - python packages: pefile macholib pyelftools python-idb prettytable * How to Use Before using it, you have to edit the paths for executables/scripts in bindiff.py. #+BEGIN_SRC # paths (should be edited) g_out_dir = r'Z:\haru\analysis\tics\bindiff_db' g_ida_dir = r'C:\work\tool\IDAx64' g_exp_path = r'Z:\cloud\gd\python\IDAPython\ida_haru\bindiff\bindiff_export.idc' g_differ_path = r"C:\Program Files\BinDiff\bin\bindiff.exe" #g_differ_path = r'C:\Program Files (x86)\zynamics\BinDiff 4.2\bin\differ64.exe' g_save_fname_path = r'Z:\cloud\gd\python\IDAPython\ida_haru\bindiff\save_func_names.py' #+END_SRC You can check the command line options by -h or --help. #+BEGIN_EXAMPLE Z:\cloud\gd\work\python\IDAPython\bindiff>python bindiff.py -h usage: bindiff.py [-h] [--out_dir OUT_DIR] [--ws_th WS_TH] [--fs_th FS_TH] [--ins_th INS_TH] [--bb_th BB_TH] [--size_th SIZE_TH] [--func_regex FUNC_REGEX] [--debug] [--clear] [--noidb] [--use_pyidb] primary {1,m} ... positional arguments: primary primary binary to compare {1,m} mode: 1, m 1 BinDiff 1 to 1 m BinDiff 1 to many optional arguments: -h, --help show this help message and exit --out_dir OUT_DIR, -o OUT_DIR output directory including .BinExport/.BinDiff (default: Z:\haru\analysis\tics\bindiff_db) --ws_th WS_TH, -w WS_TH whole binary similarity threshold (default: 0.2) --fs_th FS_TH, -f FS_TH function similarity threshold (default: 0.8) --ins_th INS_TH, -i INS_TH instruction threshold (default: 30) --bb_th BB_TH, -b BB_TH basic block threshold (default: 1) --size_th SIZE_TH, -s SIZE_TH file size threshold (MB) (default: 10) --func_regex FUNC_REGEX, -e FUNC_REGEX function name regex to reduce noise (default: sub_|fn_|chg_) --debug, -d print debug output (default: False) --clear, -c clear .BinExport, .BinDiff and function name cache (default: False) --noidb, -n skip a secondary binary without idb (default: False) --use_pyidb use python-idb (default: False) #+END_EXAMPLE There are 2 modes. One is "1 to 1" mode, the other is "1 to many" mode. ** "1 to 1" mode example In "1 to 1" mode, we should specify executable file paths for primary and secondary targets. #+BEGIN_EXAMPLE Z:\cloud\gd\work\python\IDAPython\bindiff>python bindiff.py Z:\haru\analysis\tics\hoge\[redacted]_worker_fixed 1 Z:\haru\analysis\tics\hoge\samples\checked\[redacted]c2f05 --------------------------------------------- [*] BinDiff result [*] elapsed time = 0.390000104904 sec, number of diffing = 1 [*] primary binary: (([redacted]_worker_fixed)) ============== 1 high similar binaries (>0.2) ================ +----------------+--------------------------------------+ | similarity | secondary binary | +----------------+--------------------------------------+ | 0.211967127395 | [redacted]c2f05 | +----------------+--------------------------------------+ --------------------------------------------- #+END_EXAMPLE "high similar binaries" means some binaries are found with whole binary similarities. You can adjust the similarity by -w option. ** "1 to many" mode example In "1 to many" mode, we should specify an executable file path for a primary target and a folder path for secondary targets. We can specify to compare secondary binaries recursively (-r option). #+BEGIN_EXAMPLE Z:\cloud\gd\work\python\IDAPython\bindiff>python bindiff.py Z:\haru\analysis\tics\hoge\samples\attacker\[redacted]_worker_fixed m Z:\haru\analysis\tics\hoge\samples\tmp --------------------------------------------- [*] BinDiff result [*] elapsed time = 6.71900010109 sec, number of diffing = 3 [*] primary binary: (([redacted]_worker_fixed)) ============== 10 high similar functions (>0.8), except high similar binaries ================ +----------------+--------------+--------------------------------+----------------+----------------------------------+-----------------+ | similarity | primary addr | primary name | secondary addr | secondary name |secondary binary | +----------------+--------------+--------------------------------+----------------+----------------------------------+-----------------+ | 1.0 | 0x180067720 | Virt_sub_180067720 | 0x180004c30 | sub_180004c30 | [redacted]e6504 | | 1.0 | 0x1800674b0 | sub_1800674b0 | 0x180004930 | sub_180004930 | [redacted]e6504 | | 1.0 | 0x1800673a0 | chg_peparse_Virt_sub_1800673A0 | 0x180004820 | sub_180004820 | [redacted]e6504 | | 1.0 | 0x1800672b0 | Virt_sub_1800672B0 | 0x180004730 | sub_180004730 | [redacted]e6504 | | 1.0 | 0x18005fd84 | sub_18005fd84 | 0x13f69af94 | sub_13f69af94 | [redacted]fb841 | | 1.0 | 0x18005fd84 | sub_18005fd84 | 0x180012648 | __crtMessageBoxW | [redacted]e6504 | | 1.0 | 0x180050f30 | sub_180050f30 | 0x1800019f0 | ?erase@?$basic_string@DU?$char_t | [redacted]e6504 | | 0.98987073046 | 0x1800677e0 | chg_peparse_Virt_sub_1800677E0 | 0x180004cf0 | sub_180004cf0 | [redacted]e6504 | | 0.963708558784 | 0x180067560 | sub_180067560 | 0x1800049e0 | sub_1800049e0 | [redacted]e6504 | | 0.946399194338 | 0x180018780 | chg_rotate_sub_180018780 | 0x140004360 | sub_140004360 | [redacted]92023 | +----------------+--------------+--------------------------------+----------------+----------------------------------+-----------------+ --------------------------------------------- #+END_EXAMPLE "high similar functions" means some functions are found with function similarities though they have lower whole binary similarities than the threshold. You can ajust the similarity by -f option. The function similarity result is very noisy so library/thunk functions are filtered out by the script. Additionally, we can specify the number of instructions/basic blocks, file size, and so on to reduce the noise. And by default, the script newly creates idbs for the target binaries if not found. If you want to only compare existing idbs, please specify -n. * Notes - If you can't get the function similarities correctly, adjust the function similarity threshold (--fs_th), instruction threshold (--ins_th), basic block threshold (--bb_th) and function name filter rule (--func_regex) options. The script excludes the matches of small codes because function similarity results of multiple binaries are noisy. - BinDiff 5.0 and later contains a [[https://issuetracker.google.com/issues/129600738][bug]] that we can't load existing .BinDiff files and import symbols/comments due to missing .BinExport files. I hope it will be fixed someday. - python-idb doesn't work for IDA 7.6 IDBs. So by default it's not used (enable --use_pyidb option if needed). ================================================ FILE: bindiff/bindiff.py ================================================ # bindiff.py - BinDiff wrapper script for multiple binary diffing # Takahiro Haruyama (@cci_forensics) import argparse, subprocess, os, sqlite3, time, pickle, re, multiprocessing, sys, struct, logging from prettytable import PrettyTable import pefile from macholib.MachO import MachO from macholib.mach_o import * from elftools.elf.elffile import ELFFile import idb logging.basicConfig(level=logging.ERROR) # to suppress python-idb warning # paths (should be edited) # Windows #g_out_dir = r'C:\analysisw\tmp\bindiff' #g_ida_dir = r'C:\analysisw\tool\IDA' #g_differ_path = r"C:\Program Files\BinDiff\bin\bindiff.exe" # MacOS g_out_dir = r'/Users/haru/analysis/tmp/bindiff' #g_ida_dir = r'/Applications/IDA/ida.app/Contents/MacOS' g_ida32_path = r'/Applications/IDA/ida.app/Contents/MacOS/ida' g_ida64_path = r'/Applications/IDA/ida64.app/Contents/MacOS/ida64' g_differ_path = r"/Applications/BinDiff/BinDiff.app/Contents/MacOS/bin/bindiff" g_exp_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'bindiff_export.idc') g_save_fname_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'save_func_names_7x.py') # parameters g_ws_th = 0.15 # whole binary similarity threshold g_fs_th = 0.70 # function similarity threshold g_ins_th = 10 # instruction threshold g_bb_th = 0 # basic block threshold g_size_th = 10 # file size threshold (MB) #g_func_regex = r'sub_|fn_|chg_' # function name filter rule g_func_regex = r'.*' # function name filter rule class LocalError(Exception): pass class ProcExportError(LocalError): pass class ProcDiffError(LocalError): pass class LoadFuncNamesError(LocalError): pass class FileNotFoundError(LocalError): pass class ChildProcessError(LocalError): pass class BinDiff(object): def __init__ (self, primary, out_dir, ws_th, fs_th, ins_th, bb_th, size_th, func_regex, debug=False, clear=False, newidb=False, use_pyidb=False): #def __init__ (self, primary, out_dir, ws_th, fs_th, ins_th, bb_th, size_th, debug=False, clear=False, noidb=False, use_pyidb=False): self._debug = debug self._clear = clear self._newidb = newidb self._lock = multiprocessing.Lock() self._primary = primary self._ws_th = ws_th self._fs_th = fs_th self._ins_th = ins_th self._bb_th = bb_th self._size_th = size_th self._out_dir = out_dir self.use_pyidb = use_pyidb self._format, self._arch = self._get_machine_type(primary) if self._format is None: raise ProcExportError('primary binary should be PE/Mach-O/ELF'.format(primary)) self._dprint('primary binary format: {}'.format(self._format)) self._dprint('primary binary architecture: {}'.format(self._arch)) self._ida_path = self._get_ida_path(self._arch) res = self._files_not_found() if res is not None: raise FileNotFoundError('file is not found: {}'.format(res)) self._dprint('IDA binary path for primary: {}'.format(self._ida_path)) if self._make_BinExport(self._primary, self._ida_path) != 0: raise ProcExportError('primary BinExport failed: {}'.format(primary)) if self.use_pyidb: idb_path = self._get_idb_path(primary, self._arch) self._func_names = self._load_func_names_pyidb(idb_path) else: self._func_p = re.compile(func_regex) self._func_regex = func_regex self._func_names = self._load_func_names_default(func_regex, primary, self._ida_path) self._high_ws = {} self._high_fs = {} self._diff_cnt = 0 def _dprint(self, msg): if self._debug: self._lock.acquire() print('[+] [{}]: {}'.format(os.getpid(), msg)) self._lock.release() def _get_machine_type(self, path): try: pe = pefile.PE(path) format_ = 'PE' if pefile.MACHINE_TYPE[pe.FILE_HEADER.Machine].find('I386') != -1: arch = '32-bit' else: arch = '64-bit' except (pefile.PEFormatError,KeyError) as detail: try: self._dprint(detail) m = MachO(path) format_ = 'Mach-O' for header in m.headers: if CPU_TYPE_NAMES.get(header.header.cputype,header.header.cputype) == 'x86_64': #if header.MH_MAGIC == MH_MAGIC_64: arch = '64-bit' else: arch = '32-bit' except: try: elffile = ELFFile(open(path, 'rb')) format_ = 'ELF' e_ident = elffile.header['e_ident'] if e_ident['EI_CLASS'] == 'ELFCLASS64': arch = '64-bit' else: arch = '32-bit' except: return None, None #format_ = 'shellcode' #arch = '32-bit' # 32-bit fixed return format_, arch def _files_not_found(self): #for path in (self._ida_path, g_exp_path, g_save_fname_path, g_differ_path): for path in (self._ida_path, g_exp_path, g_differ_path): if not os.path.isfile(path): return path return None def _get_db_path_noext(self, target): return os.path.join(self._out_dir, os.path.splitext(os.path.basename(target))[0]) #return os.path.join(self._out_dir, os.path.basename(target)) def _get_idb_path(self, target, arch): db_ext = '.idb' if arch == '32-bit' else '.i64' target_split = os.path.splitext(target)[0] if os.path.exists(target_split + db_ext): return target_split + db_ext else: return target + db_ext # for recent IDA versions def _get_ida_path(self, arch): #idaq = 'idaq.exe' if arch == '32-bit' else 'idaq64.exe' #idaq = g_ida32_name if arch == '32-bit' else g_ida64_name #return os.path.join(g_ida_dir, idaq) return g_ida32_path if arch == '32-bit' else g_ida64_path def _load_func_names_pyidb(self, idb_path): # exlcude library/thunk functions pickle_path = os.path.splitext(os.path.join(self._out_dir, os.path.basename(idb_path)))[0] + '_func_names.pickle' if self._clear or not os.path.exists(pickle_path): func_names = {} with idb.from_file(idb_path) as db: api = idb.IDAPython(db) for ea in api.idautils.Functions(api.idc.MinEA(), api.idc.MaxEA()): flags = api.idc.GetFunctionFlags(ea) if flags & api.ida_funcs.FUNC_LIB or flags & api.ida_funcs.FUNC_THUNK: continue func_name = api.idc.GetFunctionName(ea) func_names[ea] = func_name with open(pickle_path, 'wb') as f: pickle.dump(func_names, f) with open(pickle_path, 'rb') as f: self._dprint('function names loaded: {}'.format(idb_path)) return pickle.load(f) # default function without python-idb def _load_func_names_default(self, func_regex, path, ida_path): pickle_path = os.path.splitext(os.path.join(self._out_dir, os.path.basename(path)))[0] + '_func_names.pickle' if self._clear or not os.path.exists(pickle_path): cmd = [ida_path, '-A', '-S{}'.format(g_save_fname_path), '-Osave_func_names:{}:{}'.format(func_regex, pickle_path), path] #cmd = [ida_path, '-S{}'.format(g_save_fname_path), '-Osave_func_names:{}:{}'.format(func_regex, pickle_path), path] self._dprint('saving function names for {}'.format(path)) self._dprint(' '.join(cmd)) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() if proc.returncode != 0: raise LoadFuncNamesError('function names saving failed: {}'.format(path)) with open(pickle_path, 'rb') as f: self._dprint('function names loaded: {}'.format(path)) return pickle.load(f) raise LoadFuncNamesError('function names loading failed: {}'.format(path)) def _make_BinExport(self, target, ida_path): binexp_path = self._get_db_path_noext(target) + '.BinExport' #binexp_path = os.path.splitext(target)[0] + '.BinExport' if not self._clear and os.path.exists(binexp_path): self._dprint('already existed BinExport: {}'.format(binexp_path)) return 0 #cmd = [ida_path, '-A', '-S{}'.format(g_exp_path), '-OExporterModule:{}'.format(binexp_path), target] # the .BinExport filename should be specified in 4.3 #if self._debug: #cmd = [ida_path, '-S{}'.format(g_exp_path), '-OBinExportModule:{}'.format(binexp_path), target] #else: cmd = [ida_path, '-A', '-S{}'.format(g_exp_path), '-OBinExportModule:{}'.format(binexp_path), target] #print cmd self._dprint('getting BinExport for {}'.format(target)) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() return proc.returncode def _get_BinDiff_path(self, secondary): primary_noext = self._get_db_path_noext(self._primary) secondary_noext = os.path.splitext(secondary)[0] return primary_noext + '_vs_' + os.path.basename(secondary_noext) + '.BinDiff' def _make_BinDiff(self, secondary): pri_binexp = self._get_db_path_noext(self._primary) + '.BinExport' sec_binexp = self._get_db_path_noext(secondary) + '.BinExport' #pri_binexp = os.path.splitext(self._primary)[0] + '.BinExport' #sec_binexp = os.path.splitext(secondary)[0] + '.BinExport' bindiff_path = self._get_BinDiff_path(secondary) if not self._clear and os.path.exists(bindiff_path): self._dprint('already existed BinDiff: {}'.format(bindiff_path)) return 0, None cmd = [g_differ_path, '--primary={}'.format(pri_binexp), '--secondary={}'.format(sec_binexp), '--output_dir={}'.format(self._out_dir)] #print cmd self._dprint('diffing the binaries..') proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() self._dprint('differ output:') self._dprint(stdout) self._dprint(stderr) return proc.returncode, cmd def is_skipped(self, secondary): # file check (in case of the same dir) #if os.path.splitext(self._primary)[0] == os.path.splitext(secondary)[0]: if self._primary == secondary: return True # target at executables if os.path.splitext(secondary)[1] in ('.BinExport', '.BinDiff', '.idb', '.i64'): return True # size check if (os.path.getsize(secondary) >> 20) > self._size_th: self._dprint('The size is bigger (skipped): {}'.format(secondary)) return True # format/arch check format_, arch = self._get_machine_type(secondary) if format_ is None: return True #elif format_ != self._format or arch != self._arch: elif format_ != self._format: # only check the format self._dprint('different executable format (skipped): {}'.format(secondary)) return True # skip if idb not found idb_path = self._get_idb_path(secondary, arch) if not self._newidb and not os.path.exists(idb_path): self._dprint('no existing idb (skipped): {}'.format(secondary)) return True return False def check_similarity(self, secondary, q=None): format_, arch = self._get_machine_type(secondary) ida_path = self._get_ida_path(arch) self._dprint('IDA binary path for secondary: {}'.format(ida_path)) if self._make_BinExport(secondary, ida_path) != 0: if q is not None: q.put((None, None)) raise ProcExportError('secondary BinExport failed: {}'.format(secondary)) retcode, cmd = self._make_BinDiff(secondary) if retcode != 0: if q is not None: q.put((None, None)) raise ProcDiffError('BinDiff failed: {}'.format(cmd)) conn = sqlite3.connect(self._get_BinDiff_path(secondary)) c = conn.cursor() try: c.execute("SELECT similarity,confidence FROM metadata") except sqlite3.OperationalError as detail: print('[!] .BinDiff database ({}) is something wrong: {}'.format(self._get_BinDiff_path(secondary), detail)) return ws, wc = c.fetchone() self._dprint('whole binary similarity={} confidence={}'.format(ws, wc)) c.execute("SELECT address1,address2,similarity,confidence FROM function WHERE similarity > ? and instructions > ? and basicblocks > ?", (self._fs_th, self._ins_th, self._bb_th)) frows = c.fetchall() self._dprint('{} similar functions detected'.format(len(frows))) conn.close() c_high_ws = {} c_high_fs = {} if ws > self._ws_th: c_high_ws[secondary] = {'similarity':ws, 'confidence':wc} elif frows: if self.use_pyidb: idb_path = self._get_idb_path(secondary, arch) func_names = self._load_func_names_pyidb(idb_path) else: func_names = self._load_func_names_default(self._func_regex, secondary, ida_path) for row in frows: addr1, addr2, fs, fc = row self._dprint('addr1={:#x}, addr2={:#x}, similarity={}, confidence={}'.format(addr1, addr2, fs, fc)) if addr1 in self._func_names and addr2 in func_names: c_high_fs[(addr1, self._func_names[addr1], addr2, func_names[addr2], secondary)] = {'similarity':fs, 'confidence':fc} if not c_high_fs and not self._debug: os.remove(self._get_BinDiff_path(secondary)) else: if not self._debug: os.remove(self._get_BinDiff_path(secondary)) #self._dprint(c_high_ws) #self._dprint(c_high_fs) if q is None: self._high_ws = c_high_ws self._high_fs = c_high_fs else: q.put((c_high_ws, c_high_fs)) def check_similarities(self, secondary_dir, recursively): if recursively: seconds = [os.path.join(root, file_) for root, dirs, files in os.walk(secondary_dir) for file_ in files] else: seconds = [os.path.join(secondary_dir, entry) for entry in os.listdir(secondary_dir) if os.path.isfile(os.path.join(secondary_dir, entry))] procs = [] for secondary in seconds: if self.is_skipped(secondary): continue q = multiprocessing.Queue() p = multiprocessing.Process(target=self.check_similarity, args=(secondary, q)) p.start() procs.append((p,q)) self._diff_cnt = len(procs) for p,q in procs: c_high_ws, c_high_fs = q.get() self._high_ws.update(c_high_ws) self._high_fs.update(c_high_fs) p.join() def increment_count(self): self._diff_cnt += 1 def get_result(self): return self._high_ws, self._high_fs, self._diff_cnt def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('primary', help="primary binary to compare") parser.add_argument('--out_dir', '-o', default=g_out_dir, help="output directory including .BinExport/.BinDiff") parser.add_argument('--ws_th', '-w', type=float, default=g_ws_th, help="whole binary similarity threshold") parser.add_argument('--fs_th', '-f', type=float, default=g_fs_th, help="function similarity threshold") parser.add_argument('--ins_th', '-i', type=int, default=g_ins_th, help="instruction threshold") parser.add_argument('--bb_th', '-b', type=int, default=g_bb_th, help="basic block threshold") parser.add_argument('--size_th', '-s', type=int, default=g_size_th, help="file size threshold (MB)") parser.add_argument('--func_regex', '-e', default=g_func_regex, help="function name regex to include in the result") parser.add_argument('--debug', '-d', action='store_true', help="print debug output") parser.add_argument('--clear', '-c', action='store_true', help="clear .BinExport, .BinDiff and function name cache") parser.add_argument('--newidb', '-n', action='store_true', help="create an idb for the secondary binary") parser.add_argument('--use_pyidb', action='store_true', help="use python-idb") subparsers = parser.add_subparsers(dest='mode', help='mode: 1, m') parser_1 = subparsers.add_parser('1', help='BinDiff 1 to 1') parser_1.add_argument('secondary', help="secondary binary to compare") parser_m = subparsers.add_parser('m', help='BinDiff 1 to many') parser_m.add_argument('secondary_dir', help="secondary directory including binaries to compare") parser_m.add_argument('--recursively', '-r', action='store_true', help="getting binaries recursively") args = parser.parse_args() high_ws = high_fs = None if os.path.isfile(args.primary): start = time.time() try: bd = BinDiff(args.primary, args.out_dir, args.ws_th, args.fs_th, args.ins_th, args.bb_th, args.size_th, args.func_regex, args.debug, args.clear, args.newidb, args.use_pyidb) #bd = BinDiff(args.primary, args.out_dir, args.ws_th, args.fs_th, args.ins_th, args.bb_th, args.size_th, args.debug, args.clear, args.noidb, args.use_pyidb) if args.mode == '1' and os.path.isfile(args.secondary): if not bd.is_skipped(args.secondary): bd.check_similarity(args.secondary) bd.increment_count() elif args.mode == 'm' and os.path.isdir(args.secondary_dir): bd.check_similarities(args.secondary_dir, args.recursively) high_ws, high_fs, cnt = bd.get_result() except LocalError as e: print('[!] {} ({})'.format(str(e), type(e))) return elapsed = time.time() - start print('---------------------------------------------') print('[*] BinDiff result') print('[*] elapsed time = {} sec, number of diffing = {}'.format(elapsed, cnt)) print('[*] primary binary: (({}))'.format(os.path.basename(args.primary))) if high_ws: print('\n============== {} high similar binaries (>{}) ================'.format(len(high_ws), args.ws_th)) table = PrettyTable(['similarity', 'secondary binary']) for path,res in sorted(list(high_ws.items()), key=lambda x:x[1]['similarity'], reverse=True): table.add_row([res['similarity'], '(({}))'.format(os.path.basename(path))]) print(table) if high_fs: print('\n============== {} high similar functions (>{}), except high similar binaries ================'.format(len(high_fs), args.fs_th)) table = PrettyTable(['similarity', 'primary addr', 'primary name', 'secondary addr', 'secondary name', 'secondary binary']) for key,res in sorted(list(high_fs.items()), key=lambda x:(x[1]['similarity'], x[0][0]), reverse=True): addr1, func_name1, addr2, func_name2, path = key table.add_row([res['similarity'], '{:#x}'.format(addr1), func_name1[:0x20], '{:#x}'.format(addr2), func_name2[:0x20], '{}'.format(os.path.basename(path))]) print(table) if (not high_ws) and (not high_fs): print('\nno similar binaries/functions found') print('---------------------------------------------') if ( __name__ == "__main__" ): main() ================================================ FILE: bindiff/bindiff_export.idc ================================================ #include static main() { ChangeConfig("ABANDON_DATABASE=YES"); Batch(0); Wait(); //RunPlugin("binexport11", 2 ); //Exit( 1 - RunPlugin("zynamics_binexport_9", 2 )); //Exit( 1 - RunPlugin("zynamics_binexport_8", 2 )); //Exit( 1 - RunPlugin("binexport10", 2 )); //Exit( 1 - RunPlugin("binexport11", 2 )); //RunPlugin("binexport12_ida", 2 ); Exit( 1 - RunPlugin("binexport12_ida", 2 )); } ================================================ FILE: bindiff/save_func_names.py ================================================ import os, pickle, re g_track_parent_th = 2 # parent function tracking level threshold g_parent_func_exclude_list = ['__NMSG_WRITE', '__fassign_l'] g_pfe_list = [LocByName(p) for p in g_parent_func_exclude_list] def get_pfuncs(ea, track_th): pfuncs = [GetFunctionAttr(ref, FUNCATTR_START) for ref in CodeRefsTo(ea, False)] track_th -= 1 if track_th > 0: ppfuncs = [ppfunc for pfunc in pfuncs for ppfunc in get_pfuncs(pfunc, track_th)] pfuncs.extend(ppfuncs) return pfuncs def main(): #Wait() # not change the database to maintain the window setting process_config_line("ABANDON_DATABASE=YES") # -Odecomp:option1:option2:option3 options = idaapi.get_plugin_options("save_func_names").split(':') func_regex = options[0] pickle_path = ':'.join(options[1:]) p = re.compile(func_regex) func_names = {} with open(pickle_path, 'wb') as f: for ea in Functions(MinEA(), MaxEA()): func_name = GetFunctionName(ea) if p.search(func_name): flags = GetFunctionFlags(ea) if flags & FUNC_LIB or flags & FUNC_THUNK: continue pfuncs = get_pfuncs(ea, g_track_parent_th) if not (set(pfuncs) & set(g_pfe_list)): func_names[ea] = func_name pickle.dump(func_names, f) Exit(0) #with open(os.path.splitext(GetIdbPath())[0] + '_func_names.pickle', 'rb') as f: # func_names = pickle.load(f) # print func_names if ( __name__ == "__main__" ): main() ================================================ FILE: bindiff/save_func_names_7x.py ================================================ import os, pickle, re from idautils import * g_track_parent_th = 2 # parent function tracking level threshold g_parent_func_exclude_list = ['__NMSG_WRITE', '__fassign_l'] g_pfe_list = [get_name_ea_simple(p) for p in g_parent_func_exclude_list] def get_pfuncs(ea, track_th): pfuncs = [get_func_attr(ref, FUNCATTR_START) for ref in CodeRefsTo(ea, False)] track_th -= 1 if track_th > 0: ppfuncs = [ppfunc for pfunc in pfuncs for ppfunc in get_pfuncs(pfunc, track_th)] pfuncs.extend(ppfuncs) return pfuncs def main(): #Wait() # not change the database to maintain the window setting process_config_line("ABANDON_DATABASE=YES") # -Odecomp:option1:option2:option3 options = idaapi.get_plugin_options("save_func_names").split(':') func_regex = options[0] pickle_path = ':'.join(options[1:]) p = re.compile(func_regex) func_names = {} with open(pickle_path, 'wb') as f: for ea in Functions(idaapi.cvar.inf.minEA, idaapi.cvar.inf.maxEA): func_name = idc.get_func_name(ea) if p.search(func_name): flags = idc.get_func_attr(ea, FUNCATTR_FLAGS) if flags & FUNC_LIB or flags & FUNC_THUNK: continue pfuncs = get_pfuncs(ea, g_track_parent_th) if not (set(pfuncs) & set(g_pfe_list)): func_names[ea] = func_name pickle.dump(func_names, f) ida_pro.qexit(0) #with open(os.path.splitext(idc.get_idb_path())[0] + '_func_names.pickle', 'rb') as f: # func_names = pickle.load(f) # print func_names if ( __name__ == "__main__" ): main() ================================================ FILE: callstrings/README.org ================================================ #+OPTIONS: ^:{} * callstrings - deobfuscating Hodur's global string encryption - Recover strings using various methods (static decoding, emulation, IDA debug hook) - Apply API function types to the local variable pointers The script comparison is below: [[./img/comparison.png]] - As the comparison shows, ida_callstrings_dbg.py and ida_callstrings_flare_emu.py (except emulateSelection) can work for other malware. - As the reference slides say, it is recommended to use modified [[https://github.com/TakahiroHaruyama/flare-emu/tree/xorloop][flare-emu]] and [[https://github.com/TakahiroHaruyama/capa/tree/comment_insertion][CAPA]] to make ida_callstrings_flare_emu.py work better. ** Reference - https://speakerdeck.com/takahiro_haruyama/the-art-of-malware-c2-scanning-how-to-reverse-and-emulate-protocol-obfuscated-by-compiler ================================================ FILE: callstrings/hexrays_utils.py ================================================ ''' hexrays_utils.py - common classes/functions using Hex-Rays decompiler APIs Takahiro Haruyama (@cci_forensics) ''' #from abc import ABCMeta, abstractmethod from idc import * import idaapi, ida_ida, ida_ua, ida_typeinf, ida_kernwin from ida_hexrays import * from ida_allins import NN_callni, NN_call, NN_callfi import idautils import re # Global options/variables g_DEBUG = True g_CACHE = True g_ASCII_TYPES = ['CHAR *', 'CONST CHAR *', 'LPSTR', 'LPCSTR'] g_UNICODE_TYPES = ['WCHAR *', 'CONST WCHAR *', 'LPWSTR', 'LPCWSTR'] g_STR_TYPES = g_ASCII_TYPES + g_UNICODE_TYPES g_stub_GetProcAddress = 'fn_resolve_API_addr' g_RENAME_RETRY_CNT = 100 def info(msg): print("\033[34m\033[1m[*]\033[0m {}".format(msg)) def success(msg): print("\033[32m\033[1m[+]\033[0m {}".format(msg)) def error(msg): print("\033[31m\033[1m[!]\033[0m {}".format(msg)) def debug(msg): if g_DEBUG: print("\033[33m\033[1m[D]\033[0m {}".format(msg)) def extract_ascii(data): pat = re.compile(rb'^(?:[\x20-\x7E]){2,}') return list(set([w.decode('ascii') for w in pat.findall(data)])) def extract_unicode(data): pat = re.compile(r'^(?:[\x20-\x7E][\x00]){2,}') return list(set([w.decode('utf-16le') for w in pat.findall(data)])) def get_ctree_root(ea, cache=True): cfunc = None try: if cache: cfunc = decompile(ea) else: cfunc = decompile(ea, flags=DECOMP_NO_CACHE) except: error('Decompilation of a function {:#x} failed'.format(ea)) return cfunc # Detect constant value used in string decoding class cnt_val_finder_t(ctree_visitor_t): def __init__(self): ctree_visitor_t.__init__(self, CV_FAST) self.cst_val = None def visit_expr(self, expr): if expr.op == cot_asgxor and expr.y.op == cot_xor and expr.y.y.op == cot_num: cst = expr.y.y.n._value if expr.y.x.op == cot_add: expr_add = expr.y.x elif expr.y.x.op == cot_cast and expr.y.x.x.op == cot_add: expr_add = expr.y.x.x else: expr_add = None if expr_add and expr_add.y.op == cot_num and \ (expr_add.y.n._value == cst) and (0 < cst < 0xff): success(f'{expr.ea:#x}: string decoding constant value {cst:#x} detected') self.cst_val = cst return 1 # x ^ (y - 0x1d) ^ 0xe3 == x ^ (y + 0xe3) ^ 0xe3 if expr.y.x.op == cot_sub: expr_sub = expr.y.x elif expr.y.x.op == cot_cast and expr.y.x.x.op == cot_sub: expr_sub = expr.y.x.x else: expr_sub = None if expr_sub and expr_sub.y.op == cot_num and \ (expr_sub.y.n._value + cst == 0x100) and (0 < cst < 0xff): success(f'{expr.ea:#x}: string decoding constant value {cst:#x} detected') self.cst_val = cst return 1 return 0 def get_cnt_val(self): return self.cst_val # Detect assignments when inserting comments class asg_parent_finder_t(ctree_visitor_t): def __init__(self, call_ea): ctree_visitor_t.__init__(self, CV_PARENTS) self.call_ea = call_ea self.asg_ea = BADADDR def visit_expr(self, expr): if expr.op == cot_asg and \ ((expr.y.op == cot_call and expr.y.ea == self.call_ea) or \ (expr.y.op == cot_cast and expr.y.x.op == cot_call and expr.y.x.ea == self.call_ea)): self.asg_ea = expr.ea info(f'{self.call_ea:#x}: assignment detected, replaced with the ea {self.asg_ea:#x}') return 1 return 0 # Change type/name of the specified lvar name class my_lvar_modifier_t(user_lvar_modifier_t): def __init__(self, target_name, new_name=None, new_decl=None, new_tif=None): user_lvar_modifier_t.__init__(self) self.target_name = target_name self.new_name = new_name self.new_decl = new_decl self.new_tif = new_tif def modify_lvars(self, lvars): # Note: Variables without user-specified info are not present in lvvec if len(lvars.lvvec) == 0: error('modify_lvars: len(lvars.lvvec) == 0') for idx, one in enumerate(lvars.lvvec): debug('modify_lvars: target_name = "{}" current = "{}"'.format(self.target_name, one.name)) # Set the type to the target var if one.name == self.target_name: if self.new_name: one.name = self.new_name info('modify_lvars: Name "{}" set to {}'.format(one.name, self.target_name)) tif = None if self.new_decl: tif = ida_typeinf.tinfo_t() res = ida_typeinf.parse_decl(tif, None, self.new_decl, 0) #if not res: # error('{}: parse_decl from {} FAILED'.format(one.name, self.new_decl)) elif self.new_tif: tif = self.new_tif if tif: one.type = tif info('modify_lvars: Type "{}" set to {}'.format(str(tif), one.name)) return True return False #class HexRaysUtils(metaclass=ABCMeta): class HexRaysUtils(): def __init__(self): self.cmts = {} self.call_eas = [] #@abstractmethod def get_reg_value(self, reg_name): raise NotImplementedError() #@abstractmethod def get_ptr_value(self, ptr): raise NotImplementedError() #@abstractmethod def get_string(self, ea, is_unicode=False): raise NotImplementedError() def get_bytes(self, ea): raise NotImplementedError() def get_fn_offset(self, ea): func_ea = get_func_attr(ea, FUNCATTR_START) return get_name(func_ea) + f'+{ea-func_ea:#x}' ''' def set_decomplier_cmt(self, cfunc, ea, cmt): tl = idaapi.treeloc_t() tl.ea = ea tl.itp = idaapi.ITP_SEMI cfunc.set_user_cmt(tl, cmt) cfunc.save_user_cmts() ''' def set_decomplier_cmt(self, cfunc, ea, cmt): # Prevent orphan comment issues in assignments finder = asg_parent_finder_t(ea) finder.apply_to_exprs(cfunc.body, None) #print(f'{finder.asg_ea=:#x}') cmt_ea = ea if finder.asg_ea == BADADDR else finder.asg_ea tl = idaapi.treeloc_t() tl.ea = cmt_ea tl.itp = idaapi.ITP_SEMI cfunc.set_user_cmt(tl, cmt) cfunc.save_user_cmts() cfunc.refresh_func_ctext() # This function was ported from https://github.com/RolfRolles/Miscellaneous/blob/master/PrintTypeSignature.py # If an indirect API call still has a cast after the var type is set, apply "Force call type" on the var in Pseudocode view def GetTypeSignature(self, apiName): # Look up the prototype by name from the main TIL o = ida_typeinf.get_named_type(None, apiName, ida_typeinf.NTF_SYMU) # Found? if o is not None: code, type_str, fields_str, cmt, field_cmts, sclass, value = o # Create a tinfo_t by deserializing the data returned above t = ida_typeinf.tinfo_t() if t.deserialize(None, type_str, fields_str, field_cmts): # And change the prototype into a function pointer ptrType = ida_typeinf.tinfo_t() ptrType.create_ptr(t) return ptrType # On any failure, return None return None # IDA decompiler has no API forcing lvar name def force_rename_lvar(self, ea, var, new_name): func_ea = get_func_attr(ea, FUNCATTR_START) debug('force_rename_lvar: function ea = {:#x}'.format(func_ea)) old_name = var.name if rename_lvar(func_ea, var.name, new_name): info('force_rename_lvar {:#x}: lvar name changed "{}" -> "{}"'.format(ea, old_name, new_name)) var.name = new_name # to refresh immediately return for i in range(g_RENAME_RETRY_CNT): if rename_lvar(func_ea, var.name, new_name + '_{}'.format(i + 1)): info('force_rename_lvar {:#x}: lvar name changed "{}" -> "{}"'.format(ea, old_name, new_name + '_{}'.format(i + 1))) var.name = new_name + '_{}'.format(i + 1) break else: error('{:#x}: renaming {} failed (rename_lvar, {} times)'.format(ea, var.name, g_RENAME_RETRY_CNT)) def get_arg_strings(self, address): if address in self.call_eas: info(f'{address:#x} ({self.get_fn_offset(address)}): already-visited call') return else: self.call_eas.append(address) cfunc = get_ctree_root(address, cache=g_CACHE) if cfunc: item = cfunc.body.find_closest_addr(address) if item.op == cot_call: expr = item.cexpr print('-' * 80) if expr.x.obj_ea == BADADDR: # dynamically-resolved API if expr.x.op == cot_var: callee_name = expr.x.v.getv().name elif expr.x.op == cot_cast and expr.x.x.op == cot_var: callee_name = expr.x.x.v.getv().name # Force call type (remove the cast) tif = ida_typeinf.tinfo_t() if print_insn_mnem(expr.ea) == 'call' and not ida_nalt.get_op_tinfo(tif, expr.ea, 0): # Skip an already-specified operand tif = self.GetTypeSignature(callee_name) if tif: if ida_nalt.set_op_tinfo(expr.ea, 0, tif): success(f'{expr.ea:#x}: Force call type "{str(tif)}" to the operand "{callee_name}"') else: error(f'{expr.ea:#x}: Force call type failed') else: callee_name = 'UNRESOLVED' else: callee_name = get_name(expr.x.obj_ea) info(f'{address:#x} ({self.get_fn_offset(address)}): call {callee_name} ({expr.x.obj_ea:#x})') debug(f'{str(expr.x.type)}') debug(f'argc = {expr.a.size()}') arg_strs = [] for i in range(expr.a.size()): #breakpoint() arg = expr.a.at(i) # Sometimes the arg type in stubs is int * if str(arg.type).upper() in g_STR_TYPES or callee_name.find(g_stub_GetProcAddress) != -1: debug(f'arg{i} = {str(arg.type)}') ea = 0 if str(expr.x.type).find('__thiscall') != -1: debug('thiscall') if i == 0: ea = self.get_reg_value("ECX") else: ea = self.get_ptr_value(self.get_reg_value("ESP") + (i - 1) * 4) elif str(expr.x.type).find('__fastcall') != -1: debug('fastcall') if i == 0: ea = self.get_reg_value("RCX") elif i == 1: ea = self.get_reg_value("RDX") elif i == 2: ea = self.get_reg_value("R8") elif i == 3: ea = self.get_reg_value("R9") else: ea = self.get_ptr_value(self.get_reg_value("RSP") + (i - 4) * 4) else: # __stdcall, __cdecl, etc. debug('other calling conventions') ea = self.get_ptr_value(self.get_reg_value("ESP") + i * 4) debug(f'{ea=:#x}') if str(arg.type).upper() in g_ASCII_TYPES or callee_name.find(g_stub_GetProcAddress) != -1: #if i == 2: # res = self.get_bytes(ea) #else: res = self.get_string(ea) else: # g_UNICODE_TYPES res = self.get_string(ea, is_unicode=True) if res: arg_strs.append(f'arg{i} = {res}') debug(f'arg{i} = {res}') # Set the function prototype if the callee is the GetProcAddress stubs or GetProcAddress API if (i == 0 and callee_name.find(g_stub_GetProcAddress) != -1) or \ (i == 1 and callee_name == "GetProcAddress"): #breakpoint() p_item = cfunc.body.find_parent_of(expr) p_expr = p_item.cexpr if p_expr.op == cot_cast: p_item = cfunc.body.find_parent_of(p_expr) p_expr = p_item.cexpr if p_expr.op == cot_asg and p_expr.x.op == cot_var: var = p_expr.x.v.getv() tif = self.GetTypeSignature(res) # We need to use rename_lvar calling modify_user_lvar_info indirectly to add the var into lvvec self.force_rename_lvar(address, var, res) my_lvar_mod = my_lvar_modifier_t(var.name, new_tif=tif) modify_user_lvars(get_func_attr(address, FUNCATTR_START), my_lvar_mod) # Set the arguments comment at the call instruction address if arg_strs: cmt = f'{address:#x} ({self.get_fn_offset(address)}): {",".join(arg_strs)}' success(cmt) self.set_decomplier_cmt(cfunc, address, cmt) self.cmts[address] = cmt cfunc.refresh_func_ctext() def print_summary(self): if self.cmts: success('Summary:') for k,v in self.cmts.items(): print(f'{v}') def decode(self, enc, cst_val): return bytes([enc[i] ^ ((i + cst_val) & 0xff) ^ cst_val for i in range(len(enc))]) ================================================ FILE: callstrings/ida_callstrings_dbg.py ================================================ ''' ida_callstrings_dbg.py - string deobfuscation using IDA debug hook class Takahiro Haruyama (@cci_forensics) ''' import idaapi idaapi.require('hexrays_utils', package='*') from hexrays_utils import * from ida_dbg import * # Global options/variables g_DEBUG = False g_MAX_INSTRUCTIONS = 0 # 0 = disabled def info(msg): print("\033[34m\033[1m[*]\033[0m {}".format(msg)) def success(msg): print("\033[32m\033[1m[+]\033[0m {}".format(msg)) def error(msg): print("\033[31m\033[1m[!]\033[0m {}".format(msg)) def debug(msg): if g_DEBUG: print("\033[33m\033[1m[D]\033[0m {}".format(msg)) class TraceHook(DBG_Hooks, HexRaysUtils): def __init__(self, target_ea): DBG_Hooks.__init__(self) HexRaysUtils.__init__(self) self.traces = 0 self.target_ea = target_ea #self.current_tid = get_current_thread() def get_reg_value(self, reg_name): return get_reg_val(reg_name) def get_ptr_value(self, ptr): if idaapi.get_inf_structure().is_64bit(): return get_qword(ptr) else: return get_wide_dword(ptr) def get_string(self, ea, is_unicode=False): res = get_strlit_contents(ea, strtype=STRTYPE_C_16) if is_unicode else get_strlit_contents(ea) return res.decode() if res else None def dbg_trace(self, tid, ea): debug("[tid %X] trace %08X" % (tid, ea)) if ea < ida_ida.inf_get_min_ea() or ea > ida_ida.inf_get_max_ea(): raise Exception( "Received a trace callback for an address outside this database!" ) insn = ida_ua.insn_t() insnlen = ida_ua.decode_insn(insn, ea) fn_name = get_name(get_func_attr(ea, FUNCATTR_START)) if insnlen > 0 and insn.itype in [NN_callni, NN_call, NN_callfi] and fn_name.find(g_stub_GetProcAddress) == -1: refresh_debugger_memory() self.get_arg_strings(ea) self.traces += 1 if g_MAX_INSTRUCTIONS and self.traces >= g_MAX_INSTRUCTIONS: request_disable_step_trace() request_suspend_process() if run_requests(): info('Requests suspending the process executed (g_MAX_INSTRUCTIONS)') else: error('Requests suspending the process failed (g_MAX_INSTRUCTIONS)') #return 1 return 0 # log it def dbg_thread_start(self, pid, tid, ea): info(f'[Thread {tid:#x}] {ea:#x}: New thread started') ''' add_bpt(ea) select_thread(tid) request_suspend_process() #if tid != self.current_tid: if not self.unhook(): error("Error uninstalling hooks!") else: info('Hooks uninstalled') #self.current_tid = tid end = prev_head(get_func_attr(ea, FUNCATTR_END)) self.target_ea = end info(f'Selecting the new thread to trace until {end:#x}') #dbg_del_thread(self.current_tid) #suspend_thread(self.current_tid) select_thread(tid) set_trace_base_address(ea) dbg_add_thread(tid) self.hook() enable_step_trace(1) # needed per thread? set_step_trace_options(ST_OPTIONS_MASK) request_enable_step_trace(1) request_run_to(end) #request_continue_process() if run_requests(): info('Requests successful') else: error('Requests failed') ''' def dbg_thread_exit(self, pid, tid, ea, exit_code): info(f'[Thread {tid:#x}] {ea:#x}: Thread exited with {exit_code:#x}') def dbg_run_to(self, pid, tid=0, ea=0): if ea == self.target_ea: info(f'[Thread {tid:#x}] Reached to the target {self.get_fn_offset(ea)}') elif pid != 0: error(f'[Thread {tid:#x}] The suspended address {self.get_fn_offset(ea)} is different from the target {self.get_fn_offset(self.target_ea)}. Probably another breakpoint set?') else: error(f'[Thread {tid:#x}] The suspended address {self.get_fn_offset(ea)} is different from the target {self.get_fn_offset(self.target_ea)}. Probably suspended by users manually?') info(f"Traced {self.traces} instructions") refresh_debugger_memory() self.print_summary() def dbg_process_exit(self, pid, tid, ea, code): error(f"[Thread {tid:#x}] Process exited with {code:#x} before reaching to the target") info(f"Traced {self.traces} instructions") self.print_summary() return 0 ''' def dbg_suspend_process(self): self.dbg_run_to(0, ea=get_ip_val()) ''' def main(): info('start') if not is_debugger_on(): error("Please run the process first!") return end = prev_head(get_func_attr(get_reg_val("EIP"), FUNCATTR_END)) info(f"Tracing to the end of function {end:#x}") debugHook = TraceHook(end) debugHook.hook() enable_step_trace(1) # Only the same thread works #set_step_trace_options(ST_OVER_DEBUG_SEG | ST_OVER_LIB_FUNC | ST_SKIP_LOOPS | ST_ALREADY_LOGGED | ST_DIFFERENTIAL) #set_step_trace_options(ST_OVER_DEBUG_SEG | ST_OVER_LIB_FUNC) set_step_trace_options(ST_OPTIONS_MASK) # all included run_to(end) while get_process_state() == DSTATE_RUN: #while get_process_state() != DSTATE_NOTASK: # as long as process is currently debugged wait_for_next_event(WFNE_ANY, 0) if not debugHook.unhook(): error("Error uninstalling hooks!") else: info('Hooks uninstalled') del debugHook info('done') if __name__ == '__main__': main() ================================================ FILE: callstrings/ida_callstrings_flare_emu.py ================================================ ''' ida_callstrings_flare_emu.py - string deobfuscation using flare-emu Takahiro Haruyama (@cci_forensics) ''' import idaapi #idaapi.require('logging') # <- This suppresses the flare-emu debug messages! import logging, hexdump #logging.basicConfig(level=logging.DEBUG, force=True) idaapi.require('hexrays_utils', package='*') from hexrays_utils import * idaapi.require('flare_emu') idaapi.require('flare_emu_hooks') import flare_emu, flare_emu_hooks, unicorn # Global options g_DEBUG = False g_DEBUG_FLARE_EMU = False g_FLAG_ALL_PATHS = False # True: iterateAllPaths, False: emulateRange g_MAX_SAME_STATE_VAR = 0x1000 # to detect infinite loop by CFF g_MAX_INST_VISIT = 10000 # to detect infinite loop #g_MAX_EMU_INSN = 1000000 g_MAX_STACK_BUF = 0x100 #g_ENC_OFFSET = 0x0 def info(msg): print("\033[34m\033[1m[*]\033[0m {}".format(msg)) def success(msg): print("\033[32m\033[1m[+]\033[0m {}".format(msg)) def error(msg): print("\033[31m\033[1m[!]\033[0m {}".format(msg)) def debug(msg): if g_DEBUG: print("\033[33m\033[1m[D]\033[0m {}".format(msg)) def debug_bin(n, v): if g_DEBUG: debug(n) hexdump.hexdump(v) class HexRaysEmu(HexRaysUtils): def __init__(self, eh): HexRaysUtils.__init__(self) self.eh = eh def get_reg_value(self, reg_name): return self.eh.getRegVal(reg_name.lower()) def get_ptr_value(self, ptr): return self.eh.getEmuPtr(ptr) def get_string(self, ea, is_unicode=False): return self.eh.getEmuWideString(ea).decode('utf-16') if is_unicode else self.eh.getEmuString(ea).decode() def get_bytes(self, ea): return self.eh.getEmuBytes(ea, 0x20) def call_hook(address, argv, funcName, userData): debug(f'call_hook at {address:#x}') #is_64bit = True if idaapi.get_inf_structure().lflags & idaapi.LFLG_64BIT == 4 else False hremu = userData["hremu"] try: hremu.get_arg_strings(address) except unicorn.UcError as e: error(f'{address:#x} ({hremu.get_fn_offset(address)}): Unicorn emulation exception in get_arg_strings() ({e})') def mem_write_hook(unicornObject, accessType, memAccessAddress, memAccessSize, memValue, userData): if accessType == unicorn.UC_MEM_WRITE: hremu = userData["hremu"] sp = hremu.eh.getRegVal('esp') ip = hremu.eh.getRegVal('ip') if sp < memAccessAddress < sp + g_MAX_STACK_BUF: userData["enc_heads"][ip] = memAccessAddress def is_high_entropy(v): res = True vbytes = v.to_bytes(4, 'little') for b in vbytes: if b & 0xff == 0: # e.g., 0, 1, 0x10000000, etc. res = False break else: vlist = [b for b in vbytes] for b in vbytes: if b == vlist[0] and b == vlist[1] and b == vlist[2] and b == vlist[3]: # e.g., 0x11111111, 0xffffffff, etc. res = False break return res def inst_hook_cff(unicornObject, address, instructionSize, userData): eh = userData["EmuHelper"] state_var_cnt = userData["state_var_cnt"] state_excluded = userData["state_excluded"] abort = False if print_insn_mnem(address) == 'cmp' and get_operand_type(address, 0) == o_reg and get_operand_type(address, 1) == o_imm and \ is_high_entropy(get_operand_value(address, 1)) and print_insn_mnem(next_head(address)) in ['jz', 'jnz']: #debug(f'{address:#x}: compare state var with cmp var') reg_name = print_operand(address, 0) state_var = eh.getRegVal(reg_name) cmp_var = get_operand_value(address, 1) if state_var != cmp_var: abort = True elif print_insn_mnem(address) in ['cmovz'] and get_operand_type(address, 0) == o_reg: reg_name = print_operand(address, 0) state_var = eh.getRegVal(reg_name) cmp_var = None if is_high_entropy(state_var): op1type = get_operand_type(address, 1) if op1type == o_imm: cmp_var = get_operand_value(address, 1) elif op1type == o_reg: op1_reg_name = print_operand(address, 1) cmp_var = eh.getRegVal(op1_reg_name) if cmp_var and state_var != cmp_var: abort = True if abort: if address not in state_excluded: uid = (address, state_var) state_var_cnt[uid] = 1 if uid not in state_var_cnt else state_var_cnt[uid] + 1 #debug(f'{address:#x}: The same state variable is compared or conditional moved {state_var_cnt[uid]} times') if state_var_cnt[uid] >= g_MAX_SAME_STATE_VAR: error(f'{address:#x}: CFF infinite loop detected. Update the state variable {state_var:#x} with the new one {cmp_var:#x}') debug([f'{ea:#x}: {var=:#x}, {cnt=}' for (ea, var), cnt in state_var_cnt.items()]) debug(f'excluded: {[f"{e:#x}" for e in state_excluded]}') eh.uc.reg_write(eh.regs[reg_name], cmp_var) state_excluded.append(address) # Reset the counts of the external loops state_var_cnt = {} def inst_hook(unicornObject, address, instructionSize, userData): eh = userData["EmuHelper"] inst_visit_cnt = userData["inst_visit_cnt"] inst_visit_cnt[address] = 1 if address not in inst_visit_cnt else inst_visit_cnt[address] + 1 if inst_visit_cnt[address] >= g_MAX_INST_VISIT: error(f'{address:#x}: Infinite loop detected. Aborted.') eh.stopEmulation(userData) def noop(*args): pass def main(): info('start') #breakpoint() if g_DEBUG_FLARE_EMU: eh = flare_emu.EmuHelper(verbose=10) eh.logger.setLevel(logging.DEBUG) else: eh = flare_emu.EmuHelper() hremu = HexRaysEmu(eh) selection = idaapi.read_range_selection(None) if selection[0]: info(f'Emulating the selection {selection[1]:#x} to {selection[2]:#x}') enc_heads = {} userData = { 'hremu': hremu, 'enc_heads': enc_heads } eh.emulateSelection(memAccessHook=mem_write_hook, hookData=userData) # Get the head of encoded string stack_buf = eh.getEmuBytes(eh.getRegVal('esp'), g_MAX_STACK_BUF) debug_bin('stack', stack_buf) for i in range(len(stack_buf)): if 65 <= stack_buf[i] <= 122: # A to z offset = i break else: offset = 0 #offset = 0x48 # Sometimes you need to adjust the offset manually :-( debug(f'detected offset = {offset:#x}') # Decode the string after detecting the constant value cfunc = get_ctree_root(selection[1], cache=g_CACHE) cvf = cnt_val_finder_t() cvf.apply_to_exprs(cfunc.body, None) cnt_val = cvf.get_cnt_val() if cnt_val: if stack_buf[offset + 1] != 0: enc = stack_buf[offset:] debug(f'enc {enc} is ascii') else: enc = eh.getEmuWideString(eh.getRegVal('esp') + offset).decode('utf-16-le') enc = enc.encode() debug(f'enc {enc} is unicode (utf-16-le)') dec = hremu.decode(enc, cnt_val) debug_bin('dec', dec) # Extract the ascii strings (no null termination) head = eh.getRegVal('esp') + offset ascs = extract_ascii(dec) if ascs: keys = [k for k, v in enc_heads.items() if v == head] if len(keys) == 1: success(f'{keys[0]:#x}: string decoded "{ascs[0]}"') hremu.set_decomplier_cmt(cfunc, keys[0], ascs[0]) else: success(f'string decoded "{ascs[0]}"') else: error(f'A constant value for decoding is not found') else: ans = ida_kernwin.ask_yn(0, 'only decode the selected function?') if ans == ida_kernwin.ASKBTN_YES: fvas = [get_func_attr(get_screen_ea(), FUNCATTR_START)] elif ans == ida_kernwin.ASKBTN_NO: fvas = idautils.Functions() else: info('canceled') return for fva in fvas: if get_func_flags(fva) & (FUNC_LIB | FUNC_THUNK): debug(f"{fva:#x}: skipping library or thunk function") continue fn_name = get_name(get_func_attr(fva, FUNCATTR_START)) if fn_name.find(g_stub_GetProcAddress) != -1: debug(f"{fva:#x}: skipping GetProcAddress stub function") continue print('-' * 100) info(f'{get_name(fva)} ({fva:#x})') ''' state_var_cnt = {} state_excluded = [] userData = { 'hremu': hremu, 'state_var_cnt': state_var_cnt, 'state_excluded': state_excluded, } eh.emulateRange(fva, callHook=call_hook, instructionHook=inst_hook_cff, hookData=userData, count=g_MAX_EMU_INSN) ''' inst_visit_cnt = {} userData = { 'hremu': hremu, 'inst_visit_cnt': inst_visit_cnt, } try: if g_FLAG_ALL_PATHS: info('The mode is iterateAllPaths') eh.iterateAllPaths(fva, noop, hookData=userData, callHook=call_hook) else: info('The mode is emulateRange') eh.emulateRange(fva, callHook=call_hook, instructionHook=inst_hook, hookData=userData) except unicorn.unicorn.UcError as e: error(f'{fva:#x}: unicorn error ({e})') refresh_idaview_anyway() eh.resetEmulatorHeapAndStack() print('-' * 100) hremu.print_summary() info('done') if __name__ == '__main__': main() ================================================ FILE: callstrings/ida_callstrings_static.py ================================================ ''' ida_callstrings_static.py - string deobfuscation for Hodur Takahiro Haruyama (@cci_forensics) ''' import idaapi idaapi.require('hexrays_utils', package='*') from hexrays_utils import * g_DEBUG = False g_CACHE = True g_memcpy_names = ['qmemcpy', 'wmemcpy', 'strcpy'] def info(msg): print("\033[34m\033[1m[*]\033[0m {}".format(msg)) def success(msg): print("\033[32m\033[1m[+]\033[0m {}".format(msg)) def error(msg): print("\033[31m\033[1m[!]\033[0m {}".format(msg)) def debug(msg): if g_DEBUG: print("\033[33m\033[1m[D]\033[0m {}".format(msg)) class static_decoder_t(ctree_visitor_t, HexRaysUtils): def __init__(self, cst_val, cfunc): ctree_visitor_t.__init__(self, CV_PARENTS | CV_POST | CV_RESTART) HexRaysUtils.__init__(self) self.cst_val = cst_val self.cfunc = cfunc def visit_expr(self, expr): # Decode the src string by the constant value if expr.op == cot_call and expr.x.op == cot_helper and expr.x.helper in g_memcpy_names: #breakpoint() info(f'{expr.ea:#x}: target helper function "{expr.x.helper}" is called') arg_dst = expr.a.at(0) arg_src = expr.a.at(1) #arg_size = expr.a.at(2) #if (arg_dst.op == cot_var or (arg_dst.op == cot_ref and arg_dst.x.op == cot_var)) and \ # (arg_src.op == cot_str or (arg_src.op == cot_cast and arg_src.x.op == cot_str)): if (arg_src.op == cot_str or (arg_src.op == cot_cast and arg_src.x.op == cot_str)): enc = arg_src.string if arg_src.op == cot_str else arg_src.x.string enc = enc.encode('utf-16-le') if expr.x.helper == 'wmemcpy' else enc.encode() info(f'{expr.ea:#x}: src bytes = {enc}') dec = self.decode(enc, self.cst_val).decode() if dec: success(f'{expr.ea:#x}: string decoded "{dec}"') self.set_decomplier_cmt(self.cfunc, expr.ea, dec) else: error(f'{expr.ea:#x}: string decoding failed using a constant value ({self.cst_val:#x})') return 0 def main(): info('start') ans = ida_kernwin.ask_yn(0, 'only decode the selected function?') if ans == ida_kernwin.ASKBTN_YES: fvas = [get_func_attr(get_screen_ea(), FUNCATTR_START)] elif ans == ida_kernwin.ASKBTN_NO: fvas = idautils.Functions() else: info('canceled') return for fva in fvas: if get_func_flags(fva) & (FUNC_LIB | FUNC_THUNK): debug(f"{fva:#x}: skipping library or thunk function") continue fn_name = get_name(get_func_attr(fva, FUNCATTR_START)) if fn_name.find(g_stub_GetProcAddress) != -1: debug(f"{fva:#x}: skipping GetProcAddress stub function") continue print('-' * 100) info(f'{get_name(fva)} ({fva:#x})') cfunc = get_ctree_root(fva, cache=g_CACHE) cvf = cnt_val_finder_t() cvf.apply_to_exprs(cfunc.body, None) cnt_val = cvf.get_cnt_val() if cnt_val: sd = static_decoder_t(cnt_val, cfunc) sd.apply_to_exprs(cfunc.body, None) else: error(f'{fva:#x}: A constant value for decoding is not found') refresh_idaview_anyway() print('-' * 100) info('done') if __name__ == '__main__': main() ================================================ FILE: eset_crackme/README.org ================================================ * IDA Pro loader/processor modules for ESET CrackMe driver VM You can download the initial sample for the CrackMe challenge from [[https://join.eset.com/en/challenges/crack-me][here]]. before: [[./img/eset_before.png]] after: [[./img/eset_after.png]] ** Reference - https://quequero.org/2016/01/eset-crackme-challenge-2015-walkthrough/ - http://mshetta.blogspot.jp/2016/11/join-eset-crackme-2015-solution.html ================================================ FILE: eset_crackme/loaders/ida_loader_drv_vm.py ================================================ import idaapi import ida_segment from idc import * from struct import * DATA_SEG_START = 0x10000 # may be changed def accept_file(li, filename): sig = int16(li.read(2)) if sig in [0x3713, 0x481c, 0x1337]: return {'format': "ESET Crackme driver VM program"} else: return 0 def int16(b): return unpack('> 4, 1, 'VM_DATA', "DATA") # segmentation (base should be in paragraphs 16-bits) li.file2base(li.tell(), DATA_SEG_START, DATA_SEG_START + size - data_off, 1) ''' myAddSeg(code_off, data_off, 0, 1, 'VM_CODE', "CODE") li.file2base(li.tell(), code_off, data_off, 1) myAddSeg(data_off, size, 0, 1, 'VM_DATA', "DATA") li.file2base(li.tell(), data_off, size, 1) ''' # initialize set_inf_attr(INF_START_EA, 0) set_inf_attr(INF_START_IP, 0) set_inf_attr(INF_START_CS, 0) #add_entry(0, ep, "start", 1) add_entry(0, 0, "start", 1) # should return 1 or terminate immediately return 1 ================================================ FILE: eset_crackme/procs/ida_processor_drv_vm.py ================================================ import sys import copy import ida_idaapi import ida_idp import ida_ua import ida_bytes import ida_xref import ida_offset import ida_problems import ida_lines import ida_segment from ida_idp import CF_USE1, CF_USE2, CF_CHG1, CF_CHG2, CF_STOP, CF_JUMP, CF_SHFT, CF_CALL # enum definitions from VM engine idb # enum_vm_size SIZE_BYTE = 0 SIZE_WORD = 1 SIZE_DWORD = 2 # enum_vm_type TYPE_REG_VAL = 0 TYPE_REG_PTR = 1 TYPE_IMM_VAL = 2 TYPE_DATA_OFF = 3 # enum_vm_cmp CMP_EQUAL = 0 CMP_NOT_EQUAL = 1 CMP_LESS_THAN = 2 # enum_vm_arith ARITH_XOR = 0 ARITH_ADD = 1 ARITH_SUB = 2 ARITH_SHL = 3 ARITH_SHR = 4 ARITH_ROL = 5 ARITH_ROR = 6 ARITH_MOD = 7 # ---------------------------------------------------------------------- class eset_drv_vm_processor_t(ida_idp.processor_t): """ Processor module classes must derive from ida_idp.processor_t """ # IDP id ( Numbers above 0x8000 are reserved for the third-party modules) id = 0x8fff # Processor features flag = ida_idp.PRN_HEX | ida_idp.PR_RNAMESOK # Number of bits in a byte for code segments (usually 8) # IDA supports values up to 32 bits cnbits = 8 # Number of bits in a byte for non-code segments (usually 8) # IDA supports values up to 32 bits dnbits = 8 # short processor names # Each name should be shorter than 9 characters psnames = ['eset_vm'] # long processor names # No restriction on name lengthes. plnames = ['ESET Crackme driver VM processor'] # size of a segment register in bytes segreg_size = 0 # Array of instructions instruc = [ {'name': '', 'feature': 0}, # placeholder for "not an instruction" {'name': 'hlt', 'feature': CF_STOP, 'cmt': "halt CPU"}, {'name': 'mov', 'feature': CF_USE1 | CF_USE2 | CF_CHG1, 'cmt': "move"}, {'name': 'ncall', 'feature': CF_USE1 | CF_CALL, 'cmt': "call native function"}, {'name': 'lcall', 'feature': CF_USE1 | CF_USE2 | CF_CALL, 'cmt': "call library function"}, {'name': 'push', 'feature': CF_USE1, 'cmt': "push to stack"}, {'name': 'pop', 'feature': CF_USE1 | CF_CHG1, 'cmt': "pop from stack"}, {'name': 'cmpeq', 'feature': CF_USE1 | CF_USE2, 'cmt': "compare #0 (equal)"}, {'name': 'cmpne', 'feature': CF_USE1 | CF_USE2, 'cmt': "compare #1 (not equal)"}, {'name': 'cmpb', 'feature': CF_USE1 | CF_USE2, 'cmt': "compare #2 (less than)"}, {'name': 'jmp', 'feature': CF_USE1 | CF_JUMP | CF_STOP, 'cmt': "jump #0 (unconditional)"}, {'name': 'cjmp', 'feature': CF_USE1 | CF_JUMP, 'cmt': "jump #1 (conditional)"}, {'name': 'call', 'feature': CF_USE1 | CF_CALL, 'cmt': "call VM function"}, {'name': 'ret', 'feature': 0, 'cmt': "return"}, {'name': 'xor', 'feature': CF_USE1 | CF_USE2 | CF_CHG1, 'cmt': "arithmetic operation #0 (xor)"}, {'name': 'add', 'feature': CF_USE1 | CF_USE2 | CF_CHG1, 'cmt': "arithmetic operation #1 (add)"}, {'name': 'sub', 'feature': CF_USE1 | CF_USE2 | CF_CHG1, 'cmt': "arithmetic operation #2 (sub)"}, {'name': 'shl', 'feature': CF_USE1 | CF_USE2 | CF_CHG1 | CF_SHFT, 'cmt': "arithmetic operation #3 (shift left)"}, {'name': 'shr', 'feature': CF_USE1 | CF_USE2 | CF_CHG1 | CF_SHFT, 'cmt': "arithmetic operation #4 (shift right)"}, {'name': 'rol', 'feature': CF_USE1 | CF_USE2 | CF_CHG1 | CF_SHFT, 'cmt': "arithmetic operation #5 (rotation left)"}, {'name': 'ror', 'feature': CF_USE1 | CF_USE2 | CF_CHG1 | CF_SHFT, 'cmt': "arithmetic operation #6 (rotation right)"}, {'name': 'mod', 'feature': CF_USE1 | CF_USE2 | CF_CHG1, 'cmt': "arithmetic operation #7 (modulo)"}, {'name': 'alloc', 'feature': CF_USE1, 'cmt': "allocate buffer"}, {'name': 'free', 'feature': CF_USE1, 'cmt': "free buffer"}, {'name': 'loadVM','feature': CF_USE1 | CF_USE2, 'cmt': "load another VM"}, {'name': 'nop', 'feature': 0, 'cmt': "nop"}, ] # icode of the first instruction instruc_start = 0 # icode of the last instruction + 1 instruc_end = len(instruc) + 1 # Size of long double (tbyte) for this processor (meaningful only if ash.a_tbyte != NULL) (optional) # tbyte_size = 0 # # Number of digits in floating numbers after the decimal point. # If an element of this array equals 0, then the corresponding # floating point data is not used for the processor. # This array is used to align numbers in the output. # real_width[0] - number of digits for short floats (only PDP-11 has them) # real_width[1] - number of digits for "float" # real_width[2] - number of digits for "double" # real_width[3] - number of digits for "long double" # Example: IBM PC module has { 0,7,15,19 } # # (optional) #real_width = (0, 7, 0, 0) # only one assembler is supported assembler = { # flag (mostly for the format) 'flag' : ida_idp.ASH_HEXF3 | ida_idp.ASD_DECF0 | ida_idp.ASO_OCTF5 | ida_idp.ASB_BINF0 | ida_idp.AS_N2CHR, # user defined flags (local only for IDP) (optional) #'uflag' : 0, # Assembler name (displayed in menus) 'name': "ESET Crackme driver VM assembler", # array of automatically generated header lines they appear at the start of disassembled text (optional) 'header': [".esetvm"], # array of unsupported instructions (array of insn.itype) (optional) #'badworks': [], # org directive 'origin': ".org", # end directive 'end': ".end", # comment string (see also cmnt2) 'cmnt': ";", # ASCII string delimiter 'ascsep': "\"", # ASCII char constant delimiter 'accsep': "'", # ASCII special chars (they can't appear in character and ascii constants) 'esccodes': "\"'", # # Data representation (db,dw,...): # # ASCII string directive 'a_ascii': ".char", # byte directive 'a_byte': "db", # word directive 'a_word': "dw", # remove if not allowed 'a_dword': "dd", # remove if not allowed # 'a_qword': "dq", # float; 4bytes; remove if not allowed #'a_float': ".float", # uninitialized data directive (should include '%s' for the size of data) 'a_bss': ".space %s", # 'equ' Used if AS_UNEQU is set (optional) #'a_equ': ".equ", # 'seg ' prefix (example: push seg seg001) 'a_seg': "seg", # current IP (instruction pointer) symbol in assembler 'a_curip': "$", # "public" name keyword. NULL-gen default, ""-do not generate 'a_public': ".def", # "weak" name keyword. NULL-gen default, ""-do not generate 'a_weak': "", # "extrn" name keyword 'a_extrn': ".ref", # "comm" (communal variable) 'a_comdef': "", # "align" keyword 'a_align': ".align", # Left and right braces used in complex expressions 'lbrace': "(", 'rbrace': ")", # % mod assembler time operation 'a_mod': "%", # & bit and assembler time operation 'a_band': "&", # | bit or assembler time operation 'a_bor': "|", # ^ bit xor assembler time operation 'a_xor': "^", # ~ bit not assembler time operation 'a_bnot': "~", # << shift left assembler time operation 'a_shl': "<<", # >> shift right assembler time operation 'a_shr': ">>", # size of type (format string) (optional) 'a_sizeof_fmt': "size %s", 'flag2': 0, # the include directive (format string) (optional) 'a_include_fmt': '.include "%s"', } # Assembler # ---------------------------------------------------------------------- # The following callbacks are optional # #def notify_newprc(self, nproc): # """ # Before changing proccesor type # nproc - processor number in the array of processor names # return 1-ok,0-prohibit # """ # return 1 #def notify_assemble(self, ea, cs, ip, use32, line): # """ # Assemble an instruction # (make sure that ida_idp.PR_ASSEMBLE flag is set in the processor flags) # (display a warning if an error occurs) # args: # ea - linear address of instruction # cs - cs of instruction # ip - ip of instruction # use32 - is 32bit segment? # line - line to assemble # returns the opcode string # """ # pass def notify_get_frame_retsize(self, func_ea): """ Get size of function return address in bytes If this function is absent, the kernel will assume 4 bytes for 32-bit function 2 bytes otherwise """ return 2 def notify_get_autocmt(self, insn): """ Get instruction comment. 'insn' describes the instruction in question @return: None or the comment string """ if 'cmt' in self.instruc[insn.itype]: return self.instruc[insn.itype]['cmt'] # ---------------------------------------------------------------------- def notify_is_sane_insn(self, insn, no_crefs): """ is the instruction sane for the current file type? args: no_crefs 1: the instruction has no code refs to it. ida just tries to convert unexplored bytes to an instruction (but there is no other reason to convert them into an instruction) 0: the instruction is created because of some coderef, user request or another weighty reason. The instruction is in 'insn' returns: 1-ok, <=0-no, the instruction isn't likely to appear in the program """ #w = ida_bytes.get_wide_word(insn.ea) #if w == 0 or w == 0xFFFF: # return 0 #return 1 return -1 # ---------------------------------------------------------------------- def handle_operand(self, insn, op, isRead): flags = ida_bytes.get_flags(insn.ea) is_offs = ida_bytes.is_off(flags, op.n) dref_flag = ida_xref.dr_R if isRead else ida_xref.dr_W def_arg = ida_bytes.is_defarg(flags, op.n) optype = op.type itype = insn.itype # create code xrefs if optype == ida_ua.o_imm: makeoff = False if itype in [self.itype_ncall, self.itype_call]: insn.add_cref(op.value, op.offb, ida_xref.fl_CN) makeoff = True #elif itype == self.itype_mov: # e.g., mov #addr, PC # insn.add_cref(op.value, op.offb, ida_xref.fl_JN) # makeoff = True if makeoff and not def_arg: otype = ida_offset.get_default_reftype(insn.ea) ida_offset.op_offset(insn.ea, op.n, otype, ida_idaapi.BADADDR, insn.cs) is_offs = True if is_offs: insn.add_off_drefs(op, ida_xref.dr_O, 0) elif optype == ida_ua.o_near: if insn.itype in [self.itype_ncall, self.itype_call]: fl = ida_xref.fl_CN else: fl = ida_xref.fl_JN insn.add_cref(op.addr, op.offb, fl) # create data xrefs elif optype == ida_ua.o_mem: insn.create_op_data(op.addr, op.offb, op.dtype) insn.add_dref(op.addr, op.offb, dref_flag) ''' ds = ida_segment.get_segm_by_name('VM_DATA') start = ds.start_ea insn.create_op_data(start + op.addr, op.offb, op.dtype) insn.add_dref(start + op.addr, op.offb, dref_flag) ''' # ---------------------------------------------------------------------- # The following callbacks are mandatory # def notify_emu(self, insn): """ Emulate instruction, create cross-references, plan to analyze subsequent instructions, modify flags etc. Upon entrance to this function all information about the instruction is in 'insn' structure. If zero is returned, the kernel will delete the instruction. """ aux = self.get_auxpref(insn) Feature = insn.get_canon_feature() if Feature & CF_USE1: self.handle_operand(insn, insn.Op1, 1) if Feature & CF_CHG1: self.handle_operand(insn, insn.Op1, 0) if Feature & CF_USE2: self.handle_operand(insn, insn.Op2, 1) if Feature & CF_CHG2: self.handle_operand(insn, insn.Op2, 0) if Feature & CF_JUMP: ida_problems.remember_problem(ida_problems.PR_JUMP, insn.ea) # is it an unconditional jump? uncond_jmp = insn.itype in [self.itype_jmp] # add flow flow = (Feature & CF_STOP == 0) and not uncond_jmp if flow: insn.add_cref(insn.ea + insn.size, 0, ida_xref.fl_F) return 1 # ---------------------------------------------------------------------- def notify_out_operand(self, ctx, op): """ Generate text representation of an instructon operand. This function shouldn't change the database, flags or anything else. All these actions should be performed only by the emu() function. This function uses out_...() functions from ua.hpp to generate the operand text Returns: 1-ok, 0-operand is hidden. """ optype = op.type dtype = op.dtype signed = 0 if optype == ida_ua.o_reg: if dtype == ida_ua.dt_byte: #ctx.out_register('b') ctx.out_keyword('byte ') elif dtype == ida_ua.dt_word: #ctx.out_register('w') ctx.out_keyword('word ') ctx.out_register(self.reg_names[op.reg]) elif optype == ida_ua.o_phrase: if dtype == ida_ua.dt_dword: ctx.out_keyword('dword ptr ') elif dtype == ida_ua.dt_byte: ctx.out_keyword('byte ptr ') elif dtype == ida_ua.dt_word: ctx.out_keyword('word ptr ') ctx.out_symbol('[') ctx.out_register(self.reg_names[op.reg]) ctx.out_symbol(']') elif optype == ida_ua.o_imm: ctx.out_symbol('#') ctx.out_value(op, ida_ua.OOFW_IMM | signed ) elif optype in [ida_ua.o_near, ida_ua.o_mem]: r = ctx.out_name_expr(op, op.addr, ida_idaapi.BADADDR) if not r: ctx.out_tagon(ida_lines.COLOR_ERROR) ctx.out_long(op.addr, 16) ctx.out_tagoff(ida_lines.COLOR_ERROR) ida_problems.remember_problem(ida_problems.PR_NONAME, ctx.insn.ea) else: return False # for Op2 of mov instruction #if op.specflag1: # ctx.out_keyword(' as ptr') return True # ---------------------------------------------------------------------- def notify_out_insn(self, ctx): """ Generate text representation of an instruction in 'ctx.insn' structure. This function shouldn't change the database, flags or anything else. All these actions should be performed only by emu() function. Returns: nothing """ postfix = "" ctx.out_mnemonic() # output first operand # kernel will call outop() if ctx.insn.Op1.type != ida_ua.o_void: ctx.out_one_operand(0) # output the rest of operands separated by commas for i in xrange(1, 3): if ctx.insn[i].type == ida_ua.o_void: break ctx.out_symbol(',') ctx.out_char(' ') ctx.out_one_operand(i) ctx.set_gen_cmt() # generate comment at the next call to MakeLine() ctx.flush_outbuf() def fill_reg(self, op, dtype, regno): op.type = ida_ua.o_reg op.dtype = dtype op.reg = regno #op.specflag1 = 0 def fill_phrase(self, op, dtype, regno): op.type = ida_ua.o_phrase op.dtype = dtype op.phrase = regno #op.specflag1 = 0 def fill_imm(self, op, dtype, val): op.type = ida_ua.o_imm op.dtype = dtype op.value = val #op.specflag1 = 0 def fill_near(self, op, dtype, addr): op.type = ida_ua.o_near op.dtype = dtype op.addr = addr #op.specflag1 = 0 def fill_mem(self, op, dtype, addr): op.type = ida_ua.o_mem op.dtype = dtype #op.addr = addr # add data segment base addr ds = ida_segment.get_segm_by_name('VM_DATA') op.addr = ds.start_ea + addr #op.specflag1 = 0 def get_next_bytes(self, insn, dtype): if dtype == ida_ua.dt_byte: return insn.get_next_byte() elif dtype == ida_ua.dt_word: return insn.get_next_word() elif dtype == ida_ua.dt_dword: return insn.get_next_dword() def set_operand(self, insn, op, type_, regno, dtype): # check dtype if dtype > 2: return -1 # IDA data type enum is matched with enum_vm_size of the idb if type_ == TYPE_REG_VAL: self.fill_reg(op, dtype, regno) elif type_ == TYPE_REG_PTR: self.fill_phrase(op, dtype, regno) elif type_ == TYPE_IMM_VAL: val = self.get_next_bytes(insn, dtype) self.fill_imm(op, dtype, val) elif type_ == TYPE_DATA_OFF: dt_off = insn.get_next_dword() self.fill_mem(op, dtype, dt_off) return 0 # ---------------------------------------------------------------------- def notify_ana(self, insn): """ Decodes an instruction into 'insn'. Returns: insn.size (=the size of the decoded instruction) or zero """ opc = insn.get_next_byte() # cmp (0x6), jmp (0x7), arithmetic operation (0xa): multiple instructions # 0xe - 0xff: nop if opc > 0xd: insn.itype = self.itype_nop elif opc > 0xa: insn.itype = self.itype_hlt + opc + 2 + 1 + 7 elif opc > 7: insn.itype = self.itype_hlt + opc + 2 + 1 elif opc > 6: insn.itype = self.itype_hlt + opc + 2 else: insn.itype = self.itype_hlt + opc if insn.itype not in [self.itype_hlt, self.itype_ret, self.itype_nop]: if insn.itype in [self.itype_call, self.itype_jmp]: if insn.itype == self.itype_jmp: cflag = insn.get_next_byte() # check conditional flag if cflag > 1: return 0 # invalid flag value insn.itype += cflag addr = insn.get_next_dword() self.fill_near(insn.Op1, ida_ua.dt_dword, addr) elif insn.itype == self.itype_pop: regno = insn.get_next_byte() & 0xf self.fill_reg(insn.Op1, ida_ua.dt_dword, regno) elif insn.itype in [self.itype_push, self.itype_alloc, self.itype_free, self.itype_ncall]: b1 = insn.get_next_byte() dtype = ida_ua.dt_dword if insn.itype == self.itype_ncall else b1 >> 6 if self.set_operand(insn, insn.Op1, (b1 >> 4) & 3, b1 & 0xf, dtype): return 0 # invalid dtype elif insn.itype in [self.itype_lcall, self.itype_loadVM]: b1 = insn.get_next_byte() b2 = insn.get_next_byte() if self.set_operand(insn, insn.Op1, b2 & 3, b1 & 0xf, ida_ua.dt_dword): return 0 # invalid dtype dtype = ida_ua.dt_dword if insn.itype == self.itype_lcall else (b2 >> 4) & 3 if self.set_operand(insn, insn.Op2, (b2 >> 2) & 3, b1 >> 4, dtype): return 0 # invalid dtype elif insn.itype == self.itype_mov: b1 = insn.get_next_byte() b2 = insn.get_next_byte() dtype = (b2 >> 4) & 3 if self.set_operand(insn, insn.Op2, b2 & 3, b1 >> 4, dtype): return 0 # invalid dtype dst_regno = b1 & 0xf if (b2 >> 2) & 3: # used as pointer self.fill_phrase(insn.Op1, dtype, dst_regno) #insn.Op2.specflag1 = 1 else: self.fill_reg(insn.Op1, dtype, dst_regno) elif insn.itype in [self.itype_cmpeq, self.itype_xor]: b1 = insn.get_next_byte() b2 = insn.get_next_byte() self.fill_reg(insn.Op1, ida_ua.dt_dword, b1 & 0xf) if self.set_operand(insn, insn.Op2, b2 & 3, b1 >> 4, (b2 >> 2) & 3): return 0 # invalid dtype # update itype itype_idx = (b2 >> 4) & 7 if insn.itype == self.itype_cmpeq and itype_idx > 2: return 0 # invalid cmp operation else: insn.itype += itype_idx # Return decoded instruction size or zero return insn.size if insn.itype != self.itype_null else 0 # ---------------------------------------------------------------------- def init_instructions(self): Instructions = [] i = 0 for x in self.instruc: if x['name'] != '': setattr(self, 'itype_' + x['name'], i) else: setattr(self, 'itype_null', i) i += 1 # icode of the last instruction + 1 self.instruc_end = len(self.instruc) + 1 # ---------------------------------------------------------------------- def init_registers(self): """ This function parses the register table and creates corresponding ireg_XXX constants """ # Registers definition self.reg_names = [ # General purpose registers "r0", "r1", "r2", "r3", "r4", "r5", # SP "r6", # VM pointer "r7", # VM size "r8", # ntoskrnl_base "r9", # arg registers "r10", "r11", "r12", "r13", "r14", "r15", # Fake segment registers "CS", "DS", ] # Create the ireg_XXXX constants for i in xrange(len(self.reg_names)): setattr(self, 'ireg_' + self.reg_names[i], i) # Segment register information (use virtual CS and DS registers if your # processor doesn't have segment registers): self.reg_first_sreg = self.ireg_CS self.reg_last_sreg = self.ireg_DS # number of CS register self.reg_code_sreg = self.ireg_CS # number of DS register self.reg_data_sreg = self.ireg_DS # ---------------------------------------------------------------------- def __init__(self): ida_idp.processor_t.__init__(self) self.init_instructions() self.init_registers() # ---------------------------------------------------------------------- # Every processor module script must provide this function. # It should return a new instance of a class derived from ida_idp.processor_t def PROCESSOR_ENTRY(): return eset_drv_vm_processor_t() ================================================ FILE: fn_fuzzy/README.org ================================================ #+OPTIONS: ^:{} #+TITLE: fn_fuzzy.py - IDAPython script for fast multiple binary diffing triage * Motivation See the [[https://conference.hitb.org/hitbsecconf2019ams/sessions/fn_fuzzy-fast-multiple-binary-diffing-triage-with-ida/][conference information]] or [[https://www.carbonblack.com/2019/05/09/fn_fuzzy-fast-multiple-binary-diffing-triage-with-ida/][blog]] post. * how to use - fn_fuzzy.py :: IDAPython script to export/compare fuzzy hashes of the sample - cli_export.py :: python wrapper script to export fuzzy hashes of multiple samples The typical usage is to run cli_export.py to make a database for large idbs then compare on IDA by executing fn_fuzzy.py. [[./img/fn_fuzzy.png]] [[./img/res_summary.png]] [[./img/res_funcs.png]] * supported IDB version IDBs generated by IDA 6.9 or later due to SHA256 API * required python packages - mmh3 - [[https://github.com/williballenthin/python-idb%0A][python-idb]] ================================================ FILE: fn_fuzzy/cli_export.py ================================================ # cli_export.py - batch export script for fn_fuzzy # Takahiro Haruyama (@cci_forensics) import argparse, subprocess, os, sqlite3, time, sys import idb # python-idb import logging logging.basicConfig(level=logging.ERROR) # to suppress python-idb warning # plz edit the following paths g_ida_dir = r'C:\analysisw\tool\IDA' g_db_path = r'C:\analysisw\tics\fn_fuzzy.sqlite' g_fn_fuzzy_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'fn_fuzzy.py') g_min_bytes = 0x10 # minimum number of extracted code bytes per function g_analyzed_prefix = r'fn_' # analyzed function name prefix (regex) class LocalError(Exception): pass class ProcExportError(LocalError): pass def info(msg): print("[*] {}".format(msg)) def success(msg): print("[+] {}".format(msg)) def error(msg): print("[!] {}".format(msg)) def init_db(cur): cur.execute("SELECT * FROM sqlite_master WHERE type='table'") if cur.fetchone() is None: info('DB initialized') cur.execute("CREATE TABLE IF NOT EXISTS sample(sha256 UNIQUE, path)") #cur.execute("CREATE INDEX sha256_index ON sample(sha256)") cur.execute("CREATE INDEX path_index ON sample(path)") cur.execute("CREATE TABLE IF NOT EXISTS function(sha256, fva, fname, fhd, fhm, f_ana, bsize, ptype, UNIQUE(sha256, fva))") cur.execute("CREATE INDEX f_ana_index ON function(f_ana)") cur.execute("CREATE INDEX bsize_index ON function(bsize)") def existed(cur, sha256): cur.execute("SELECT * FROM sample WHERE sha256 = ?", (sha256,)) if cur.fetchone() is None: return False else: return True def remove(cur, sha256): cur.execute("DELETE FROM sample WHERE sha256 = ?", (sha256,)) cur.execute("DELETE FROM function WHERE sha256 = ?", (sha256,)) def export(f_debug, idb_path, outdb, min_, f_ex_libthunk, f_update, f_ana_exp, ana_pre, f_remove): # check the ext and signature ext = os.path.splitext(idb_path)[1] if ext != '.idb' and ext != '.i64': return 0 with open(idb_path, 'rb') as f: sig = f.read(4) if sig != b'IDA1' and sig != b'IDA2': return 0 # check the database record for the idb #print idb_path conn = sqlite3.connect(outdb) cur = conn.cursor() init_db(cur) with idb.from_file(idb_path) as db: # Fix: Cause NameError. need to rewrite in IDA batch mode to calculate SHA256 api = idb.IDAPython(db) try: sha256 = api.ida_nalt.retrieve_input_file_sha256() except KeyError: error('{}: ida_nalt.retrieve_input_file_sha256() failed. The API is supported in 6.9 or later idb version. Check the API on IDA for validation.'.format(idb_path)) return 0 sha256 = sha256.lower() if f_remove: remove(cur, sha256) success('{}: the records successfully removed (SHA256={})'.format(idb_path, sha256)) conn.commit() cur.close() return 0 if existed(cur, sha256) and not f_update: info('{}: The sample records are present in DB (SHA256={}). Skipped.'.format(idb_path, sha256)) return 0 conn.commit() cur.close() ida = 'ida.exe' if sig == 'IDA1' else 'ida64.exe' ida_path = os.path.join(g_ida_dir, ida) #cmd = [ida_path, '-L{}'.format(os.path.join(g_ida_dir, 'debug.log')), '-S{}'.format(g_fn_fuzzy_path), '-Ofn_fuzzy:{}:{}:{}:{}:{}:{}'.format(min_, f_ex_libthunk, f_update, f_ana_exp, ana_pre, outdb), idb_path] cmd = [ida_path, '-S{}'.format(g_fn_fuzzy_path), '-Ofn_fuzzy:{}:{}:{}:{}:{}:{}'.format(min_, f_ex_libthunk, f_update, f_ana_exp, ana_pre, outdb), idb_path] if not f_debug: cmd.insert(1, '-A') #print cmd proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() if proc.returncode == 0: success('{}: successfully exported'.format(idb_path)) return 1 elif proc.returncode == 2: # skipped return 0 else: # maybe 1 raise ProcExportError('{}: Something wrong with the IDAPython script (returncode={}). Use -d for debug'.format(idb_path, proc.returncode)) def list_file(d): for entry in os.listdir(d): if os.path.isfile(os.path.join(d, entry)): yield os.path.join(d, entry) def list_file_recursive(d): for root, dirs, files in os.walk(d): for file_ in files: yield os.path.join(root, file_) def main(): info('start') parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('target', help="idb file or folder to export") parser.add_argument('--outdb', '-o', default=g_db_path, help="export DB path") parser.add_argument('--min_', '-m', type=int, default=g_min_bytes, help="minimum number of extracted code bytes per function") parser.add_argument('--exclude', '-e', action='store_true', help="exclude library/thunk functions") parser.add_argument('--update', '-u', action='store_true', help="update the DB records") parser.add_argument('--ana_exp', '-a', action='store_true', help="check analyzed functions") parser.add_argument('--ana_pre', '-p', default=g_analyzed_prefix, help="analyzed function name prefix (regex)") parser.add_argument('--recursively', '-r', action='store_true', help="export idbs recursively") parser.add_argument('--debug', '-d', action='store_true', help="display IDA dialog for debug") parser.add_argument('--remove', action='store_true', help="remove records from db") args = parser.parse_args() start = time.time() cnt = 0 if os.path.isfile(args.target): try: cnt += export(args.debug, args.target, args.outdb, args.min_, args.exclude, args.update, args.ana_exp, args.ana_pre, args.remove) except LocalError as e: error('{} ({})'.format(str(e), type(e))) return elif os.path.isdir(args.target): gen_lf = list_file_recursive if args.recursively else list_file for t in gen_lf(args.target): try: cnt += export(args.debug, t, args.outdb, args.min_, args.exclude, args.update, args.ana_exp, args.ana_pre, args.remove) except LocalError as e: error('{} ({})'.format(str(e), type(e))) return else: error('the target is not file/dir') return elapsed = time.time() - start success('totally {} samples exported'.format(cnt)) info('elapsed time = {} sec'.format(elapsed)) info('done') if __name__ == '__main__': main() ================================================ FILE: fn_fuzzy/dump_types.py ================================================ import os def main(): path = os.path.splitext(get_idb_path())[0] + '.idc' gen_file(OFILE_IDC, path, 0, 0, GENFLG_IDCTYPE) Exit(0) if ( __name__ == "__main__" ): main() ================================================ FILE: fn_fuzzy/fn_fuzzy.py ================================================ # fn_fuzzy.py - IDAPython script for fast multiple binary diffing triage # Takahiro Haruyama (@cci_forensics) import os, ctypes, sqlite3, re, time, sys, subprocess import cProfile from collections import defaultdict from pprint import PrettyPrinter from io import StringIO from tqdm import tqdm from idc import * import idautils, ida_nalt, ida_kernwin, idaapi, ida_expr import mmh3 import yara_fn # modified version in the same folder g_db_path = r'Z:\haru\analysis\tics\fn_fuzzy.sqlite' # plz edit your path g_min_bytes = 0x10 # minimum number of extracted code bytes per function g_analyzed_prefix = r'fn_|func_' # analyzed function name prefix (regex) g_threshold = 50 # function similarity score threshold without CFG match g_threshold_cfg = 10 # function similarity score threshold with CFG match g_max_bytes_for_score = 0x100 # more code bytes are evaluated by only CFG match g_bsize_ratio = 40 # function binary size correction ratio to compare (40 is enough) # debug purpose to check one function matching g_dbg_flag = False g_dbg_fva = 0x180015978 g_dbg_fname = 'fn_blob_get_word_param_and_seek' g_dbg_sha256 = '' # initialization for ssdeep SPAMSUM_LENGTH = 64 FUZZY_MAX_RESULT = (2 * SPAMSUM_LENGTH + 20) dirpath = os.path.dirname(__file__) _lib_path = os.path.join(dirpath, 'fuzzy64.dll') fuzzy_lib = ctypes.cdll.LoadLibrary(_lib_path) g_dump_types_path = os.path.join(dirpath, 'dump_types.py') class defaultdictRecurse(defaultdict): def __init__(self): self.default_factory = type(self) class import_handler_t(ida_kernwin.action_handler_t): def __init__(self, items, idb_path, title): ida_kernwin.action_handler_t.__init__(self) self.items = items self.idb_path = idb_path self.title = title def import_types(self): idc_path = os.path.splitext(self.idb_path)[0] + '.idc' # dump type information from the 2nd idb if not (os.path.exists(idc_path)): with open(self.idb_path, 'rb') as f: sig = f.read(4) ida = 'ida.exe' if sig == 'IDA1' else 'ida64.exe' ida_path = os.path.join(idadir(), ida) cmd = [ida_path, '-S{}'.format(g_dump_types_path), self.idb_path] #print cmd proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() if proc.returncode == 0: success('{}: type information successfully dumped'.format(self.idb_path)) else: error('{}: type information dumping failed'.format(self.idb_path)) return False # import the type information idc_path = os.path.splitext(self.idb_path)[0] + '.idc' ida_expr.exec_idc_script(None, str(idc_path), "main", None, 0) return True def activate(self, ctx): sel = [] for idx in ctx.chooser_selection: # rename the function ea = get_name_ea_simple(self.items[idx][2]) sfname = str(self.items[idx][4]) #set_name(ea, sfname) idaapi.do_name_anyway(ea, sfname) success('{:#x}: renamed to {}'.format(ea, sfname)) # set the function prototype sptype = str(self.items[idx][5]) if sptype != 'None': tinfo = idaapi.tinfo_t() idaapi.parse_decl2(idaapi.cvar.idati, sptype, tinfo, 0) #idaapi.apply_callee_tinfo(ea, tinfo) if idaapi.apply_tinfo(ea, tinfo, 0): success('{:#x}: function prototype set to {}'.format(ea, sptype)) else: error('{:#x}: function prototype set FAILED (maybe you should import the types?)'.format(ea)) if ask_yn(0, 'Do you import types from the secondary idb?') == 1: if self.import_types(): tinfo = idaapi.tinfo_t() idaapi.parse_decl2(idaapi.cvar.idati, sptype, tinfo, 0) if idaapi.apply_tinfo(ea, tinfo, 0): success('{:#x}: function prototype set to {}'.format(ea, sptype)) else: error('{:#x}: function prototype set FAILED again'.format(ea)) # insert the comment score = self.items[idx][0] mmatch = self.items[idx][1] cmt = 'fn_fuzzy: ssdeep={}, machoc={}'.format(score, mmatch) set_func_cmt(ea, cmt, 1) #set_decomplier_cmt(ea, cmt) # not sure how to avoid orphan comment # update the Choose rows ida_kernwin.refresh_chooser(self.title) def update(self, ctx): return idaapi.AST_ENABLE_ALWAYS ''' return ida_kernwin.AST_ENABLE_FOR_WIDGET \ if ida_kernwin.is_chooser_widget(ctx.widget_type) \ else ida_kernwin.AST_DISABLE_FOR_WIDGET ''' class FnCh(ida_kernwin.Choose): def __init__(self, title, mfn, idb_path): self.mfn = mfn self.idb_path = idb_path self.title = title ida_kernwin.Choose.__init__( self, title, [ ["ssdeep score", 10 | ida_kernwin.Choose.CHCOL_DEC], ["machoc matched", 10 | ida_kernwin.Choose.CHCOL_PLAIN], ["primary function", 30 | ida_kernwin.Choose.CHCOL_PLAIN], ["primary bsize", 10 | ida_kernwin.Choose.CHCOL_DEC], ["secondary analyzed function", 30 | ida_kernwin.Choose.CHCOL_PLAIN], ["secondary prototype", 40 | ida_kernwin.Choose.CHCOL_PLAIN] ], flags = ida_kernwin.Choose.CH_MULTI) def OnInit(self): self.items = [] for fva,v in sorted(list(self.mfn.items()), key=lambda x:x[1]['score'], reverse=True): if v['sfname']: self.items.append(['{}'.format(v['score']), '{}'.format(v['cfg_match']), str(get_name(fva)), '{}'.format(v['pbsize']), str(v['sfname']), '{}'.format(v['sptype'])]) return True def OnPopup(self, form, popup_handle): actname = "choose:actFnFuzzyImport" desc = ida_kernwin.action_desc_t(actname, 'Import function name and prototype', import_handler_t(self.items, self.idb_path, self.title)) ida_kernwin.attach_dynamic_action_to_popup(form, popup_handle, desc) def OnGetSize(self): return len(self.items) def OnGetLine(self, n): return self.items[n] def OnSelectLine(self, n): idx = n[0] # due to CH_MULTI idc.Jump(get_name_ea_simple(self.items[idx][2])) def OnRefresh(self, n): self.OnInit() # try to preserve the cursor #return [ida_kernwin.Choose.ALL_CHANGED] + self.adjust_last_item(n) #return n return None def OnClose(self): print("closed ", self.title) class SummaryCh(ida_kernwin.Choose): def __init__(self, title, res): self.res = res ida_kernwin.Choose.__init__( self, title, [ ["SHA256", 20 | ida_kernwin.Choose.CHCOL_PLAIN], ["total similar functions", 20 | ida_kernwin.Choose.CHCOL_DEC], ["analyzed similar functions", 20 | ida_kernwin.Choose.CHCOL_DEC], ["IDB path", 80 | ida_kernwin.Choose.CHCOL_PATH] ]) self.items = [] def OnInit(self): for sha256,v in sorted(list(self.res.items()), key=lambda x:x[1]['mcnt']['total'], reverse=True): if v['mcnt']['total'] > 0: self.items.append([str(sha256), '{}'.format(v['mcnt']['total']), '{}'.format(v['mcnt']['analyzed']), str(v['path'])]) return True def OnGetSize(self): return len(self.items) def OnGetLine(self, n): return self.items[n] def OnSelectLine(self, n): sha256 = self.items[n][0] c = FnCh("similarities with {}(snip)".format(sha256[:8]), self.res[sha256]['mfn'], self.res[sha256]['path']) c.Show() def OnRefresh(self, n): return n def OnClose(self): print("closed ", self.title) class FnFuzzyForm(ida_kernwin.Form): def __init__(self): ida_kernwin.Form.__init__(self, r"""BUTTON YES* Run BUTTON CANCEL Cancel fn_fuzzy {FormChangeCb} General Options {cGroup}> <##Commands##Export:{rExport}> {rGroup}> Export Options {cEGroup}> Compare Options {cCGroup}> """, { 'FormChangeCb': ida_kernwin.Form.FormChangeCb(self.OnFormChange), 'cGroup': ida_kernwin.Form.ChkGroupControl(("cLibthunk", "cDebug")), 'iDBSave': ida_kernwin.Form.FileInput(save=True), 'iMinBytes': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_HEX), 'rGroup': ida_kernwin.Form.RadGroupControl(("rCompare", "rExport")), 'cEGroup': ida_kernwin.Form.ChkGroupControl(("cUpdate", "cAnaExp")), 'iPrefix': ida_kernwin.Form.StringInput(), 'cCGroup': ida_kernwin.Form.ChkGroupControl(("cAnaCmp", "cFolCmp")), 'iFolder': ida_kernwin.Form.DirInput(), 'iRatio': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_DEC), 'iSimilarity': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_DEC), 'iSimilarityCFG': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_DEC), 'iMaxBytesForScore': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_HEX), }) def OnFormChange(self, fid): if fid == -1: self.SetControlValue(self.cLibthunk, True) self.SetControlValue(self.cAnaExp, True) self.SetControlValue(self.cAnaCmp, True) self.SetControlValue(self.rCompare, True) self.EnableField(self.cEGroup, False) self.EnableField(self.iPrefix, False) self.EnableField(self.cCGroup, True) self.EnableField(self.iSimilarity, True) self.EnableField(self.iSimilarityCFG, True) self.EnableField(self.iMaxBytesForScore, True) self.EnableField(self.iRatio, True) if fid == self.rExport.id: self.EnableField(self.cEGroup, True) self.EnableField(self.iPrefix, True) self.EnableField(self.cCGroup, False) self.EnableField(self.iSimilarity, False) self.EnableField(self.iSimilarityCFG, False) self.EnableField(self.iMaxBytesForScore, False) self.EnableField(self.iRatio, False) elif fid == self.rCompare.id: self.EnableField(self.cEGroup, False) self.EnableField(self.iPrefix, False) self.EnableField(self.cCGroup, True) self.EnableField(self.iSimilarity, True) self.EnableField(self.iSimilarityCFG, True) self.EnableField(self.iMaxBytesForScore, True) self.EnableField(self.iRatio, True) return 1 class FnFuzzy(object): def __init__(self, f_debug, db_path, min_bytes, f_ex_libthunk, f_update, f_ana_exp, ana_pre, f_ana_cmp = False, f_fol_cmp = False, ana_fol='', threshold = None, threshold_cfg = None, max_bytes_for_score = None, ratio = 0): self.f_debug = f_debug self.conn = sqlite3.connect(db_path) self.cur = self.conn.cursor() self.init_db() self.in_memory_db() self.min_bytes = min_bytes self.f_ex_libthunk = f_ex_libthunk # for export self.f_update = f_update self.f_ana_exp = f_ana_exp self.ana_pre = ana_pre if f_ana_exp: self.ana_pat = re.compile(self.ana_pre) # for compare self.f_ana_cmp = f_ana_cmp self.f_fol_cmp = f_fol_cmp self.ana_fol = ana_fol self.threshold = threshold self.threshold_cfg = threshold_cfg self.max_bytes_for_score = max_bytes_for_score self.ratio = float(ratio) self.idb_path = get_idb_path() self.sha256 = ida_nalt.retrieve_input_file_sha256() try: #self.sha256 = self.sha256.lower() self.sha256 = self.sha256.hex() self.md5 = ida_nalt.retrieve_input_file_md5().lower() except AttributeError: message = 'ida_nalt.retrieve_input_file_sha256() returned None. Probably the IDB was generated by old IDA (<6.9). Check the version by ida_netnode.cvar.root_node.supstr(ida_nalt.RIDX_IDA_VERSION)' error(message) #ida_kernwin.warning(message) def debug(self, msg): if self.f_debug: print("[D] {}".format(msg)) def init_db(self): self.cur.execute("SELECT * FROM sqlite_master WHERE type='table'") if self.cur.fetchone() is None: info('DB initialized') self.cur.execute("CREATE TABLE IF NOT EXISTS sample(sha256 UNIQUE, path)") #self.cur.execute("CREATE INDEX sha256_index ON sample(sha256)") self.cur.execute("CREATE INDEX path_index ON sample(path)") self.cur.execute("CREATE TABLE IF NOT EXISTS function(sha256, fva, fname, fhd, fhm, f_ana, bsize, ptype, UNIQUE(sha256, fva))") self.cur.execute("CREATE INDEX f_ana_index ON function(f_ana)") self.cur.execute("CREATE INDEX bsize_index ON function(bsize)") def in_memory_db(self): # for SELECT tempfile = StringIO() for line in self.conn.iterdump(): tempfile.write("{}\n".format(line)) tempfile.seek(0) self.mconn = sqlite3.connect(":memory:") self.mconn.cursor().executescript(tempfile.read()) self.mconn.commit() self.mconn.row_factory=sqlite3.Row self.mcur = self.mconn.cursor() def calc_fn_machoc(self, fva, fname): # based on Machoc hash implementation (https://github.com/0x00ach/idadiff) func = idaapi.get_func(fva) if type(func) == type(None): self.debug('{}: ignored due to lack of function object'.format(fname)) return None, None flow = idaapi.FlowChart(f=func) cur_hash_rev = "" addrIds = [] cur_id = 1 for c in range(0,flow.size): cur_basic = flow.__getitem__(c) cur_hash_rev += shex(cur_basic.start_ea)+":" addrIds.append((shex(cur_basic.start_ea),str(cur_id))) cur_id += 1 addr = cur_basic.start_ea blockEnd = cur_basic.end_ea mnem = GetMnem(addr) while mnem != "": if mnem == "call": # should be separated into 2 blocks by call cur_hash_rev += "c," addr = NextHead(addr,blockEnd) mnem = GetMnem(addr) if addr != BADADDR: cur_hash_rev += shex(addr)+";"+shex(addr)+":" addrIds.append((shex(addr),str(cur_id))) cur_id += 1 else: addr = NextHead(addr,blockEnd) mnem = GetMnem(addr) refs = [] for suc in cur_basic.succs(): refs.append(suc.start_ea) refs.sort() refsrev = "" for ref in refs: refsrev += shex(ref)+"," if refsrev != "": refsrev = refsrev[:-1] cur_hash_rev += refsrev+";" # change addr to index for aid in addrIds: #cur_hash_rev = string.replace(cur_hash_rev,aid[0],aid[1]) cur_hash_rev = cur_hash_rev.replace(aid[0],aid[1]) # calculate machoc hash value self.debug('{}: CFG = {}'.format(fname, cur_hash_rev)) return mmh3.hash(cur_hash_rev) & 0xFFFFFFFF, cur_id-1 def calc_fn_ssdeep(self, fva, fname): d2h = b'' for bb in yara_fn.get_basic_blocks(fva): rule = yara_fn.get_basic_block_rule(bb) if rule: chk = rule.cut_bytes_for_hash if len(chk) < yara_fn.MIN_BB_BYTE_COUNT: continue d2h += chk.encode() #self.debug('chunk at {:#x}: {}'.format(bb.va, get_hex_pat(chk))) #self.debug('total func seq at {:#x}: {}'.format(fva, get_hex_pat(d2h))) if len(d2h) < self.min_bytes: self.debug('{}: ignored because of the number of extracted code bytes {}'.format(fname, len(d2h))) return None, None result_buffer = ctypes.create_string_buffer(FUZZY_MAX_RESULT) file_buffer = ctypes.create_string_buffer(d2h) hash_result = fuzzy_lib.fuzzy_hash_buf(file_buffer, len(file_buffer) - 1, result_buffer) hash_value = result_buffer.value.decode("ascii") return hash_value, len(d2h) def existed(self): self.mcur.execute("SELECT sha256 FROM sample WHERE sha256 = ?", (self.sha256,)) if self.mcur.fetchone() is None: return False else: return True def exclude_libthunk(self, fva, fname): if self.f_ex_libthunk: flags = get_func_attr(fva, FUNCATTR_FLAGS) if flags & FUNC_LIB: self.debug('{}: ignored because of library function'.format(fname)) return True if flags & FUNC_THUNK: self.debug('{}: ignored because of thunk function'.format(fname)) return True return False def export(self): if self.existed() and not self.f_update: info('{}: The sample records are present in DB. skipped.'.format(self.sha256)) return False self.cur.execute("REPLACE INTO sample values(?, ?)", (self.sha256, self.idb_path)) pnum = tnum = 0 records = [] for fva in idautils.Functions(): fname = get_func_name(fva) tnum += 1 if self.exclude_libthunk(fva, fname): continue fhd, bsize = self.calc_fn_ssdeep(fva, fname) fhm, cfgnum = self.calc_fn_machoc(fva, fname) if fhd and fhm: pnum += 1 f_ana = bool(self.ana_pat.search(fname)) if self.f_ana_exp else False tinfo = idaapi.tinfo_t() idaapi.get_tinfo(fva, tinfo) ptype = idaapi.print_tinfo('', 0, 0, idaapi.PRTYPE_1LINE, tinfo, fname, '') ptype = ptype + ';' if ptype is not None else ptype # fva is 64-bit int causing OverflowError records.append((self.sha256, '{:#x}'.format(fva), fname, fhd, fhm, f_ana, bsize, ptype)) self.debug('EXPORT {} at {:#x}: ssdeep={} (size={}), machoc={} (num of CFG={})'.format(fname, fva, fhd, bsize, fhm, cfgnum)) self.cur.executemany("REPLACE INTO function values (?, ?, ?, ?, ?, ?, ?, ?)", records) success ('{} of {} functions exported'.format(pnum, tnum)) return True def compare(self): res = defaultdictRecurse() if self.f_fol_cmp: self.mcur.execute("SELECT sha256,path FROM sample WHERE path LIKE ?", (self.ana_fol+'%',)) else: self.mcur.execute("SELECT sha256,path FROM sample") frows = self.mcur.fetchall() num_of_samples = len(frows) for sha256, path in frows: res[sha256]['path'] = path res[sha256]['mcnt'].default_factory = lambda: 0 #sql = "SELECT sha256,fname,fhd,fhm,f_ana,ptype FROM function WHERE f_ana == 1 AND bsize BETWEEN ? AND ?" if self.f_ana_cmp else "SELECT sha256,fname,fhd,fhm,f_ana,ptype FROM function WHERE bsize BETWEEN ? AND ?" sql = "SELECT function.sha256,fname,fhd,fhm,f_ana,ptype FROM function INNER JOIN sample on function.sha256 == sample.sha256 WHERE path LIKE ? AND " if self.f_fol_cmp else "SELECT sha256,fname,fhd,fhm,f_ana,ptype FROM function WHERE " sql += "f_ana == 1 AND bsize BETWEEN ? AND ?" if self.f_ana_cmp else "bsize BETWEEN ? AND ?" fns = list(idautils.Functions()) for fva in tqdm(fns, desc='comparing functions'): fname = get_func_name(fva) if self.exclude_libthunk(fva, fname) or not num_of_samples: continue pfhd, pbsize = self.calc_fn_ssdeep(fva, fname) pfhm, pcfgnum = self.calc_fn_machoc(fva, fname) if pfhd and pfhm: pbuf = ctypes.create_string_buffer(pfhd.encode()) self.debug('COMPARE {}: ssdeep={} (size={}), machoc={} (num of bb={})'.format(fname, pfhd, pbsize, pfhm, pcfgnum)) min_ = pbsize * (1 - (self.ratio / 100)) max_ = pbsize * (1 + (self.ratio / 100)) self.debug('min={}, max={}'.format(min_, max_)) if self.f_fol_cmp: self.mcur.execute(sql, (self.ana_fol+'%', min_, max_)) else: self.mcur.execute(sql, (min_, max_)) frows = self.mcur.fetchall() self.debug('targeted {} records'.format(len(frows))) for sha256, sfname, sfhd, sfhm, sf_ana, sptype in frows: if sha256 == self.sha256: # skip the self continue res[sha256]['mfn'][fva].default_factory = lambda: 0 sbuf = ctypes.create_string_buffer(sfhd.encode()) score = fuzzy_lib.fuzzy_compare(pbuf, sbuf) dbg_cond = g_dbg_flag and fva == g_dbg_fva and sfname == g_dbg_fname and sha256 == g_dbg_sha256 if dbg_cond: print(('{:#x}: compared with {} in {} score = {} machoc match = {}'.format(fva, sfname, sha256, score, bool(pfhm == sfhm)))) if (score >= self.threshold) or (score >= self.threshold_cfg and pfhm == sfhm) or (pbsize > self.max_bytes_for_score and pfhm == sfhm): if dbg_cond: print(('{:#x}: counting {} in {} for total number'.format(fva, sfname, sha256))) res[sha256]['mcnt']['total'] += 1 if sf_ana: res[sha256]['mcnt']['analyzed'] += 1 if score > res[sha256]['mfn'][fva]['score'] or (res[sha256]['mfn'][fva]['score'] == 0 and pbsize > self.max_bytes_for_score): res[sha256]['mfn'][fva]['score'] = score res[sha256]['mfn'][fva]['cfg_match'] = bool(pfhm == sfhm) res[sha256]['mfn'][fva]['sfname'] = sfname res[sha256]['mfn'][fva]['sptype'] = sptype res[sha256]['mfn'][fva]['pbsize'] = pbsize if dbg_cond: print(('{:#x}: appended record = {} in {}'.format(fva, sfname, sha256))) c = SummaryCh("fn_fuzzy summary", res) c.Show() success('totally {} samples compared'.format(num_of_samples)) def close(self): self.conn.commit() self.cur.close() def info(msg): print("[*] {}".format(msg)) def success(msg): print("[+] {}".format(msg)) def error(msg): print("[!] {}".format(msg)) def get_hex_pat(buf): # get hex pattern return ' '.join(['{:02x}'.format(ord(x)) for x in buf]) def shex(a): return hex(a).rstrip("L") def set_decomplier_cmt(ea, cmt): cfunc = idaapi.decompile(ea) tl = idaapi.treeloc_t() tl.ea = ea tl.itp = idaapi.ITP_SEMI if cfunc: cfunc.set_user_cmt(tl, cmt) cfunc.save_user_cmts() else: error("Decompile failed: {:#x}".formart(ea)) def main(): info('start') if idaapi.get_plugin_options("fn_fuzzy"): # CLI (export only) # not change the database to maintain the window setting process_config_line("ABANDON_DATABASE=YES") start = time.time() options = idaapi.get_plugin_options("fn_fuzzy").split(':') #print options min_bytes = int(options[0]) f_ex_libthunk = eval(options[1]) f_update = eval(options[2]) f_ana_exp = eval(options[3]) ana_pre = options[4] db_path = ':'.join(options[5:]) ff = FnFuzzy(False, db_path, min_bytes, f_ex_libthunk, f_update, f_ana_exp, ana_pre) res = ff.export() ff.close() elapsed = time.time() - start info('done (CLI)') if res: # return code 1 is reserved for error qexit(0) else: qexit(2) # already exported (skipped) else: f = FnFuzzyForm() f.Compile() f.iDBSave.value = g_db_path f.iMinBytes.value = g_min_bytes f.iPrefix.value = g_analyzed_prefix f.iFolder.value = os.path.dirname(get_idb_path()) f.iSimilarity.value = g_threshold f.iSimilarityCFG.value = g_threshold_cfg f.iMaxBytesForScore.value = g_max_bytes_for_score f.iRatio.value = g_bsize_ratio r = f.Execute() if r == 1: # Run start = time.time() ff = FnFuzzy(f.cDebug.checked, f.iDBSave.value, f.iMinBytes.value, f.cLibthunk.checked, f.cUpdate.checked, f.cAnaExp.checked, f.iPrefix.value, f.cAnaCmp.checked, f.cFolCmp.checked, f.iFolder.value, f.iSimilarity.value, f.iSimilarityCFG.value, f.iMaxBytesForScore.value, f.iRatio.value) if f.rExport.selected: if ff.sha256 is None: print('aborted') return ff.export() #cProfile.runctx('ff.export()', None, locals()) else: ff.compare() #cProfile.runctx('ff.compare()', None, locals()) ff.close() elapsed = time.time() - start else: print('canceled') return info('elapsed time = {} sec'.format(elapsed)) info('done') if __name__ == '__main__': main() ================================================ FILE: fn_fuzzy/fn_fuzzy_7x.py ================================================ # fn_fuzzy.py - IDAPython script for fast multiple binary diffing triage # Takahiro Haruyama (@cci_forensics) import os, ctypes, sqlite3, re, time, sys, subprocess import cProfile from collections import defaultdict from pprint import PrettyPrinter from io import StringIO from tqdm import tqdm from idc import * import idautils, ida_nalt, ida_kernwin, idaapi, ida_expr, ida_typeinf import mmh3 import yara_fn_7x # modified version in the same folder g_db_path = r'C:\analysisw\tics\fn_fuzzy.sqlite' # plz edit your path g_min_bytes = 0x10 # minimum number of extracted code bytes per function g_analyzed_prefix = r'fn_|func_' # analyzed function name prefix (regex) g_threshold = 50 # function similarity score threshold without CFG match g_threshold_cfg = 10 # function similarity score threshold with CFG match g_max_bytes_for_score = 0x100 # more code bytes are evaluated by only CFG match g_bsize_ratio = 40 # function binary size correction ratio to compare (40 is enough) # debug purpose to check one function matching g_dbg_flag = False g_dbg_fva = 0x180015978 g_dbg_fname = 'fn_blob_get_word_param_and_seek' g_dbg_sha256 = '' # initialization for ssdeep SPAMSUM_LENGTH = 64 FUZZY_MAX_RESULT = (2 * SPAMSUM_LENGTH + 20) dirpath = os.path.dirname(__file__) _lib_path = os.path.join(dirpath, 'fuzzy64.dll') fuzzy_lib = ctypes.cdll.LoadLibrary(_lib_path) g_dump_types_path = os.path.join(dirpath, 'dump_types.py') class defaultdictRecurse(defaultdict): def __init__(self): self.default_factory = type(self) class import_handler_t(ida_kernwin.action_handler_t): def __init__(self, items, idb_path, title): ida_kernwin.action_handler_t.__init__(self) self.items = items self.idb_path = idb_path self.title = title def import_types(self): idc_path = os.path.splitext(self.idb_path)[0] + '.idc' # dump type information from the 2nd idb if not (os.path.exists(idc_path)): with open(self.idb_path, 'rb') as f: sig = f.read(4) ida = 'ida.exe' if sig == 'IDA1' else 'ida64.exe' ida_path = os.path.join(idadir(), ida) cmd = [ida_path, '-S{}'.format(g_dump_types_path), self.idb_path] #print cmd proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() if proc.returncode == 0: success('{}: type information successfully dumped'.format(self.idb_path)) else: error('{}: type information dumping failed'.format(self.idb_path)) return False # import the type information idc_path = os.path.splitext(self.idb_path)[0] + '.idc' ida_expr.exec_idc_script(None, str(idc_path), "main", None, 0) return True def activate(self, ctx): sel = [] for idx in ctx.chooser_selection: # rename the function ea = get_name_ea_simple(self.items[idx][2]) sfname = str(self.items[idx][4]) #set_name(ea, sfname) ida_name.force_name(ea, sfname) success('{:#x}: renamed to {}'.format(ea, sfname)) # set the function prototype sptype = str(self.items[idx][5]) if sptype != 'None': tinfo = idaapi.tinfo_t() idaapi.parse_decl2(idaapi.cvar.idati, sptype, tinfo, 0) #idaapi.apply_callee_tinfo(ea, tinfo) if idaapi.apply_tinfo(ea, tinfo, 0): success('{:#x}: function prototype set to {}'.format(ea, sptype)) else: error('{:#x}: function prototype set FAILED (maybe you should import the types?)'.format(ea)) if ask_yn(0, 'Do you import types from the secondary idb?') == 1: if self.import_types(): tinfo = idaapi.tinfo_t() idaapi.parse_decl2(idaapi.cvar.idati, sptype, tinfo, 0) if idaapi.apply_tinfo(ea, tinfo, 0): success('{:#x}: function prototype set to {}'.format(ea, sptype)) else: error('{:#x}: function prototype set FAILED again'.format(ea)) # insert the comment score = self.items[idx][0] mmatch = self.items[idx][1] cmt = 'fn_fuzzy: ssdeep={}, machoc={}'.format(score, mmatch) set_func_cmt(ea, cmt, 1) #set_decomplier_cmt(ea, cmt) # not sure how to avoid orphan comment # update the Choose rows ida_kernwin.refresh_chooser(self.title) def update(self, ctx): return idaapi.AST_ENABLE_ALWAYS ''' return ida_kernwin.AST_ENABLE_FOR_WIDGET \ if ida_kernwin.is_chooser_widget(ctx.widget_type) \ else ida_kernwin.AST_DISABLE_FOR_WIDGET ''' class FnCh(ida_kernwin.Choose): def __init__(self, title, mfn, idb_path): self.mfn = mfn self.idb_path = idb_path self.title = title ida_kernwin.Choose.__init__( self, title, [ ["ssdeep score", 10 | ida_kernwin.Choose.CHCOL_DEC], ["machoc matched", 10 | ida_kernwin.Choose.CHCOL_PLAIN], ["primary function", 30 | ida_kernwin.Choose.CHCOL_PLAIN], ["primary bsize", 10 | ida_kernwin.Choose.CHCOL_DEC], ["secondary analyzed function", 30 | ida_kernwin.Choose.CHCOL_PLAIN], ["secondary prototype", 40 | ida_kernwin.Choose.CHCOL_PLAIN] ], flags = ida_kernwin.Choose.CH_MULTI) def OnInit(self): self.items = [] for fva,v in sorted(list(self.mfn.items()), key=lambda x:x[1]['score'], reverse=True): if v['sfname']: self.items.append(['{}'.format(v['score']), '{}'.format(v['cfg_match']), str(get_name(fva)), '{}'.format(v['pbsize']), str(v['sfname']), '{}'.format(v['sptype'])]) return True def OnPopup(self, form, popup_handle): actname = "choose:actFnFuzzyImport" desc = ida_kernwin.action_desc_t(actname, 'Import function name and prototype', import_handler_t(self.items, self.idb_path, self.title)) ida_kernwin.attach_dynamic_action_to_popup(form, popup_handle, desc) def OnGetSize(self): return len(self.items) def OnGetLine(self, n): return self.items[n] def OnSelectLine(self, n): idx = n[0] # due to CH_MULTI ida_kernwin.jumpto(get_name_ea_simple(self.items[idx][2])) def OnRefresh(self, n): self.OnInit() # try to preserve the cursor #return [ida_kernwin.Choose.ALL_CHANGED] + self.adjust_last_item(n) #return n return None def OnClose(self): print("closed ", self.title) class SummaryCh(ida_kernwin.Choose): def __init__(self, title, res): self.res = res ida_kernwin.Choose.__init__( self, title, [ ["SHA256", 20 | ida_kernwin.Choose.CHCOL_PLAIN], ["total similar functions", 20 | ida_kernwin.Choose.CHCOL_DEC], ["analyzed similar functions", 20 | ida_kernwin.Choose.CHCOL_DEC], ["IDB path", 80 | ida_kernwin.Choose.CHCOL_PATH] ]) self.items = [] def OnInit(self): for sha256,v in sorted(list(self.res.items()), key=lambda x:x[1]['mcnt']['total'], reverse=True): if v['mcnt']['total'] > 0: self.items.append([str(sha256), '{}'.format(v['mcnt']['total']), '{}'.format(v['mcnt']['analyzed']), str(v['path'])]) return True def OnGetSize(self): return len(self.items) def OnGetLine(self, n): return self.items[n] def OnSelectLine(self, n): sha256 = self.items[n][0] c = FnCh("similarities with {}(snip)".format(sha256[:8]), self.res[sha256]['mfn'], self.res[sha256]['path']) c.Show() def OnRefresh(self, n): return n def OnClose(self): print("closed ", self.title) class FnFuzzyForm(ida_kernwin.Form): def __init__(self): ida_kernwin.Form.__init__(self, r"""BUTTON YES* Run BUTTON CANCEL Cancel fn_fuzzy {FormChangeCb} General Options {cGroup}> <##Commands##Export:{rExport}> {rGroup}> Export Options {cEGroup}> Compare Options {cCGroup}> """, { 'FormChangeCb': ida_kernwin.Form.FormChangeCb(self.OnFormChange), 'cGroup': ida_kernwin.Form.ChkGroupControl(("cLibthunk", "cDebug")), 'iDBSave': ida_kernwin.Form.FileInput(save=True), 'iMinBytes': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_HEX), 'rGroup': ida_kernwin.Form.RadGroupControl(("rCompare", "rExport")), 'cEGroup': ida_kernwin.Form.ChkGroupControl(("cUpdate", "cAnaExp")), 'iPrefix': ida_kernwin.Form.StringInput(), 'cCGroup': ida_kernwin.Form.ChkGroupControl(("cAnaCmp", "cFolCmp")), 'iFolder': ida_kernwin.Form.DirInput(), 'iRatio': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_DEC), 'iSimilarity': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_DEC), 'iSimilarityCFG': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_DEC), 'iMaxBytesForScore': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_HEX), }) def OnFormChange(self, fid): if fid == -1: self.SetControlValue(self.cLibthunk, True) self.SetControlValue(self.cAnaExp, True) self.SetControlValue(self.cAnaCmp, True) self.SetControlValue(self.rCompare, True) self.EnableField(self.cEGroup, False) self.EnableField(self.iPrefix, False) self.EnableField(self.cCGroup, True) self.EnableField(self.iSimilarity, True) self.EnableField(self.iSimilarityCFG, True) self.EnableField(self.iMaxBytesForScore, True) self.EnableField(self.iRatio, True) if fid == self.rExport.id: self.EnableField(self.cEGroup, True) self.EnableField(self.iPrefix, True) self.EnableField(self.cCGroup, False) self.EnableField(self.iSimilarity, False) self.EnableField(self.iSimilarityCFG, False) self.EnableField(self.iMaxBytesForScore, False) self.EnableField(self.iRatio, False) elif fid == self.rCompare.id: self.EnableField(self.cEGroup, False) self.EnableField(self.iPrefix, False) self.EnableField(self.cCGroup, True) self.EnableField(self.iSimilarity, True) self.EnableField(self.iSimilarityCFG, True) self.EnableField(self.iMaxBytesForScore, True) self.EnableField(self.iRatio, True) return 1 class FnFuzzy(object): def __init__(self, f_debug, db_path, min_bytes, f_ex_libthunk, f_update, f_ana_exp, ana_pre, f_ana_cmp = False, f_fol_cmp = False, ana_fol='', threshold = None, threshold_cfg = None, max_bytes_for_score = None, ratio = 0): self.f_debug = f_debug self.conn = sqlite3.connect(db_path) self.cur = self.conn.cursor() self.init_db() self.in_memory_db() self.min_bytes = min_bytes self.f_ex_libthunk = f_ex_libthunk # for export self.f_update = f_update self.f_ana_exp = f_ana_exp self.ana_pre = ana_pre if f_ana_exp: self.ana_pat = re.compile(self.ana_pre) # for compare self.f_ana_cmp = f_ana_cmp self.f_fol_cmp = f_fol_cmp self.ana_fol = ana_fol self.threshold = threshold self.threshold_cfg = threshold_cfg self.max_bytes_for_score = max_bytes_for_score self.ratio = float(ratio) self.idb_path = get_idb_path() self.sha256 = ida_nalt.retrieve_input_file_sha256() try: #self.sha256 = self.sha256.lower() self.sha256 = self.sha256.hex() self.md5 = ida_nalt.retrieve_input_file_md5().lower() except AttributeError: message = 'ida_nalt.retrieve_input_file_sha256() returned None. Probably the IDB was generated by old IDA (<6.9). Check the version by ida_netnode.cvar.root_node.supstr(ida_nalt.RIDX_IDA_VERSION)' error(message) #ida_kernwin.warning(message) def debug(self, msg): if self.f_debug: print("[D] {}".format(msg)) def init_db(self): self.cur.execute("SELECT * FROM sqlite_master WHERE type='table'") if self.cur.fetchone() is None: info('DB initialized') self.cur.execute("CREATE TABLE IF NOT EXISTS sample(sha256 UNIQUE, path)") #self.cur.execute("CREATE INDEX sha256_index ON sample(sha256)") self.cur.execute("CREATE INDEX path_index ON sample(path)") self.cur.execute("CREATE TABLE IF NOT EXISTS function(sha256, fva, fname, fhd, fhm, f_ana, bsize, ptype, UNIQUE(sha256, fva))") self.cur.execute("CREATE INDEX f_ana_index ON function(f_ana)") self.cur.execute("CREATE INDEX bsize_index ON function(bsize)") def in_memory_db(self): # for SELECT tempfile = StringIO() for line in self.conn.iterdump(): tempfile.write("{}\n".format(line)) tempfile.seek(0) self.mconn = sqlite3.connect(":memory:") self.mconn.cursor().executescript(tempfile.read()) self.mconn.commit() self.mconn.row_factory=sqlite3.Row self.mcur = self.mconn.cursor() def calc_fn_machoc(self, fva, fname): # based on Machoc hash implementation (https://github.com/0x00ach/idadiff) func = idaapi.get_func(fva) if type(func) == type(None): self.debug('{}: ignored due to lack of function object'.format(fname)) return None, None flow = idaapi.FlowChart(f=func) cur_hash_rev = "" addrIds = [] cur_id = 1 for c in range(0,flow.size): cur_basic = flow.__getitem__(c) cur_hash_rev += shex(cur_basic.start_ea)+":" addrIds.append((shex(cur_basic.start_ea),str(cur_id))) cur_id += 1 addr = cur_basic.start_ea blockEnd = cur_basic.end_ea mnem = idc.print_insn_mnem(addr) while mnem != "": if mnem == "call": # should be separated into 2 blocks by call cur_hash_rev += "c," addr = idc.next_head(addr,blockEnd) mnem = idc.print_insn_mnem(addr) if addr != BADADDR: cur_hash_rev += shex(addr)+";"+shex(addr)+":" addrIds.append((shex(addr),str(cur_id))) cur_id += 1 else: addr = idc.next_head(addr,blockEnd) mnem = idc.print_insn_mnem(addr) refs = [] for suc in cur_basic.succs(): refs.append(suc.start_ea) refs.sort() refsrev = "" for ref in refs: refsrev += shex(ref)+"," if refsrev != "": refsrev = refsrev[:-1] cur_hash_rev += refsrev+";" # change addr to index for aid in addrIds: #cur_hash_rev = string.replace(cur_hash_rev,aid[0],aid[1]) cur_hash_rev = cur_hash_rev.replace(aid[0],aid[1]) # calculate machoc hash value self.debug('{}: CFG = {}'.format(fname, cur_hash_rev)) return mmh3.hash(cur_hash_rev) & 0xFFFFFFFF, cur_id-1 def calc_fn_ssdeep(self, fva, fname): d2h = b'' for bb in yara_fn_7x.get_basic_blocks(fva): rule = yara_fn_7x.get_basic_block_rule(bb) if rule: chk = rule.cut_bytes_for_hash if len(chk) < yara_fn_7x.MIN_BB_BYTE_COUNT: continue d2h += chk.encode() #self.debug('chunk at {:#x}: {}'.format(bb.va, get_hex_pat(chk))) #self.debug('total func seq at {:#x}: {}'.format(fva, get_hex_pat(d2h))) if len(d2h) < self.min_bytes: self.debug('{}: ignored because of the number of extracted code bytes {}'.format(fname, len(d2h))) return None, None result_buffer = ctypes.create_string_buffer(FUZZY_MAX_RESULT) file_buffer = ctypes.create_string_buffer(d2h) hash_result = fuzzy_lib.fuzzy_hash_buf(file_buffer, len(file_buffer) - 1, result_buffer) hash_value = result_buffer.value.decode("ascii") return hash_value, len(d2h) def existed(self): self.mcur.execute("SELECT sha256 FROM sample WHERE sha256 = ?", (self.sha256,)) if self.mcur.fetchone() is None: return False else: return True def exclude_libthunk(self, fva, fname): if self.f_ex_libthunk: flags = get_func_attr(fva, FUNCATTR_FLAGS) if flags & FUNC_LIB: self.debug('{}: ignored because of library function'.format(fname)) return True if flags & FUNC_THUNK: self.debug('{}: ignored because of thunk function'.format(fname)) return True return False def export(self): if self.existed() and not self.f_update: info('{}: The sample records are present in DB. skipped.'.format(self.sha256)) return False self.cur.execute("REPLACE INTO sample values(?, ?)", (self.sha256, self.idb_path)) pnum = tnum = 0 records = [] for fva in idautils.Functions(): fname = get_func_name(fva) tnum += 1 if self.exclude_libthunk(fva, fname): continue fhd, bsize = self.calc_fn_ssdeep(fva, fname) fhm, cfgnum = self.calc_fn_machoc(fva, fname) if fhd and fhm: pnum += 1 f_ana = bool(self.ana_pat.search(fname)) if self.f_ana_exp else False #tinfo = idaapi.tinfo_t() #idc.get_tinfo(fva, tinfo) #tif = ida_typeinf.tinfo_t() #tinfo = idc.get_tinfo(fva) #ptype = idaapi.print_tinfo('', 0, 0, idaapi.PRTYPE_1LINE, tinfo, fname, '') ptype = ida_typeinf.idc_get_type(fva) ptype = ptype + ';' if ptype is not None else ptype # fva is 64-bit int causing OverflowError records.append((self.sha256, '{:#x}'.format(fva), fname, fhd, fhm, f_ana, bsize, ptype)) self.debug('EXPORT {} at {:#x}: ssdeep={} (size={}), machoc={} (num of CFG={})'.format(fname, fva, fhd, bsize, fhm, cfgnum)) self.cur.executemany("REPLACE INTO function values (?, ?, ?, ?, ?, ?, ?, ?)", records) success ('{} of {} functions exported'.format(pnum, tnum)) return True def compare(self): res = defaultdictRecurse() if self.f_fol_cmp: self.mcur.execute("SELECT sha256,path FROM sample WHERE path LIKE ?", (self.ana_fol+'%',)) else: self.mcur.execute("SELECT sha256,path FROM sample") frows = self.mcur.fetchall() num_of_samples = len(frows) for sha256, path in frows: res[sha256]['path'] = path res[sha256]['mcnt'].default_factory = lambda: 0 #sql = "SELECT sha256,fname,fhd,fhm,f_ana,ptype FROM function WHERE f_ana == 1 AND bsize BETWEEN ? AND ?" if self.f_ana_cmp else "SELECT sha256,fname,fhd,fhm,f_ana,ptype FROM function WHERE bsize BETWEEN ? AND ?" sql = "SELECT function.sha256,fname,fhd,fhm,f_ana,ptype FROM function INNER JOIN sample on function.sha256 == sample.sha256 WHERE path LIKE ? AND " if self.f_fol_cmp else "SELECT sha256,fname,fhd,fhm,f_ana,ptype FROM function WHERE " sql += "f_ana == 1 AND bsize BETWEEN ? AND ?" if self.f_ana_cmp else "bsize BETWEEN ? AND ?" fns = list(idautils.Functions()) for fva in tqdm(fns, desc='comparing functions'): fname = get_func_name(fva) if self.exclude_libthunk(fva, fname) or not num_of_samples: continue pfhd, pbsize = self.calc_fn_ssdeep(fva, fname) pfhm, pcfgnum = self.calc_fn_machoc(fva, fname) if pfhd and pfhm: pbuf = ctypes.create_string_buffer(pfhd.encode()) self.debug('COMPARE {}: ssdeep={} (size={}), machoc={} (num of bb={})'.format(fname, pfhd, pbsize, pfhm, pcfgnum)) min_ = pbsize * (1 - (self.ratio / 100)) max_ = pbsize * (1 + (self.ratio / 100)) self.debug('min={}, max={}'.format(min_, max_)) if self.f_fol_cmp: self.mcur.execute(sql, (self.ana_fol+'%', min_, max_)) else: self.mcur.execute(sql, (min_, max_)) frows = self.mcur.fetchall() self.debug('targeted {} records'.format(len(frows))) for sha256, sfname, sfhd, sfhm, sf_ana, sptype in frows: if sha256 == self.sha256: # skip the self continue res[sha256]['mfn'][fva].default_factory = lambda: 0 sbuf = ctypes.create_string_buffer(sfhd.encode()) score = fuzzy_lib.fuzzy_compare(pbuf, sbuf) dbg_cond = g_dbg_flag and fva == g_dbg_fva and sfname == g_dbg_fname and sha256 == g_dbg_sha256 if dbg_cond: print(('{:#x}: compared with {} in {} score = {} machoc match = {}'.format(fva, sfname, sha256, score, bool(pfhm == sfhm)))) if (score >= self.threshold) or (score >= self.threshold_cfg and pfhm == sfhm) or (pbsize > self.max_bytes_for_score and pfhm == sfhm): if dbg_cond: print(('{:#x}: counting {} in {} for total number'.format(fva, sfname, sha256))) res[sha256]['mcnt']['total'] += 1 if sf_ana: res[sha256]['mcnt']['analyzed'] += 1 if score > res[sha256]['mfn'][fva]['score'] or (res[sha256]['mfn'][fva]['score'] == 0 and pbsize > self.max_bytes_for_score): res[sha256]['mfn'][fva]['score'] = score res[sha256]['mfn'][fva]['cfg_match'] = bool(pfhm == sfhm) res[sha256]['mfn'][fva]['sfname'] = sfname res[sha256]['mfn'][fva]['sptype'] = sptype res[sha256]['mfn'][fva]['pbsize'] = pbsize if dbg_cond: print(('{:#x}: appended record = {} in {}'.format(fva, sfname, sha256))) c = SummaryCh("fn_fuzzy summary", res) c.Show() success('totally {} samples compared'.format(num_of_samples)) def close(self): self.conn.commit() self.cur.close() def info(msg): print("[*] {}".format(msg)) def success(msg): print("[+] {}".format(msg)) def error(msg): print("[!] {}".format(msg)) def get_hex_pat(buf): # get hex pattern return ' '.join(['{:02x}'.format(ord(x)) for x in buf]) def shex(a): return hex(a).rstrip("L") def set_decomplier_cmt(ea, cmt): cfunc = idaapi.decompile(ea) tl = idaapi.treeloc_t() tl.ea = ea tl.itp = idaapi.ITP_SEMI if cfunc: cfunc.set_user_cmt(tl, cmt) cfunc.save_user_cmts() else: error("Decompile failed: {:#x}".formart(ea)) def main(): info('start') if idaapi.get_plugin_options("fn_fuzzy"): # CLI (export only) # not change the database to maintain the window setting process_config_line("ABANDON_DATABASE=YES") start = time.time() options = idaapi.get_plugin_options("fn_fuzzy").split(':') #print options min_bytes = int(options[0]) f_ex_libthunk = eval(options[1]) f_update = eval(options[2]) f_ana_exp = eval(options[3]) ana_pre = options[4] db_path = ':'.join(options[5:]) ff = FnFuzzy(False, db_path, min_bytes, f_ex_libthunk, f_update, f_ana_exp, ana_pre) res = ff.export() ff.close() elapsed = time.time() - start info('done (CLI)') if res: # return code 1 is reserved for error qexit(0) else: qexit(2) # already exported (skipped) else: f = FnFuzzyForm() f.Compile() f.iDBSave.value = g_db_path f.iMinBytes.value = g_min_bytes f.iPrefix.value = g_analyzed_prefix f.iFolder.value = os.path.dirname(get_idb_path()) f.iSimilarity.value = g_threshold f.iSimilarityCFG.value = g_threshold_cfg f.iMaxBytesForScore.value = g_max_bytes_for_score f.iRatio.value = g_bsize_ratio r = f.Execute() if r == 1: # Run start = time.time() ff = FnFuzzy(f.cDebug.checked, f.iDBSave.value, f.iMinBytes.value, f.cLibthunk.checked, f.cUpdate.checked, f.cAnaExp.checked, f.iPrefix.value, f.cAnaCmp.checked, f.cFolCmp.checked, f.iFolder.value, f.iSimilarity.value, f.iSimilarityCFG.value, f.iMaxBytesForScore.value, f.iRatio.value) if f.rExport.selected: if ff.sha256 is None: print('aborted') return ff.export() #cProfile.runctx('ff.export()', None, locals()) else: ff.compare() #cProfile.runctx('ff.compare()', None, locals()) ff.close() elapsed = time.time() - start else: print('canceled') return info('elapsed time = {} sec'.format(elapsed)) info('done') if __name__ == '__main__': main() ================================================ FILE: fn_fuzzy/yara_fn.py ================================================ ''' IDAPython script that generates a YARA rule to match against the basic blocks of the current function. It masks out relocation bytes and ignores jump instructions (given that we're already trying to match compiler-specific bytes, this is of arguable benefit). If python-yara is installed, the IDAPython script also validates that the generated rule matches at least one segment in the current file. author: Willi Ballenthin ''' # 2018/8/6 Takahiro Haruyama modified to calculate fixup (relocation) size correctly # and exclude direct memory reference data and other ignorable variable code import logging from collections import namedtuple from idc import * import idaapi import idautils import ida_ua, ida_kernwin logger = logging.getLogger(__name__) BasicBlock = namedtuple('BasicBlock', ['va', 'size']) # each rule must have at least this many non-masked bytes MIN_BB_BYTE_COUNT = 4 def get_basic_blocks(fva): ''' return sequence of `BasicBlock` instances for given function. ''' ret = [] func = idaapi.get_func(fva) if func is None: return ret for bb in idaapi.FlowChart(func): ret.append(BasicBlock(va=bb.start_ea, size=bb.end_ea - bb.start_ea)) return ret def get_function(va): ''' return va for first instruction in function that contains given va. ''' return idaapi.get_func(va).start_ea Rule = namedtuple('Rule', ['name', 'bytes', 'masked_bytes', 'cut_bytes_for_hash']) def is_jump(va): ''' return True if the instruction at the given address appears to be a jump. ''' return print_insn_mnem(va).startswith('j') def get_fixup_va_and_size(va): fva = idaapi.get_next_fixup_ea(va) ftype = get_fixup_target_type(fva) fsize = ida_fixup.calc_fixup_size(ftype) return fva, fsize def get_basic_block_rule(bb): ''' create and format a YARA rule for a single basic block. The following bytes are ignored: - relocation bytes - the last jump instruction - direct memory references / immediate values and other igorable data ''' # fetch the instruction start addresses insns = [] va = bb.va while va < bb.va + bb.size: insns.append(va) va = next_head(va) # drop the last instruction if its a jump if insns and is_jump(insns[-1]): insns = insns[:-1] _bytes = [] # `masked_bytes` is the list of formatted bytes, # not yet join'd for performance. masked_bytes = [] cut_bytes_for_hash = '' for va in insns: insn = ida_ua.insn_t() size = ida_ua.decode_insn(insn, va) mnem = insn.get_canon_mnem() op1 = insn.Op1 op2 = insn.Op2 fixup_byte_addrs = set([]) if idaapi.contains_fixups(va, size): # not work for x64 binaries? (e.g., idaapi.contains_fixups(here(), 0x2d000) -> False) logging.debug('ea = {:#x}, fixups'.format(va)) # fetch the fixup locations and sizes within this one instruction. fixups = [] fva, fsize = get_fixup_va_and_size(va) fixups.append((fva, fsize)) fva += fsize while fva < va + size: fva, fsize = get_fixup_va_and_size(fva - 1) # to detect consecutive fixups fixups.append((fva, fsize)) fva += fsize logging.debug('fixups: {}'.format(fixups)) # compute the addresses of each component byte. for fva, fsize in fixups: for i in range(fva, fva+fsize): fixup_byte_addrs.add(i) # fetch and format each byte of the instruction, # possibly masking it into an unknown byte if its a fixup or several operand types like direct mem ref. masked_types = [o_mem, o_imm, o_displ, o_near, o_far] #masked_types = [o_mem, o_imm, o_near, o_far] bytes_ = get_bytes(va, size) if bytes_ is None: return None for i, byte in enumerate(bytes_): _bytes.append(byte) byte_addr = i + va if byte_addr in fixup_byte_addrs: logging.debug('{:#x}: fixup byte (masked)'.format(byte_addr)) masked_bytes.append('??') elif op1.type in masked_types and i >= op1.offb and (i < op2.offb or op2.offb == 0): logging.debug('{:#x}: Op1 masked byte'.format(byte_addr)) masked_bytes.append('??') elif op2.type in masked_types and i >= op2.offb: logging.debug('{:#x}: Op2 masked byte'.format(byte_addr)) masked_bytes.append('??') else: masked_bytes.append('%02X' % (byte)) # for Python3 cut_bytes_for_hash += chr(byte) return Rule('$0x%x' % (bb.va), _bytes, masked_bytes, cut_bytes_for_hash) def format_rules(fva, rules): ''' given the address of a function, and the byte signatures for basic blocks in the function, format a complete YARA rule that matches all of the basic block signatures. ''' name = GetFunctionName(fva) if not rules: logging.info('no rules for {}'.format(name)) return None # some characters aren't valid for YARA rule names safe_name = name BAD_CHARS = '@ /\\!@#$%^&*()[]{};:\'",./<>?' for c in BAD_CHARS: safe_name = safe_name.replace(c, '') md5 = idautils.GetInputFileMD5() ret = [] ret.append('rule a_{hash:s}_{name:s} {{'.format( hash=md5.hex(), name=safe_name)) ret.append(' meta:') ret.append(' sample_md5 = "{md5:s}"'.format(md5=md5.hex())) ret.append(' function_address = "0x{fva:x}"'.format(fva=fva)) ret.append(' function_name = "{name:s}"'.format(name=name)) ret.append(' strings:') for rule in rules: formatted_rule = ' '.join(rule.masked_bytes).rstrip('?? ') ret.append(' {name:s} = {{ {hex:s} }}'.format( name=rule.name, hex=formatted_rule)) ret.append(' condition:') ret.append(' all of them') ret.append('}') return '\n'.join(ret) def create_yara_rule_for_function(fva): ''' given the address of a function, generate and format a complete YARA rule that matches the basic blocks. ''' rules = [] for bb in get_basic_blocks(fva): rule = get_basic_block_rule(bb) if rule: # ensure there at least MIN_BB_BYTE_COUNT # non-masked bytes in the rule, or ignore it. # this will reduce the incidence of many very small matches. unmasked_count = len([b for b in rule.masked_bytes if b != '??']) if unmasked_count < MIN_BB_BYTE_COUNT: continue rules.append(rule) return format_rules(fva, rules) def get_segment_buffer(segstart): ''' fetch the bytes of the section that starts at the given address. if the entire section cannot be accessed, try smaller regions until it works. ''' segend = idaapi.getseg(segstart).end_ea buf = None segsize = segend - segstart while buf is None and segsize > 0: buf = GetManyBytes(segstart, segsize) if buf is None: segsize -= 0x1000 return buf Segment = namedtuple('Segment', ['start', 'size', 'name', 'buf']) def get_segments(): ''' fetch the segments in the current executable. ''' for segstart in idautils.Segments(): segend = idaapi.getseg(segstart).end_ea segsize = segend - segstart segname = str(SegName(segstart)).rstrip('\x00') segbuf = get_segment_buffer(segstart) yield Segment(segstart, segend, segname, segbuf) class TestDidntRunError(Exception): pass def test_yara_rule(rule): ''' try to match the given rule against each segment in the current exectuable. raise TestDidntRunError if its not possible to import the YARA library. return True if there's at least one match, False otherwise. ''' try: import yara except ImportError: logger.warning("can't test rule: failed to import python-yara") raise TestDidntRunError('python-yara not available') r = yara.compile(source=rule) for segment in get_segments(): if segment.buf is not None: matches = r.match(data=segment.buf) if len(matches) > 0: logger.info('generated rule matches section: {:s}'.format(segment.name)) return True return False def main(): print('Start') ans = ida_kernwin.ask_yn(0, 'define only selected function?') if ans: va = ScreenEA() fva = get_function(va) print(('-' * 80)) rule = create_yara_rule_for_function(fva) if rule: print(rule) ''' if test_yara_rule(rule): logging.info('success: validated the generated rule') else: logging.error('error: failed to validate generated rule') ''' else: for fva in idautils.Functions(): print(('-' * 80)) rule = create_yara_rule_for_function(fva) if rule: print(rule) print('Done') if __name__ == '__main__': logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.INFO) #logging.basicConfig(level=logging.DEBUG) #logging.getLogger().setLevel(logging.DEBUG) main() ================================================ FILE: fn_fuzzy/yara_fn_7x.py ================================================ ''' IDAPython script that generates a YARA rule to match against the basic blocks of the current function. It masks out relocation bytes and ignores jump instructions (given that we're already trying to match compiler-specific bytes, this is of arguable benefit). If python-yara is installed, the IDAPython script also validates that the generated rule matches at least one segment in the current file. author: Willi Ballenthin ''' # 2018/8/6 Takahiro Haruyama modified to calculate fixup (relocation) size correctly # and exclude direct memory reference data and other ignorable variable code import logging from collections import namedtuple from idc import * import idaapi import idautils import ida_ua, ida_kernwin logger = logging.getLogger(__name__) BasicBlock = namedtuple('BasicBlock', ['va', 'size']) # each rule must have at least this many non-masked bytes MIN_BB_BYTE_COUNT = 4 def get_basic_blocks(fva): ''' return sequence of `BasicBlock` instances for given function. ''' ret = [] func = idaapi.get_func(fva) if func is None: return ret for bb in idaapi.FlowChart(func): ret.append(BasicBlock(va=bb.start_ea, size=bb.end_ea - bb.start_ea)) return ret def get_function(va): ''' return va for first instruction in function that contains given va. ''' return idaapi.get_func(va).start_ea Rule = namedtuple('Rule', ['name', 'bytes', 'masked_bytes', 'cut_bytes_for_hash']) def is_jump(va): ''' return True if the instruction at the given address appears to be a jump. ''' return print_insn_mnem(va).startswith('j') def get_fixup_va_and_size(va): fva = idaapi.get_next_fixup_ea(va) ftype = get_fixup_target_type(fva) fsize = ida_fixup.calc_fixup_size(ftype) return fva, fsize def get_basic_block_rule(bb): ''' create and format a YARA rule for a single basic block. The following bytes are ignored: - relocation bytes - the last jump instruction - direct memory references / immediate values and other igorable data ''' # fetch the instruction start addresses insns = [] va = bb.va while va < bb.va + bb.size: insns.append(va) va = next_head(va) # drop the last instruction if its a jump if insns and is_jump(insns[-1]): insns = insns[:-1] _bytes = [] # `masked_bytes` is the list of formatted bytes, # not yet join'd for performance. masked_bytes = [] cut_bytes_for_hash = '' for va in insns: insn = ida_ua.insn_t() size = ida_ua.decode_insn(insn, va) mnem = insn.get_canon_mnem() op1 = insn.Op1 op2 = insn.Op2 fixup_byte_addrs = set([]) if idaapi.contains_fixups(va, size): # not work for x64 binaries? (e.g., idaapi.contains_fixups(here(), 0x2d000) -> False) logging.debug('ea = {:#x}, fixups'.format(va)) # fetch the fixup locations and sizes within this one instruction. fixups = [] fva, fsize = get_fixup_va_and_size(va) fixups.append((fva, fsize)) fva += fsize while fva < va + size: fva, fsize = get_fixup_va_and_size(fva - 1) # to detect consecutive fixups fixups.append((fva, fsize)) fva += fsize logging.debug('fixups: {}'.format(fixups)) # compute the addresses of each component byte. for fva, fsize in fixups: for i in range(fva, fva+fsize): fixup_byte_addrs.add(i) # fetch and format each byte of the instruction, # possibly masking it into an unknown byte if its a fixup or several operand types like direct mem ref. masked_types = [o_mem, o_imm, o_displ, o_near, o_far] #masked_types = [o_mem, o_imm, o_near, o_far] bytes_ = get_bytes(va, size) if bytes_ is None: return None for i, byte in enumerate(bytes_): _bytes.append(byte) byte_addr = i + va if byte_addr in fixup_byte_addrs: logging.debug('{:#x}: fixup byte (masked)'.format(byte_addr)) masked_bytes.append('??') elif op1.type in masked_types and i >= op1.offb and (i < op2.offb or op2.offb == 0): logging.debug('{:#x}: Op1 masked byte'.format(byte_addr)) masked_bytes.append('??') elif op2.type in masked_types and i >= op2.offb: logging.debug('{:#x}: Op2 masked byte'.format(byte_addr)) masked_bytes.append('??') else: masked_bytes.append('%02X' % (byte)) # for Python3 cut_bytes_for_hash += chr(byte) return Rule('$0x%x' % (bb.va), _bytes, masked_bytes, cut_bytes_for_hash) def format_rules(fva, rules): ''' given the address of a function, and the byte signatures for basic blocks in the function, format a complete YARA rule that matches all of the basic block signatures. ''' name = idc.get_func_name(fva) if not rules: logging.info('no rules for {}'.format(name)) return None # some characters aren't valid for YARA rule names safe_name = name BAD_CHARS = '@ /\\!@#$%^&*()[]{};:\'",./<>?' for c in BAD_CHARS: safe_name = safe_name.replace(c, '') md5 = idautils.GetInputFileMD5() ret = [] ret.append('rule a_{hash:s}_{name:s} {{'.format( hash=md5.hex(), name=safe_name)) ret.append(' meta:') ret.append(' sample_md5 = "{md5:s}"'.format(md5=md5.hex())) ret.append(' function_address = "0x{fva:x}"'.format(fva=fva)) ret.append(' function_name = "{name:s}"'.format(name=name)) ret.append(' strings:') for rule in rules: formatted_rule = ' '.join(rule.masked_bytes).rstrip('?? ') ret.append(' {name:s} = {{ {hex:s} }}'.format( name=rule.name, hex=formatted_rule)) ret.append(' condition:') ret.append(' all of them') ret.append('}') return '\n'.join(ret) def create_yara_rule_for_function(fva): ''' given the address of a function, generate and format a complete YARA rule that matches the basic blocks. ''' rules = [] for bb in get_basic_blocks(fva): rule = get_basic_block_rule(bb) if rule: # ensure there at least MIN_BB_BYTE_COUNT # non-masked bytes in the rule, or ignore it. # this will reduce the incidence of many very small matches. unmasked_count = len([b for b in rule.masked_bytes if b != '??']) if unmasked_count < MIN_BB_BYTE_COUNT: continue rules.append(rule) return format_rules(fva, rules) def get_segment_buffer(segstart): ''' fetch the bytes of the section that starts at the given address. if the entire section cannot be accessed, try smaller regions until it works. ''' segend = idaapi.getseg(segstart).end_ea buf = None segsize = segend - segstart while buf is None and segsize > 0: buf = idc.get_bytes(segstart, segsize) if buf is None: segsize -= 0x1000 return buf Segment = namedtuple('Segment', ['start', 'size', 'name', 'buf']) def get_segments(): ''' fetch the segments in the current executable. ''' for segstart in idautils.Segments(): segend = idaapi.getseg(segstart).end_ea segsize = segend - segstart segname = str(idc.get_segm_name(segstart)).rstrip('\x00') segbuf = get_segment_buffer(segstart) yield Segment(segstart, segend, segname, segbuf) class TestDidntRunError(Exception): pass def test_yara_rule(rule): ''' try to match the given rule against each segment in the current exectuable. raise TestDidntRunError if its not possible to import the YARA library. return True if there's at least one match, False otherwise. ''' try: import yara except ImportError: logger.warning("can't test rule: failed to import python-yara") raise TestDidntRunError('python-yara not available') r = yara.compile(source=rule) for segment in get_segments(): if segment.buf is not None: matches = r.match(data=segment.buf) if len(matches) > 0: logger.info('generated rule matches section: {:s}'.format(segment.name)) return True return False def main(): print('Start') ans = ida_kernwin.ask_yn(0, 'define only selected function?') if ans: va = ida_kernwin.get_screen_ea() fva = get_function(va) print(('-' * 80)) rule = create_yara_rule_for_function(fva) if rule: print(rule) ''' if test_yara_rule(rule): logging.info('success: validated the generated rule') else: logging.error('error: failed to validate generated rule') ''' else: for fva in idautils.Functions(): print(('-' * 80)) rule = create_yara_rule_for_function(fva) if rule: print(rule) print('Done') if __name__ == '__main__': logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.INFO) #logging.basicConfig(level=logging.DEBUG) #logging.getLogger().setLevel(logging.DEBUG) main() ================================================ FILE: stackstring_static/README.org ================================================ * stackstring_static.py - IDAPython script statically-recovering strings constructed in stack The motivation is the same as [[https://www.fireeye.com/blog/threat-research/2014/08/flare-ida-pro-script-series-automatic-recovery-of-constructed-strings-in-malware.html][FireEye FLARE script]], but I implemented it statically without Vivisect a few years ago. Note: the script internally renames the stack variables so manually-renamed info in the function will be lost ASCII case: [[./img/sss_asc_after.jpg]] Unicode case: [[./img/sss_uni_after.jpg]] xor-decoding case: [[./img/sss_xor_after.jpg]] ================================================ FILE: stackstring_static/stackstring_static.py ================================================ # stackstring_static.py - IDAPython script statically-recovering strings constructed in stack # Takahiro Haruyama (@cci_forensics) # Note: the script internally renames the stack variables so manually-renamed info will be lost import struct from ida_ua import * from ida_allins import * from idautils import * #from ida_funcs import * from idc import * import ida_kernwin def extract_unicode(data): pat = re.compile(r'^(?:[\x20-\x7E][\x00]){2,}') return list(set([w.decode('utf-16le') for w in pat.findall(data)])) def extract_ascii(data): pat = re.compile(r'^(?:[\x20-\x7E]){2,}') return list(set([w.decode('ascii') for w in pat.findall(data)])) class StackString(object): def __init__ (self, start, end, debug, do_xor, static_xor_key): self.start = start self.end = end self.debug = debug self.do_xor = do_xor self.regs_w_value = {} self.stack_chars = {} self.xor_vars = {} self.stack_imm = None self.static_xor_key = static_xor_key def rename_vars(self): stack = GetFrame(self.start) stack_size = GetStrucSize(stack) args_and_ret_size = stack_size - GetFrameLvarSize(self.start) for offset, name, size in StructMembers(stack): postfix = stack_size - offset - args_and_ret_size if postfix >= 0: self.stack_chars[postfix] = 0 # initialize vars if name.find('var_') == -1: #postfix = stack_size - offset - args_and_ret_size SetMemberName(stack, offset, 'var_{:X}'.format(postfix)) def store_bytes_to_reg(self, r, b): if r == procregs.sp.reg or r == procregs.bp.reg: return elif procregs.xmm0.reg <= r and r <= procregs.xmm15.reg: self.dprint('reg enum {} = {}'.format(r, repr(b))) self.regs_w_value[r] = b #if (0x1f < b and b < 0x7f) or b == 0: elif 0 <= b and b < 0x100: self.dprint('reg enum {} = {:#x}'.format(r, b)) self.regs_w_value[r] = b if procregs.ax.reg <= r and r <= procregs.bx.reg: # ax = eax = rax = 0 but al = 16 / ah = 20 self.regs_w_value[r+16] = b self.regs_w_value[r+20] = b def store_reg_to_reg(self, dst, src): if dst == procregs.sp.reg or dst == procregs.bp.reg: return if src in self.regs_w_value: self.dprint('reg enum {} = reg enum {} ({:#x})'.format(dst, src, self.regs_w_value[src])) self.regs_w_value[dst] = self.regs_w_value[src] def parse_and_get_var_hex(self, vstr): # e.g., mov [ebp+68h+var_18+0Ch], 61h var_off = vstr.split('_')[1][:-1].rstrip('h').split('+') # '18+0C' if len(var_off) == 2: res = int(var_off[0], 16) - int(var_off[1], 16) else: res = int(var_off[0], 16) # handle base+index registers (e.g., mov [rsp+rax+258h+var_C0], 6Fh) try: the_reg = eval('procregs.{}.reg'.format(vstr.split('+')[1])) if the_reg in self.regs_w_value: res = res - self.regs_w_value[the_reg] except SyntaxError: pass return res #return eval('0x{}'.format(var_num)) # '18-4' = 20 def store_byte_to_var(self, v, b): #if (0x1f < b and b < 0x7f) or b == 0: if 0 <= b and b < 0x100: #''' try: if self.stack_chars[v] != 0: # should not be overwritten return except KeyError: # when not initialized (to handle the bytes one by one) #print 'keyerror var_{:X} = {}'.format(v, b) pass #''' self.dprint('var_{:X} = {:#x}'.format(v, b)) self.stack_chars[v] = b def store_bytes_to_vars(self, v, bs): if isinstance(bs, str): # binary sequence for xmm registers blist = [ord(x) for x in bs] else: # int or long blist = self.int_to_bytes_list(bs) for i, b in enumerate(blist): #self.store_byte_to_var(v - i, blist[i]) self.store_byte_to_var(v - i, b) def store_key_to_name(self, v, b): #if (0x1f < b and b < 0x7f) or b == 0: if 0 <= b and b < 0x100: self.dprint('{} ^ {:#x}'.format(v, b)) self.xor_vars[v] = b def int_to_bytes_list(self, v): if v == 0: return [0] res = [] while(1): b = v & 0xff v = v >> 8 #if 0x1f < b and b < 0x7f or b == 0: if 0 <= b and b < 0x100: res.append(b) #if v == 0 and (len(res) == 1 or len(res) == 2 or len(res) == 4 or len(res) == 8): if v == 0 and (len(res) == 2 or len(res) == 4 or len(res) == 8): # e.g., mov [rsp+3A8h+var_290], 6E0069h return res else: break return [] def store_byte_to_stack(self, b): if 0 <= b and b < 0x100: self.stack_imm = b def dprint(self, s): if self.debug: print s def traverse(self): print '----------------------------------------------' print '{:#x}:'.format(self.start) # replace analyzed names with 'var_*' in stack for calculation try: self.rename_vars() #except TypeError: # caused by StructMembers() except: return for head in Heads(self.start, self.end): self.dprint('{:#x}'.format(head)) insn = insn_t() inslen = decode_insn(insn, head) if insn.itype == NN_mov or insn.itype == NN_movsxd: if insn.Op1.type == o_reg and insn.Op2.type == o_imm: # e.g., mov cl/cx/ecx, 6Ch self.store_bytes_to_reg(insn.Op1.reg, insn.Op2.value) elif insn.Op1.type == o_reg and insn.Op2.type == o_reg: # e.g., mov cl/cx/ecx, al/ax/eax self.store_reg_to_reg(insn.Op1.reg, insn.Op2.reg) elif insn.Op1.type == o_reg and insn.Op2.dtype == dt_byte and insn.Op2.type == o_mem: # e.g., mov al, ds:byte_100040F8 self.store_bytes_to_reg(insn.Op1.reg, Byte(insn.Op2.addr)) elif insn.Op1.type == o_displ and GetOpnd(head, 0).find('var_') != -1 and insn.Op2.type == o_reg and (insn.Op2.dtype == dt_byte or insn.Op2.dtype == dt_word): # e.g., mov [esp+180h+var_127], cl #elif insn.Op1.type == o_displ and GetOpnd(head, 0).find('var_') != -1 and insn.Op2.type == o_reg: # e.g., mov [rsp+258h+var_1F0], eax (index register) try: var_hex = self.parse_and_get_var_hex(GetOpnd(head, 0)) except (AttributeError, IndexError, ValueError): # e.g., var_10.S_un continue if insn.Op2.reg in self.regs_w_value: self.store_bytes_to_vars(var_hex, self.regs_w_value[insn.Op2.reg]) elif insn.Op1.type == o_displ and insn.Op2.type == o_imm: # e.g., mov [esp+188h+var_130], 6Ah/2E32h/3362646Fh #print 'o_displ = o_imm' try: var_hex = self.parse_and_get_var_hex(GetOpnd(head, 0)) except (AttributeError, IndexError, ValueError): # e.g., var_10.S_un continue self.store_bytes_to_vars(var_hex, insn.Op2.value) elif insn.Op1.type == o_reg and insn.Op2.type == o_displ: # e.g., mov eax, [rsp+258h+var_1F0] try: var_hex = self.parse_and_get_var_hex(GetOpnd(head, 1)) except (AttributeError, IndexError, ValueError): # e.g., var_10.S_un continue if var_hex in self.stack_chars: self.store_bytes_to_reg(insn.Op1.reg, self.stack_chars[var_hex]) elif insn.itype == NN_xor: if insn.Op1.type == o_reg and insn.Op2.type == o_reg and insn.Op1.reg == insn.Op2.reg: # e.g., xor ebx, ebx self.store_bytes_to_reg(insn.Op1.reg, 0) elif insn.Op1.type == o_displ: # e.g., xor [esp+eax+384h+var_2A4], bl try: var_hex = self.parse_and_get_var_hex(GetOpnd(head, 0)) except (AttributeError, IndexError, ValueError): # e.g., var_10.S_un continue str_var_hex = 'var_{:X}'.format(var_hex) if insn.Op2.type == o_reg and insn.Op2.reg in self.regs_w_value: self.store_key_to_name(str_var_hex, self.regs_w_value[insn.Op2.reg]) elif insn.Op2.type == o_imm: self.store_key_to_name(str_var_hex, insn.Op2.value) elif insn.itype == NN_and: if insn.Op1.type == o_displ and GetOpnd(head, 0).find('var_') != -1 and insn.Op2.value == 0: # e.g., and [ebp+var_24], 0 try: var_hex = self.parse_and_get_var_hex(GetOpnd(head, 0)) except (AttributeError, IndexError, ValueError): # e.g., var_10.S_un continue self.store_byte_to_var(var_hex, 0) # e.g., push 7; pop edx elif insn.itype == NN_push and insn.Op1.type == o_imm: self.store_byte_to_stack(insn.Op1.value) elif insn.itype == NN_pop and insn.Op1.type == o_reg and self.stack_imm: self.store_bytes_to_reg(insn.Op1.reg, self.stack_imm) self.stack_imm = None # for SSE registers elif (insn.itype == NN_movdqa or insn.itype == NN_movaps) and insn.Op1.type == o_reg: # e.g., movdqa xmm1, ds:xmmword_155680 self.store_bytes_to_reg(insn.Op1.reg, GetManyBytes(insn.Op2.addr, 0x10)) elif (insn.itype == NN_movdqu or insn.itype == NN_movups) and insn.Op1.type == o_displ: # e.g., movdqu [ebp+var_27C], xmm1 try: var_hex = self.parse_and_get_var_hex(GetOpnd(head, 0)) except (AttributeError, IndexError, ValueError): # e.g., var_10.S_un continue if insn.Op2.reg in self.regs_w_value: self.store_bytes_to_vars(var_hex, self.regs_w_value[insn.Op2.reg]) # for o_displ operand with base+index registers (increment index) elif insn.itype == NN_inc and insn.Op1.type == o_reg and insn.Op1.reg in self.regs_w_value: self.dprint('{}: incremented {}->{}'.format(GetOpnd(head, 0), self.regs_w_value[insn.Op1.reg], self.regs_w_value[insn.Op1.reg]+1)) self.regs_w_value[insn.Op1.reg] += 1 strings = {} result = [] prev = 0 len_ = 0 uresult = [] uprev = 0 ulen = 0 for k in sorted(self.stack_chars.keys(), reverse=True): self.dprint('{:x}: prev={:x}, uprev={:x}'.format(k, prev, uprev)) # detect discontinuous chars if prev != 0 and prev != k + 1: self.dprint('discontinuous chars detected') stack_var = 'var_{:X}'.format(prev - 1 + len_) strings[stack_var] = ''.join(result) if strings[stack_var][0] != '\x00': print '{} = {}'.format(stack_var, repr(strings[stack_var])) result = [] prev = 0 len_ = 0 uresult = [] uprev = 0 ulen = 0 elif uprev != 0 and uprev != k + 1: #elif uprev != 0 and uprev != k + 1 and uresult[1] == 0: # tiny check for unicode self.dprint('discontinuous chars detected (unicode)') stack_var = 'var_{:X}'.format(uprev - 1 + ulen) try: #strings[stack_var] = ''.join(uresult).decode('utf-16') self.dprint('data: {}'.format(repr(''.join(uresult)))) if extract_unicode(''.join(uresult)): strings[stack_var] = extract_unicode(''.join(uresult))[0] if strings[stack_var][0] != '\x00': print '{} = {}'.format(stack_var, repr(strings[stack_var])) #except UnicodeDecodeError: except (TypeError, IndexError): self.dprint('exception: {}'.format(stack_var)) #strings[stack_var] = ''.join(uresult) pass uresult = [] uprev = 0 ulen = 0 result = [] prev = 0 len_ = 0 self.dprint('{:x}: {} (len={}, ulen={})'.format(k, repr(chr(self.stack_chars[k])), len_, ulen)) result.append(chr(self.stack_chars[k])) uresult.append(chr(self.stack_chars[k])) # detect null-terminated chars #''' if self.stack_chars[k] == 0: #if self.stack_chars[k] == 0 and (prev != 0 and self.stack_chars[prev] == 0): #stack_var = 'var_{:X}'.format(k + len_) #if uprev != 0 and self.stack_chars[uprev] == 0: if uprev != 0 and self.stack_chars[uprev] == 0 and uresult[1] == 0: # tiny check for unicode self.dprint('null-terminated chars detected (unicode)') stack_var = 'var_{:X}'.format(k + ulen) try: #print ''.join(uresult) #strings[stack_var] = ''.join(uresult)[:-1].decode('utf-16') if extract_unicode(''.join(uresult)): strings[stack_var] = extract_unicode(''.join(uresult))[0] if strings[stack_var][0] != '\x00': print '{} = {}'.format(stack_var, repr(strings[stack_var])) #except UnicodeDecodeError: except (TypeError, IndexError): #strings[stack_var] = ''.join(uresult) pass uresult = [] uprev = 0 ulen = 0 prev = k len_ += 1 else: self.dprint('null-terminated chars detected') stack_var = 'var_{:X}'.format(k + len_) strings[stack_var] = ''.join(result) if strings[stack_var][0] != '\x00': print '{} = {}'.format(stack_var, repr(strings[stack_var])) result = [] prev = 0 len_ = 0 uprev = k ulen += 1 else: #''' prev = k len_ += 1 uprev = k ulen += 1 if len(result) > 0: print('the string is not null-terminated: {}'.format(repr(''.join(result)))) stack = GetFrame(self.start) results = [] for offset, name, size in StructMembers(stack): if name in strings: if self.do_xor: if name in self.xor_vars: k = self.xor_vars[name] else: k = self.static_xor_key res = ''.join([chr(ord(x) ^ k) for x in strings[name][:-1]]) #print k print '{} (xor-decoded): {} ({})'.format(name, repr(res), repr(strings[name])) res = res + ' (decoded)' else: res = strings[name] if res[0] != '\x00': SetMemberComment(stack, offset, repr(res.rstrip('\x00')), 1) results.append(repr(res.rstrip('\x00'))) # set comment at the function start ea if results: cmt = ', '.join(results) if len(cmt) < 128: set_func_cmt(self.start, cmt, True) else: set_func_cmt(self.start, 'a lot of stack strings recovered (need to be checked)', True) # restore analyzed names in stack AnalyzeArea(self.start, self.end) class SSSForm(ida_kernwin.Form): def __init__(self): ida_kernwin.Form.__init__(self, r"""BUTTON YES* Run BUTTON CANCEL Cancel stackstring_static {FormChangeCb} {cGroup}> """, { 'FormChangeCb': ida_kernwin.Form.FormChangeCb(self.OnFormChange), 'cGroup': ida_kernwin.Form.ChkGroupControl(("cCurrentOnly", "cDebug", "cDecode")), 'iXorValue': ida_kernwin.Form.NumericInput(tp=ida_kernwin.Form.FT_HEX), }) def OnFormChange(self, fid): if fid == -1: self.SetControlValue(self.cCurrentOnly, True) self.EnableField(self.iXorValue, False) if fid == self.cDecode.id: #print('cDecode changed: {}'.format(self.cDecode.checked)) #if self.cDecode.checked: self.EnableField(self.iXorValue, True) #else: #self.EnableField(self.iXorValue, False) return 1 def main(): print 'start' f = SSSForm() f.Compile() f.iXorValue.value = 0x55 r = f.Execute() if r == 1: # Run if f.cCurrentOnly.checked: start = GetFunctionAttr(here(), FUNCATTR_START) end = GetFunctionAttr(here(), FUNCATTR_END) ss = StackString(start, end, f.cDebug.checked, f.cDecode.checked, f.iXorValue.value) ss.traverse() else: for start in Functions(): end = GetFunctionAttr(start, FUNCATTR_END) ss = StackString(start, end, f.cDebug.checked, f.cDecode.checked, f.iXorValue.value) ss.traverse() else: # Cancel print 'cancel' Refresh() print '----------------------------------------------' print 'done' if __name__ == '__main__': main()