Repository: williballenthin/EVTXtract
Branch: master
Commit: 0895be4c2512
Files: 16
Total size: 65.6 KB

Directory structure:
gitextract_2o9c9iwa/

├── .gitignore
├── .travis.yml
├── LICENSE.TXT
├── README.md
├── evtxtract/
│   ├── __init__.py
│   ├── carvers.py
│   ├── main.py
│   ├── templates.py
│   ├── utils.py
│   └── version.py
├── evtxtract.spec
├── setup.py
└── tests/
    ├── .gitignore
    ├── fixtures.py
    ├── readmd.txt
    └── test_all.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
*.py[cod]

# C extensions
*.so

# Packages
*.egg
*.egg-info
dist
build
eggs
parts
bin
var
sdist
develop-eggs
.installed.cfg
lib
lib64

# Installer logs
pip-log.txt

# Unit test / coverage reports
.coverage
.tox
nosetests.xml

# Translations
*.mo

# Mr Developer
.mr.developer.cfg
.project
.pydevproject
<<<<<<< HEAD
extract_valid_evtx_records_and_templates.py~
find_evtx_chunks.py~
*~
*.evtx
workspace/*.xml
workspace/*.txt
.idea/*
.idea
*_templates.txt
*_chunks.txt


================================================
FILE: .travis.yml
================================================
env:
    global:
        - secure: "j89gGCxDhMdJ9vP/dUhu06XUqYMeqMjxIx8/s8KdVOhE0BxOddU7dIQE5SvcGYMoW+W4NV+7/Pio/eIkY3SUXGOLlPLMLwMDmvg9nA6HwrcSs6zPGreCYhqf7RlCNEyHHoWZ6syHjx1cEL2c1FyXLelQ8r5ONAzWsTeDn7ctnFcGzr7EDhKEjC9LGZchjRMYVrWkOruskSwnJYkPCstNqcwLh7qPgAXktTxx0YMPIr0sTbwLTnZRiNCE0egFSoT6QLBggrM3Nv0DbZO7luyFEgozgp99CACDdJMeMsKqgkedk0E+nz2BV26EpqjjyIRJVMiwXZVLexkB8vSw9PhCGY36REwMIhJz6KCzVQMZyoNkSbrMWBpa3LqdfZGyMxBtLlBw/Yvv+pzB+OnbCIjooy1nTfOlyLlk6QzESnzEW/A/DsFVEnNXQjoAOZEZjj6SlaHl+r/Uw6sXAP4FjP8umI++E9+MnuI0T1bevX8ZMdJ6Qz7gPWnIuOGJqkvFqC7MYt3SY4O4O0DS1pHXQGoQOgoRYExY2VRqeJSEnRTvbr8S5uVWOOz5PRb4psM6gnl6eueN8uNNmqj+BDaZp8qu/uLJ31zvAq0q1+rxxaqB5OkVebi65Q7cN6IjnpCTCK5risooGNZRGco2thnRiuysQ3kEQjaWH/f15cfY8YYnuTA="
        - secure: "xe37YTz7uegptreK36MVYl8+c9FzMqVsNr+/WMPBiZSmQ2T9mHoEP7QWL3AA1JSi7q2A3qWKNK15fUpFmu+u7+/15lFYUqihPEMZTBmmUYx6/I6bxP6d/sdu+mro13kflDzqTOdmkDU9X/Olympc5kI8qqheH6OTqwgjU0ypl/V53/3mXTgOre4LYvy+p0nkYdFCzipp+stuZyvn6tag57nvdnH8j8OLLR886ZV7KtB7RlOeaVK0NbyZ5XFBHJL1GXwV01lDfMsMokDHgkDfvjKVo63p1rYFScOUx7BBwpnSM1zr1hpHli562wQbs2eS4F8oYWOzXxhcRmvwAUxE/iqQvNyis+PD42xlhhOP1ubKJazqc9/AlKIHLNIvfsAVuFLt1eGI/g5/K+cpNUEs38+CXpqy2vTng4bFF4IabpxKlZxqpuCniKiDs5WExmhp2/fwXpNhdQmlh/WM2Mv4+vD8XyardIWZzmeR9EVFRAe+cGoejM+seGNKyfYxdOnj0fqmT2IORK9UKrmqisa9eBUOOg5kLirbnhfbax2J/FAcvAaMS8c5ZQHMPdswyaOdSpoJPPHULpI3uoPvYDFJcbuuZWlE0tuD/Qm5/4ABOeUfInOPAWtOBKYFl+YYIWUzCMEy6QxzJpSXqqXtNlR5y1l4M+PQzVY463+DXK3XBXE="
        - ARTIFACTS_BUCKET=build-artifacts.floss.flare.fireeye.com

language: python

matrix:
    include:
        - os: linux
          sudo: required
          python: 2.7

        - os: linux
          sudo: required
          python: 3.5

        # travis doesn't have py2.7 available, so we have to do it ourselves
        # ref: https://github.com/travis-ci/travis-ci/issues/2312
        - os: osx
          language: generic

before_install:
    # fix erroring OSX job because of rvm issue
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then rvm get stable --auto-dotfiles; fi
    # travis doesn't have py2.7 available, so we have to do it ourselves
    # ref: https://github.com/travis-ci/travis-ci/issues/2312
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then git clone https://github.com/MacPython/terryfy ../terryfy; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then source ../terryfy/travis_tools.sh;                         fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then get_python_environment macpython 2.7.10;                   fi

install:
    - pip install pyinstaller pep8
    - echo "__version__ = '$(git describe --tags)'" > evtxtract/version.py
    - pip install -e .
    - pyinstaller evtxtract.spec && rm -r './dist/evtxtract-dat/'

script:
    - find . -name \*.py -exec pep8 --ignore=E501 {} \;
    - pushd ./tests && wget "https://dl.dropboxusercontent.com/u/55819714/joshua1.zip" && unzip joshua1.zip && popd
    - py.test tests/ -v

addons:
    artifacts:
        debug: true
        paths:
            - $(find . -type f | grep -e '/bin/' -e 'dist/evtxtract' | awk 1 ORS=':')
        target_paths: travis/$TRAVIS_OS_NAME/


================================================
FILE: LICENSE.TXT
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================

Purpose
-------
EVTXtract recovers and reconstructs fragments of EVTX log files from raw binary data, including unallocated space and memory images.

Quick Run
---------

Install EVTXtract via `pip`:

    pip install evtxtract

Now the tool is ready to go!

    C:/Python27/Scripts/evtxtract.exe   Z:/evidence/1/image.dd   >   Z:/work/1/evtx.xml


Quicker Run
-----------

Download standalone executable nightly builds of EVTXtract here:

  - [Linux](https://s3.amazonaws.com/build-artifacts.floss.flare.fireeye.com/travis/linux/dist/evtxtract)
  - [MacOS](https://s3.amazonaws.com/build-artifacts.floss.flare.fireeye.com/travis/osx/dist/evtxtract)

Then you can do:

    ./evtxtract    /path/to/evidence    >   /path/to/output.xml


Background
----------

EVTX records are XML fragments encoded using a Microsoft-specific binary XML representation.
Despite the convenient format, it is not easy to recover EVTX event log records from a corrupted file or unallocated space.
This is because the complete representation of a record often depends on other records found nearby.
The event log service recognizes similarities among records and refactors commonalities into "templates".
A template is a fixed structure with placeholders that reserve space for variable content.
The on-disk event log record structure is a reference to a template, and a list of substitutions (the variable content the replaces a placeholder in a template).
To decode a record into XML, the event log service resolves the template and replaces its placeholders with the entries of the substitution array.
Therefore, template corruption renders many records unrecoverable within the local 64KB "chunk".
However, the substitution array for the remaining records may still be intact.
If so, it may be possible to produce XML fragments that match the original records if the damaged template can be reconstructed.
For many common events, such as process creation or account logon, empirical testing demonstrates the relevant templates remain mostly constant.
In these cases, recovering event log records boils down to identifying appropriate templates found in other EVTX chunks.


Algorithm
---------

1. Scan for chunk signatures ("ElfChnk")
   - check header for sane values (0x80 <= size <= 0x200)
   - verify checksums (header, data)
2. Extract records from valid chunks found in (1)
3. Extract templates from valid chunks found in (1)
4. Scan for record signatures
   - check header for sane values
   - extract timestamp
   - attempt to parse substitutions
   - attempt to decode substitutions into EID, other fields
5. Reconstruct records by reusing old templates with recovered substitutions


Usage
-----

The EVTXtract is a pure Python script.
This means it easily runs on Windows, Linux, and MacOS.
Simply invoke the script, providing the path to a binary image, and EVTXtract writes its results to the standard out stream.
The binary file can be any data: a raw image, memory dump, etc.

Example command line:

    C:/Python27/Scripts/evtxtract.exe   Z:/evidence/1/image.dd   >   Z:/work/1/evtx.xml

Below are some example results from the above command.
It shows two records: a complete and incomplete record.
The first record is completely reconstructed,
  and is formatted just like it would be in event viewer.
However, EVTXtract was unable to complete reconstruct the second record,
 since some critical template data was missing.
So, its been formatted with as much data as was recovered.
EVTXtract uses a schema that allows you to continue processing despite incomplete data.

    <Event xmlns="http://schemas.microsoft.com/win/2004/08/events/event">
        <System>
            <Provider Name="Microsoft-Windows-PrintService" Guid="{747ef6fd-e535-4d16-b510-42c90f6873a1}"></Provider>
            <EventID Qualifiers="">823</EventID>
            <Version>0</Version>
            <Level>4</Level>
            <Task>49</Task>
            <Opcode>11</Opcode>
            <Keywords>0x80000000000200</Keywords>
            <TimeCreated SystemTime="2013-03-23 02:05:57.848455"></TimeCreated>
            <EventRecordID>1</EventRecordID>
            <Correlation ActivityID="" RelatedActivityID=""></Correlation>
            <Execution ProcessID="1204" ThreadID="1208"></Execution>
            <Channel>Microsoft-Windows-PrintService/Admin</Channel>
            <Computer>JOSHUA</Computer>
            <Security UserID="S-1-5-21-3454551831-629247693-1078506759-1000"></Security>
        </System>
        <UserData>
            <ChangingDefaultPrinter xmlns:auto-ns3="http://schemas.microsoft.com/win/2004/08/events" xmlns="http://manifests.microsoft.com/win/2005/08/windows/printing/spooler/core/events">
                <DefaultPrinterSelectedBySpooler>1</DefaultPrinterSelectedBySpooler>
                <OldDefaultPrinter></OldDefaultPrinter>
                <NewDefaultPrinter>Microsoft XPS Document Writer,winspool,Ne00:</NewDefaultPrinter>
                <Status>0x000000</Status>
                <Module>spoolsv.exe</Module>
            </ChangingDefaultPrinter>
        </UserData>
    </Event>

    ...

    <Record>
    <Offset>0x317198</Offset>
    <EventID>1531</EventID>
    <Substitutions>
      <Substitution index="0">
        <Type>4</Type>
        <Value>4</Value>
      </Substitution>
      <Substitution index="1">
        <Type>4</Type>
        <Value>0</Value>
      </Substitution>
      <Substitution index="2">
        <Type>6</Type>
        <Value>0</Value>
      </Substitution>
      <Substitution index="3">
        <Type>6</Type>
        <Value>1531</Value>
      </Substitution>
      <Substitution index="4">
        <Type>0</Type>
        <Value></Value>
      </Substitution>
      <Substitution index="5">
        <Type>21</Type>
        <Value>0x8000000000000000</Value>
      </Substitution>
      <Substitution index="6">
        <Type>17</Type>
        <Value>2013-03-23 02:02:35.679552</Value>
      </Substitution>
      <Substitution index="7">
        <Type>0</Type>
        <Value></Value>
      </Substitution>
      <Substitution index="8">
        <Type>8</Type>
        <Value>928</Value>
      </Substitution>
      <Substitution index="9">
        <Type>8</Type>
        <Value>1040</Value>
      </Substitution>
      <Substitution index="10">
        <Type>10</Type>
        <Value>132</Value>
      </Substitution>
      <Substitution index="11">
        <Type>4</Type>
        <Value>0</Value>
      </Substitution>
      <Substitution index="12">
        <Type>19</Type>
        <Value>S-1-5-18</Value>
      </Substitution>
      <Substitution index="13">
        <Type>0</Type>
        <Value></Value>
      </Substitution>
      <Substitution index="14">
        <Type>1</Type>
        <Value>Microsoft-Windows-User Profiles Service</Value>
      </Substitution>
      <Substitution index="15">
        <Type>15</Type>
        <Value>0001010f-010c-77e3-bf2f-3ef300001200</Value>
      </Substitution>
      <Substitution index="16">
        <Type>1</Type>
        <Value>Application</Value>
      </Substitution>
    </Substitutions>
    </Record>


================================================
FILE: evtxtract/__init__.py
================================================
import logging
import collections

import evtxtract.utils
import evtxtract.carvers
import evtxtract.templates


logger = logging.getLogger(__name__)

VALUE = 1


class CompleteRecord(object):
    __slots__ = ('offset', 'eid', 'xml')

    def __init__(self, offset, eid, xml):
        super(CompleteRecord, self).__init__()
        self.offset = offset
        self.eid = eid
        self.xml = xml


class IncompleteRecord(object):
    __slots__ = ('offset', 'eid', 'substitutions')

    def __init__(self, offset, eid, substitutions):
        super(IncompleteRecord, self).__init__()
        self.offset = offset
        self.eid = eid
        self.substitutions = substitutions


def extract(buf):
    '''
    Do the EVTXtract algorithm and reconstruct EVTX records from the given data.

    Args:
      buf (buffer): the binary data from which to extract structures.

    Returns:
      iterable[union[CompleteRecord, IncompleteRecord]]: a generator of either
        CompleteRecord or IncompleteRecord. You'll have to type-switch of these
        classes to decide out how to handle them.
    '''
    # this does a full scan of the file (#1)
    chunks = set(evtxtract.carvers.find_evtx_chunks(buf))

    valid_record_offsets = set([])
    for chunk in chunks:
        for record in evtxtract.carvers.extract_chunk_records(buf, chunk):
            valid_record_offsets.add(record.offset)
            yield CompleteRecord(record.offset, record.eid, record.xml)

    # map from eid to dictionary mapping from templateid to template
    templates = collections.defaultdict(dict)
    for chunk in chunks:
        for template in evtxtract.carvers.extract_chunk_templates(buf, chunk):
            templates[template.eid][template.get_id()] = template

    # this does a full scan of the file (#2).
    # needs to be distinct because we must have collected all the templates
    # first.
    for record_offset in evtxtract.carvers.find_evtx_records(buf):
        if record_offset in valid_record_offsets:
            continue

        try:
            record = evtxtract.carvers.extract_record(buf, record_offset)
        except evtxtract.carvers.ParseError as e:
            logger.info('parse error for record at offset: 0x%x: %s', record_offset, str(e))
            continue
        except ValueError as e:
            logger.info('timestamp parse error for record at offset: 0x%x: %s', record_offset, str(e))
            continue
        except Exception as e:
            logger.info('unknown parse error for record at offset: 0x%x: %s', record_offset, str(e))
            continue

        if len(record.substitutions) < 4:
            logger.info('too few substitutions for record at offset: 0x%x', record_offset)
            continue

        # we just know that the EID is substitution index 3
        eid = record.substitutions[3][VALUE]

        matching_templates = set([])
        for template in templates.get(eid, {}).values():
            if template.match_substitutions(record.substitutions):
                matching_templates.add(template)

        if len(matching_templates) == 0:
            logger.info('no matching templates for record at offset: 0x%x', record_offset)
            yield IncompleteRecord(record_offset, eid, record.substitutions)
            continue

        if len(matching_templates) > 1:
            logger.info('too many templates for record at offset: 0x%x', record_offset)
            yield IncompleteRecord(record_offset, eid, record.substitutions)
            continue

        template = list(matching_templates)[0]

        record_xml = template.insert_substitutions(record.substitutions)

        yield CompleteRecord(record_offset, eid, record_xml)


================================================
FILE: evtxtract/carvers.py
================================================
import re
import struct
import logging
import binascii
import datetime
import xml.sax.saxutils
from collections import namedtuple

import six
import Evtx.Evtx
import Evtx.Views

import evtxtract.templates


logger = logging.getLogger(__name__)


# TODO: this should be part of python-evtx
EVTX_HEADER_MAGIC = b"ElfChnk"
EVTX_RECORD_MAGIC = b"\x2a\x2a\x00\x00"
CHUNK_SIZE = 0x10000
MIN_CHUNK_HEADER_SIZE = 0x80
MAX_CHUNK_HEADER_SIZE = 0x200


class ParseError(RuntimeError): pass


def is_chunk_header(buf, offset):
    """
    Return True if the offset appears to be an EVTX Chunk header.
    Implementation note: Simply checks the magic header and size field for reasonable values.

    Args:
      buf (buffer): the binary data from which to extract structures.
      offset (int): the address of the potential EVTX chunk header.

    Returns:
      bool: if the offset appears to be an EVTX chunk header.
    """
    if len(buf) < offset + 0x2C:
        # our accesses below will overflow
        return False

    magic = struct.unpack_from("<7s", buf, offset)[0]
    if magic != EVTX_HEADER_MAGIC:
        return False

    size = struct.unpack_from("<I", buf, offset + 0x28)[0]
    if not (MIN_CHUNK_HEADER_SIZE <= size <= MAX_CHUNK_HEADER_SIZE):
        return False

    if len(buf) <= offset + size:
        # the chunk overruns the buffer end
        return False

    try:
        chunk = Evtx.Evtx.ChunkHeader(buf, offset)
    except:
        logger.debug('failed to parse chunk header', exc_info=True)
        return False

    if len(buf) < offset + CHUNK_SIZE:
        return False

    if chunk.calculate_header_checksum() != chunk.header_checksum():
        return False

    if chunk.calculate_data_checksum() != chunk.data_checksum():
        return False

    return True


def find_evtx_chunks(buf):
    """
    Scans the given data for valid EVTX chunk structures.

    Args:
      buf (buffer): the binary data from which to extract structures.

    Returns:
      iterable[int]: generator of offsets of chunks
    """
    offset = 0
    while True:
        offset = buf.find(EVTX_HEADER_MAGIC, offset)
        if offset == -1:
            break

        if is_chunk_header(buf, offset):
            yield offset

        offset += 1


def is_record(buf, offset):
    """
    Return True if the offset appears to be an EVTX record.

    Args:
      buf (buffer): the binary data from which to extract structures.
      offset (int): the address of the potential record.

    Returns:
      bool: if its a record.
    """

    if len(buf) < offset + 8:
        return False

    magic, size = struct.unpack_from("<II", buf, offset)
    if magic != 0x00002a2a:
        return False

    if not (0x30 <= size <= 0x10000):
        return False

    if len(buf) < offset + size:
        return False

    size2 = struct.unpack_from("<I", buf, offset + size - 4)[0]
    if size != size2:
        return False

    return True


def find_evtx_records(buf):
    """
    Generates offsets of apparent EVTX records from the given buffer.

    Args:
      buf (buffer): the binary data from which to extract structures.

    Returns:
      iterable[int]: the offsets of EVTX records.
    """
    offset = 0
    while True:
        offset = buf.find(EVTX_RECORD_MAGIC, offset)
        if offset == -1:
            break

        if is_record(buf, offset):
            yield offset

        offset += 1


RecoveredRecord = namedtuple('RecoveredRecord', ['offset', 'eid', 'xml'])


def extract_chunk_records(buf, offset):
    """
    Generates EVTX records from the EVTX chunk at the given offset.

    Args:
      buf (buffer): the binary data from which to extract structures.
      offset (int): offset to EVTX chunk

    Returns:
      iterable[int]: the offsets of EVTX records.
    """
    try:
        chunk = Evtx.Evtx.ChunkHeader(buf, offset)
    except:
        raise ParseError('failed to parse chunk header')

    cache = {}
    for record in chunk.records():
        try:
            record_xml = Evtx.Views.evtx_record_xml_view(record, cache=cache)
            eid = evtxtract.utils.get_eid(record_xml)
            yield RecoveredRecord(record.offset(), eid, record_xml)

        except UnicodeEncodeError:
            logger.info("Unicode encoding issue processing record at 0x%X", record.offset())
            continue

        except UnicodeDecodeError:
            logger.info("Unicode decoding issue processing record at 0x%X", record.offset())
            continue

        except Evtx.Evtx.InvalidRecordException:
            logger.info("EVTX parsing issue processing record at 0x%X", record.offset())
            continue

        except Exception as e:
            logger.info("Unknown exception processing record at 0x%X", record.offset(), exc_info=True)
            continue


def extract_chunk_templates(buf, offset):
    """
    Generates EVTX record templates from the EVTX chunk at the given offset.

    Args:
      buf (buffer): the binary data from which to extract structures.
      offset (int): offset to EVTX chunk.

    Returns:
      iterable[evtxtract.templates.Template]: a generator of the things you asked for.
    """

    try:
        chunk = Evtx.Evtx.ChunkHeader(buf, offset)
    except:
        raise ParseError('failed to parse chunk header')

    cache = {}
    for record in chunk.records():
        try:
            yield evtxtract.templates.get_template(record)
        except UnicodeEncodeError:
            logger.info("Unicode encoding issue processing record at 0x%X", record.offset())
            continue

        except UnicodeDecodeError:
            logger.info("Unicode decoding issue processing record at 0x%X", record.offset())
            continue

        except Evtx.Evtx.InvalidRecordException:
            logger.info("EVTX parsing issue processing record at 0x%X", record.offset())
            continue

        except Exception as e:
            logger.info("Unknown exception processing record at 0x%X", record.offset(), exc_info=True)
            continue


# map from byte value to boolean
# the key values correspond to evtx node types
VALID_SUBSTITUTION_TYPES = [False for _ in range(256)]
for i in range(22):
    VALID_SUBSTITUTION_TYPES[i] = True
VALID_SUBSTITUTION_TYPES[33] = True
VALID_SUBSTITUTION_TYPES[129] = True


class MaxOffsetReached(Exception): pass


def does_root_have_resident_template(buf, offset, max_offset):
    """
    Guess whether an RootNode has a resident template
      from the given buffer and offset, not parsing
      beyond the given max_offset.

    Args:
      buf (buffer): the binary data from which to extract structures.
      offset (int): address of an EVTX record.
      max_offset (int): don't parse beyond this address.

    Returns:
      boolean: if the RootNode has a resident template.

    Raises:
      MaxOffsetReached: if the given max offset was reached while parsing.
    """
    logger = logging.getLogger("extract_lost_records")
    ofs = offset
    token = struct.unpack_from("<b", buf, ofs)[0]
    if token == 0x0F:  # stream start
        ofs += 4

    ofs += 6  # template offset

    # now, since we don't know where the chunk header is
    #  for this record, we can't use the template offset
    #  to decide if its resident or not
    # instead, we assume that if the template is resident,
    #  then it begins immediately. if this is true, and the
    #  template is resident, then the next fields are:
    #    DWORD next_offset  (range 0-0x10000?, length 0x4)
    #    GUID  template_id (length 0x16, essentially random bytes)
    #    DWORD template_length (range 0-0x10000?, length 0x4)
    # if the template is non-resident, then the fields are:
    #    DWORD num_subs (range 0-100?)
    #    WORD size                            \
    #    BYTE type (value one of 0-21,33,129)  | repeat num_subs times
    #    BYTE zero (value 0)                  /
    # the key takeaway is that we can test
    #   *(ofs + 6 + 4i) (with 0 < i < min(num_subs, 4))
    #  is in the set {0-21, 33, 129}, and that
    #   *(ofs + 7 + 4i) (0 < i < min(num_subs, 4))
    #  is 0.  If these conditions hold, then the template is probably
    #  non-resident.
    #
    # TODO(wb): what if num_subs == 1 or 2?

    ofs += 4  # next_offset or num_subs
    maybe_num_subs = struct.unpack_from("<I", buf, ofs)[0]
    if maybe_num_subs > 100:
        return True

    ofs += 4  # template_id or size

    if max_offset < ofs + 4 + (4 * min(maybe_num_subs or 2, 4)):
        return False

    for i in range(min(maybe_num_subs or 2, 4)):
        byte = struct.unpack_from("<B", buf, ofs + 3 + (i * 4))[0]
        if byte != 0:
            return True

    for i in range(min(maybe_num_subs or 2, 4)):
        byte = struct.unpack_from("<B", buf, ofs + 2 + (i * 4))[0]
        if not VALID_SUBSTITUTION_TYPES[byte]:
            return True

    return False


def extract_root_substitutions(buf, offset, max_offset):
    """
    Parse a RootNode into a list of its substitutions, not parsing beyond
      the max offset.

    Args:
      buf (buffer): the binary data from which to extract structures.
      offset (int): address of an EVTX record.
      max_offset (int): don't parse beyond this address.

    Returns:
      list[tuple[int, variant]]: list of substitution tuples (type, value).

    Raises:
      ParseError: for various reasons, including invalid timestamps and overruns.
    """
    ofs = offset
    token = struct.unpack_from("<b", buf, ofs)[0]
    if token == 0x0F:  # stream start
        ofs += 4

    ofs += 6  # template offset

    if does_root_have_resident_template(buf, offset, max_offset):
        # have to hope that the template begins immediately
        # template_offset = struct.unpack_from("<I", buf, ofs)[0]
        logger.debug("0x%x: resident template", offset)
        ofs += 4  # next offset
        ofs += 4  # guid
        ofs += 0x10  # template_length
        template_length = struct.unpack_from("<I", buf, ofs)[0]
        ofs += 4
        ofs += template_length  # num_subs
    else:
        logger.debug("0x%x: non-resident template", offset)
        ofs += 4  # num_subs

    num_subs = struct.unpack_from("<I", buf, ofs)[0]
    if num_subs > 100:
        raise ParseError("Unexpected number of substitutions: %d at %s" %
                         (num_subs, hex(ofs)))

    ofs += 4  # begin sub list

    substitutions = []
    for _ in range(num_subs):
        size, type_ = struct.unpack_from("<HB", buf, ofs)
        if not VALID_SUBSTITUTION_TYPES[type_]:
            raise ParseError('Unexpected substitution type: ' + hex(type_))

        substitutions.append((type_, size))
        ofs += 4

    ret = []
    for i, pair in enumerate(substitutions):
        type_, size = pair
        if ofs > max_offset:
            raise MaxOffsetReached("Substitutions overran record buffer.")

        value = None
        #[0] = parse_null_type_node,
        if type_ == 0x0:
            value = None
            ret.append((type_, value))

        #[1] = parse_wstring_type_node,
        elif type_ == 0x1:
            s = buf[ofs:ofs + size]
            s = s.decode('utf-16le')
            s = xml.sax.saxutils.escape(s)
            value = s
            ret.append((type_, value))

        #[2] = parse_string_type_node,
        elif type_ == 0x2:
            s = buf[ofs:ofs + size]
            s = s.decode('ascii')
            s = xml.sax.saxutils.escape(s)
            value = s
            ret.append((type_, value))

        #[3] = parse_signed_byte_type_node,
        elif type_ == 0x3:
            value = struct.unpack_from("<b", buf, ofs)[0]
            ret.append((type_, value))

        #[4] = parse_unsigned_byte_type_node,
        elif type_ == 0x4:
            value = struct.unpack_from("<B", buf, ofs)[0]
            ret.append((type_, value))

        #[5] = parse_signed_word_type_node,
        elif type_ == 0x5:
            value = struct.unpack_from("<h", buf, ofs)[0]
            ret.append((type_, value))

        #[6] = parse_unsigned_word_type_node,
        elif type_ == 0x6:
            value = struct.unpack_from("<H", buf, ofs)[0]
            ret.append((type_, value))

        #[7] = parse_signed_dword_type_node,
        elif type_ == 0x7:
            value = struct.unpack_from("<i", buf, ofs)[0]
            ret.append((type_, value))

        #[8] = parse_unsigned_dword_type_node,
        elif type_ == 0x8:
            value = struct.unpack_from("<I", buf, ofs)[0]
            ret.append((type_, value))

        #[9] = parse_signed_qword_type_node,
        elif type_ == 0x9:
            value = struct.unpack_from("<q", buf, ofs)[0]
            ret.append((type_, value))

        #[10] = parse_unsigned_qword_type_node,
        elif type_ == 0xA:
            value = struct.unpack_from("<Q", buf, ofs)[0]
            ret.append((type_, value))

        #[11] = parse_float_type_node,
        elif type_ == 0xB:
            value = struct.unpack_from("<f", buf, ofs)[0]
            ret.append((type_, value))

        #[12] = parse_double_type_node,
        elif type_ == 0xC:
            value = struct.unpack_from("<d", buf, ofs)[0]
            ret.append((type_, value))

        #[13] = parse_boolean_type_node,
        elif type_ == 0xD:
            value = struct.unpack_from("<I", buf, ofs)[0] > 1
            ret.append((type_, value))

        #[14] = parse_binary_type_node,
        elif type_ == 0xE:
            value = binascii.hexlify(buf[ofs:ofs + size])
            ret.append((type_, value))

        #[15] = parse_guid_type_node,
        elif type_ == 0xF:
            _bin = buf[offset:offset + 16]

            # Yeah, this is ugly
            h = [six.indexbytes(_bin, i) for i in range(len(_bin))]
            value = """{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}""".format(
                h[3], h[2], h[1], h[0],
                h[5], h[4],
                h[7], h[6],
                h[8], h[9],
                h[10], h[11], h[12], h[13], h[14], h[15])
            ret.append((type_, value))

        #[16] = parse_size_type_node,
        elif type_ == 0x10:
            if size == 0x4:
                value = struct.unpack_from("<I", buf, ofs)[0]
            elif size == 0x8:
                value = struct.unpack_from("<Q", buf, ofs)[0]
            else:
                raise ParseError('unexpected sizetypenode value: ' + hex(size))

            ret.append((type_, value))

        #[17] = parse_filetime_type_node,
        elif type_ == 0x11:
            qword = struct.unpack_from("<Q", buf, ofs)[0]
            try:
                value = datetime.datetime.utcfromtimestamp(float(qword) * 1e-7 - 11644473600)
            except ValueError:
                raise ParseError('invalid timestamp')

            ret.append((type_, value))

        #[18] = parse_systemtime_type_node,
        elif type_ == 0x12:
            parts = struct.unpack_from("<WWWWWWWW", buf, ofs)
            value = datetime.datetime(parts[0], parts[1],
                                      parts[3],  # skip part 2 (day of week)
                                      parts[4], parts[5],
                                      parts[6], parts[7])
            ret.append((type_, value))

        #[19] = parse_sid_type_node,  -- SIDTypeNode, 0x13
        elif type_ == 0x13:
            version, num_elements = struct.unpack_from("<BB", buf, ofs)
            id_high, id_low = struct.unpack_from(">IH", buf, ofs + 2)
            value = "S-%d-%d" % (version, (id_high << 16) ^ id_low)
            for i in range(num_elements):
                val = struct.unpack_from("<I", buf, ofs + 8 + (4 * i))
                value += "-%d" % val
            ret.append((type_, value))

        #[20] = parse_hex32_type_node,  -- Hex32TypeNoe, 0x14
        elif type_ == 0x14:
            value = "0x"
            for c in buf[ofs:ofs + size][::-1]:
                if not isinstance(c, (int)):
                    c = ord(c)
                value += "%02x" % c
            ret.append((type_, value))

        #[21] = parse_hex64_type_node,  -- Hex64TypeNode, 0x15
        elif type_ == 0x15:
            value = "0x"
            for c in buf[ofs:ofs + size][::-1]:
                if not isinstance(c, (int)):
                    c = ord(c)
                value += "%02x" % c
            ret.append((type_, value))

        #[33] = parse_bxml_type_node,  -- BXmlTypeNode, 0x21
        elif type_ == 0x21:
            subs = extract_root_substitutions(buf, ofs, max_offset)
            ret.extend(subs)

        #[129] = WstringArrayTypeNode, 0x81
        elif type_ == 0x81:

            value = []

            bin = buf[ofs:ofs + size]
            while len(bin) > 0:
                match = re.search(b"((?:[^\x00].)+)", bin)
                if match:
                    frag = match.group()
                    s = frag.decode("utf-16")
                    s = xml.sax.saxutils.escape(s)
                    value.append(s)
                    bin = bin[len(frag) + 2:]
                    if len(bin) == 0:
                        break

                frag = re.search(b"(\x00*)", bin).group()
                if len(frag) % 2 == 0:
                    for _ in range(len(frag) // 2):
                        value.append('')

                else:
                    raise ParseError("Error parsing uneven substring of NULLs")

                bin = bin[len(frag):]

            if value[-1].strip("\x00") == "":
                value = value[:-1]

            ret.append((type_, value))

        else:
            raise ParseError("Unexpected type encountered: " + hex(type_))

        ofs += size
    return ret


ExtractedRecord = namedtuple(
    'ExtractedRecord', ['offset', 'num', 'timestamp', 'substitutions'])


def extract_record(buf, offset):
    """
    Parse an EVTX record into a convenient dictionary of fields.

    Args:
      buf (buffer): the binary data from which to extract structures.
      offset (int): address of the EVTX record.

    Returns:
      ExtractedRecord: the thing you asked for.

    Raises:
      ParseError: for various reasons, including invalid timestamps and overruns.
    """
    if not is_record(buf, offset):
        raise ValueError('not a record')

    record_size, record_num, qword = struct.unpack_from("<IQQ", buf, offset + 0x4)
    timestamp = datetime.datetime.utcfromtimestamp(float(qword) * 1e-7 - 11644473600)
    root_offset = offset + 0x18
    try:
        substitutions = extract_root_substitutions(buf, root_offset, offset + record_size)
    except struct.error:
        raise ParseError('buffer overrun')

    return ExtractedRecord(offset, record_num, timestamp, substitutions)


================================================
FILE: evtxtract/main.py
================================================
import os
import sys
import logging
import os.path
import argparse

import evtxtract
import evtxtract.carvers


logger = logging.getLogger(__name__)


def output_record(args, r):

    xmlhead = '<?xml version="1.0" encoding="UTF-8"?>\n<evtxtract>'
    xmlfoot = '</evtxtract>'
    if isinstance(r, evtxtract.CompleteRecord):
        try:
            if args.split:
                fname = "{}-{}.xml".format(r.eid, r.offset)
                fpath = os.path.join(args.out, fname)
                with open(fpath, "wb") as f:
                    f.write(xmlhead)
                    f.write(r.xml.encode('utf-8'))
                    f.write(xmlfoot)
            else:
                os.write(sys.stdout.fileno(), r.xml.encode('utf-8'))
        except Exception as e:
            logger.warn('failed to output record at offset: 0x%x: %s', r.offset, str(e), exc_info=True)
        else:
            sys.stdout.flush()

    elif isinstance(r, evtxtract.IncompleteRecord):
        try:
            if args.split:
                fname = "{}-{}-incomplete.xml".format(r.eid, r.offset)
                fpath = os.path.join(args.out, fname)
                with open(fpath, "wb") as f:
                    f.write(xmlhead.encode('utf-8'))
                    f.write(format_incomplete_record(r).encode('utf-8'))
                    f.write(xmlfoot.encode('utf-8'))
            else:
                os.write(sys.stdout.fileno(), format_incomplete_record(r).encode('utf-8'))
        except Exception as e:
            logger.warn('failed to output record at offset: 0x%x: %s', r.offset, str(e), exc_info=True)
        else:
            sys.stdout.flush()


def format_incomplete_record(record):
    ret = []

    ret.append('<Record>')
    ret.append('<Offset>0x%x</Offset>' % (record.offset))
    ret.append('<EventID>%d</EventID>' % (record.eid))
    ret.append('<Substitutions>')
    for i, (type_, value) in enumerate(record.substitutions):
        ret.append('  <Substitution index="%d">' % (i))
        ret.append('    <Type>%d</Type>' % (type_))
        if value is None:
            ret.append('    <Value></Value>')
        else:
            ret.append('    <Value>%s</Value>' % (value))
        ret.append('  </Substitution>')
    ret.append('</Substitutions>')
    ret.append('</Record>')

    return '\n'.join(ret)


def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]

    parser = argparse.ArgumentParser(
        description="Reconstruct EVTX event log records from binary data.")
    parser.add_argument("input", type=str,
                        help="Path to binary input file")
    parser.add_argument("-v", "--verbose", action="store_true",
                        help="Enable debug logging")
    parser.add_argument("-q", "--quiet", action="store_true",
                        help="Disable all output but errors")
    parser.add_argument("-s", "--split", action="store_true",
                        help="split each event into its own file")
    parser.add_argument("-o", "--out", metavar='output-directory', action="store",
                        help="output directory to store split files")
    args = parser.parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    elif args.quiet:
        logging.basicConfig(level=logging.ERROR)
    else:
        logging.basicConfig(level=logging.INFO)

    if args.split and not args.out:
        logger.error('Error: the -o argument is required when using -s. please provide an output directory with -o')
        exit(1)

    if args.out and not os.path.isdir(args.out):
        logger.error('Error: {0} is not a directory'.format(args.out))
        exit(1)

    with evtxtract.utils.Mmap(args.input) as mm:
        num_complete = 0
        num_incomplete = 0

        if not args.split:
            print('<?xml version="1.0" encoding="UTF-8"?>')
            print('<evtxtract>')
        for r in evtxtract.extract(mm):
            
            output_record(args, r)

            if isinstance(r, evtxtract.CompleteRecord):
                num_complete += 1

            elif isinstance(r, evtxtract.IncompleteRecord):
                num_incomplete += 1

            else:
                raise RuntimeError('unexpected return type')

        if not args.split:
            print('</evtxtract>')

        logging.info('recovered %d complete records', num_complete)
        logging.info('recovered %d incomplete records', num_incomplete)


if __name__ == "__main__":
    sys.exit(main())


================================================
FILE: evtxtract/templates.py
================================================
import re
import sys
import logging

import six
import Evtx.Evtx
import Evtx.Nodes
import Evtx.Views

import evtxtract.utils
import evtxtract.templates


logger = logging.getLogger(__name__)


class Template(object):
    substitition_re = re.compile("\[(Conditional|Normal) Substitution\(index=(\d+), type=(\d+)\)\]")

    def __init__(self, eid, xml):
        self.eid = eid
        self.xml = xml

        self._cached_placeholders = None
        self._cached_id = None

    def get_id(self):
        """
        @rtype: str
        @return: A string that can be parsed into constraints describing what
          types of subsitutions this template can accept.
          Short example: 1100-[0|4|c]-[1|4|c]-[2|6|c]-[3|6|c]
        """
        if self._cached_id is not None:
            return self._cached_id

        ret = [str(self.eid)]
        for index, type_, mode in self._get_placeholders():
            if mode:
                mode_str = "c"
            else:
                mode_str = "n"
            ret.append("[%s|%s|%s]" % (index, type_, mode_str))

        self._cached_id = "-".join(ret)
        return self._cached_id

    def _get_placeholders(self):
        """
        Get descriptors for each of the substitutions required by this
          template.

        Tuple schema: (index, type, is_conditional)

        @rtype: list of (int, int, boolean)
        """
        if self._cached_placeholders is not None:
            return self._cached_placeholders

        ret = []
        for mode, index, type_ in Template.substitition_re.findall(self.xml):
            ret.append((int(index), int(type_), mode == "Conditional"))

        self._cached_placeholders = sorted(ret, key=lambda p: p[0])
        return self._cached_placeholders

    def match_substitutions(self, substitutions):
        """
        Checks to see if the provided set of substitutions match the
          placeholder values required by this template.

        Note, this is only a best guess.  The number of substitutions
          *may* be greater than the number of available slots. So we
          must only check the slot and substitution types.

        @type substitutions: list of (int, str)
        @param substitutions: Tuple schema (type, value)
        @rtype: boolean
        """
        logger = logging.getLogger("match_substitutions")
        placeholders = self._get_placeholders()
        logger.debug("Substitutions: %s", str(substitutions))
        logger.debug("Constraints: %s", str(placeholders))
        if len(placeholders) > len(substitutions):
            logger.debug("Failing on lens: %d vs %d",
                         len(placeholders), len(substitutions))
            return False
        if max(placeholders, key=lambda k: k[0])[0] > len(substitutions):
            logger.debug("Failing on max index: %d vs %d",
                         max(placeholders, key=lambda k: k[0])[0],
                         len(substitutions))
            return False

        # it seems that some templates request different values than what are subsequently put in them
        #   specifically, a Hex64 might be put into a SizeType field (EID 4624)
        # this maps from the type described in a template, to possible additional types that a
        #   record can provide for a particular substitution
        overrides = {
            16: set([21])
        }

        for index, type_, is_conditional in placeholders:
            sub_type, sub_value = substitutions[index]
            if is_conditional and sub_type == 0:
                continue
            if sub_type != type_:
                if type_ not in overrides or sub_type not in overrides[type_]:
                    logger.debug("Failing on type comparison, index %d: %d vs %d (mode: %s)",
                                 index, sub_type, type_, is_conditional)
                    return False
                else:
                    logger.debug("Overriding template type %d with substitution type %d", type_, sub_type)
                    continue
        return True

    escape_re = re.compile(r"\\\\(\d)")

    @staticmethod
    def _escape(value):
        """
        Escape the static value to be used in a regular expression
          subsititution. This processes any backreferences and
          makes them plain, escaped sequences.

        @type value: str
        @rtype: str
        """
        return Template.escape_re.sub(r"\\\\\\\\\1", re.escape(value))

    def insert_substitutions(self, substitutions):
        """
        Return a copy of the template with the given substitutions inserted.

        @type substitutions: list of (int, str)
        @param substitutions: an ordered list of (type:int, value:str)
        @rtype: str
        """
        ret = self.xml
        for index, pair in enumerate(substitutions):
            type_, value = pair
            from_pattern = "\[(Normal|Conditional) Substitution\(index=%d, type=\d+\)\]" % index
            if isinstance(value, six.string_types):
                value = Template._escape(value)
            else:
                value = str(value)
            ret = re.sub(from_pattern, value, ret)
        return ret


REPLACEMENT_PATTERNS = {
    i: re.compile(
        "\[(Normal|Conditional) Substitution\(index=%d, type=\d+\)\]" % i)
    for i in range(35)}


def make_replacement(template, index, substitution):
    """
    Makes a substitution given a template as a string.

    Implementation is a huge hack that depends on the
    brittle template_format() output.

    @type template: str
    @type index: int
    @type substitution: str
    @rtype: str
    """
    if index not in REPLACEMENT_PATTERNS:
        from_pattern = re.compile("\[(Normal|Conditional) Substitution\(index=%d, type=\d+\)\]" % index)
        REPLACEMENT_PATTERNS[index] = from_pattern
    return REPLACEMENT_PATTERNS[index].sub(substitution, template)


def get_complete_template(root, current_index=0):
    """
    Gets the template from a RootNode while resolving any
    nested templates and fixing up their indices.
    Depth first ordering/indexing.

    Implementation is a huge hack that depends on the
      brittle template_format() output.

    @type root: RootNode
    @type current_index: int
    @rtype: str
    """
    template = Evtx.Views.evtx_template_readable_view(root)  # TODO(wb): make sure this is working

    # walk through each substitution.
    # if its a normal node, continue
    # else its a subtemplate, and we count the number of substitutions _it_ has
    #   so that we can later fixup all the indices
    replacements = []
    for index, substitution in enumerate(root.substitutions()):
        # find all sub-templates
        if not isinstance(substitution, Evtx.Nodes.BXmlTypeNode):
            replacements.append(current_index + index)
            continue
        # TODO(wb): hack here accessing ._root
        subtemplate = get_complete_template(substitution._root,
                                             current_index=current_index + index)
        replacements.append(subtemplate)
        current_index += subtemplate.count("Substitution(index=")
    replacements.reverse()

    # now walk through all the indices and fix them up depth-first
    for i, replacement in enumerate(replacements):
        index = len(replacements) - i - 1
        if isinstance(replacement, int):
            # fixup index
            from_pattern = "index=%d," % index
            to_pattern = "index=%d," % replacement
            template = template.replace(from_pattern, to_pattern)
        if isinstance(replacement, six.string_types):
            # insert sub-template
            template = make_replacement(template, index, replacement)
    return template


def get_template(record):
    """
    Given a complete Record, parse out the nodes that make up the Template
      and return it as a Template.

    @type record: Record
    @rtype: Template
    """
    record_xml = Evtx.Views.evtx_record_xml_view(record)
    eid = evtxtract.utils.get_eid(record_xml)
    return Template(eid, get_complete_template(record.root()))


================================================
FILE: evtxtract/utils.py
================================================
import mmap
import logging
from lxml import etree


logger = logging.getLogger(__name__)


def to_lxml(record_xml):
    """
    Convert an XML string to an Etree element.

    @type record_xml: str
    @rtype: etree.Element
    """
    if "<?xml" not in record_xml:
        return etree.fromstring(
            "<?xml version=\"1.0\" standalone=\"yes\" ?>%s" % record_xml)
    else:
        return etree.fromstring(record_xml)


def get_child(node, tag,
              ns="{http://schemas.microsoft.com/win/2004/08/events/event}"):
    """
    Given an Etree element, get the first child node with the given tag.

    @type node: etree.Element
    @type tag: str
    @type ns: str
    @rtype: etree.Element or None
    """
    return node.find("%s%s" % (ns, tag))


def get_eid(record_xml):
    """
    Given EVTX record XML, return the EID of the record.

    Args:
      record_xml (str)

    Returns:
      int: the event ID of the record
    """
    return int(
        get_child(
            get_child(to_lxml(record_xml),
                      "System"),
            "EventID").text)


class Mmap(object):
    """
    Convenience class for opening a read-only memory map for a file path.
    """

    def __init__(self, filename):
        super(Mmap, self).__init__()
        self._filename = filename
        self._f = None
        self._mmap = None

    def __enter__(self):
        self._f = open(self._filename, "rb")
        self._mmap = mmap.mmap(self._f.fileno(), 0, access=mmap.ACCESS_READ)
        return self._mmap

    def __exit__(self, type, value, traceback):
        if self._mmap:
            self._mmap.close()
        if  self._f :
            self._f.close()


================================================
FILE: evtxtract/version.py
================================================
__version__ = '0.2.4'


================================================
FILE: evtxtract.spec
================================================
# -*- mode: python -*-

block_cipher = None

a = Analysis(
    ['evtxtract/main.py'],
     pathex=['evtxtract'],
     binaries=None,
     datas=None,
     hiddenimports=[],
     hookspath=None,
     runtime_hooks=None,
     excludes=["tkinter", "_tkinter", "Tkinter"],
     win_no_prefer_redirects=None,
     win_private_assemblies=None,
     cipher=None)

a.binaries = a.binaries - TOC([
 ('sqlite3.dll', None, None),
 ('tcl85.dll', None, None),
 ('tk85.dll', None, None),
 ('_sqlite3', None, None),
 ('_ssl', None, None),
 ('_tkinter', None, None)])

pyz = PYZ(a.pure, a.zipped_data, cipher=None)

exe = EXE(pyz,
          a.scripts,
          a.binaries,
          exclude_binaries=False,
          name='evtxtract',
          #icon='resources/icon.ico',
          debug=False,
          strip=None,
          upx=True,
          console=True )

coll = COLLECT(exe,
               a.binaries,
               a.zipfiles,
               a.datas,
               strip=None,
               upx=True,
               name='evtxtract-dat')


================================================
FILE: setup.py
================================================
#!/usr/bin/env python

import os
import setuptools


# this sets __version__
# # via: http://stackoverflow.com/a/7071358/87207
# # and: http://stackoverflow.com/a/2073599/87207
with open(os.path.join("evtxtract", "version.py"), "rb") as f:
     exec(f.read())

setuptools.setup(name="evtxtract",
      version=__version__,
      description="EVTXtract recovers and reconstructs fragments of EVTX log files from raw binary data, including unallocated space and memory images.",
      author="Willi Ballenthin",
      author_email="william.ballenthin@fireeye.com",
      url="https://github.com/williballenthin/evtxtract",
      license="Apache 2.0 License",
      packages=setuptools.find_packages(),
      entry_points={
          "console_scripts": [
              "evtxtract=evtxtract.main:main",
          ]
      },
      install_requires=[
          'six',
          'lxml',
          'pytest',
          'python-evtx>=0.5.2',
      ],
)


================================================
FILE: tests/.gitignore
================================================
*memoryevtx/file.None*
*.dat
*.vacb


================================================
FILE: tests/fixtures.py
================================================
import os

import pytest

import evtxtract.utils


CD = os.path.dirname(__file__)
IMAGE_PATH = os.path.join(CD, 'joshua1.vmem')


@pytest.fixture
def image(request):
    if not os.path.exists(IMAGE_PATH):
        raise RuntimeError('required image %s does not exist. see readme.' % (IMAGE_PATH))

    return IMAGE_PATH


@pytest.fixture
def image_file(request):
    with open(image(request), 'rb') as f:
        yield f


@pytest.fixture
def image_mmap(request):
    with evtxtract.utils.Mmap(image(request)) as mm:
        yield mm


================================================
FILE: tests/readmd.txt
================================================
the tests require the image `joshua1.vmem` from:
  - referenced: http://jessekornblum.livejournal.com/293291.html
  - download: https://dl.dropboxusercontent.com/u/55819714/joshua1.zip


================================================
FILE: tests/test_all.py
================================================
import logging

import evtxtract
import evtxtract.carvers

from fixtures import *


#logging.basicConfig(level=logging.DEBUG)
#logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def test_find_chunks(image_mmap):
    # these offsets were empirically collected from the test image
    expected = set([
        0xc7f000,
        0xf0e000,
        0x1374f20,
        0x70cc000,
        0xd727440,
        0xdfe7000,
        0x18851080,
        0x1c31d000,
        0x20b362c0,
        0x276f8000,
        0x2833e000,
        0x28b4e000,
        0x28b68000,
        0x28d5e000,
        0x28ead000,
        0x2986e000,
        0x2998c000,
        0x29a9c000,
        0x2ff30000,
        0x2ffd0000,
        0x3070f000,
        0x30c1f000,
        0x30c8f000,
        0x30dbf000,
        0x30f2f000,
        0x30fff000,
        0x3126f000,
        0x328eac10,
        0x34b75000,
        0x38835000,
        0x39981910,
        0x39cc07a0,
        0x3b91b000,
    ])

    assert expected == set(evtxtract.carvers.find_evtx_chunks(image_mmap))


def first(s):
    for x in s:
        return x


def test_extract_records(image_mmap):
    # these offsets were empirically collected from the test image
    expected_offsets = set([
        0xf0e200,
        0x70cc200,
        0x70cca30,
        0x1c31d200,
        0x1c31d858,
        0x20b364c0,
        0x20b36b80,
        0x276f8200,
        0x276f88c0,
        0x29a9c200,
        0x30dbf200,
        0x30dbf8c8,
        0x30dbfb68,
        0x30dbfde8,
        0x34b75200,
        0x34b758a0,
        0x3b91b200,
    ])

    # these eids were empirically collected from the test image
    expected_eids = set([
        1,
        2,
        5,
        21,
        22,
        100,
        306,
        823,
        1001,
        1002,
        1006,
        1009,
        1020
    ])

    found_offsets = set([])
    found_eids = set([])
    for chunk_offset in evtxtract.carvers.find_evtx_chunks(image_mmap):
        for recovered_record in evtxtract.carvers.extract_chunk_records(image_mmap, chunk_offset):
            found_offsets.add(recovered_record.offset)
            found_eids.add(recovered_record.eid)

    assert expected_offsets == found_offsets
    assert expected_eids == found_eids


def test_extract_templates(image_mmap):
    # these template ids were empirically collected from the test image
    expected_ids = set([
        "1-[0|4|c]-[1|4|c]-[2|6|c]-[3|6|c]-[4|6|c]-[5|21|c]-[6|17|c]-[7|15|c]-[8|8|c]-[9|8|c]-[10|10|c]-[11|4|c]-[12|19|c]-[13|15|c]-[14|1|c]-[15|15|c]-[16|1|c]-[17|8|n]",
        "2-[0|4|c]-[1|4|c]-[2|6|c]-[3|6|c]-[4|6|c]-[5|21|c]-[6|17|c]-[7|15|c]-[8|8|c]-[9|8|c]-[10|10|c]-[11|4|c]-[12|19|c]-[13|15|c]-[14|1|c]-[15|15|c]-[16|1|c]-[17|8|n]",
        "21-[0|4|c]-[1|4|c]-[2|6|c]-[3|6|c]-[4|6|c]-[5|21|c]-[6|17|c]-[7|15|c]-[8|8|c]-[9|8|c]-[10|10|c]-[11|4|c]-[12|19|c]-[13|15|c]-[14|1|c]-[15|15|c]-[16|1|c]-[17|1|n]-[18|8|n]-[19|1|n]",
        "22-[0|4|c]-[1|4|c]-[2|6|c]-[3|6|c]-[4|6|c]-[5|21|c]-[6|17|c]-[7|15|c]-[8|8|c]-[9|8|c]-[10|10|c]-[11|4|c]-[12|19|c]-[13|15|c]-[14|1|c]-[15|15|c]-[16|1|c]-[17|1|n]-[18|8|n]-[19|1|n]",
        "5-[0|4|c]-[1|4|c]-[2|6|c]-[3|6|c]-[4|6|c]-[5|21|c]-[6|17|c]-[7|15|c]-[8|8|c]-[9|8|c]-[10|10|c]-[11|4|c]-[12|19|c]-[13|15|c]-[14|1|c]-[15|15|c]-[16|1|c]-[17|1|n]-[18|1|n]",
        "100-[0|4|c]-[1|4|c]-[2|6|c]-[3|6|c]-[4|6|c]-[5|21|c]-[6|17|c]-[7|15|c]-[8|8|c]-[9|8|c]-[10|10|c]-[11|4|c]-[12|19|c]-[13|15|c]-[14|1|c]-[15|15|c]-[16|1|c]-[17|8|n]-[18|1|n]",
        "306-[0|4|c]-[1|4|c]-[2|6|c]-[3|6|c]-[4|6|c]-[5|21|c]-[6|17|c]-[7|15|c]-[8|8|c]-[9|8|c]-[10|10|c]-[11|4|c]-[12|19|c]-[13|15|c]-[14|1|c]-[15|15|c]-[16|1|c]",
        "823-[0|4|c]-[1|4|c]-[2|6|c]-[3|6|c]-[4|6|c]-[5|21|c]-[6|17|c]-[7|15|c]-[8|8|c]-[9|8|c]-[10|10|c]-[11|4|c]-[12|19|c]-[13|15|c]-[14|1|c]-[15|15|c]-[16|1|c]-[17|8|n]-[18|1|n]-[19|1|n]-[20|20|n]-[21|1|n]",
        "1001-[0|4|c]-[1|4|c]-[2|6|c]-[3|6|c]-[4|6|c]-[5|21|c]-[6|17|c]-[7|15|c]-[8|8|c]-[9|8|c]-[10|10|c]-[11|4|c]-[12|19|c]-[13|15|c]-[14|1|c]-[15|15|c]-[16|1|c]",
        "1002-[0|4|c]-[1|4|c]-[2|6|c]-[3|6|c]-[4|6|c]-[5|21|c]-[6|17|c]-[7|15|c]-[8|8|c]-[9|8|c]-[10|10|c]-[11|4|c]-[12|19|c]-[13|15|c]-[14|1|c]-[15|15|c]-[16|1|c]",
        "1006-[0|4|c]-[1|4|c]-[2|6|c]-[3|6|c]-[4|6|c]-[5|21|c]-[6|17|c]-[7|15|c]-[8|8|c]-[9|8|c]-[10|10|c]-[11|4|c]-[12|19|c]-[13|15|c]-[14|1|c]-[15|15|c]-[16|1|c]-[17|8|n]-[18|13|n]-[19|13|n]",
        "1009-[0|4|c]-[1|4|c]-[2|6|c]-[3|6|c]-[4|6|c]-[5|21|c]-[6|17|c]-[7|15|c]-[8|8|c]-[9|8|c]-[10|10|c]-[11|4|c]-[12|19|c]-[13|15|c]-[14|1|c]-[15|15|c]-[16|1|c]-[17|1|n]-[18|8|n]-[19|8|n]",
        "1020-[0|4|c]-[1|4|c]-[2|6|c]-[3|6|c]-[4|6|c]-[5|21|c]-[6|17|c]-[7|15|c]-[8|8|c]-[9|8|c]-[10|10|c]-[11|4|c]-[12|19|c]-[13|15|c]-[14|1|c]-[15|15|c]-[16|1|c]-[17|1|n]",
    ])

    found_ids = set([])
    for chunk_offset in evtxtract.carvers.find_evtx_chunks(image_mmap):
        for template in evtxtract.carvers.extract_chunk_templates(image_mmap, chunk_offset):
            found_ids.add(template.get_id())

    assert expected_ids == found_ids


def test_find_records(image_mmap):
    records = list(evtxtract.carvers.find_evtx_records(image_mmap))
    assert records[0] == 0x317198
    assert records[-1] == 0x3D706A88
    assert len(records) == 1674


def test_evtxtract(image_mmap):
    num_complete = 0
    num_incomplete = 0
    for r in evtxtract.extract(image_mmap):
        if isinstance(r, evtxtract.CompleteRecord):
            num_complete += 1
        elif isinstance(r, evtxtract.IncompleteRecord):
            num_incomplete += 1
        else:
            raise RuntimeError('unexpected return type')

    assert num_complete == 52
    assert num_incomplete == 1615