Repository: davisjam/safe-regex
Branch: master
Commit: 9041d808830a
Files: 16
Total size: 19.5 KB

Directory structure:
gitextract_snszas4p/

├── .github/
│   └── workflows/
│       └── npm-publish.yml
├── .gitignore
├── .npmignore
├── .travis.yml
├── CHANGELOG.md
├── LICENSE
├── README.md
├── babel.config.json
├── bin/
│   └── cli.js
├── example/
│   └── safe.js
├── package.json
└── src/
    ├── analyzer-family.js
    ├── analyzer.js
    ├── heuristic-analyzer.js
    ├── index.js
    └── test/
        └── regex.spec.js

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/npm-publish.yml
================================================
# This workflow will run tests using node and then publish a package to GitHub Packages when a release is created
# For more information see: https://help.github.com/actions/language-and-framework-guides/publishing-nodejs-packages

name: Node.js Package

on:
  release:
    types: [created]

jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v2
      - uses: actions/setup-node@v1
        with:
          node-version: 12
      - run: npm ci
      - run: npm test

  publish-npm:
    needs: build
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v2
      - uses: actions/setup-node@v1
        with:
          node-version: 12
          registry-url: https://registry.npmjs.org/
      - run: npm ci
      - run: npm publish
        env:
          NODE_AUTH_TOKEN: ${{secrets.npm_token}}

  publish-gpr:
    needs: build
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v2
      - uses: actions/setup-node@v1
        with:
          node-version: 12
          registry-url: https://npm.pkg.github.com/
      - run: npm ci
      - run: npm publish
        env:
          NODE_AUTH_TOKEN: ${{secrets.GITHUB_TOKEN}}


================================================
FILE: .gitignore
================================================
.vscode
coverage
node_modules/
package-lock.json
dist/


================================================
FILE: .npmignore
================================================
.vscode
coverage
src/
dist/test/


================================================
FILE: .travis.yml
================================================
sudo: required

language: node_js

node_js:
  - "10"
  - "12"

install:
  - npm install

script:
  - npm test

cache:
  directories:
    - node_modules


================================================
FILE: CHANGELOG.md
================================================
# v2

## v2.0

## v2.0.0

1. Update README.
2. Switch AST library from ret to regexp-tree.
3. Fix incorrect handling of nested quantifiers in disjunctions.
4. Enhance test suite.

Contributors:
- [davisjam](https://github.com/davisjam)

# v1

This is the historic release.

Contributors:
- [substack](https://github.com/substack)


================================================
FILE: LICENSE
================================================
Copyright 2019-present is held by the authors of the safe-regex module.

This software is released under the MIT license:

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

Original author: James Halliday @substack
Maintainer: James C. (Jamie) Davis @davisjam


================================================
FILE: README.md
================================================
# safe-regex

Detect potentially
[catastrophic](http://regular-expressions.mobi/catastrophic.html)
[exponential-time](http://perlgeek.de/blog-en/perl-tips/in-search-of-an-exponetial-regexp.html)
regular expressions by limiting the
[star height](https://en.wikipedia.org/wiki/Star_height) to 1.

WARNING: This module has both false positives and false negatives.
Use [vuln-regex-detector](https://github.com/davisjam/vuln-regex-detector) for improved accuracy.

[![Build Status](https://travis-ci.org/davisjam/safe-regex.svg?branch=master)](https://travis-ci.org/davisjam/safe-regex)

## Example

Suppose you have a script named `safe.js`:

``` js
var safe = require('safe-regex');
var regex = process.argv.slice(2).join(' ');
console.log(safe(regex));
```

This is its behavior:

```
$ node safe.js '(x+x+)+y'
false
$ node safe.js '(beep|boop)*'
true
$ node safe.js '(a+){10}'
false
$ node safe.js '\blocation\s*:[^:\n]+\b(Oakland|San Francisco)\b'
true
```

## Methods

``` js
const safe = require('safe-regex')
```

### const ok = safe(re, opts={})

Return a boolean `ok` whether or not the regex `re` is safe and not possibly
catastrophic.

`re` can be a `RegExp` object or just a string.

If the `re` is a string and is an invalid regex, returns `false`.

* `opts.limit` - maximum number of allowed repetitions in the entire regex.
Default: `25`.

## Install

With [npm](https://npmjs.org) do:

```
npm install safe-regex
```

## Resources

### What should I do if my project has a super-linear regex?

1. Confirm that it is *reachable* by untrusted input.
2. If it is, you can consider whether you can prevent worst-case behavior by trimming the input, revising the regex, or replacing the regex with another algorithm like string functions. For examples, see Table 5 in [this article](http://people.cs.vt.edu/davisjam/downloads/publications/DavisCoghlanServantLee-EcosystemREDOS-ESECFSE18.pdf).
3. If none of those solutions looks feasible, you might also consider changing regex engines. The [RE2 bindings](https://www.npmjs.com/package/re2) might work, though test carefully to confirm there are no [semantic portability problems](https://medium.com/@davisjam/why-arent-regexes-a-lingua-franca-esecfse19-a36348df3a2?source=friends_link&sk=d21be7f8f723e2080dc993385c6973d1).

### Further reading

The following documents may be edifying:

- [Research brief on the extent of super-linear regexes in practice](https://medium.com/@davisjam/introduction-987fdc4c7b0?source=friends_link&sk=ceefa4a4ca9617e08ab782c3b1580aea)
- [Research brief on the variability of super-linear regex behavior across programming languages](https://medium.com/@davisjam/why-arent-regexes-a-lingua-franca-esecfse19-a36348df3a2?source=friends_link&sk=d21be7f8f723e2080dc993385c6973d1)
- [Comparing regex matching algorithms](https://swtch.com/~rsc/regexp/regexp1.html)

## Project policies

### Versioning

This project follows [Semantic Versioning 2.0 (semver)](https://semver.org/).

Here are the project-specific meanings of MAJOR, MINOR, and PATCH updates:

- MAJOR: "Incompatible" API changes were introduced. There are two types in this module:
  - Changes that modify the interface
  - Changes that cause any regexes to be marked as unsafe that were formerly marked as safe
- MINOR: Functionality was added in a backwards-compatible manner. There are two types in this module:
  - Refactoring the analyses but not changing their results
  - Modifying the analyses to reduce false positives, without affecting negatives (false or true)
- PATCH: I don't anticipate using PATCH for this module

### License

[MIT](https://github.com/davisjam/safe-regex/blob/master/LICENSE)

================================================
FILE: babel.config.json
================================================
{
    "presets": [
        [
            "@babel/preset-env",
            {
                "targets": {
                    "edge": "17",
                    "firefox": "60",
                    "chrome": "67",
                    "safari": "11.1",
                    "ie": "11"
                }
            }
        ]
    ]
}

================================================
FILE: bin/cli.js
================================================
#!/usr/bin/env node
// CLI for querying safe-regex for safety analysis
// Input: JSON-formatted file with an object with key 'pattern'
// Output: STDOUT: JSON-formatted object with new key 'isSafe' and value 0 or 1
//         STDERR: Progress updates

var safe = require('../'),
    fs   = require('fs');

if (process.argv.length != 3) {
  console.error(`Usage: ${process.argv[1]} pattern.json`);
  process.exit(1);
}
const file = process.argv[2];

// Get pattern
const cont = fs.readFileSync(file, 'utf-8');
let pattern = JSON.parse(cont);

// Can analyze? Is safe?
let canAnalyze = 0;
let isSafe = 0;
try {
  isSafe = safe(pattern.pattern) ? 1 : 0;
  canAnalyze = 1;
} catch (e) {
  canAnalyze = 0;
  isSafe = 'unknown';
}

pattern.canAnalyze = canAnalyze;
pattern.isSafe = isSafe;

// Emit.
console.error(`Pattern /${pattern.pattern}/: canAnalyze ${pattern.canAnalyze} isSafe ${pattern.isSafe}`);
console.log(JSON.stringify(pattern));
process.exit(0);


================================================
FILE: example/safe.js
================================================
var safe = require('../');
var regex = process.argv.slice(2).join(' ');
console.log(safe(regex));


================================================
FILE: package.json
================================================
{
  "name": "safe-regex",
  "version": "2.1.2",
  "description": "detect possibly catastrophic, exponential-time regular expressions",
  "main": "dist/index.js",
  "dependencies": {
    "babel-jest": "^26.3.0",
    "regexp-tree": "~0.1.1"
  },
  "devDependencies": {
    "@babel/cli": "^7.11.6",
    "@babel/core": "^7.11.6",
    "@babel/preset-env": "^7.11.5",
    "jest": "^24.9.0"
  },
  "scripts": {
    "build": "babel src -d dist",
    "prepublishOnly": "npm run build",
    "test": "jest src/*",
    "test-artifact": "jest dist/*"
  },
  "jest": {
    "moduleFileExtensions": [
      "js"
    ],
    "testRegex": "test.*\\.spec\\.js$",
    "collectCoverage": true,
    "coverageReporters": [
      "text-summary",
      "html",
      "lcov"
    ],
    "collectCoverageFrom": [
      ".js"
    ],
    "coverageThreshold": {
      "global": {
        "statements": 100,
        "branches": 100,
        "functions": 100,
        "lines": 100
      }
    }
  },
  "repository": {
    "type": "git",
    "url": "git://github.com/davisjam/safe-regex.git"
  },
  "homepage": "https://github.com/davisjam/safe-regex",
  "keywords": [
    "catastrophic",
    "exponential",
    "regex",
    "safe",
    "sandbox"
  ],
  "author": {
    "name": "James C. (Jamie) Davis",
    "email": "davisjam@vt.edu",
    "url": "http://people.cs.vt.edu/~davisjam"
  },
  "license": "MIT"
}


================================================
FILE: src/analyzer-family.js
================================================
// Load the analyzers
const heuristicAnalyzer = require("./heuristic-analyzer");

module.exports = [heuristicAnalyzer];


================================================
FILE: src/analyzer.js
================================================
// Generic options
class AnalyzerOptions {
  constructor(heuristic_replimit) {
    this.heuristic_replimit = heuristic_replimit;
  }
}

class AttackString {
  constructor(prefixAndPumpList, suffix) {
    this.prefixAndPumpList = prefixAndPumpList;
    this.suffix = suffix;
  }
}

// Abstract class
class Analyzer {
  constructor(analyzerOptions) {
    this.options = analyzerOptions;
  }

  // Subclasser must implement
  // Return boolean
  isVulnerable(regExp) {
    return false;
  }

  // Subclass must implement
  // Returns an AttackString or null
  genAttackString(regExp) {
    return null;
  }
}

module.exports = function(re, replimit) {
  // Build an AST
  let myRegExp = null;
  let ast = null;
  try {
    // Construct a RegExp object
    if (re instanceof RegExp) {
      myRegExp = re;
    } else if (typeof re === "string") {
      myRegExp = new RegExp(re);
    } else {
      myRegExp = new RegExp(String(re));
    }

    // Build an AST
    ast = regexpTree.parse(myRegExp);
  } catch (err) {
    // Invalid or unparseable input
    return false;
  }

  let currentStarHeight = 0;
  let maxObservedStarHeight = 0;

  let repetitionCount = 0;

  regexpTree.traverse(ast, {
    Repetition: {
      pre({ node }) {
        repetitionCount++;

        currentStarHeight++;
        if (maxObservedStarHeight < currentStarHeight) {
          maxObservedStarHeight = currentStarHeight;
        }
      },

      post({ node }) {
        currentStarHeight--;
      }
    }
  });

  return maxObservedStarHeight <= 1 && repetitionCount <= replimit;
};

module.exports = {
  "AnalyzerOptions": AnalyzerOptions,
  "Analyzer": Analyzer,
};


================================================
FILE: src/heuristic-analyzer.js
================================================
// Exports an Analyzer subclass

const regexpTree = require("regexp-tree");
const analyzer = require("./analyzer");

class HeuristicAnalyzer extends analyzer.Analyzer {
  constructor(analyzerOptions) {
    super(analyzerOptions);
  }

  isVulnerable(regExp) {
    // Heuristic #1: Star height > 1
    const starHeight = this._measureStarHeight(regExp);
    if (starHeight > 1) {
      return true;
    }

    // Heuristic #2: # repetitions > limit
    // TODO This is a poor heuristic
    const nRepetitions = this._measureRepetitions(regExp);
    if (nRepetitions > this.options.heuristic_replimit) {
      return true;
    }

    return false;
  }

  genAttackString(regExp) {
    return null;
  }

  _measureStarHeight(regExp) {
    let currentStarHeight = 0;
    let maxObservedStarHeight = 0;

    const ast = regexpTree.parse(regExp);

    regexpTree.traverse(ast, {
      Repetition: {
        pre({ node }) {
          currentStarHeight++;
          if (maxObservedStarHeight < currentStarHeight) {
            maxObservedStarHeight = currentStarHeight;
          }
        },

        post({ node }) {
          currentStarHeight--;
        }
      }
    });

    return maxObservedStarHeight;
  }

  _measureRepetitions(regExp) {
    let nRepetitions = 0;

    const ast = regexpTree.parse(regExp);
    regexpTree.traverse(ast, {
      Repetition: {
        pre({ node }) {
          nRepetitions++;
        }
      }
    });

    return nRepetitions;
  }
}

module.exports = HeuristicAnalyzer;


================================================
FILE: src/index.js
================================================
const analyzer = require('./analyzer');
const analyzerFamily = require('./analyzer-family');

const DEFAULT_SAFE_REP_LIMIT = 25;
const RET_IS_SAFE = true;
const RET_IS_VULNERABLE = false;

class Args {
  constructor(regExp, analyzerOptions) {
    this.regExp = regExp;
    this.analyzerOptions = analyzerOptions;
  }
}

function safeRegex(re, opts) {
  try {
    const args = buildArgs(re, opts);
    const analyzerResponses = askAnalyzersIfVulnerable(args);

    // Did any analyzer say true?
    if (analyzerResponses.find((isVulnerable) => isVulnerable)) {
      return RET_IS_VULNERABLE;
    } else {
      return RET_IS_SAFE;
    }
  } catch (err) {
    // Invalid or unparseable input
    return false;
  }
}

function buildArgs(re, opts) {
  // Build AnalyzerOptions
  if (!opts) opts = {};
  const heuristic_replimit = opts.limit === undefined ? DEFAULT_SAFE_REP_LIMIT : opts.limit;

  const analyzerOptions = new analyzer.AnalyzerOptions(heuristic_replimit);

  // Build RegExp
  let regExp = null;
  // Construct a RegExp object
  if (re instanceof RegExp) {
    regExp = re;
  } else if (typeof re === 'string') {
    regExp = new RegExp(re);
  } else {
    regExp = new RegExp(String(re));
  }

  return new Args(regExp, analyzerOptions);
}

function askAnalyzersIfVulnerable(args) {
  let analyzerSaysVulnerable = [];

  // Query the Analyzers
  let Analyzer;
  for (Analyzer of analyzerFamily) {
    try {
      const analyzer = new Analyzer(args.analyzerOptions);
      analyzerSaysVulnerable.push(analyzer.isVulnerable(args.regExp));
    } catch (err) {
      /* istanbul ignore next */ // No need to worry about code coverage here.
      analyzerSaysVulnerable.push(false);
    }
  }

  return analyzerSaysVulnerable;
}

// Export

module.exports = safeRegex;

================================================
FILE: src/test/regex.spec.js
================================================
const safeRegex = require("../");

const REPETITION_LIMIT = 25;
const REPETITION_TOO_MUCH = REPETITION_LIMIT + 1;

// TODO Named character classes

test("The full set of JS regex features are supported", () => {
  /**
   * A list of linear-time regexes using a reasonably exhaustive
   * set of the supported JS regex features.
   * cf. https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions
   *
   * The purpose of this list is to check for
   * full regex syntax support.
   */
  const diverseLinTimeRegexes = [
    /* Truly Regular Expressions */

    // Conjunction
    /a/,
    /abc/,

    // Simple zero-width assertions
    /^a/,
    /a$/,

    // Quantifiers
    /^a*/,
    /^a+/,
    /a?/,
    /x{5}/,
    /x{5,}/,
    /x{5,10}/,

    // Grouping constructs
    /(x)/,
    /(?:x)/,

    // Disjunction
    /x|y/,

    // Built-in character classes
    /.\./,
    /[\b]/,
    /\b/,
    /\B/,
    /\cA/,
    /\d/,
    /\D/,
    /\f/,
    /\n/,
    /\r/,
    /\s/,
    /\S/,
    /\t/,
    /\v/,
    /\w/,
    /\W/,
    /\0/,
    /\x00/,
    /\u0000/,
    /\u{0000}/u,

    // Custom character classes
    /[xyz]/,
    /[x-z]/,
    /[^xyz]/,
    /[^x-z]/,

    /* Extended features */

    // Backreferences
    /(x) \1/,

    // Lookaround assertions
    /x(?=y)/,
    /x(?!y)/,

    /* Added in ECMAScript 2018 */

    // Lookbehind assertions
    /(?<=y)x/,
    /(?!<y)x/,

    // Named capture groups
    /(?<year>\d{4})/,
    /(?<year>a)\k<year>/,
    // Tests related to bug #26
    /(?<year>test?)/,
    /(?<year>test*)/,
    /(?<year>test)*/,
  ];

  diverseLinTimeRegexes.forEach(re => {
    expect(safeRegex(re)).toBe(true);
  });
});

test("linear-time regexes are safe", () => {
  const linTime = [
    /**
     * No false positives
     */

    // Standard regex features
    /a/,
    /a*/,
    /^a*/,
    /^a+$/,
    /a+$/,
    /a?/,
    /a{3,5}/,
    /a|b/,
    /(ab)/,
    /(ab)\1/,
    /\bOakland\b/,
    /^((a+)|(b+))/,
    /(a+)|(b+)/,

    // RE's in a string
    "^foo/bar",

    // non-RE, non-string
    1,
  ];

  linTime.forEach(re => {
    expect(safeRegex(re)).toBe(true);
  });
});

test("linear-time regexes are safe, under varying repetition limits", () => {
  const re1 = RegExp("a?".repeat(REPETITION_LIMIT) + "a".repeat(REPETITION_LIMIT));
  expect(safeRegex(re1)).toBe(true);

  const LOW_REPETITION_LIMIT = 3;
  const re2 = RegExp(Array(LOW_REPETITION_LIMIT).join("a?"));

  expect(safeRegex(re2, { limit: LOW_REPETITION_LIMIT })).toBe(true);
});

test("poly-time regexes are safe (at least according to our heuristics)", () => {
  const polyTime = [
    /^a+a+$/,        // QOA
    /^a+aa+$/,       // QOA with obvious intermediate run
    /^a+aaaa+$/,     // QOA with obvious intermediate run
    /^a+[a-z]a+$/,   // QOA with obvious intermediate run
    /^a+\wa+$/,      // QOA with intermediate character class
    /^a+(\w|\d)a+$/, // QOA with valid path through
    /^a+b?a+$/,      // QOA with valid path through
    /^a+(cde)*a+$/,  // QOA with valid path through
    /^.*a*$/,        // QOA by subset
    /^\w*\d*$/,      // QOA by intersection
    /^\S+@\S+\.\S+$/, // Example from Django
    /a+$/,           // QOA under partial-match
    /abc.*$/,        // QOA under partial-match
    // TODO It would be nice to have one of the regexes that are poly-time even when they match, due to non-greedy quantifiers (p-NFA)
  ];

  polyTime.forEach(re => {
    expect(safeRegex(re)).toBe(true);
  });
});

test("exp-time regexes due to star height are unsafe", () => {
  const expTime = [
    // Straightforward star height
    /(a*)*$/,
    /(a?)*$/,
    /(a*)?$/,
    /(a*)+$/,
    /(a+)*$/,
    /(\wa+)*$/, // Prefix
    /(\..*)*$/, // Suffix

    // Branching and nesting.
    /(a*|b)+$/,
    /(a|b*)+$/,
    /(((b*)))+$/,
    /(((b*))+)$/,
    /(((b*)+))$/,
    /(((b)*)+)$/,
    /(((b)*))+$/,

    // Misc. more complex cases
    /^(a?){25}(a){25}$/,
    /(x+x+)+y/,
    /foo|(x+x+)+y/,
    /(a+){10}y/,
    /(a+){2}y/,
    /(.*){1,32000}[bc]/,


    // RE's in a string
    "(a+)+",
  ];

  expTime.forEach(re => {
    expect(safeRegex(re)).toBe(false);
  });
});

test("linear-time regexes with star height > 1", () => {
  // TODO These are false positives, Fix once we improve analysis
  const linTime = [
    /(ab*)+$/,
    /(b*a)+$/,
  ];

  linTime.forEach(re => {
    expect(safeRegex(re)).toBe(false);
  });
});

test("exp-time regexes due to disjunction are safe (according to current heuristics)", () => {
  // TODO These are false negatives. Fix once we improve analysis
  const expTime = [
    /(a|a)*$/,       // QOD: obvious 
    /(a|\w)*$/,      // QOD due to overlap
    /([abc]|b)*$/,   // QOD due to overlap
    /(\w\w\w|bab)*$/, // QOD due to overlap, with multi-step internal paths
  ];

  expTime.forEach(re => {
    expect(safeRegex(re)).toBe(true);
  });
});

test("regex that exceeds repetition limit is unsafe", () => {
  const re1 = RegExp("a?".repeat(REPETITION_TOO_MUCH) + "a".repeat(REPETITION_TOO_MUCH));
  expect(safeRegex(re1)).toBe(false);

  const LOW_REPETITION_LIMIT = 3;
  const re2 = RegExp("a?".repeat(LOW_REPETITION_LIMIT + 1));
  expect(safeRegex(re2, { limit: LOW_REPETITION_LIMIT })).toBe(false);
});

test("invalid regexes default to unsafe", () => {
  const invalid = [
    "(a+",
    "[a-z",
    "*Oakland*",
    "hey(yoo) )",
    "abcde(?>hellow)",
    "[abc",
  ];

  invalid.forEach(re => {
    expect(safeRegex(re)).toBe(false);
  });
});