Repository: netgen/query-translator Branch: master Commit: 9f42602061cf Files: 72 Total size: 334.1 KB Directory structure: gitextract_wpaijre9/ ├── .gitattributes ├── .github/ │ └── workflows/ │ └── tests.yml ├── .gitignore ├── .php_cs.dist ├── LICENSE ├── README.md ├── composer.json ├── lib/ │ ├── Languages/ │ │ └── Galach/ │ │ ├── Generators/ │ │ │ ├── Common/ │ │ │ │ ├── Aggregate.php │ │ │ │ └── Visitor.php │ │ │ ├── ExtendedDisMax.php │ │ │ ├── Lucene/ │ │ │ │ ├── Common/ │ │ │ │ │ ├── Group.php │ │ │ │ │ ├── LogicalAnd.php │ │ │ │ │ ├── LogicalNot.php │ │ │ │ │ ├── LogicalOr.php │ │ │ │ │ ├── Mandatory.php │ │ │ │ │ ├── Phrase.php │ │ │ │ │ ├── Prohibited.php │ │ │ │ │ ├── Query.php │ │ │ │ │ ├── Tag.php │ │ │ │ │ ├── User.php │ │ │ │ │ └── WordBase.php │ │ │ │ ├── ExtendedDisMax/ │ │ │ │ │ └── Word.php │ │ │ │ └── QueryString/ │ │ │ │ └── Word.php │ │ │ ├── Native/ │ │ │ │ ├── BinaryOperator.php │ │ │ │ ├── Group.php │ │ │ │ ├── Phrase.php │ │ │ │ ├── Query.php │ │ │ │ ├── Tag.php │ │ │ │ ├── UnaryOperator.php │ │ │ │ ├── User.php │ │ │ │ └── Word.php │ │ │ ├── Native.php │ │ │ └── QueryString.php │ │ ├── Parser.php │ │ ├── README.md │ │ ├── SYNTAX.md │ │ ├── TokenExtractor/ │ │ │ ├── Full.php │ │ │ └── Text.php │ │ ├── TokenExtractor.php │ │ ├── Tokenizer.php │ │ └── Values/ │ │ ├── Node/ │ │ │ ├── Group.php │ │ │ ├── LogicalAnd.php │ │ │ ├── LogicalNot.php │ │ │ ├── LogicalOr.php │ │ │ ├── Mandatory.php │ │ │ ├── Prohibited.php │ │ │ ├── Query.php │ │ │ └── Term.php │ │ └── Token/ │ │ ├── GroupBegin.php │ │ ├── Phrase.php │ │ ├── Tag.php │ │ ├── User.php │ │ └── Word.php │ ├── Parsing.php │ ├── Tokenizing.php │ └── Values/ │ ├── Correction.php │ ├── Node.php │ ├── SyntaxTree.php │ ├── Token.php │ └── TokenSequence.php ├── phpunit.xml └── tests/ ├── Galach/ │ ├── Generators/ │ │ ├── AggregateVisitorDispatchTest.php │ │ ├── ExtendedDisMaxTest.php │ │ ├── LuceneVisitorDispatchTest.php │ │ ├── NativeVisitorDispatchTest.php │ │ └── QueryStringTest.php │ ├── IntegrationTest.php │ ├── Tokenizer/ │ │ ├── FullTokenizerTest.php │ │ ├── TextTokenizerTest.php │ │ └── TokenExtractorTest.php │ └── Values/ │ └── NodeTraversalTest.php └── bootstrap.php ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ /tests export-ignore ================================================ FILE: .github/workflows/tests.yml ================================================ name: Tests on: push: branches: ['master'] pull_request: jobs: tests: name: PHP ${{ matrix.php }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: include: - php: '7.0' coverage: none - php: '7.1' coverage: none - php: '7.2' coverage: none - php: '7.3' coverage: none - php: '7.4' coverage: none - php: '8.0' coverage: none - php: '8.1' coverage: none - php: '8.2' coverage: xdebug upload_coverage: true - php: '8.3' coverage: none - php: '8.4' coverage: none - php: '8.5' coverage: none steps: - name: Checkout uses: actions/checkout@v4 with: fetch-depth: 0 - name: Setup PHP uses: shivammathur/setup-php@v2 with: php-version: ${{ matrix.php }} coverage: ${{ matrix.coverage }} - name: Configure Composer (public deps) run: | composer config -g github-protocols https composer config -g use-github-api false - name: Composer version run: composer --version - name: Validate composer.json run: composer validate --strict - name: Install dependencies run: composer update --prefer-dist - name: Run PHPUnit (no coverage) if: ${{ !matrix.upload_coverage }} run: vendor/bin/phpunit -c phpunit.xml --colors=always - name: Run PHPUnit with coverage if: ${{ matrix.upload_coverage }} run: vendor/bin/phpunit -c phpunit.xml --colors=always --coverage-clover=coverage.xml - name: Upload coverage to Codecov if: ${{ matrix.upload_coverage }} uses: codecov/codecov-action@v4 with: files: ./coverage.xml flags: all fail_ci_if_error: true env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} ================================================ FILE: .gitignore ================================================ composer.lock .php_cs.cache .phpunit.result.cache /vendor/ ================================================ FILE: .php_cs.dist ================================================ in(__DIR__) ->exclude([]) ->files()->name('*.php') ; return PhpCsFixer\Config::create() ->setRules([ '@Symfony' => true, '@Symfony:risky' => true, 'concat_space' => ['spacing' => 'one'], 'array_syntax' => ['syntax' => 'short'], 'simplified_null_return' => false, 'phpdoc_align' => false, 'phpdoc_separation' => false, 'phpdoc_to_comment' => false, 'no_useless_else' => true, 'no_useless_return' => true, 'ordered_class_elements' => true, 'ordered_imports' => true, 'cast_spaces' => false, 'blank_line_after_opening_tag' => false, 'single_blank_line_before_namespace' => false, 'phpdoc_annotation_without_dot' => false, 'phpdoc_no_alias_tag' => false, 'space_after_semicolon' => false, ]) ->setRiskyAllowed(true) ->setFinder($finder) ; ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2017 Petar Španja Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Query Translator [![Build Status](https://img.shields.io/github/actions/workflow/status/netgen/query-translator/tests.yml?branch=master&&style=flat-square)](https://github.com/netgen/query-translator/actions?query=workflow%3ATests) [![Code Coverage](https://img.shields.io/codecov/c/github/netgen/query-translator.svg?style=flat-square)](https://codecov.io/gh/netgen/query-translator) [![Downloads](https://img.shields.io/packagist/dt/netgen/query-translator.svg?style=flat-square)](https://packagist.org/packages/netgen/query-translator) [![Latest stable](https://img.shields.io/packagist/v/netgen/query-translator.svg?style=flat-square)](https://packagist.org/packages/netgen/query-translator) [![License](https://img.shields.io/packagist/l/netgen/query-translator.svg?style=flat-square)](https://packagist.org/packages/netgen/query-translator) [![PHP](https://img.shields.io/badge/php-%3E%3D%205.6-8892BF.svg?style=flat-square)](https://secure.php.net/) [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/netgen/query-translator) Query Translator takes a search string as user input and converts it into something a search backend can understand. Technically, it's a search query [translator](https://en.wikipedia.org/wiki/Translator_(computing)) with [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) representation. From the produced syntax tree, translation target can be anything you need. Usually it's a search backend, like Solr and Elasticsearch, or a database abstraction layer. A set of interfaces for implementing a language processor is provided, with a single implemented language named [Galach](lib/Languages/Galach). Galach implements a syntax that is based on what seems to be the unofficial standard for search query as user input. Quick cheat sheet: `word` `"phrase"` `(group)` `+mandatory` `-prohibited` `AND` `&&` `OR` `||` `NOT` `!` `#tag` `@user` `domain:term` ### Error handling User input means you have to expect errors and handle them gracefully. Because of that, the parser is completely resistant to errors. Syntax tree will contain detailed information about corrections applied to make sense of the user input. This can be useful to clean up the input or implement rich input interface, with features like suggestions, syntax highlighting and error feedback. ### Customization The implementation was made with customization in mind. You can change the special characters which will be used as part of the syntax, pick out elements of the language you want to use, implement your own term clauses, or change how the syntax tree is converted to the target output. ### Some use cases - User-level query language on top of your search backend - Common query language on top of different search backends - Control over options of the query language that is already provided by the search backend - Better error handling than provided by the search backend - Analysis and manipulation of the query before sending to the backend - Customized query language (while remaining within the base syntax) - Implementing rich input interface (with suggestions, syntax highlighting, error feedback) Note: This implementation is intended as a [library](https://en.wikipedia.org/wiki/Library_(computing)), meaning it doesn't try to solve specific use cases for query translation. Instead, it's meant to be a base that you can use in implementing such a use case. ### How to use First add the library to your project: ``` composer require netgen/query-translator:^1.0 ``` After that, make use of the features provided out of the box. If those are not enough, use extension points to customize various parts of the translator to fit your needs. See [Galach documentation](lib/Languages/Galach) to find out more. ## Run the demo Demo is available as a separate repository at [netgen/query-translator-demo](https://github.com/netgen/query-translator-demo). Steps for running the demo: 1. Create the demo project using composer `composer create-project netgen/query-translator-demo` 2. Position into the demo project directory `cd query-translator-demo` 3. Start the web server with `src` as the document root `php -S localhost:8005 -t src` 4. Open [http://localhost:8005](http://localhost:8005) in your browser ![Query Translator demo](https://raw.githubusercontent.com/netgen/query-translator-demo/master/src/animation.gif) ================================================ FILE: composer.json ================================================ { "name": "netgen/query-translator", "description": "Query Translator is a search query translator with AST representation", "keywords": [ "search", "query", "tokenizer", "parser", "generator", "translator", "ast", "solr", "edismax", "elasticsearch" ], "type": "library", "homepage": "https://github.com/netgen/query-translator", "license": "MIT", "authors": [ { "name": "Petar Španja", "email": "petar@spanja.info" } ], "require": { "php": "^7.0||^8.0" }, "require-dev": { "phpunit/phpunit": "<10", "symfony/phpunit-bridge": "*", "friendsofphp/php-cs-fixer": "^2.11" }, "autoload": { "psr-4": { "QueryTranslator\\": "lib" } }, "autoload-dev": { "psr-4": { "QueryTranslator\\Tests\\": "tests" } }, "scripts": { "test": [ "./vendor/bin/phpunit" ] }, "extra": { "branch-alias": { "dev-master": "1.0.x-dev" } } } ================================================ FILE: lib/Languages/Galach/Generators/Common/Aggregate.php ================================================ addVisitor($visitor); } } /** * Add a $visitor to the aggregated collection. * * @param \QueryTranslator\Languages\Galach\Generators\Common\Visitor $visitor */ public function addVisitor(Visitor $visitor) { $this->visitors[] = $visitor; } public function accept(Node $node) { return true; } public function visit(Node $node, Visitor $subVisitor = null, $options = null) { foreach ($this->visitors as $visitor) { if ($visitor->accept($node)) { return $visitor->visit($node, $this, $options); } } throw new RuntimeException('No visitor available for ' . get_class($node)); } } ================================================ FILE: lib/Languages/Galach/Generators/Common/Visitor.php ================================================ visitor = $visitor; } /** * Generate query string in Solr Extended DisMax format from the given $syntaxTree. * * @param \QueryTranslator\Values\SyntaxTree $syntaxTree * @param mixed $options * * @return string */ public function generate(SyntaxTree $syntaxTree, $options = null) { return $this->visitor->visit($syntaxTree->rootNode, null, $options); } } ================================================ FILE: lib/Languages/Galach/Generators/Lucene/Common/Group.php ================================================ domainFieldMap = $domainFieldMap; } $this->defaultFieldName = $defaultFieldName; } public function accept(Node $node) { return $node instanceof GroupNode; } public function visit(Node $node, Visitor $subVisitor = null, $options = null) { if (!$node instanceof GroupNode) { throw new LogicException( 'Implementation accepts instance of Group Node' ); } if ($subVisitor === null) { throw new LogicException('Implementation requires sub-visitor'); } $clauses = []; foreach ($node->nodes as $subNode) { $clauses[] = $subVisitor->visit($subNode, $subVisitor, $options); } $fieldPrefix = $this->getSolrFieldPrefix($node->tokenLeft); $clauses = implode(' ', $clauses); return "{$fieldPrefix}({$clauses})"; } /** * Return Solr backend field name prefix for the given $token. * * @param \QueryTranslator\Languages\Galach\Values\Token\GroupBegin $token * * @return string */ private function getSolrFieldPrefix(GroupBegin $token) { if ($token->domain === '') { return ''; } if (isset($this->domainFieldMap[$token->domain])) { return $this->domainFieldMap[$token->domain] . ':'; } return $this->defaultFieldName . ':'; } } ================================================ FILE: lib/Languages/Galach/Generators/Lucene/Common/LogicalAnd.php ================================================ visit($node->leftOperand, $subVisitor, $options), $subVisitor->visit($node->rightOperand, $subVisitor, $options), ]; return implode(' AND ', $clauses); } } ================================================ FILE: lib/Languages/Galach/Generators/Lucene/Common/LogicalNot.php ================================================ visit($node->operand, $subVisitor, $options); return "NOT {$clause}"; } } ================================================ FILE: lib/Languages/Galach/Generators/Lucene/Common/LogicalOr.php ================================================ visit($node->leftOperand, $subVisitor, $options), $subVisitor->visit($node->rightOperand, $subVisitor, $options), ]; return implode(' OR ', $clauses); } } ================================================ FILE: lib/Languages/Galach/Generators/Lucene/Common/Mandatory.php ================================================ visit($node->operand, $subVisitor, $options); return "+{$clause}"; } } ================================================ FILE: lib/Languages/Galach/Generators/Lucene/Common/Phrase.php ================================================ domainFieldMap = $domainFieldMap; } $this->defaultFieldName = $defaultFieldName; } public function accept(Node $node) { return $node instanceof Term && $node->token instanceof PhraseToken; } public function visit(Node $node, Visitor $subVisitor = null, $options = null) { if (!$node instanceof Term) { throw new LogicException( 'Implementation accepts instance of Term Node' ); } $token = $node->token; if (!$token instanceof PhraseToken) { throw new LogicException( 'Implementation accepts instance of Phrase Token' ); } $fieldPrefix = $this->getSolrFieldPrefix($token); $phraseEscaped = preg_replace("/([\\{$token->quote}])/", '\\\\$1', $token->phrase); return "{$fieldPrefix}\"{$phraseEscaped}\""; } /** * Return Solr backend field name prefix for the given $token. * * @param \QueryTranslator\Languages\Galach\Values\Token\Phrase $token * * @return string */ private function getSolrFieldPrefix(PhraseToken $token) { if ($token->domain === '') { return ''; } if (isset($this->domainFieldMap[$token->domain])) { return $this->domainFieldMap[$token->domain] . ':'; } return $this->defaultFieldName . ':'; } } ================================================ FILE: lib/Languages/Galach/Generators/Lucene/Common/Prohibited.php ================================================ visit($node->operand, $subVisitor, $options); return "-{$clause}"; } } ================================================ FILE: lib/Languages/Galach/Generators/Lucene/Common/Query.php ================================================ nodes as $subNode) { $clauses[] = $subVisitor->visit($subNode, $subVisitor, $options); } return implode(' ', $clauses); } } ================================================ FILE: lib/Languages/Galach/Generators/Lucene/Common/Tag.php ================================================ fieldName = $fieldName; } public function accept(Node $node) { return $node instanceof Term && $node->token instanceof TagToken; } public function visit(Node $node, Visitor $subVisitor = null, $options = null) { if (!$node instanceof Term) { throw new LogicException( 'Implementation accepts instance of Term Node' ); } $token = $node->token; if (!$token instanceof TagToken) { throw new LogicException( 'Implementation accepts instance of Tag Token' ); } $fieldPrefix = $this->fieldName === null ? '' : "{$this->fieldName}:"; return "{$fieldPrefix}{$token->tag}"; } } ================================================ FILE: lib/Languages/Galach/Generators/Lucene/Common/User.php ================================================ fieldName = $fieldName; } public function accept(Node $node) { return $node instanceof Term && $node->token instanceof UserToken; } public function visit(Node $node, Visitor $subVisitor = null, $options = null) { if (!$node instanceof Term) { throw new LogicException( 'Implementation accepts instance of Term Node' ); } $token = $node->token; if (!$token instanceof UserToken) { throw new LogicException( 'Implementation accepts instance of User Token' ); } $fieldPrefix = $this->fieldName === null ? '' : "{$this->fieldName}:"; return "{$fieldPrefix}{$token->user}"; } } ================================================ FILE: lib/Languages/Galach/Generators/Lucene/Common/WordBase.php ================================================ domainFieldMap = $domainFieldMap; } $this->defaultFieldName = $defaultFieldName; } public function accept(Node $node) { return $node instanceof Term && $node->token instanceof WordToken; } public function visit(Node $node, Visitor $subVisitor = null, $options = null) { if (!$node instanceof Term) { throw new LogicException( 'Implementation accepts instance of Term Node' ); } $token = $node->token; if (!$token instanceof WordToken) { throw new LogicException( 'Implementation accepts instance of Word Token' ); } $fieldPrefix = $this->getSolrFieldPrefix($token); $wordEscaped = $this->escapeWord($token->word); return "{$fieldPrefix}{$wordEscaped}"; } /** * Escape special characters in the given word $string. * * @param string $string * * @return string */ abstract protected function escapeWord($string); /** * Return backend field name prefix for the given $token. * * @param \QueryTranslator\Languages\Galach\Values\Token\Word $token * * @return string */ private function getSolrFieldPrefix(WordToken $token) { if ($token->domain === '') { return ''; } if (isset($this->domainFieldMap[$token->domain])) { return $this->domainFieldMap[$token->domain] . ':'; } return $this->defaultFieldName . ':'; } } ================================================ FILE: lib/Languages/Galach/Generators/Lucene/ExtendedDisMax/Word.php ================================================ |\\<|!|\\(|\\)|\\{|}|\\[|]|\\^|"|~|\\*|\\?|:|\\/|\\\\| )/', '\\\\$1', $string ); } } ================================================ FILE: lib/Languages/Galach/Generators/Native/BinaryOperator.php ================================================ visit($node->leftOperand, $subVisitor, $options), $subVisitor->visit($node->rightOperand, $subVisitor, $options), ]; return implode(" {$node->token->lexeme} ", $clauses); } } ================================================ FILE: lib/Languages/Galach/Generators/Native/Group.php ================================================ nodes as $subNode) { $clauses[] = $subVisitor->visit($subNode, $subVisitor, $options); } $clauses = implode(' ', $clauses); $domainPrefix = $node->tokenLeft->domain === '' ? '' : "{$node->tokenLeft->domain}:"; return "{$domainPrefix}{$node->tokenLeft->delimiter}{$clauses}{$node->tokenRight->lexeme}"; } } ================================================ FILE: lib/Languages/Galach/Generators/Native/Phrase.php ================================================ token instanceof PhraseToken; } public function visit(Node $node, Visitor $subVisitor = null, $options = null) { if (!$node instanceof Term) { throw new LogicException( 'Implementation accepts instance of Term Node' ); } $token = $node->token; if (!$token instanceof PhraseToken) { throw new LogicException( 'Implementation accepts instance of Phrase Token' ); } $domainPrefix = $token->domain === '' ? '' : "{$token->domain}:"; $phraseEscaped = preg_replace("/([\\{$token->quote}])/", '\\\\$1', $token->phrase); return "{$domainPrefix}{$token->quote}{$phraseEscaped}{$token->quote}"; } } ================================================ FILE: lib/Languages/Galach/Generators/Native/Query.php ================================================ nodes as $subNode) { $clauses[] = $subVisitor->visit($subNode, $subVisitor, $options); } return implode(' ', $clauses); } } ================================================ FILE: lib/Languages/Galach/Generators/Native/Tag.php ================================================ token instanceof TagToken; } public function visit(Node $node, Visitor $subVisitor = null, $options = null) { if (!$node instanceof Term) { throw new LogicException( 'Implementation accepts instance of Term Node' ); } $token = $node->token; if (!$token instanceof TagToken) { throw new LogicException( 'Implementation accepts instance of Tag Token' ); } return "{$token->marker}{$token->tag}"; } } ================================================ FILE: lib/Languages/Galach/Generators/Native/UnaryOperator.php ================================================ visit($node->operand, $subVisitor, $options); $padding = ''; if ($node->token->type === Tokenizer::TOKEN_LOGICAL_NOT) { $padding = ' '; } return "{$node->token->lexeme}{$padding}{$clause}"; } } ================================================ FILE: lib/Languages/Galach/Generators/Native/User.php ================================================ token instanceof UserToken; } public function visit(Node $node, Visitor $subVisitor = null, $options = null) { if (!$node instanceof Term) { throw new LogicException( 'Implementation accepts instance of Term Node' ); } $token = $node->token; if (!$token instanceof UserToken) { throw new LogicException( 'Implementation accepts instance of User Token' ); } return "{$token->marker}{$token->user}"; } } ================================================ FILE: lib/Languages/Galach/Generators/Native/Word.php ================================================ token instanceof WordToken; } public function visit(Node $node, Visitor $subVisitor = null, $options = null) { if (!$node instanceof Term) { throw new LogicException( 'Implementation accepts instance of Term Node' ); } $token = $node->token; if (!$token instanceof WordToken) { throw new LogicException( 'Implementation accepts instance of Word Token' ); } $domainPrefix = $token->domain === '' ? '' : "{$token->domain}:"; $wordEscaped = preg_replace('/([\\\'"+\-!():#@ ])/', '\\\\$1', $token->word); return "{$domainPrefix}{$wordEscaped}"; } } ================================================ FILE: lib/Languages/Galach/Generators/Native.php ================================================ visitor = $visitor; } /** * Generate query string in Galach format from the given $syntaxTree. * * @param \QueryTranslator\Values\SyntaxTree $syntaxTree * * @return string */ public function generate(SyntaxTree $syntaxTree) { return $this->visitor->visit($syntaxTree->rootNode); } } ================================================ FILE: lib/Languages/Galach/Generators/QueryString.php ================================================ visitor = $visitor; } /** * Generate query string in Elasticsearch Query String Query format from the given $syntaxTree. * * @param \QueryTranslator\Values\SyntaxTree $syntaxTree * @param mixed $options * * @return string */ public function generate(SyntaxTree $syntaxTree, $options = null) { return $this->visitor->visit($syntaxTree->rootNode, null, $options); } } ================================================ FILE: lib/Languages/Galach/Parser.php ================================================ Tokenizer::TOKEN_LOGICAL_NOT | Tokenizer::TOKEN_LOGICAL_NOT_2, 'operatorPreference' => Tokenizer::TOKEN_MANDATORY | Tokenizer::TOKEN_PROHIBITED, 'operatorPrefix' => Tokenizer::TOKEN_MANDATORY | Tokenizer::TOKEN_PROHIBITED | Tokenizer::TOKEN_LOGICAL_NOT_2, 'operatorUnary' => Tokenizer::TOKEN_MANDATORY | Tokenizer::TOKEN_PROHIBITED | Tokenizer::TOKEN_LOGICAL_NOT | Tokenizer::TOKEN_LOGICAL_NOT_2, 'operatorBinary' => Tokenizer::TOKEN_LOGICAL_AND | Tokenizer::TOKEN_LOGICAL_OR, 'operator' => Tokenizer::TOKEN_LOGICAL_AND | Tokenizer::TOKEN_LOGICAL_OR | Tokenizer::TOKEN_MANDATORY | Tokenizer::TOKEN_PROHIBITED | Tokenizer::TOKEN_LOGICAL_NOT | Tokenizer::TOKEN_LOGICAL_NOT_2, 'groupDelimiter' => Tokenizer::TOKEN_GROUP_BEGIN | Tokenizer::TOKEN_GROUP_END, 'binaryOperatorAndWhitespace' => Tokenizer::TOKEN_LOGICAL_AND | Tokenizer::TOKEN_LOGICAL_OR | Tokenizer::TOKEN_WHITESPACE, ]; private static $shifts = [ Tokenizer::TOKEN_WHITESPACE => 'shiftWhitespace', Tokenizer::TOKEN_TERM => 'shiftTerm', Tokenizer::TOKEN_GROUP_BEGIN => 'shiftGroupBegin', Tokenizer::TOKEN_GROUP_END => 'shiftGroupEnd', Tokenizer::TOKEN_LOGICAL_AND => 'shiftBinaryOperator', Tokenizer::TOKEN_LOGICAL_OR => 'shiftBinaryOperator', Tokenizer::TOKEN_LOGICAL_NOT => 'shiftLogicalNot', Tokenizer::TOKEN_LOGICAL_NOT_2 => 'shiftLogicalNot2', Tokenizer::TOKEN_MANDATORY => 'shiftPreference', Tokenizer::TOKEN_PROHIBITED => 'shiftPreference', Tokenizer::TOKEN_BAILOUT => 'shiftBailout', ]; private static $nodeToReductionGroup = [ Group::class => 'group', LogicalAnd::class => 'logicalAnd', LogicalOr::class => 'logicalOr', LogicalNot::class => 'unaryOperator', Mandatory::class => 'unaryOperator', Prohibited::class => 'unaryOperator', Term::class => 'term', ]; private static $reductionGroups = [ 'group' => [ 'reduceGroup', 'reducePreference', 'reduceLogicalNot', 'reduceLogicalAnd', 'reduceLogicalOr', ], 'unaryOperator' => [ 'reduceLogicalNot', 'reduceLogicalAnd', 'reduceLogicalOr', ], 'logicalOr' => [], 'logicalAnd' => [ 'reduceLogicalOr', ], 'term' => [ 'reducePreference', 'reduceLogicalNot', 'reduceLogicalAnd', 'reduceLogicalOr', ], ]; /** * Input tokens. * * @var \QueryTranslator\Values\Token[] */ private $tokens; /** * Query stack. * * @var \SplStack */ private $stack; /** * An array of applied corrections. * * @var \QueryTranslator\Values\Correction[] */ private $corrections = []; public function parse(TokenSequence $tokenSequence) { $this->init($tokenSequence->tokens); while (!empty($this->tokens)) { $node = $this->shift(); if ($node instanceof Node) { $this->reduce($node); } } $this->reduceQuery(); return new SyntaxTree($this->stack->top(), $tokenSequence, $this->corrections); } private function shift() { $token = array_shift($this->tokens); $shift = self::$shifts[$token->type]; return $this->{$shift}($token); } private function reduce(Node $node) { $previousNode = null; $reductionIndex = null; while ($node instanceof Node) { // Reset reduction index on first iteration or on Node change if ($node !== $previousNode) { $reductionIndex = 0; } // If there are no reductions to try, put the Node on the stack // and continue shifting $reduction = $this->getReduction($node, $reductionIndex); if ($reduction === null) { $this->stack->push($node); break; } $previousNode = $node; $node = $this->{$reduction}($node); ++$reductionIndex; } } protected function shiftWhitespace() { if ($this->isTopStackToken(self::$tokenShortcuts['operatorPrefix'])) { $this->addCorrection( self::CORRECTION_UNARY_OPERATOR_MISSING_OPERAND_IGNORED, $this->stack->pop() ); } } protected function shiftPreference(Token $token) { return $this->shiftAdjacentUnaryOperator($token, self::$tokenShortcuts['operator']); } protected function shiftAdjacentUnaryOperator(Token $token, $tokenMask) { if ($this->isToken(reset($this->tokens), $tokenMask)) { $this->addCorrection( self::CORRECTION_ADJACENT_UNARY_OPERATOR_PRECEDING_OPERATOR_IGNORED, $token ); return null; } $this->stack->push($token); } protected function shiftLogicalNot(Token $token) { $this->stack->push($token); } protected function shiftLogicalNot2(Token $token) { $tokenMask = self::$tokenShortcuts['operator'] & ~Tokenizer::TOKEN_LOGICAL_NOT_2; return $this->shiftAdjacentUnaryOperator($token, $tokenMask); } protected function shiftBinaryOperator(Token $token) { if ($this->stack->isEmpty() || $this->isTopStackToken(Tokenizer::TOKEN_GROUP_BEGIN)) { $this->addCorrection( self::CORRECTION_BINARY_OPERATOR_MISSING_LEFT_OPERAND_IGNORED, $token ); return null; } if ($this->isTopStackToken(self::$tokenShortcuts['operator'])) { $this->ignoreBinaryOperatorFollowingOperator($token); return null; } $this->stack->push($token); } private function ignoreBinaryOperatorFollowingOperator(Token $token) { $precedingOperators = $this->ignorePrecedingOperators(self::$tokenShortcuts['operator']); $followingOperators = $this->ignoreFollowingOperators(); $this->addCorrection( self::CORRECTION_BINARY_OPERATOR_FOLLOWING_OPERATOR_IGNORED, ...array_merge( $precedingOperators, [$token], $followingOperators ) ); } protected function shiftTerm(Token $token) { return new Term($token); } protected function shiftGroupBegin(Token $token) { $this->stack->push($token); } protected function shiftGroupEnd(Token $token) { $this->stack->push($token); return new Group(); } protected function shiftBailout(Token $token) { $this->addCorrection(self::CORRECTION_BAILOUT_TOKEN_IGNORED, $token); } protected function reducePreference(Node $node) { if (!$this->isTopStackToken(self::$tokenShortcuts['operatorPreference'])) { return $node; } $token = $this->stack->pop(); if ($this->isToken($token, Tokenizer::TOKEN_MANDATORY)) { return new Mandatory($node, $token); } return new Prohibited($node, $token); } protected function reduceLogicalNot(Node $node) { if (!$this->isTopStackToken(self::$tokenShortcuts['operatorNot'])) { return $node; } if ($node instanceof Mandatory || $node instanceof Prohibited) { $this->ignoreLogicalNotOperatorsPrecedingPreferenceOperator(); return $node; } return new LogicalNot($node, $this->stack->pop()); } public function ignoreLogicalNotOperatorsPrecedingPreferenceOperator() { $precedingOperators = $this->ignorePrecedingOperators(self::$tokenShortcuts['operatorNot']); if (!empty($precedingOperators)) { $this->addCorrection( self::CORRECTION_LOGICAL_NOT_OPERATORS_PRECEDING_PREFERENCE_IGNORED, ...$precedingOperators ); } } protected function reduceLogicalAnd(Node $node) { if ($this->stack->count() <= 1 || !$this->isTopStackToken(Tokenizer::TOKEN_LOGICAL_AND)) { return $node; } $token = $this->stack->pop(); $leftOperand = $this->stack->pop(); return new LogicalAnd($leftOperand, $node, $token); } /** * Reduce logical OR. * * @param \QueryTranslator\Values\Node $node * @param bool $inGroup Reduce inside a group * * @return null|\QueryTranslator\Languages\Galach\Values\Node\LogicalOr|\QueryTranslator\Values\Node */ protected function reduceLogicalOr(Node $node, $inGroup = false) { if ($this->stack->count() <= 1 || !$this->isTopStackToken(Tokenizer::TOKEN_LOGICAL_OR)) { return $node; } // If inside a group don't look for following logical AND if (!$inGroup) { $this->popWhitespace(); // If the next token is logical AND, put the node on stack // as that has precedence over logical OR if ($this->isToken(reset($this->tokens), Tokenizer::TOKEN_LOGICAL_AND)) { $this->stack->push($node); return null; } } $token = $this->stack->pop(); $leftOperand = $this->stack->pop(); return new LogicalOr($leftOperand, $node, $token); } protected function reduceGroup(Group $group) { $rightDelimiter = $this->stack->pop(); // Pop dangling tokens $this->popTokens(~Tokenizer::TOKEN_GROUP_BEGIN); if ($this->isTopStackToken(Tokenizer::TOKEN_GROUP_BEGIN)) { $leftDelimiter = $this->stack->pop(); $this->ignoreEmptyGroup($leftDelimiter, $rightDelimiter); $this->reduceRemainingLogicalOr(true); return null; } $this->reduceRemainingLogicalOr(true); $group->nodes = $this->collectTopStackNodes(); $group->tokenLeft = $this->stack->pop(); $group->tokenRight = $rightDelimiter; return $group; } /** * Collect all Nodes from the top of the stack. * * @return \QueryTranslator\Values\Node[] */ private function collectTopStackNodes() { $nodes = []; while (!$this->stack->isEmpty() && $this->stack->top() instanceof Node) { array_unshift($nodes, $this->stack->pop()); } return $nodes; } private function ignoreEmptyGroup(Token $leftDelimiter, Token $rightDelimiter) { $precedingOperators = $this->ignorePrecedingOperators(self::$tokenShortcuts['operator']); $followingOperators = $this->ignoreFollowingOperators(); $this->addCorrection( self::CORRECTION_EMPTY_GROUP_IGNORED, ...array_merge( $precedingOperators, [$leftDelimiter, $rightDelimiter], $followingOperators ) ); } /** * Initialize the parser with given array of $tokens. * * @param \QueryTranslator\Values\Token[] $tokens */ private function init(array $tokens) { $this->corrections = []; $this->tokens = $tokens; $this->cleanupGroupDelimiters($this->tokens); $this->stack = new SplStack(); } private function getReduction(Node $node, $reductionIndex) { $reductionGroup = self::$nodeToReductionGroup[get_class($node)]; if (isset(self::$reductionGroups[$reductionGroup][$reductionIndex])) { return self::$reductionGroups[$reductionGroup][$reductionIndex]; } return null; } private function reduceQuery() { $this->popTokens(); $this->reduceRemainingLogicalOr(); $nodes = []; while (!$this->stack->isEmpty()) { array_unshift($nodes, $this->stack->pop()); } $this->stack->push(new Query($nodes)); } /** * Check if the given $token is an instance of Token. * * Optionally also checks given Token $typeMask. * * @param mixed $token * @param int $typeMask * * @return bool */ private function isToken($token, $typeMask = null) { if (!$token instanceof Token) { return false; } if (null === $typeMask || $token->type & $typeMask) { return true; } return false; } private function isTopStackToken($type = null) { return !$this->stack->isEmpty() && $this->isToken($this->stack->top(), $type); } /** * Remove whitespace Tokens from the beginning of the token array. */ private function popWhitespace() { while ($this->isToken(reset($this->tokens), Tokenizer::TOKEN_WHITESPACE)) { array_shift($this->tokens); } } /** * Remove all Tokens from the top of the query stack and log Corrections as necessary. * * Optionally also checks that Token matches given $typeMask. * * @param int $typeMask */ private function popTokens($typeMask = null) { while ($this->isTopStackToken($typeMask)) { $token = $this->stack->pop(); if ($token->type & self::$tokenShortcuts['operatorUnary']) { $this->addCorrection( self::CORRECTION_UNARY_OPERATOR_MISSING_OPERAND_IGNORED, $token ); } else { $this->addCorrection( self::CORRECTION_BINARY_OPERATOR_MISSING_RIGHT_OPERAND_IGNORED, $token ); } } } private function ignorePrecedingOperators($type) { $tokens = []; while ($this->isTopStackToken($type)) { array_unshift($tokens, $this->stack->pop()); } return $tokens; } private function ignoreFollowingOperators() { $tokenMask = self::$tokenShortcuts['binaryOperatorAndWhitespace']; $tokens = []; while ($this->isToken(reset($this->tokens), $tokenMask)) { $token = array_shift($this->tokens); if ($token->type & self::$tokenShortcuts['operatorBinary']) { $tokens[] = $token; } } return $tokens; } /** * Reduce logical OR possibly remaining after reaching end of group or query. * * @param bool $inGroup Reduce inside a group */ private function reduceRemainingLogicalOr($inGroup = false) { if (!$this->stack->isEmpty() && !$this->isTopStackToken()) { $node = $this->reduceLogicalOr($this->stack->pop(), $inGroup); $this->stack->push($node); } } /** * Clean up group delimiter tokens, removing unmatched left and right delimiter. * * Closest group delimiters will be matched first, unmatched remainder is removed. * * @param \QueryTranslator\Values\Token[] $tokens */ private function cleanupGroupDelimiters(array &$tokens) { $indexes = $this->getUnmatchedGroupDelimiterIndexes($tokens); while (!empty($indexes)) { $lastIndex = array_pop($indexes); $token = $tokens[$lastIndex]; unset($tokens[$lastIndex]); if ($token->type === Tokenizer::TOKEN_GROUP_BEGIN) { $this->addCorrection( self::CORRECTION_UNMATCHED_GROUP_LEFT_DELIMITER_IGNORED, $token ); } else { $this->addCorrection( self::CORRECTION_UNMATCHED_GROUP_RIGHT_DELIMITER_IGNORED, $token ); } } } private function getUnmatchedGroupDelimiterIndexes(array &$tokens) { $trackLeft = []; $trackRight = []; foreach ($tokens as $index => $token) { if (!$this->isToken($token, self::$tokenShortcuts['groupDelimiter'])) { continue; } if ($this->isToken($token, Tokenizer::TOKEN_GROUP_BEGIN)) { $trackLeft[] = $index; continue; } if (empty($trackLeft)) { $trackRight[] = $index; } else { array_pop($trackLeft); } } return array_merge($trackLeft, $trackRight); } private function addCorrection($type, Token ...$tokens) { $this->corrections[] = new Correction($type, ...$tokens); } } ================================================ FILE: lib/Languages/Galach/README.md ================================================ # Galach query language To better understand parts of the language processor described below, run the demo: 1. Create the demo project using composer `composer create-project netgen/query-translator-demo` 2. Position into the demo project directory `cd query-translator-demo` 3. Start the web server with `src` as the document root `php -S localhost:8005 -t src` 4. Open [http://localhost:8005](http://localhost:8005) in your browser The demo will present behavior of Query Translator in an interactive way. ### Syntax Galach is based on a syntax that seems to be the unofficial standard for search query as user input. It should feel familiar, as the same basic syntax is used by any popular text-based search engine out there. It is also very similar to [Lucene Query Parser syntax](https://lucene.apache.org/core/2_9_4/queryparsersyntax.html), used by both Solr and Elasticsearch. Read about it more detail in the [syntax documentation](SYNTAX.md), here we'll only show a quick cheat sheet: `word` `"phrase"` `(group)` `+mandatory` `-prohibited` `AND` `&&` `OR` `||` `NOT` `!` `#tag` `@user` `domain:term` And an example: ``` cheese AND (bacon OR eggs) +type:breakfast ``` ### How it works The implementation has some of the usual language processor phases, starting with the lexical analysis in [Tokenizer](Tokenizer.php), followed by the syntax analysis in [Parser](Parser.php), and ending with the target code generation in a [Generator](Generators). The output of the Parser is a hierarchical tree structure. It represents the syntax of the query in an abstract way and is easy to process using [tree traversal](https://en.wikipedia.org/wiki/Tree_traversal). From that syntax tree, a target output is generated. When broken into parts, we have a sequence like this: 1. User writes a query string 2. Query string is given to Tokenizer which produces an instance of [TokenSequence](../../Values/TokenSequence.php) 3. TokenSequence instance is given to Parser which produces an instance of [SyntaxTree](../../Values/SyntaxTree.php) 4. SyntaxTree instance is given to the Generator to produce a target output 5. Target output is passed to its consumer Here's how that would look in code: ```php use QueryTranslator\Languages\Galach\Tokenizer; use QueryTranslator\Languages\Galach\TokenExtractor\Full as FullTokenExtractor; use QueryTranslator\Languages\Galach\Parser; use QueryTranslator\Languages\Galach\Generators; // 1. User writes a query string $queryString = $_GET['query_string']; // This is the place where you would perform some sanity checks that are out of the scope // of this library, for example, checking the length of the query string // 2. Query string is given to Tokenizer which produces an instance of TokenSequence // Note that Tokenizer needs a TokenExtractor, which is an extension point // Here we use Full TokenExtractor which provides full Galach syntax $tokenExtractor = new FullTokenExtractor(); $tokenizer = new Tokenizer($tokenExtractor); $tokenSequence = $tokenizer->tokenize($queryString); // 3. TokenSequence instance is given to Parser which produces an instance of SyntaxTree $parser = new Parser(); $syntaxTree = $parser->parse($tokenSequence); // If needed, here you can access corrections foreach ($syntaxTree->corrections as $correction) { echo $correction->type; } // 4. Now we can build a generator, in this example an ExtendedDisMax generator to target // Solr's Extended DisMax Query Parser // This part is a little bit more involving since we need to build all visitors for different // Nodes in the syntax tree $generator = new Generators\ExtendedDisMax( new Generators\Common\Aggregate([ new Generators\Lucene\Common\BinaryOperator(), new Generators\Lucene\Common\Group(), new Generators\Lucene\Common\Phrase(), new Generators\Lucene\Common\Query(), new Generators\Lucene\Common\Tag(), new Generators\Lucene\Common\UnaryOperator(), new Generators\Lucene\Common\User(), new Generators\Lucene\ExtendedDisMax\Word(), ]) ); // Now we can use the generator to generate the target output $targetString = $generator->generate($syntaxTree); // Finally we can send the generated string to Solr $result = $solrClient->search($targetString); ``` ### Error handling No input is considered invalid. Both Tokenizer and Parser are made to be resistant to errors and will try to process anything you throw at them. When input does contain an error, a correction will be applied. This will be repeated as necessary. The corrections are applied during parsing and are made available in the SyntaxTree as an array of [Correction](../../Values/Correction.php) instances. They will contain information about the type of the correction and the tokens affected by it. One type of correction starts in the Tokenizer. When no [Token](../../Values/Token.php) can be extracted at a current position in the input string, a single character will be read as a special `Tokenizer::TOKEN_BAILOUT` type Token. All Tokens of that type will be ignored by the parser. The only known case where this can happen is the occurrence of an unclosed phrase delimiter `"`. Note that, while applying the corrections, the best efforts are made to preserve the intended meaning of the query. The following is a list of corrections, with correction type constant and an example of an incorrect input and a corrected result. 1. Adjacent unary operator preceding another operator is ignored `Parser::CORRECTION_ADJACENT_UNARY_OPERATOR_PRECEDING_OPERATOR_IGNORED` ``` ++one +-two ``` ``` +one -two ``` 2. Unary operator missing an operand is ignored `Parser::CORRECTION_UNARY_OPERATOR_MISSING_OPERAND_IGNORED` ``` one NOT ``` ``` one ``` 3. Binary operator missing left side operand is ignored `Parser::CORRECTION_BINARY_OPERATOR_MISSING_LEFT_OPERAND_IGNORED` ``` AND two ``` ``` two ``` 4. Binary operator missing right side operand is ignored `Parser::CORRECTION_BINARY_OPERATOR_MISSING_RIGHT_OPERAND_IGNORED` ``` one AND ``` ``` one ``` 5. Binary operator following another operator is ignored together with connecting operators `Parser::CORRECTION_BINARY_OPERATOR_FOLLOWING_OPERATOR_IGNORED` ``` one AND OR AND two ``` ``` one two ``` 6. Logical not operators preceding mandatory or prohibited operator are ignored `Parser::CORRECTION_LOGICAL_NOT_OPERATORS_PRECEDING_PREFERENCE_IGNORED` ``` NOT +one NOT -two ``` ``` +one -two ``` 7. Empty group is ignored together with connecting operators `Parser::CORRECTION_EMPTY_GROUP_IGNORED` ``` one AND () OR two ``` ``` one two ``` 8. Unmatched left side group delimiter is ignored `Parser::CORRECTION_UNMATCHED_GROUP_LEFT_DELIMITER_IGNORED` ``` one ( AND two ``` ``` one AND two ``` 9. Unmatched right side group delimiter is ignored `Parser::CORRECTION_UNMATCHED_GROUP_RIGHT_DELIMITER_IGNORED` ``` one AND ) two ``` ``` one AND two ``` 10. Any Token of `Tokenizer::TOKEN_BAILOUT` type is ignored `Parser::CORRECTION_BAILOUT_TOKEN_IGNORED` ``` one " two ``` ``` one two ``` ### Customization You can modify the Galach language in a limited way: - By changing special characters and sequences of characters used as part of the language syntax: - operators: `AND` `&&` `OR` `||` `NOT` `!` `+` `-` - grouping and phrase delimiters: `(` `)` `"` - user and tag markers: `@` `#` - domain prefix: `domain:` - By choosing parts of the language that you want to use. You might want to use only a subset of the full syntax, maybe without the grouping feature, using only `+` and `-` operators, disabling domains, and so on. - By implementing custom `Tokenizer::TOKEN_TERM` type token. Read more on that in the text below. Customization happens during the lexical analysis. The Tokenizer is actually marked as `final` and is not intended for extending. You will need to implement your own [TokenExtractor](TokenExtractor.php), a dependency to the Tokenizer. TokenExtractor controls the syntax through regular expressions used to recognize the [Token](../../Values/Token.php), which is a sequence of characters forming the smallest syntactic unit of the language. The following is a list of supported Token types, together with their `Tokenizer::TOKEN_*` constants and an example: 1. Term token – represents a category of term type tokens. Note that [Word](Values/Token/Word.php) and [Phrase](Values/Token/Phrase.php) term tokens can have domain prefix. This can't be used on [User](Values/Token/User.php) and [Tag](Values/Token/Tag.php) term tokens, because those define implicit domains of their own. `Tokenizer::TOKEN_TERM` ``` word ``` ``` title:word ``` ``` "this is a phrase" ``` ``` body:"this is a phrase" ``` ``` @user ``` ``` #tag ``` 2. Whitespace token - represents the whitespace in the input string. `Tokenizer::TOKEN_WHITESPACE` ``` one two ^ ``` 3. Logical AND token - combines two adjoining elements with logical AND. `Tokenizer::TOKEN_LOGICAL_AND` ``` one AND two ^^^ ``` 4. Logical OR token - combines two adjoining elements with logical OR. `Tokenizer::TOKEN_LOGICAL_OR` ``` one OR two ^^ ``` 5. Logical NOT token - applies logical NOT to the next (right-side) element. `Tokenizer::TOKEN_LOGICAL_NOT` ``` NOT one ^^^ ``` 6. Shorthand logical NOT token - applies logical NOT to the next (right-side) element. This is an alternative to the `Tokenizer::TOKEN_LOGICAL_NOT` above, with the difference that parser will expect it's placed next (left) to the element it applies to, without the whitespace in between. `Tokenizer::TOKEN_LOGICAL_NOT_2` ``` !one ^ ``` 7. Mandatory operator - applies mandatory inclusion to the next (right side) element. `Tokenizer::TOKEN_MANDATORY` ``` +one ^ ``` 8. Prohibited operator - applies mandatory exclusion to the next (right side) element. `Tokenizer::TOKEN_PROHIBITED` ``` -one ^ ``` 9. Left side delimiter of a group. Note that the left side group delimiter can have domain prefix. `Tokenizer::TOKEN_GROUP_BEGIN` ``` (one AND two) ^ ``` ``` text:(one AND two) ^^^^^^ ``` 10. Right side delimiter of a group. `Tokenizer::TOKEN_GROUP_END` ``` (one AND two) ^ ``` 11. Bailout token. `Tokenizer::TOKEN_BAILOUT` ``` not exactly a phrase" ^ ``` By changing the regular expressions, you can change how tokens are recognized, including special characters used as part of the language syntax. You can also omit regular expressions for some token types. Through that, you can control which elements of the language you want to use. There are two abstract methods to implement when extending the base [TokenExtractor](TokenExtractor.php): - `getExpressionTypeMap(): array` Here you must return a map of regular expressions to corresponding Token types. Token type can be one of the predefined constants `Tokenizer::TOKEN_*`. - `createTermToken($position, array $data): Token` Here you receive Token data extracted through regular expression matching and a position where the data was extracted at. From that, you must return the corresponding Token instance of the `Tokenizer::TOKEN_TERM` type. If needed, here you can return an instance of your own Token subtype. You can use regular expressions with named capturing groups to extract meaning from the input string and pass it to the constructor method. Optionally you can override the `createGroupBeginToken()` method. This is useful if you want to customize token of the `Tokenizer::TOKEN_GROUP_BEGIN` type: - `createGroupBeginToken($position, array $data): Token` Here you receive Token data extracted through regular expression matching and a position where the data was extracted at. From that, you must return the corresponding Token instance of the `Tokenizer::TOKEN_GROUP_BEGIN` type. If needed, here you can return an instance of your own Token subtype. You can use regular expressions with named capturing groups to extract meaning from the input string and pass it to the constructor method. Two TokenExtractor implementations are provided out of the box. You can use them as an example and a starting point to implement your own. These are: - [Full](TokenExtractor/Full.php) TokenExtractor, supports full syntax of the language - [Text](TokenExtractor/Text.php) TokenExtractor, supports text related subset of the language #### Parser The Parser is the core of the library. It's marked as `final` and is not intended for extending. Method `Parser::parse()` accepts TokenSequence, but it only cares about the type of the Token, so it will be oblivious to any customizations you might do in the Tokenizer. That includes both recognizing only a subset of the full syntax and the custom `Tokenizer::TOKEN_TERM` type tokens. While it's possible to implement a custom Parser, at that point you should consider calling it a new language rather than a customization of Galach. ### Generators A generator is used to generate the target output from the SyntaxTree. Three different ones are provided out of the box: 1. [Native](Generators/Native.php) `Native` generator produces query string in the Galach format. This is mostly useful as an example and for the cleanup of the user input. In case the corrections were applied to the input, the output will be corrected. Also, it will not contain any superfluous whitespace and special characters will be explicitly escaped. 2. [ExtendedDisMax](Generators/ExtendedDisMax.php) Output of `ExtendedDisMax` generator is intended for the `q` parameter of the [Solr Extended DisMax Query Parser](https://cwiki.apache.org/confluence/display/solr/The+Extended+DisMax+Query+Parser). 3. [QueryString](Generators/QueryString.php) Output of `QueryString` generator is intended for the `query` parameter of the [Elasticsearch Query String Query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html). All generators use the same hierarchical [Visitor](Generators/Common/Visitor.php) pattern. Each concrete [Node](../../Values/Node.php) instance has its own visitor, dispatched by checking on the class it implements. This enables customization per Node visitor. Since Term Node can cover different Term tokens (including your custom ones), Term visitors should be dispatched both by the Node instance and the type of Token it aggregates. The visit method also propagates optional `$options` parameter. If needed, it can be used to control the behavior of the generator from the outside. This approach should be useful for most custom implementations. Note that the Generator interface is not provided. That is because the generator's output can't be assumed, because it's specific to the intended target. The main job of the Query Translator is producing the syntax tree from which it's easy to generate anything you might need. Following from that - if the provided generators don't meet your needs, feel free to customize them or implement your own. ================================================ FILE: lib/Languages/Galach/SYNTAX.md ================================================ # Galach query language syntax ## Terms 1. `Word` term is a string not containing whitespace, unless that whitespace is escaped. ``` word ``` ``` another\ word ``` 2. `Phrase` term is formed by enclosing words within double quotation marks `"`. ``` "reality exists" ``` ``` "what's not real doesn't exist" ``` 3. `User` term is defined by the leading `@` character, followed by at least one alphanumeric or underscore character, followed by an arbitrary sequence of alphanumeric characters, hyphens, underscores, and dots. Regular expression: ``` @[a-zA-Z0-9_][a-zA-Z0-9_\-.]* ``` Examples: ``` @joe.watt ``` ``` @_alice83 ``` ``` @The-Ronald ``` 4. `Tag` term is defined by the leading `#` character, followed by at least one alphanumeric or underscore character, followed by an arbitrary sequence of alphanumeric characters, hyphens, underscores, and dots. Regular expression: ``` \#[a-zA-Z0-9_][a-zA-Z0-9_\-.]* ``` Examples: ``` #php ``` ``` #PHP-7.1 ``` ``` #query_parser ``` ## Operators Terms can be combined or modified using binary and unary operators: 1. `Logical and` is a binary operator that combines left and right operands so that both must match. It comes in two forms: `AND`, `&&` In both cases, it must be separated from its operands by whitespace. ``` coffee AND milk ``` ``` tea && lemon ``` 2. `Logical or` is a binary operator that combines left and right operands so that at least one of them has to match. It comes in two forms: `OR`, `||` In both cases, it must be separated from its operands by whitespace. ``` potato OR tomato ``` ``` true || false ``` 3. `Logical not` is a unary operator that modifies its operand so that it must not match. It comes in two forms: `NOT`, `!` When `NOT` form is used, it must be separated from its operand by whitespace: ``` NOT important ``` When shorthand form `!` is used, it must be adjacent to its operand: ``` !important ``` 4. `Mandatory` is a unary operator that modifies its operand so that it must match. It's represented by a plus sign `+` and must be placed adjacent to its operand. ``` +coffee ``` 5. `Prohibited` is a unary operator that modifies its operand so that it must not match. It's represented by a minus sign `-` and must be placed adjacent to its operand. ``` -cake ``` ### Operator precedence Unary operators are applied first. Since they apply to the first element to the left, they never conflict. They are followed by binary operators, with `Logical and` preceding `Logical or`: 1. `Logical not`, `Mandatory`, `Prohibited` 2. `Logical and` 3. `Logical or` ## Grouping Terms and expressions can be grouped using round brackets. A group is processed as a whole. The following two examples will be processed as the same since grouping follows operator precedence: ``` one OR NOT two AND three ``` ``` one OR ((NOT two) AND three) ``` But you can also use grouping to change the meaning that would follow from operator precedence: ``` (one OR NOT two) AND three ``` ``` one OR NOT (two AND three) ``` ## Domains Domain is an abstract category on which the term or group applies. It's defined by prefixing the term or group with a domain string, followed by a colon `:`. Domain string must start with at least one alphanumeric or underscore character and is followed by an arbitrary sequence of alphanumeric characters, hyphens `-`, underscores `_` and dots `.`. Note that the domain cannot be used on `Tag` and `User` terms. These two, in fact, define implicit domains of their own. Regular expression for domain string: ``` [a-zA-Z_][a-zA-Z0-9_\-.]* ``` Examples: ``` type:aeroplane ``` ``` title:"Language processor" ``` ``` description:(wings AND propeller) ``` ## Special characters The characters that are part of the language syntax must be escaped in order not to be recognized as such by the engine. These are: - `(` left paren - `)` right paren - `+` plus - `-` minus - `!` exclamation mark - `"` double quote - `#` hash - `@` at sign - `:` colon - `\` backslash - `␣` blank space Character used for escaping is backslash `\`: ``` joined\ word ``` ``` "escaped \"double quote\"" ``` ``` escaped \+operator domain\:word \@user \#tag \(and so on\) ``` ``` double backslash \\ is a backslash escaped ``` Aside from the quotation marks themselves, escaping is not required inside phrases. Since quotes are used as delimiters, everything between them is taken as-is. Hence these will be interpreted as equal in meaning: ``` "+one -two" ``` ``` "\+one \-two" ``` In some cases the tokenizer will automatically assume that a special character is to be interpreted as if it was escaped. The following pairs will be processed as the same: 1. Colon at the end of a `Word` is considered part of the `Word` ``` word: ``` ``` word\: ``` 2. Colon placed after a domain colon is considered part of the `Word` ``` domain:domain:domain ``` ``` domain:domain\:domain ``` 3. Domain can't be used on a `Tag` and `User` terms ``` domain:#tag domain:@user ``` ``` domain:\#tag domain:\@user ``` 4. Characters used for `Mandatory`, `Prohibited` and shorthand `Logical not` operators can be considered part of the `Word`: - When placed after domain colon ``` domain:+word domain:-word domain:!word ``` ``` domain:\+word domain:\-word domain:\!word ``` - When placed in the middle of the word ``` one+two one-two one!two ``` ``` one\+two one\-two one\!two ``` - When placed at the end of the `Word` ``` one+ two- three! ``` ``` one\+ two\- three\! ``` ================================================ FILE: lib/Languages/Galach/TokenExtractor/Full.php ================================================ [\s]+)/Au' => Tokenizer::TOKEN_WHITESPACE, '/(?\+)/Au' => Tokenizer::TOKEN_MANDATORY, '/(?-)/Au' => Tokenizer::TOKEN_PROHIBITED, '/(?!)/Au' => Tokenizer::TOKEN_LOGICAL_NOT_2, '/(?\))/Au' => Tokenizer::TOKEN_GROUP_END, '/(?NOT)(?:[\s"()+\-!]|$)/Au' => Tokenizer::TOKEN_LOGICAL_NOT, '/(?(?:AND|&&))(?:[\s"()+\-!]|$)/Au' => Tokenizer::TOKEN_LOGICAL_AND, '/(?(?:OR|\|\|))(?:[\s"()+\-!]|$)/Au' => Tokenizer::TOKEN_LOGICAL_OR, '/(?(?:(?[a-zA-Z_][a-zA-Z0-9_\-.]*):)?(?\())/Au' => Tokenizer::TOKEN_GROUP_BEGIN, '/(?(?:(?(?[a-zA-Z0-9_][a-zA-Z0-9_\-.]*)))(?:[\s"()+!]|$)/Au' => Tokenizer::TOKEN_TERM, '/(?(?:(?(?[a-zA-Z0-9_][a-zA-Z0-9_\-.]*)))(?:[\s"()+!]|$)/Au' => Tokenizer::TOKEN_TERM, '/(?(?:(?[a-zA-Z_][a-zA-Z0-9_\-.]*):)?(?(?.*?)(?:(? Tokenizer::TOKEN_TERM, '/(?(?:(?[a-zA-Z_][a-zA-Z0-9_\-.]*):)?(?(?:\\\\\\\\|\\\\ |\\\\\(|\\\\\)|\\\\"|[^"()\s])+?))(?:(? Tokenizer::TOKEN_TERM, ]; protected function getExpressionTypeMap() { return self::$expressionTypeMap; } protected function createTermToken($position, array $data) { $lexeme = $data['lexeme']; switch (true) { case isset($data['word']): return new Word( $lexeme, $position, $data['domain'], // un-backslash special characters preg_replace('/(?:\\\\(\\\\|(["+\-!():#@ ])))/', '$1', $data['word']) ); case isset($data['phrase']): $quote = $data['quote']; return new Phrase( $lexeme, $position, $data['domain'], $quote, // un-backslash quote preg_replace('/(?:\\\\([' . $quote . ']))/', '$1', $data['phrase']) ); case isset($data['tag']): return new Tag( $lexeme, $position, $data['marker'], $data['tag'] ); case isset($data['user']): return new User( $lexeme, $position, $data['marker'], $data['user'] ); } throw new RuntimeException('Could not extract term token from the given data'); } } ================================================ FILE: lib/Languages/Galach/TokenExtractor/Text.php ================================================ [\s]+)/Au' => Tokenizer::TOKEN_WHITESPACE, '/(?\+)/Au' => Tokenizer::TOKEN_MANDATORY, '/(?-)/Au' => Tokenizer::TOKEN_PROHIBITED, '/(?!)/Au' => Tokenizer::TOKEN_LOGICAL_NOT_2, '/(?\))/Au' => Tokenizer::TOKEN_GROUP_END, '/(?NOT)(?:[\s"()+\-!]|$)/Au' => Tokenizer::TOKEN_LOGICAL_NOT, '/(?(?:AND|&&))(?:[\s"()+\-!]|$)/Au' => Tokenizer::TOKEN_LOGICAL_AND, '/(?(?:OR|\|\|))(?:[\s"()+\-!]|$)/Au' => Tokenizer::TOKEN_LOGICAL_OR, '/(?\()/Au' => Tokenizer::TOKEN_GROUP_BEGIN, '/(?(?(?.*?)(?:(? Tokenizer::TOKEN_TERM, '/(?(?(?:\\\\\\\\|\\\\ |\\\\\(|\\\\\)|\\\\"|[^"()\s])+?))(?:(? Tokenizer::TOKEN_TERM, ]; protected function getExpressionTypeMap() { return self::$expressionTypeMap; } protected function createTermToken($position, array $data) { $lexeme = $data['lexeme']; switch (true) { case isset($data['word']): return new Word( $lexeme, $position, '', // un-backslash special chars preg_replace('/(?:\\\\(\\\\|(["+\-!() ])))/', '$1', $data['word']) ); case isset($data['phrase']): $quote = $data['quote']; return new Phrase( $lexeme, $position, '', $quote, // un-backslash quote preg_replace('/(?:\\\\([' . $quote . ']))/', '$1', $data['phrase']) ); } throw new RuntimeException('Could not extract term token from the given data'); } protected function createGroupBeginToken($position, array $data) { return new GroupBegin($data['lexeme'], $position, $data['lexeme'], ''); } } ================================================ FILE: lib/Languages/Galach/TokenExtractor.php ================================================ getByteOffset($string, $position); foreach ($this->getExpressionTypeMap() as $expression => $type) { $success = preg_match($expression, $string, $matches, 0, $byteOffset); if (false === $success) { throw new RuntimeException('PCRE regex error code: ' . preg_last_error()); } if (0 === $success) { continue; } return $this->createToken($type, $position, $matches); } return new Token( Tokenizer::TOKEN_BAILOUT, mb_substr($string, $position, 1), $position ); } /** * Return a map of regular expressions to token types. * * The returned map must be an array where key is a regular expression * and value is a corresponding token type. Regular expression must define * named capturing group 'lexeme' that identifies part of the input string * recognized as token. * * @return array */ abstract protected function getExpressionTypeMap(); /** * Create a term type token by the given parameters. * * @throw \RuntimeException If token could not be created from the given $matches data * * @param int $position Position of the token in the input string * @param array $data Regex match data, depends on the matched term token * * @return \QueryTranslator\Values\Token */ abstract protected function createTermToken($position, array $data); /** * Create a token object from the given parameters. * * @param int $type Token type * @param int $position Position of the token in the input string * @param array $data Regex match data, depends on the type of the token * * @return \QueryTranslator\Values\Token */ private function createToken($type, $position, array $data) { if ($type === Tokenizer::TOKEN_GROUP_BEGIN) { return $this->createGroupBeginToken($position, $data); } if ($type === Tokenizer::TOKEN_TERM) { return $this->createTermToken($position, $data); } return new Token($type, $data['lexeme'], $position); } /** * Create an instance of Group token by the given parameters. * * @param $position * @param array $data * * @return \QueryTranslator\Values\Token */ protected function createGroupBeginToken($position, array $data) { return new GroupBegin($data['lexeme'], $position, $data['delimiter'], $data['domain']); } /** * Return the offset of the given $position in the input $string, in bytes. * * Offset in bytes is needed for preg_match $offset parameter. * * @param string $string * @param int $position * * @return int */ private function getByteOffset($string, $position) { return strlen(mb_substr($string, 0, $position)); } } ================================================ FILE: lib/Languages/Galach/Tokenizer.php ================================================ tokenExtractor = $tokenExtractor; } public function tokenize($string) { $length = mb_strlen($string); $position = 0; $tokens = []; while ($position < $length) { $token = $this->tokenExtractor->extract($string, $position); $position += mb_strlen($token->lexeme); $tokens[] = $token; } return new TokenSequence($tokens, $string); } } ================================================ FILE: lib/Languages/Galach/Values/Node/Group.php ================================================ nodes = $nodes; $this->tokenLeft = $tokenLeft; $this->tokenRight = $tokenRight; } public function getNodes() { return $this->nodes; } } ================================================ FILE: lib/Languages/Galach/Values/Node/LogicalAnd.php ================================================ leftOperand = $leftOperand; $this->rightOperand = $rightOperand; $this->token = $token; } public function getNodes() { return [ $this->leftOperand, $this->rightOperand, ]; } } ================================================ FILE: lib/Languages/Galach/Values/Node/LogicalNot.php ================================================ operand = $operand; $this->token = $token; } public function getNodes() { return [$this->operand]; } } ================================================ FILE: lib/Languages/Galach/Values/Node/LogicalOr.php ================================================ leftOperand = $leftOperand; $this->rightOperand = $rightOperand; $this->token = $token; } public function getNodes() { return [ $this->leftOperand, $this->rightOperand, ]; } } ================================================ FILE: lib/Languages/Galach/Values/Node/Mandatory.php ================================================ operand = $operand; $this->token = $token; } public function getNodes() { return [$this->operand]; } } ================================================ FILE: lib/Languages/Galach/Values/Node/Prohibited.php ================================================ operand = $operand; $this->token = $token; } public function getNodes() { return [$this->operand]; } } ================================================ FILE: lib/Languages/Galach/Values/Node/Query.php ================================================ nodes = $nodes; } public function getNodes() { return $this->nodes; } } ================================================ FILE: lib/Languages/Galach/Values/Node/Term.php ================================================ token = $token; } public function getNodes() { return []; } } ================================================ FILE: lib/Languages/Galach/Values/Token/GroupBegin.php ================================================ delimiter = $delimiter; $this->domain = $domain; parent::__construct(Tokenizer::TOKEN_GROUP_BEGIN, $lexeme, $position); } } ================================================ FILE: lib/Languages/Galach/Values/Token/Phrase.php ================================================ domain = $domain; $this->quote = $quote; $this->phrase = $phrase; parent::__construct(Tokenizer::TOKEN_TERM, $lexeme, $position); } } ================================================ FILE: lib/Languages/Galach/Values/Token/Tag.php ================================================ marker = $marker; $this->tag = $tag; parent::__construct(Tokenizer::TOKEN_TERM, $lexeme, $position); } } ================================================ FILE: lib/Languages/Galach/Values/Token/User.php ================================================ marker = $marker; $this->user = $user; parent::__construct(Tokenizer::TOKEN_TERM, $lexeme, $position); } } ================================================ FILE: lib/Languages/Galach/Values/Token/Word.php ================================================ domain = $domain; $this->word = $word; parent::__construct(Tokenizer::TOKEN_TERM, $lexeme, $position); } } ================================================ FILE: lib/Parsing.php ================================================ type = $type; $this->tokens = $tokens; } } ================================================ FILE: lib/Values/Node.php ================================================ rootNode = $rootNode; $this->tokenSequence = $tokenSequence; $this->corrections = $corrections; } } ================================================ FILE: lib/Values/Token.php ================================================ type = $type; $this->lexeme = $lexeme; $this->position = $position; } } ================================================ FILE: lib/Values/TokenSequence.php ================================================ tokens = $tokens; $this->source = $source; } } ================================================ FILE: phpunit.xml ================================================ ./tests ./lib ================================================ FILE: tests/Galach/Generators/AggregateVisitorDispatchTest.php ================================================ getMockBuilder(Node::class)->getMock(); $this->assertTrue((new Aggregate())->accept($nodeMock)); } public function testVisitThrowsException() { $this->expectException(RuntimeException::class); $this->expectExceptionMessage('No visitor available for Mock'); /** @var \QueryTranslator\Values\Node $nodeMock */ $nodeMock = $this->getMockBuilder(Node::class)->getMock(); (new Aggregate())->visit($nodeMock); } } ================================================ FILE: tests/Galach/Generators/ExtendedDisMaxTest.php ================================================ getGenerator(); $tokenSequence = $tokenizer->tokenize($string); $syntaxTree = $parser->parse($tokenSequence); $translatedString = $generator->generate($syntaxTree); $this->assertEquals($expectedTranslatedString, $translatedString); } /** * @return \QueryTranslator\Languages\Galach\Generators\ExtendedDisMax */ protected function getGenerator() { $visitors = []; $visitors[] = new Generators\Lucene\Common\Prohibited(); $visitors[] = new Generators\Lucene\Common\Group( [ self::FIELD_TEXT_DOMAIN => self::FIELD_TEXT_DOMAIN_MAPPED, ], self::FIELD_TEXT_DEFAULT ); $visitors[] = new Generators\Lucene\Common\Mandatory(); $visitors[] = new Generators\Lucene\Common\LogicalAnd(); $visitors[] = new Generators\Lucene\Common\LogicalNot(); $visitors[] = new Generators\Lucene\Common\LogicalOr(); $visitors[] = new Generators\Lucene\Common\Phrase( [ self::FIELD_TEXT_DOMAIN => self::FIELD_TEXT_DOMAIN_MAPPED, ], self::FIELD_TEXT_DEFAULT ); $visitors[] = new Generators\Lucene\Common\Query(); $visitors[] = new Generators\Lucene\Common\Tag(self::FIELD_TAG); $visitors[] = new Generators\Lucene\Common\User(self::FIELD_USER); $visitors[] = new Generators\Lucene\ExtendedDisMax\Word( [ self::FIELD_TEXT_DOMAIN => self::FIELD_TEXT_DOMAIN_MAPPED, ], self::FIELD_TEXT_DEFAULT ); $aggregate = new Generators\Common\Aggregate($visitors); return new Generators\ExtendedDisMax($aggregate); } } ================================================ FILE: tests/Galach/Generators/LuceneVisitorDispatchTest.php ================================================ getMockBuilder(Node::class)->getMock(); return [ [ new Group(), $nodeMock, 'Implementation accepts instance of Group Node', ], [ new LogicalAnd(), $nodeMock, 'Implementation accepts instance of LogicalAnd Node', ], [ new LogicalNot(), $nodeMock, 'Implementation accepts instance of LogicalNot Node', ], [ new LogicalOr(), $nodeMock, 'Implementation accepts instance of LogicalOr Node', ], [ new Mandatory(), $nodeMock, 'Implementation accepts instance of Mandatory Node', ], [ new Phrase(), $nodeMock, 'Implementation accepts instance of Term Node', ], [ new Prohibited(), $nodeMock, 'Implementation accepts instance of Prohibited Node', ], [ new Query(), $nodeMock, 'Implementation accepts instance of Query Node', ], [ new Tag(), $nodeMock, 'Implementation accepts instance of Term Node', ], [ new User(), $nodeMock, 'Implementation accepts instance of Term Node', ], [ new ExtendedDisMaxWord(), $nodeMock, 'Implementation accepts instance of Term Node', ], [ new QueryStringWord(), $nodeMock, 'Implementation accepts instance of Term Node', ], ]; } /** * @dataProvider providerForTestVisitThrowsLogicExceptionNode * * @param \QueryTranslator\Languages\Galach\Generators\Common\Visitor $visitor * @param \QueryTranslator\Values\Node $node * @param string $expectedExceptionMessage */ public function testVisitThrowsLogicExceptionNode(Visitor $visitor, Node $node, $expectedExceptionMessage) { $this->expectException(LogicException::class); try { $visitor->visit($node); } catch (LogicException $e) { $this->assertSame($expectedExceptionMessage, $e->getMessage()); throw $e; } } public function providerForTestVisitThrowsLogicExceptionToken() { /** @var \QueryTranslator\Values\Token $tokenMock */ $tokenMock = $this->getMockBuilder(Token::class)->disableOriginalConstructor()->getMock(); $node = new Term($tokenMock); return [ [ new Phrase(), $node, 'Implementation accepts instance of Phrase Token', ], [ new Tag(), $node, 'Implementation accepts instance of Tag Token', ], [ new User(), $node, 'Implementation accepts instance of User Token', ], [ new ExtendedDisMaxWord(), $node, 'Implementation accepts instance of Word Token', ], [ new QueryStringWord(), $node, 'Implementation accepts instance of Word Token', ], ]; } /** * @dataProvider providerForTestVisitThrowsLogicExceptionToken * * @param \QueryTranslator\Languages\Galach\Generators\Common\Visitor $visitor * @param \QueryTranslator\Values\Node $node * @param string $expectedExceptionMessage */ public function testVisitThrowsLogicExceptionToken(Visitor $visitor, Node $node, $expectedExceptionMessage) { $this->expectException(LogicException::class); try { $visitor->visit($node); } catch (LogicException $e) { $this->assertSame($expectedExceptionMessage, $e->getMessage()); throw $e; } } public function providerForTestVisitThrowsLogicExceptionSubVisitor() { return [ [ new Group(), new GroupNode(), ], [ new LogicalAnd(), new LogicalAndNode(), ], [ new LogicalNot(), new LogicalNotNode(), ], [ new LogicalOr(), new LogicalOrNode(), ], [ new Mandatory(), new MandatoryNode(), ], [ new Prohibited(), new ProhibitedNode(), ], [ new Query(), new QueryNode([]), ], ]; } /** * @dataProvider providerForTestVisitThrowsLogicExceptionSubVisitor * * @param \QueryTranslator\Languages\Galach\Generators\Common\Visitor $visitor * @param \QueryTranslator\Values\Node $node */ public function testVisitThrowsLogicExceptionSubVisitor(Visitor $visitor, Node $node) { $this->expectException(LogicException::class); $this->expectExceptionMessage("Implementation requires sub-visitor"); $visitor->visit($node); } } ================================================ FILE: tests/Galach/Generators/NativeVisitorDispatchTest.php ================================================ getMockBuilder(Node::class)->getMock(); return [ [ new BinaryOperator(), $nodeMock, 'Implementation accepts instance of LogicalAnd or LogicalOr Node', ], [ new Group(), $nodeMock, 'Implementation accepts instance of Group Node', ], [ new Phrase(), $nodeMock, 'Implementation accepts instance of Term Node', ], [ new Query(), $nodeMock, 'Implementation accepts instance of Query Node', ], [ new Tag(), $nodeMock, 'Implementation accepts instance of Term Node', ], [ new UnaryOperator(), $nodeMock, 'Implementation accepts instance of Mandatory, Prohibited or LogicalNot Node', ], [ new User(), $nodeMock, 'Implementation accepts instance of Term Node', ], [ new Word(), $nodeMock, 'Implementation accepts instance of Term Node', ], ]; } /** * @dataProvider providerForTestVisitThrowsLogicExceptionNode * * @param \QueryTranslator\Languages\Galach\Generators\Common\Visitor $visitor * @param \QueryTranslator\Values\Node $node * @param string $expectedExceptionMessage */ public function testVisitThrowsLogicExceptionNode(Visitor $visitor, Node $node, $expectedExceptionMessage) { $this->expectException(LogicException::class); try { $visitor->visit($node); } catch (LogicException $e) { $this->assertSame($expectedExceptionMessage, $e->getMessage()); throw $e; } } public function providerForTestVisitThrowsLogicExceptionToken() { /** @var \QueryTranslator\Values\Token $tokenMock */ $tokenMock = $this->getMockBuilder(Token::class)->disableOriginalConstructor()->getMock(); $node = new Term($tokenMock); return [ [ new Phrase(), $node, 'Implementation accepts instance of Phrase Token', ], [ new Tag(), $node, 'Implementation accepts instance of Tag Token', ], [ new User(), $node, 'Implementation accepts instance of User Token', ], [ new Word(), $node, 'Implementation accepts instance of Word Token', ], ]; } /** * @dataProvider providerForTestVisitThrowsLogicExceptionToken * * @param \QueryTranslator\Languages\Galach\Generators\Common\Visitor $visitor * @param \QueryTranslator\Values\Node $node * @param string $expectedExceptionMessage */ public function testVisitThrowsLogicExceptionToken(Visitor $visitor, Node $node, $expectedExceptionMessage) { $this->expectException(LogicException::class); try { $visitor->visit($node); } catch (LogicException $e) { $this->assertSame($expectedExceptionMessage, $e->getMessage()); throw $e; } } public function providerForTestVisitThrowsLogicExceptionSubVisitor() { return [ [ new BinaryOperator(), new LogicalAndNode(), ], [ new BinaryOperator(), new LogicalOrNode(), ], [ new Group(), new GroupNode(), ], [ new Query(), new QueryNode([]), ], [ new UnaryOperator(), new LogicalNotNode(), ], [ new UnaryOperator(), new MandatoryNode(), ], [ new UnaryOperator(), new ProhibitedNode(), ], ]; } /** * @dataProvider providerForTestVisitThrowsLogicExceptionSubVisitor * * @param \QueryTranslator\Languages\Galach\Generators\Common\Visitor $visitor * @param \QueryTranslator\Values\Node $node */ public function testVisitThrowsLogicExceptionSubVisitor(Visitor $visitor, Node $node) { $this->expectException(LogicException::class); $this->expectExceptionMessage("Implementation requires sub-visitor"); $visitor->visit($node); } } ================================================ FILE: tests/Galach/Generators/QueryStringTest.php ================================================ ', '\\\\\\>', ], [ '\\<', '\\\\\\<', ], ] ); } /** * @return \QueryTranslator\Languages\Galach\Generators\QueryString */ protected function getGenerator() { $visitors = []; $visitors[] = new Generators\Lucene\Common\Prohibited(); $visitors[] = new Generators\Lucene\Common\Group( [ self::FIELD_TEXT_DOMAIN => self::FIELD_TEXT_DOMAIN_MAPPED, ], self::FIELD_TEXT_DEFAULT ); $visitors[] = new Generators\Lucene\Common\Mandatory(); $visitors[] = new Generators\Lucene\Common\LogicalAnd(); $visitors[] = new Generators\Lucene\Common\LogicalNot(); $visitors[] = new Generators\Lucene\Common\LogicalOr(); $visitors[] = new Generators\Lucene\Common\Phrase( [ self::FIELD_TEXT_DOMAIN => self::FIELD_TEXT_DOMAIN_MAPPED, ], self::FIELD_TEXT_DEFAULT ); $visitors[] = new Generators\Lucene\Common\Query(); $visitors[] = new Generators\Lucene\Common\Tag(self::FIELD_TAG); $visitors[] = new Generators\Lucene\Common\User(self::FIELD_USER); $visitors[] = new Generators\Lucene\QueryString\Word( [ self::FIELD_TEXT_DOMAIN => self::FIELD_TEXT_DOMAIN_MAPPED, ], self::FIELD_TEXT_DEFAULT ); $aggregate = new Generators\Common\Aggregate($visitors); return new QueryString($aggregate); } } ================================================ FILE: tests/Galach/IntegrationTest.php ================================================ doTestQuery($string, $string, $expectedTokens, $expectedTree, []); } /** * @dataProvider providerForTestQueryCorrected * * @param string $string * @param string $correctedString * @param \QueryTranslator\Values\Token[] $expectedTokens * @param \QueryTranslator\Languages\Galach\Values\Node\Query $query * @param \QueryTranslator\Values\Correction[] $corrections */ public function testQueryCorrected($string, $correctedString, $expectedTokens, $query, $corrections) { $this->doTestQuery($string, $correctedString, $expectedTokens, $query, $corrections); } /** * @param string $string * @param string $expectedCorrectedString * @param \QueryTranslator\Values\Token[] $expectedTokens * @param \QueryTranslator\Languages\Galach\Values\Node\Query $query * @param \QueryTranslator\Values\Correction[] $corrections */ protected function doTestQuery($string, $expectedCorrectedString, $expectedTokens, $query, $corrections) { $tokenExtractor = new TokenExtractor\Full(); $tokenizer = new Tokenizer($tokenExtractor); $parser = new Parser(); $generator = $this->getNativeGenerator(); $tokenSequence = $tokenizer->tokenize($string); $this->assertInstanceOf(TokenSequence::class, $tokenSequence); $syntaxTree = $parser->parse($tokenSequence); $this->assertInstanceOf(SyntaxTree::class, $syntaxTree); $correctedString = $generator->generate($syntaxTree); $tokensWithoutWhitespace = []; foreach ($tokenSequence->tokens as $token) { if ($token->type !== Tokenizer::TOKEN_WHITESPACE) { $tokensWithoutWhitespace[] = $token; } } $this->assertEquals($expectedCorrectedString, $correctedString); $this->assertEquals($expectedTokens, $tokensWithoutWhitespace); $this->assertEquals($query, $syntaxTree->rootNode); $this->assertEquals($corrections, $syntaxTree->corrections); $this->assertEquals($tokenSequence, $syntaxTree->tokenSequence); } /** * @return \QueryTranslator\Languages\Galach\Generators\Native */ protected function getNativeGenerator() { $visitors = []; $visitors[] = new Generators\Native\Group(); $visitors[] = new Generators\Native\BinaryOperator(); $visitors[] = new Generators\Native\Phrase(); $visitors[] = new Generators\Native\Query(); $visitors[] = new Generators\Native\Tag(); $visitors[] = new Generators\Native\UnaryOperator(); $visitors[] = new Generators\Native\User(); $visitors[] = new Generators\Native\Word(); $aggregate = new Generators\Common\Aggregate($visitors); return new Generators\Native($aggregate); } } ================================================ FILE: tests/Galach/Tokenizer/FullTokenizerTest.php ================================================ getTokenExtractor(); $tokenizer = new Tokenizer($tokenExtractor); $tokenSequence = $tokenizer->tokenize($string); $this->assertInstanceOf(TokenSequence::class, $tokenSequence); $this->assertEquals($expectedTokens, $tokenSequence->tokens); $this->assertEquals($string, $tokenSequence->source); } public function providerForTestTokenizeNotRecognized() { return [ [ ( $blah = mb_convert_encoding( '👩‍👩‍👧‍👧', 'UTF-8', 'HTML-ENTITIES' ) ) . '"', [ new WordToken($blah, 0, '', $blah), new Token(Tokenizer::TOKEN_BAILOUT, '"', 7), ], ], [ '"' . $blah, [ new Token(Tokenizer::TOKEN_BAILOUT, '"', 0), new WordToken($blah, 1, '', $blah), ], ], [ 'word"', [ new WordToken('word', 0, '', 'word'), new Token(Tokenizer::TOKEN_BAILOUT, '"', 4), ], ], [ 'one"two', [ new WordToken('one', 0, '', 'one'), new Token(Tokenizer::TOKEN_BAILOUT, '"', 3), new WordToken('two', 4, '', 'two'), ], ], [ 'šđ"čćž', [ new WordToken('šđ', 0, '', 'šđ'), new Token(Tokenizer::TOKEN_BAILOUT, '"', 2), new WordToken('čćž', 3, '', 'čćž'), ], ], [ 'AND"', [ new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new Token(Tokenizer::TOKEN_BAILOUT, '"', 3), ], ], [ 'OR"', [ new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new Token(Tokenizer::TOKEN_BAILOUT, '"', 2), ], ], [ 'NOT"', [ new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new Token(Tokenizer::TOKEN_BAILOUT, '"', 3), ], ], ]; } /** * @dataProvider providerForTestTokenizeNotRecognized * * @param string $string * @param \QueryTranslator\Values\Token[] $expectedTokens */ public function testTokenizeNotRecognized($string, array $expectedTokens) { $tokenExtractor = $this->getTokenExtractor(); $tokenizer = new Tokenizer($tokenExtractor); $tokenSequence = $tokenizer->tokenize($string); $this->assertInstanceOf(TokenSequence::class, $tokenSequence); $this->assertEquals($expectedTokens, $tokenSequence->tokens); $this->assertEquals($string, $tokenSequence->source); } /** * @return \QueryTranslator\Languages\Galach\TokenExtractor */ protected function getTokenExtractor() { return new TokenExtractor\Full(); } } ================================================ FILE: tests/Galach/Tokenizer/TextTokenizerTest.php ================================================ getExpectedFixtureWithOverride($string, $expectedTokens); parent::testTokenize($string, $expectedTokens); } /** * @param string $string * @param array $expectedTokens * * @return \QueryTranslator\Values\Token[] */ protected function getExpectedFixtureWithOverride($string, array $expectedTokens) { $this->setFixtureOverride(); if (isset(self::$fixtureOverride[$string])) { return self::$fixtureOverride[$string]; } return $expectedTokens; } protected function setFixtureOverride() { if (self::$fixtureOverride === null) { self::$fixtureOverride = [ '#tag' => [ new WordToken('#tag', 0, '', '#tag'), ], '\#tag' => [ new WordToken('\#tag', 0, '', '\#tag'), ], '#_tag-tag' => [ new WordToken('#_tag-tag', 0, '', '#_tag-tag'), ], '#tag+' => [ new WordToken('#tag+', 0, '', '#tag+'), ], '#tag-' => [ new WordToken('#tag-', 0, '', '#tag-'), ], '#tag!' => [ new WordToken('#tag!', 0, '', '#tag!'), ], "#tag\n" => [ new WordToken('#tag', 0, '', '#tag'), new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 4), ], '#tag ' => [ new WordToken('#tag', 0, '', '#tag'), new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 4), ], '#tag(' => [ new WordToken('#tag', 0, '', '#tag'), new GroupBeginToken('(', 4, '(', null), ], '#tag)' => [ new WordToken('#tag', 0, '', '#tag'), new Token(Tokenizer::TOKEN_GROUP_END, ')', 4), ], '@user' => [ new WordToken('@user', 0, '', '@user'), ], '@user.user' => [ new WordToken('@user.user', 0, '', '@user.user'), ], '\@user' => [ new WordToken('\@user', 0, '', '\@user'), ], '@_user-user' => [ new WordToken('@_user-user', 0, '', '@_user-user'), ], '@user+' => [ new WordToken('@user+', 0, '', '@user+'), ], '@user-' => [ new WordToken('@user-', 0, '', '@user-'), ], '@user!' => [ new WordToken('@user!', 0, '', '@user!'), ], "@user\n" => [ new WordToken('@user', 0, '', '@user'), new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 5), ], '@user ' => [ new WordToken('@user', 0, '', '@user'), new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 5), ], '@user(' => [ new WordToken('@user', 0, '', '@user'), new GroupBeginToken('(', 5, '(', null), ], '@user)' => [ new WordToken('@user', 0, '', '@user'), new Token(Tokenizer::TOKEN_GROUP_END, ')', 5), ], 'domain:domain:' => [ new WordToken('domain:domain:', 0, '', 'domain:domain:'), ], 'some.domain:some.domain:' => [ new WordToken('some.domain:some.domain:', 0, '', 'some.domain:some.domain:'), ], 'domain:domain:domain:domain' => [ new WordToken('domain:domain:domain:domain', 0, '', 'domain:domain:domain:domain'), ], 'domain\:' => [ new WordToken('domain\:', 0, '', 'domain\:'), ], 'domain\::' => [ new WordToken('domain\::', 0, '', 'domain\::'), ], 'domain:word' => [ new WordToken('domain:word', 0, '', 'domain:word'), ], 'domain\:word' => [ new WordToken('domain\:word', 0, '', 'domain\:word'), ], 'domain:"phrase"' => [ new WordToken('domain:', 0, '', 'domain:'), new PhraseToken('"phrase"', 7, '', '"', 'phrase'), ], 'some.domain:"phrase"' => [ new WordToken('some.domain:', 0, '', 'some.domain:'), new PhraseToken('"phrase"', 12, '', '"', 'phrase'), ], 'domain\:"phrase"' => [ new WordToken('domain\:', 0, '', 'domain\:'), new PhraseToken('"phrase"', 8, '', '"', 'phrase'), ], 'domain:(one)' => [ new WordToken('domain:', 0, '', 'domain:'), new GroupBeginToken('(', 7, '(', ''), new WordToken('one', 8, '', 'one'), new Token(Tokenizer::TOKEN_GROUP_END, ')', 11), ], 'some.domain:(one)' => [ new WordToken('some.domain:', 0, '', 'some.domain:'), new GroupBeginToken('(', 12, '(', ''), new WordToken('one', 13, '', 'one'), new Token(Tokenizer::TOKEN_GROUP_END, ')', 16), ], ]; } } /** * @return \QueryTranslator\Languages\Galach\TokenExtractor */ protected function getTokenExtractor() { return new TokenExtractor\Text(); } } ================================================ FILE: tests/Galach/Tokenizer/TokenExtractorTest.php ================================================ expectException(RuntimeException::class); $this->expectExceptionMessage('PCRE regex error code: 2'); /** @var \QueryTranslator\Languages\Galach\TokenExtractor|\PHPUnit_Framework_MockObject_MockObject $extractor */ $extractor = $this->getMockBuilder(TokenExtractor::class) ->setMethods(['getExpressionTypeMap']) ->getMockForAbstractClass(); $extractor->expects($this->once()) ->method('getExpressionTypeMap') ->willReturn( [ '/(?:\D+|<\d+>)*[!?]/' => Tokenizer::TOKEN_WHITESPACE, ] ); $extractor->extract('foobar foobar foobar', 0); } public function testFullExtractTermTokenThrowsException() { $this->expectException(RuntimeException::class); $this->expectExceptionMessage('Could not extract term token from the given data'); $extractor = new Full(); $reflectedClass = new \ReflectionClass($extractor); $reflectedProperty = $reflectedClass->getProperty('expressionTypeMap'); $reflectedProperty->setAccessible(true); $reflectedProperty->setValue( null, [ '/(?foobar)/' => Tokenizer::TOKEN_TERM, ] ); $extractor->extract('foobar', 0); } public function testTextExtractTermTokenThrowsException() { $this->expectException(RuntimeException::class); $this->expectExceptionMessage('Could not extract term token from the given data'); $extractor = new Text(); $reflectedClass = new \ReflectionClass($extractor); $reflectedProperty = $reflectedClass->getProperty('expressionTypeMap'); $reflectedProperty->setAccessible(true); $reflectedProperty->setValue( null, [ '/(?foobar)/' => Tokenizer::TOKEN_TERM, ] ); $extractor->extract('foobar', 0); } } ================================================ FILE: tests/Galach/Values/NodeTraversalTest.php ================================================ getMockForAbstractClass(Node::class); $secondMember = $this->getMockForAbstractClass(Node::class); $nodes = (new Group([$firstMember, $secondMember]))->getNodes(); $this->assertSame($firstMember, $nodes[0]); $this->assertSame($secondMember, $nodes[1]); } public function testLogicalAndNode() { $leftOperand = $this->getMockForAbstractClass(Node::class); $rightOperand = $this->getMockForAbstractClass(Node::class); $nodes = (new LogicalAnd($leftOperand, $rightOperand))->getNodes(); $this->assertSame($leftOperand, $nodes[0]); $this->assertSame($rightOperand, $nodes[1]); } public function testLogicalNotNode() { $operand = $this->getMockForAbstractClass(Node::class); $nodes = (new LogicalNot($operand))->getNodes(); $this->assertSame($operand, $nodes[0]); } public function testLogicalOrNode() { $leftOperand = $this->getMockForAbstractClass(Node::class); $rightOperand = $this->getMockForAbstractClass(Node::class); $nodes = (new LogicalOr($leftOperand, $rightOperand))->getNodes(); $this->assertSame($leftOperand, $nodes[0]); $this->assertSame($rightOperand, $nodes[1]); } public function testMandatoryNode() { $operand = $this->getMockForAbstractClass(Node::class); $nodes = (new Mandatory($operand))->getNodes(); $this->assertSame($operand, $nodes[0]); } public function testProhibitedNode() { $operand = $this->getMockForAbstractClass(Node::class); $nodes = (new Prohibited($operand))->getNodes(); $this->assertSame($operand, $nodes[0]); } public function testQueryNode() { $firstMember = $this->getMockForAbstractClass(Node::class); $secondMember = $this->getMockForAbstractClass(Node::class); $nodes = (new Query([$firstMember, $secondMember]))->getNodes(); $this->assertSame($firstMember, $nodes[0]); $this->assertSame($secondMember, $nodes[1]); } public function testTermNode() { /** @var \QueryTranslator\Values\Token $token */ $token = $this->getMockBuilder(Token::class)->disableOriginalConstructor()->getMock(); $nodes = (new Term($token))->getNodes(); $this->assertEmpty($nodes); } } ================================================ FILE: tests/bootstrap.php ================================================