From 2eb5a432d2229788ce2fdb09f36c6f4bebdea813 Mon Sep 17 00:00:00 2001 From: Laria Carolin Chabowski Date: Fri, 7 Feb 2020 09:44:59 +0100 Subject: Initial commit --- src/Search/Parser.php | 295 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 src/Search/Parser.php (limited to 'src/Search/Parser.php') diff --git a/src/Search/Parser.php b/src/Search/Parser.php new file mode 100644 index 0000000..a8efdfd --- /dev/null +++ b/src/Search/Parser.php @@ -0,0 +1,295 @@ +valid()) + return null; + $out = $input->current(); + $input->next(); + return $out; + } + + /** + * @return Iterator + * @throws ParseError + */ + private static function tokenize_normal(CharSource $input): Iterator + { + $buf = ""; + + $yieldBufAndClear = function () use (&$buf) { + if ($buf !== "") { + switch ($buf) { + case "and": + case "or": + case "not": + yield [self::TOK_OP, $buf]; + break; + default: + yield [self::TOK_WORD, $buf]; + } + } + $buf = ""; + }; + + for (;;) { + $c = $input->getNext(); + if ($c === null) { + break; + } + + switch ($c) { + case '\\': + $next = $input->getNext(); + if ($next === null) { + $buf .= $c; + break 2; + } + $buf .= $next; + break; + + case ' ': + case "\t": + yield from $yieldBufAndClear(); + break; + + case '"': + yield from $yieldBufAndClear(); + yield from self::tokenize_string($input); + break; + + case ':': + if ($buf !== "") { + yield [self::TOK_PROP, $buf]; + $buf = ""; + } + break; + + case '(': + yield from $yieldBufAndClear(); + yield [self::TOK_PAROPEN, null]; + break; + + case ')': + yield from $yieldBufAndClear(); + yield [self::TOK_PARCLOSE, null]; + break; + + case '#': + yield from $yieldBufAndClear(); + yield from self::tokenize_tag($input); + break; + + default: + $buf .= $c; + } + } + + yield from $yieldBufAndClear(); + return; + } + + /** + * @param string $input + * @return SearchExpr|null + * @throws ParseError + */ + public static function parse(string $input): ?SearchExpr + { + $tokens = self::tokenize($input); + + $stack = []; + $cur = null; + $binOp = null; + $negated = false; + + $putExpr = function (SearchExpr $expr) use (&$cur, &$binOp, &$negated) { + if ($negated) { + $expr = new NotOp($expr); + } + + $cur = $cur === null + ? $expr + : LogicOp::build($binOp ?? LogicOp::OP_AND, $cur, $expr); + + $binOp = null; + $negated = false; + }; + + $setBinOp = function ($op) use (&$binOp) { + if ($binOp !== null) + throw new ParseError("Unexpected logic operator $op"); + + $binOp = $op; + }; + + for (;;) { + $token = self::getItemAndAdvance($tokens); + if ($token === null) + break; + + [$ttyp, $tdata] = $token; + + switch ($ttyp) { + + case self::TOK_TAG: + $putExpr(new TagExpr($tdata)); + break; + case self::TOK_OP: + switch ($tdata) { + case "and": + $setBinOp(LogicOp::OP_AND); + break; + case "or": + $setBinOp(LogicOp::OP_OR); + break; + case "not": + $negated = !$negated; + break; + default: + throw new \DomainException("Unexpected data for TOK_OP: $tdata"); + } + break; + case self::TOK_WORD: + $putExpr(new FTSExpr($tdata)); + break; + case self::TOK_PROP: + // TODO(laria): Implement this + throw new ParseError("Not yet supported"); + case self::TOK_PAROPEN: + $stack[] = [$cur, $binOp, $negated]; + $cur = $binOp = $negated = null; + break; + case self::TOK_PARCLOSE: + if (empty($stack)) + throw new ParseError("Unexpected closing parenthesis"); + + $parContent = $cur; + [$cur, $binOp, $negated] = array_pop($stack); + $putExpr($parContent); + break; + } + } + + if (!empty($stack)) + throw new ParseError("Unclosed parenthesis"); + + return $cur; + } + + /** + * @param CharSource $input + * @return Generator + * @throws ParseError + */ + private static function tokenize_string(CharSource $input): Generator + { + $content = ""; + for (;;) { + $c = $input->getNext(); + if ($c === null) + throw new ParseError("Unclosed string encountered"); + + switch ($c) { + case '\\': + $next = $input->getNext(); + if ($next === null) + throw new ParseError("Unclosed string encountered"); + + $content .= $next; + break; + + case '"': + yield [self::TOK_WORD, $content]; + return; + + default: + $content .= $c; + } + } + } + + /** + * @param CharSource $input + * @return Iterator + */ + private static function tokenize_tag(CharSource $input): Iterator + { + $tag = ""; + + $yieldTag = function () use (&$tag) { + if ($tag === "") + yield [self::TOK_WORD, "#"]; + else + yield [self::TOK_TAG, $tag]; + }; + + for (;;) { + $c = $input->getNext(); + if ($c === null) { + yield from $yieldTag(); + return; + } + + switch ($c) { + case '\\': + $next = $input->getNext(); + if ($c === null) { + $tag .= '\\'; + yield [self::TOK_TAG, $tag]; + return; + } + $tag .= $next; + break; + + case ' ': + case "\t": + yield from $yieldTag(); + return; + + case '(': + case ')': + case '#': + $input->unget(); + yield from $yieldTag(); + return; + + default: + $tag .= $c; + } + } + } +} \ No newline at end of file -- cgit v1.2.3-70-g09d2