From 2eb5a432d2229788ce2fdb09f36c6f4bebdea813 Mon Sep 17 00:00:00 2001 From: Laria Carolin Chabowski Date: Fri, 7 Feb 2020 09:44:59 +0100 Subject: Initial commit --- src/Search/AbstractFTSExpr.php | 31 +++++ src/Search/CharSource.php | 33 +++++ src/Search/FTSExpr.php | 30 +++++ src/Search/FTSLogicOp.php | 46 +++++++ src/Search/FTSNotExpr.php | 29 ++++ src/Search/LogicOp.php | 78 +++++++++++ src/Search/NotOp.php | 32 +++++ src/Search/Pagination.php | 14 ++ src/Search/ParseError.php | 9 ++ src/Search/Parser.php | 295 +++++++++++++++++++++++++++++++++++++++++ src/Search/SQLSearchExpr.php | 11 ++ src/Search/SearchExpr.php | 14 ++ src/Search/SearchResult.php | 160 ++++++++++++++++++++++ src/Search/TagExpr.php | 42 ++++++ src/Search/TrueExpr.php | 25 ++++ 15 files changed, 849 insertions(+) create mode 100644 src/Search/AbstractFTSExpr.php create mode 100644 src/Search/CharSource.php create mode 100644 src/Search/FTSExpr.php create mode 100644 src/Search/FTSLogicOp.php create mode 100644 src/Search/FTSNotExpr.php create mode 100644 src/Search/LogicOp.php create mode 100644 src/Search/NotOp.php create mode 100644 src/Search/Pagination.php create mode 100644 src/Search/ParseError.php create mode 100644 src/Search/Parser.php create mode 100644 src/Search/SQLSearchExpr.php create mode 100644 src/Search/SearchExpr.php create mode 100644 src/Search/SearchResult.php create mode 100644 src/Search/TagExpr.php create mode 100644 src/Search/TrueExpr.php (limited to 'src/Search') diff --git a/src/Search/AbstractFTSExpr.php b/src/Search/AbstractFTSExpr.php new file mode 100644 index 0000000..b72b1b6 --- /dev/null +++ b/src/Search/AbstractFTSExpr.php @@ -0,0 +1,31 @@ +sql = $singleFTS + ? "nc.note_contents MATCH :{$bindPrefix}match" + : "n.content_row IN ( + SELECT rowid + FROM note_contents + WHERE note_contents MATCH :{$bindPrefix}match + )"; + $sqlex->bindings["{$bindPrefix}match"] = $this->fts4Query(); + + return $sqlex; + } + + public function countFTSQueries(): int + { + return 1; + } +} \ No newline at end of file diff --git a/src/Search/CharSource.php b/src/Search/CharSource.php new file mode 100644 index 0000000..165e538 --- /dev/null +++ b/src/Search/CharSource.php @@ -0,0 +1,33 @@ +s = $s; + $this->len = mb_strlen($s); + } + + public function getNext(): ?string + { + if ($this->i >= $this->len) + return null; + + $c = mb_substr($this->s, $this->i, 1); + $this->i++; + return $c; + } + + public function unget(): void + { + $this->i = max(0, $this->i - 1); + } +} \ No newline at end of file diff --git a/src/Search/FTSExpr.php b/src/Search/FTSExpr.php new file mode 100644 index 0000000..1123cf3 --- /dev/null +++ b/src/Search/FTSExpr.php @@ -0,0 +1,30 @@ +term = $term; + } + + public function getTerm(): string + { + return $this->term; + } + + protected function fts4Query(): string + { + return '"' . str_replace('"', '""', $this->term) . '"'; + } + + public function toString(): string + { + return '"' . preg_replace_callback('/(["\\\\])/', fn($s) => "\\$s", $this->term) . '"'; + } +} \ No newline at end of file diff --git a/src/Search/FTSLogicOp.php b/src/Search/FTSLogicOp.php new file mode 100644 index 0000000..452f63b --- /dev/null +++ b/src/Search/FTSLogicOp.php @@ -0,0 +1,46 @@ +op = $op; + $this->a = $a; + $this->b = $b; + } + + private const FTSOPS = [ + LogicOp::OP_AND => "", + LogicOp::OP_OR => "OR", + ]; + + protected function fts4Query(): string + { + $ftsop = self::FTSOPS[$this->op]; + assert($ftsop); + + return "({$this->a->fts4Query()} {$ftsop} {$this->b->fts4Query()})"; + } + + public function toString(): string + { + return "({$this->a->toString()} FTS-{$this->op} {$this->b->toString()})"; + } +} \ No newline at end of file diff --git a/src/Search/FTSNotExpr.php b/src/Search/FTSNotExpr.php new file mode 100644 index 0000000..a4aa219 --- /dev/null +++ b/src/Search/FTSNotExpr.php @@ -0,0 +1,29 @@ +expr = $expr; + } + + protected function fts4Query(): string + { + return "-{$this->expr->fts4Query()}"; + } + + public function toString(): string + { + return "(FTS-NOT {$this->expr->toString()})"; + } +} \ No newline at end of file diff --git a/src/Search/LogicOp.php b/src/Search/LogicOp.php new file mode 100644 index 0000000..85fb8fa --- /dev/null +++ b/src/Search/LogicOp.php @@ -0,0 +1,78 @@ + "AND", + self::OP_OR => "OR", + ]; + + private string $op; + private SearchExpr $a; + private SearchExpr $b; + + public function __construct(string $op, SearchExpr $a, SearchExpr $b) + { + if (!self::checkOp($op)) + throw new \DomainException("{$op} is not a valid operator"); + + $this->op = $op; + $this->a = $a; + $this->b = $b; + } + + public static function build(string $op, SearchExpr $a, SearchExpr $b): SearchExpr + { + return $a instanceof AbstractFTSExpr && $b instanceof AbstractFTSExpr + ? new FTSLogicOp($op, $a, $b) + : new self($op, $a, $b); + } + + /** + * @param string $op + * @return bool + */ + public static function checkOp(string $op): bool + { + return in_array($op, [ + self::OP_AND, + self::OP_OR, + ]); + } + + public function getA(): SearchExpr { return $this->a; } + public function getB(): SearchExpr { return $this->b; } + public function getOp(): string { return $this->op; } + + public function toString(): string + { + return "({$this->a->toString()}) {$this->op} ({$this->b->toString()})"; + } + + public function toSQL($bindPrefix, bool $singleFTS): SQLSearchExpr + { + $sqlex = new SQLSearchExpr(); + + $a = $this->a->toSQL("a_$bindPrefix", $singleFTS); + $b = $this->b->toSQL("b_$bindPrefix", $singleFTS); + $sqlop = self::SQLOPS[$this->op]; + assert($sqlop); + + $sqlex->sql = "(({$a->sql}) {$sqlop} ({$b->sql}))"; + $sqlex->bindings = array_merge($a->bindings, $b->bindings); + + return $sqlex; + } + + public function countFTSQueries(): int + { + return $this->a->countFTSQueries() + $this->b->countFTSQueries(); + } +} \ No newline at end of file diff --git a/src/Search/NotOp.php b/src/Search/NotOp.php new file mode 100644 index 0000000..35fcf1e --- /dev/null +++ b/src/Search/NotOp.php @@ -0,0 +1,32 @@ +expr = $expr; + } + + public function toString(): string + { + return "not ({$this->expr->toString()})"; + } + + public function toSQL(string $bindPrefix, bool $singleFTS): SQLSearchExpr + { + $sqlex = $this->expr->toSQL($bindPrefix, $singleFTS); + $sqlex->sql = "(NOT ({$sqlex->sql}))"; + return $sqlex; + } + + public function countFTSQueries(): int + { + return $this->expr->countFTSQueries(); + } +} \ No newline at end of file diff --git a/src/Search/Pagination.php b/src/Search/Pagination.php new file mode 100644 index 0000000..b4b2447 --- /dev/null +++ b/src/Search/Pagination.php @@ -0,0 +1,14 @@ +valid()) + return null; + $out = $input->current(); + $input->next(); + return $out; + } + + /** + * @return Iterator + * @throws ParseError + */ + private static function tokenize_normal(CharSource $input): Iterator + { + $buf = ""; + + $yieldBufAndClear = function () use (&$buf) { + if ($buf !== "") { + switch ($buf) { + case "and": + case "or": + case "not": + yield [self::TOK_OP, $buf]; + break; + default: + yield [self::TOK_WORD, $buf]; + } + } + $buf = ""; + }; + + for (;;) { + $c = $input->getNext(); + if ($c === null) { + break; + } + + switch ($c) { + case '\\': + $next = $input->getNext(); + if ($next === null) { + $buf .= $c; + break 2; + } + $buf .= $next; + break; + + case ' ': + case "\t": + yield from $yieldBufAndClear(); + break; + + case '"': + yield from $yieldBufAndClear(); + yield from self::tokenize_string($input); + break; + + case ':': + if ($buf !== "") { + yield [self::TOK_PROP, $buf]; + $buf = ""; + } + break; + + case '(': + yield from $yieldBufAndClear(); + yield [self::TOK_PAROPEN, null]; + break; + + case ')': + yield from $yieldBufAndClear(); + yield [self::TOK_PARCLOSE, null]; + break; + + case '#': + yield from $yieldBufAndClear(); + yield from self::tokenize_tag($input); + break; + + default: + $buf .= $c; + } + } + + yield from $yieldBufAndClear(); + return; + } + + /** + * @param string $input + * @return SearchExpr|null + * @throws ParseError + */ + public static function parse(string $input): ?SearchExpr + { + $tokens = self::tokenize($input); + + $stack = []; + $cur = null; + $binOp = null; + $negated = false; + + $putExpr = function (SearchExpr $expr) use (&$cur, &$binOp, &$negated) { + if ($negated) { + $expr = new NotOp($expr); + } + + $cur = $cur === null + ? $expr + : LogicOp::build($binOp ?? LogicOp::OP_AND, $cur, $expr); + + $binOp = null; + $negated = false; + }; + + $setBinOp = function ($op) use (&$binOp) { + if ($binOp !== null) + throw new ParseError("Unexpected logic operator $op"); + + $binOp = $op; + }; + + for (;;) { + $token = self::getItemAndAdvance($tokens); + if ($token === null) + break; + + [$ttyp, $tdata] = $token; + + switch ($ttyp) { + + case self::TOK_TAG: + $putExpr(new TagExpr($tdata)); + break; + case self::TOK_OP: + switch ($tdata) { + case "and": + $setBinOp(LogicOp::OP_AND); + break; + case "or": + $setBinOp(LogicOp::OP_OR); + break; + case "not": + $negated = !$negated; + break; + default: + throw new \DomainException("Unexpected data for TOK_OP: $tdata"); + } + break; + case self::TOK_WORD: + $putExpr(new FTSExpr($tdata)); + break; + case self::TOK_PROP: + // TODO(laria): Implement this + throw new ParseError("Not yet supported"); + case self::TOK_PAROPEN: + $stack[] = [$cur, $binOp, $negated]; + $cur = $binOp = $negated = null; + break; + case self::TOK_PARCLOSE: + if (empty($stack)) + throw new ParseError("Unexpected closing parenthesis"); + + $parContent = $cur; + [$cur, $binOp, $negated] = array_pop($stack); + $putExpr($parContent); + break; + } + } + + if (!empty($stack)) + throw new ParseError("Unclosed parenthesis"); + + return $cur; + } + + /** + * @param CharSource $input + * @return Generator + * @throws ParseError + */ + private static function tokenize_string(CharSource $input): Generator + { + $content = ""; + for (;;) { + $c = $input->getNext(); + if ($c === null) + throw new ParseError("Unclosed string encountered"); + + switch ($c) { + case '\\': + $next = $input->getNext(); + if ($next === null) + throw new ParseError("Unclosed string encountered"); + + $content .= $next; + break; + + case '"': + yield [self::TOK_WORD, $content]; + return; + + default: + $content .= $c; + } + } + } + + /** + * @param CharSource $input + * @return Iterator + */ + private static function tokenize_tag(CharSource $input): Iterator + { + $tag = ""; + + $yieldTag = function () use (&$tag) { + if ($tag === "") + yield [self::TOK_WORD, "#"]; + else + yield [self::TOK_TAG, $tag]; + }; + + for (;;) { + $c = $input->getNext(); + if ($c === null) { + yield from $yieldTag(); + return; + } + + switch ($c) { + case '\\': + $next = $input->getNext(); + if ($c === null) { + $tag .= '\\'; + yield [self::TOK_TAG, $tag]; + return; + } + $tag .= $next; + break; + + case ' ': + case "\t": + yield from $yieldTag(); + return; + + case '(': + case ')': + case '#': + $input->unget(); + yield from $yieldTag(); + return; + + default: + $tag .= $c; + } + } + } +} \ No newline at end of file diff --git a/src/Search/SQLSearchExpr.php b/src/Search/SQLSearchExpr.php new file mode 100644 index 0000000..76306ce --- /dev/null +++ b/src/Search/SQLSearchExpr.php @@ -0,0 +1,11 @@ +note = $note; + $this->highlights = $highlights; + } + + /** + * @param SQLite3 $db + * @param SearchExpr $expr + * @return self[] + */ + public static function search(SQLite3 $db, SearchExpr $expr): array + { + return $expr->countFTSQueries() === 1 + ? self::searchFTS($db, $expr) + : self::searchComplex($db, $expr); + } + + private static function searchComplex(SQLite3 $db, SearchExpr $expr): array + { + $sqlSearchExpr = $expr->toSQL("", false); + + $query = new DbQuery(" + SELECT + n.id + FROM notes n + INNER JOIN note_contents nc + ON nc.rowid = n.content_row + WHERE {$sqlSearchExpr->sql} + "); + + foreach ($sqlSearchExpr->bindings as $k => $v) + $query->bind($k, $v); + + $ids = array_map(fn ($row) => $row[0], $query->fetchRows($db)); + $notes = Note::byIds($db, $ids); + return array_map(fn ($note) => new self($note, []), $notes); + } + + private static function highlightRangeContains(array $range, int $point): bool + { + [$start, $end] = $range; + return $start <= $point && $point <= $end; + } + + private static function areHighlightsOverlapping(array $a, array $b): bool + { + [$aStart, $aEnd] = $a; + [$bStart, $bEnd] = $b; + + return self::highlightRangeContains($a, $bStart) + || self::highlightRangeContains($a, $bEnd) + || self::highlightRangeContains($b, $aStart) + || self::highlightRangeContains($b, $aEnd); + } + + private static function parseOffsetsToHighlights(string $offsets): array + { + $offsets = explode(" ", $offsets); + $offsets = array_map("intval", $offsets); + + $phraseMatches = count($offsets) / 4; + + $highlights = []; + for ($i = 0; $i < $phraseMatches; $i++) { + $off = $offsets[$i * 4 + 2]; + $len = $offsets[$i * 4 + 3]; + + if ($off < 0 || $len === 0) + continue; + + $highlights[] = [$off, $off+$len-1]; + } + + usort($highlights, fn ($a, $b) => ($a[0] <=> $b[0]) ?: ($b[1] <=> $a[1])); + + // merge overlapping areas + for ($i = count($highlights)-1; $i >= 0; $i--) { + for ($j = $i-1; $j >= 0; $j--) { + if (self::areHighlightsOverlapping($highlights[$i], $highlights[$j])) { + [$iStart, $iEnd] = $highlights[$i]; + [$jStart, $jEnd] = $highlights[$j]; + + $highlights[$j] = [min($iStart, $jStart), max($iEnd, $jEnd)]; + unset($highlights[$i]); + break; + } + } + } + + return array_merge($highlights); // array_merge here renumbers the keys + } + + private static function searchFTS(SQLite3 $db, SearchExpr $expr) + { + $sqlSearchExpr = $expr->toSQL("", true); + $query = new DbQuery(" + SELECT + n.id, + offsets(nc.note_contents) AS offsets + FROM notes n + INNER JOIN note_contents nc + ON nc.rowid = n.content_row + WHERE {$sqlSearchExpr->sql} + "); + foreach ($sqlSearchExpr->bindings as $k => $v) + $query->bind($k, $v); + + + $offsets = $query->fetchIndexedValues($db, "offsets", "id"); + + $notes = Note::byIds($db, array_keys($offsets)); + + $out = []; + foreach ($offsets as $id => $offString) { + if (!isset($notes[$id])) + throw new LogicException("Note '{$id}' not loaded but found?"); + + $out[] = new self($notes[$id], self::parseOffsetsToHighlights($offString)); + } + + return $out; + } + + public function renderHighlightedContent(): string + { + $out = ""; + $content = $this->note->getContent(); + $lastOff = 0; + foreach ($this->highlights as [$start, $end]) { + $out .= Esc::e(substr($content, $lastOff, $start - $lastOff), Esc::HTML_WITH_BR); + $out .= '' . Esc::e(substr($content, $start, $end - $start + 1), Esc::HTML_WITH_BR) . ''; + + $lastOff = $end + 1; + } + + $out .= Esc::e(substr($content, $lastOff), Esc::HTML_WITH_BR); + + return $out; + } + + public function getNote(): Note { return $this->note; } +} \ No newline at end of file diff --git a/src/Search/TagExpr.php b/src/Search/TagExpr.php new file mode 100644 index 0000000..b117bbe --- /dev/null +++ b/src/Search/TagExpr.php @@ -0,0 +1,42 @@ +tag = $tag; + } + + public function getTag(): string { return $this->tag; } + + public function toString(): string + { + return "#{$this->tag}"; + } + + public function toSQL(string $bindPrefix, bool $singleFTS): SQLSearchExpr + { + $sqlex = new SQLSearchExpr(); + + $sqlex->sql = "EXISTS ( + SELECT 1 + FROM tags t + WHERE t.tag = :{$bindPrefix}tag + AND t.note_id = n.id + )"; + $sqlex->bindings["{$bindPrefix}tag"] = $this->tag; + + return $sqlex; + } + + public function countFTSQueries(): int + { + return 0; + } +} \ No newline at end of file diff --git a/src/Search/TrueExpr.php b/src/Search/TrueExpr.php new file mode 100644 index 0000000..5f25c7e --- /dev/null +++ b/src/Search/TrueExpr.php @@ -0,0 +1,25 @@ +"; + } + + public function toSQL(string $bindPrefix, bool $singleFTS): SQLSearchExpr + { + $sqlSearchExpr = new SQLSearchExpr(); + $sqlSearchExpr->sql = "1"; + return $sqlSearchExpr; + } + + public function countFTSQueries(): int + { + return 0; + } +} \ No newline at end of file -- cgit v1.2.3-70-g09d2