Merge pull request #2425 from lonvia/tokenizer-documentation

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 17 Aug 2021 07:38:03 +0000 (09:38 +0200)

committer GitHub <noreply@github.com>

Tue, 17 Aug 2021 07:38:03 +0000 (09:38 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 17 Aug 2021 07:38:03 +0000 (09:38 +0200)
committer GitHub <noreply@github.com>
Tue, 17 Aug 2021 07:38:03 +0000 (09:38 +0200)
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt

index d5143bdbe949dfe8865c0e70fe71e462166b3a44..6472722a8bbad32f1dcab22bc7e78ef802961b81 100644 (file)
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -26,7 +26,10 @@ ADD_CUSTOM_TARGET(doc
     COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Centos-8.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Centos-8.md
     COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Ubuntu-18.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Ubuntu-18.md
     COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Ubuntu-20.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Ubuntu-20.md
-   COMMAND mkdocs build -d ${CMAKE_CURRENT_BINARY_DIR}/../site-html -f ${CMAKE_CURRENT_BINARY_DIR}/../mkdocs.yml
+   COMMAND PYTHONPATH=${PROJECT_SOURCE_DIR} mkdocs build -d ${CMAKE_CURRENT_BINARY_DIR}/../site-html -f ${CMAKE_CURRENT_BINARY_DIR}/../mkdocs.yml
  )
  
-
+ADD_CUSTOM_TARGET(serve-doc
+    COMMAND PYTHONPATH=${PROJECT_SOURCE_DIR} mkdocs serve
+    WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+)
diff --git a/docs/develop/Development-Environment.md b/docs/develop/Development-Environment.md

index eea69c706e95aa3d6195bbe171409b70d2c153ac..f1610a49a479128cd1d28990fd6ade7a83cd35fb 100644 (file)
--- a/docs/develop/Development-Environment.md
+++ b/docs/develop/Development-Environment.md
@@ -38,6 +38,7 @@ It has the following additional requirements:
  The documentation is built with mkdocs:
  
  * [mkdocs](https://www.mkdocs.org/) >= 1.1.2
+* [mkdocstrings](https://mkdocstrings.github.io/)
  
  ### Installing prerequisites on Ubuntu/Debian
  
@@ -51,7 +52,7 @@ To install all necessary packages run:
  sudo apt install php-cgi phpunit php-codesniffer \
                   python3-pip python3-setuptools python3-dev pylint
  
-pip3 install --user behave mkdocs pytest
+pip3 install --user behave mkdocs mkdocstrings pytest
  ```
  
  The `mkdocs` executable will be located in `.local/bin`. You may have to add
@@ -113,7 +114,7 @@ symlinks (see `CMakeLists.txt` for the exact steps).
  Now you can start webserver for local testing
  
  ```
-build> mkdocs serve
+build> mame serve-doc
  [server:296] Serving on http://127.0.0.1:8000
  [handlers:62] Start watching changes
  ```
@@ -122,7 +123,7 @@ If you develop inside a Vagrant virtual machine, use a port that is forwarded
  to your host:
  
  ```
-build> mkdocs serve --dev-addr 0.0.0.0:8088
+build> PYTHONPATH=$SRCDIR mkdocs serve --dev-addr 0.0.0.0:8088
  [server:296] Serving on http://0.0.0.0:8088
  [handlers:62] Start watching changes
  ```
diff --git a/docs/develop/Tokenizers.md b/docs/develop/Tokenizers.md

new file mode 100644 (file)

index 0000000..529315e
--- /dev/null
+++ b/docs/develop/Tokenizers.md
@@ -0,0 +1,325 @@
+# Tokenizers
+
+The tokenizer is the component of Nominatim that is responsible for
+analysing names of OSM objects and queries. Nominatim provides different
+tokenizers that use different strategies for normalisation. This page describes
+how tokenizers are expected to work and the public API that needs to be
+implemented when creating a new tokenizer. For information on how to configure
+a specific tokenizer for a database see the
+[tokenizer chapter in the administration guide](../admin/Tokenizers.md).
+
+## Generic Architecture
+
+### About Search Tokens
+
+Search in Nominatim is organised around search tokens. Such a token represents
+string that can be part of the search query. Tokens are used so that the search
+index does not need to be organised around strings. Instead the database saves
+for each place which tokens match this place's name, address, house number etc.
+To be able to distinguish between these different types of information stored
+with the place, a search token also always has a certain type: name, house number,
+postcode etc.
+
+During search an incoming query is transformed into a ordered list of such
+search tokens (or rather many lists, see below) and this list is then converted
+into a database query to find the right place.
+
+It is the core task of the tokenizer to create, manage and assign the search
+tokens. The tokenizer is involved in two distinct operations:
+
+* __at import time__: scanning names of OSM objects, normalizing them and
+  building up the list of search tokens.
+* __at query time__: scanning the query and returning the appropriate search
+  tokens.
+
+
+### Importing
+
+The indexer is responsible to enrich an OSM object (or place) with all data
+required for geocoding. It is split into two parts: the controller collects
+the places that require updating, enriches the place information as required
+and hands the place to Postgresql. The collector is part of the Nominatim
+library written in Python. Within Postgresql, the `placex_update`
+trigger is responsible to fill out all secondary tables with extra geocoding
+information. This part is written in PL/pgSQL.
+
+The tokenizer is involved in both parts. When the indexer prepares a place,
+it hands it over to the tokenizer to inspect the names and create all the
+search tokens applicable for the place. This usually involves updating the
+tokenizer's internal token lists and creating a list of all token IDs for
+the specific place. This list is later needed in the PL/pgSQL part where the
+indexer needs to add the token IDs to the appropriate search tables. To be
+able to communicate the list between the Python part and the pl/pgSQL trigger,
+the `placex` table contains a special JSONB column `token_info` which is there
+for the exclusive use of the tokenizer.
+
+The Python part of the tokenizer returns a structured information about the
+tokens of a place to the indexer which converts it to JSON and inserts it into
+the `token_info` column. The content of the column is then handed to the PL/pqSQL
+callbacks of the tokenizer which extracts the required information. Usually
+the tokenizer then removes all information from the `token_info` structure,
+so that no information is ever persistently saved in the table. All information
+that went in should have been processed after all and put into secondary tables.
+This is however not a hard requirement. If the tokenizer needs to store
+additional information about a place permanently, it may do so in the
+`token_info` column. It just may never execute searches over it and
+consequently not create any special indexes on it.
+
+### Querying
+
+At query time, Nominatim builds up multiple _interpretations_ of the search
+query. Each of these interpretations is tried against the database in order
+of the likelihood with which they match to the search query. The first
+interpretation that yields results wins.
+
+The interpretations are encapsulated in the `SearchDescription` class. An
+instance of this class is created by applying a sequence of
+_search tokens_ to an initially empty SearchDescription. It is the
+responsibility of the tokenizer to parse the search query and derive all
+possible sequences of search tokens. To that end the tokenizer needs to parse
+the search query and look up matching words in its own data structures.
+
+## Tokenizer API
+
+The following section describes the functions that need to be implemented
+for a custom tokenizer implementation.
+
+!!! warning
+    This API is currently in early alpha status. While this API is meant to
+    be a public API on which other tokenizers may be implemented, the API is
+    far away from being stable at the moment.
+
+### Directory Structure
+
+Nominatim expects two files for a tokenizer:
+
+* `nominiatim/tokenizer/<NAME>_tokenizer.py` containing the Python part of the
+  implementation
+* `lib-php/tokenizer/<NAME>_tokenizer.php` with the PHP part of the
+  implementation
+
+where `<NAME>` is a unique name for the tokenizer consisting of only lower-case
+letters, digits and underscore. A tokenizer also needs to install some SQL
+functions. By convention, these should be placed in `lib-sql/tokenizer`.
+
+If the tokenizer has a default configuration file, this should be saved in
+the `settings/<NAME>_tokenizer.<SUFFIX>`.
+
+### Configuration and Persistance
+
+Tokenizers may define custom settings for their configuration. All settings
+must be prefixed with `NOMINATIM_TOKENIZER_`. Settings may be transient or
+persistent. Transient settings are loaded from the configuration file when
+Nominatim is started and may thus be changed at any time. Persistent settings
+are tied to a database installation and must only be read during installation
+time. If they are needed for the runtime then they must be saved into the
+`nominatim_properties` table and later loaded from there.
+
+### The Python module
+
+The Python module is expect to export a single factory function:
+
+```python
+def create(dsn: str, data_dir: Path) -> AbstractTokenizer
+```
+
+The `dsn` parameter contains the DSN of the Nominatim database. The `data_dir`
+is a directory in the project directory that the tokenizer may use to save
+database-specific data. The function must return the instance of the tokenizer
+class as defined below.
+
+### Python Tokenizer Class
+
+All tokenizers must inherit from `nominatim.tokenizer.base.AbstractTokenizer`
+and implement the abstract functions defined there.
+
+::: nominatim.tokenizer.base.AbstractTokenizer
+    rendering:
+        heading_level: 4
+
+### Python Analyzer Class
+
+::: nominatim.tokenizer.base.AbstractAnalyzer
+    rendering:
+        heading_level: 4
+
+### PL/pgSQL Functions
+
+The tokenizer must provide access functions for the `token_info` column
+to the indexer which extracts the necessary information for the global
+search tables. If the tokenizer needs additional SQL functions for private
+use, then these functions must be prefixed with `token_` in order to ensure
+that there are no naming conflicts with the SQL indexer code.
+
+The following functions are expected:
+
+```sql
+FUNCTION token_get_name_search_tokens(info JSONB) RETURNS INTEGER[]
+```
+
+Return an array of token IDs of search terms that should match
+the name(s) for the given place. These tokens are used to look up the place
+by name and, where the place functions as part of an address for another place,
+by address. Must return NULL when the place has no name.
+
+```sql
+FUNCTION token_get_name_match_tokens(info JSONB) RETURNS INTEGER[]
+```
+
+Return an array of token IDs of full names of the place that should be used
+to match addresses. The list of match tokens is usually more strict than
+search tokens as it is used to find a match between two OSM tag values which
+are expected to contain matching full names. Partial terms should not be
+used for match tokens. Must return NULL when the place has no name.
+
+```sql
+FUNCTION token_get_housenumber_search_tokens(info JSONB) RETURNS INTEGER[]
+```
+
+Return an array of token IDs of house number tokens that apply to the place.
+Note that a place may have multiple house numbers, for example when apartments
+each have their own number. Must be NULL when the place has no house numbers.
+
+```sql
+FUNCTION token_normalized_housenumber(info JSONB) RETURNS TEXT
+```
+
+Return the house number(s) in the normalized form that can be matched against
+a house number token text. If a place has multiple house numbers they must
+be listed with a semicolon as delimiter. Must be NULL when the place has no
+house numbers.
+
+```sql
+FUNCTION token_addr_street_match_tokens(info JSONB) RETURNS INTEGER[]
+```
+
+Return the match token IDs by which to search a matching street from the
+`addr:street` tag. These IDs will be matched against the IDs supplied by
+`token_get_name_match_tokens`. Must be NULL when the place has no `addr:street`
+tag.
+
+```sql
+FUNCTION token_addr_place_match_tokens(info JSONB) RETURNS INTEGER[]
+```
+
+Return the match token IDs by which to search a matching place from the
+`addr:place` tag. These IDs will be matched against the IDs supplied by
+`token_get_name_match_tokens`. Must be NULL when the place has no `addr:place`
+tag.
+
+```sql
+FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[]
+```
+
+Return the search token IDs extracted from the `addr:place` tag. These tokens
+are used for searches by address when no matching place can be found in the
+database. Must be NULL when the place has no `addr:place` tag.
+
+```sql
+CREATE TYPE token_addresstoken AS (
+  key TEXT,
+  match_tokens INT[],
+  search_tokens INT[]
+);
+
+FUNCTION token_get_address_tokens(info JSONB) RETURNS SETOF token_addresstoken
+```
+
+Return the match and search token IDs for explicit `addr:*` tags for the place
+other than `addr:street` and `addr:place`. For each address item there are
+three pieces of information returned:
+
+ * _key_ contains the type of address item (city, county, etc.). This is the
+   key handed in with the `address` dictionary.
+ * *match_tokens* is the list of token IDs used to find the corresponding
+   place object for the address part. The list is matched against the IDs
+   from `token_get_name_match_tokens`.
+ * *search_tokens* is the list of token IDs under which to search the address
+   item. It is used when no corresponding place object was found.
+
+```sql
+FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT
+```
+
+Return the normalized version of the given postcode. This function must return
+the same value as the Python function `AbstractAnalyzer->normalize_postcode()`.
+
+```sql
+FUNCTION token_strip_info(info JSONB) RETURNS JSONB
+```
+
+Return the part of the `token_info` field that should be stored in the database
+permanently. The indexer calls this function when all processing is done and
+replaces the content of the `token_info` column with the returned value before
+the trigger stores the information in the database. May return NULL if no
+information should be stored permanently.
+
+### PHP Tokenizer class
+
+The PHP tokenizer class is instantiated once per request and responsible for
+analyzing the incoming query. Multiple requests may be in flight in
+parallel.
+
+The class is expected to be found under the
+name of `\Nominatim\Tokenizer`. To find the class the PHP code includes the file
+`tokenizer/tokenizer.php` in the project directory. This file must be created
+when the tokenizer is first set up on import. The file should initialize any
+configuration variables by setting PHP constants and then require the file
+with the actual implementation of the tokenizer.
+
+The tokenizer class must implement the following functions:
+
+```php
+public function __construct(object &$oDB)
+```
+
+The constructor of the class receives a database connection that can be used
+to query persistent data in the database.
+
+```php
+public function checkStatus()
+```
+
+Check that the tokenizer can access its persistent data structures. If there
+is an issue, throw an `\Exception`.
+
+```php
+public function normalizeString(string $sTerm) : string
+```
+
+Normalize string to a form to be used for comparisons when reordering results.
+Nominatim reweighs results how well the final display string matches the actual
+query. Before comparing result and query, names and query are normalised against
+this function. The tokenizer can thus remove all properties that should not be
+taken into account for reweighing, e.g. special characters or case.
+
+```php
+public function tokensForSpecialTerm(string $sTerm) : array
+```
+
+Return the list of special term tokens that match the given term.
+
+```php
+public function extractTokensFromPhrases(array &$aPhrases) : TokenList
+```
+
+Parse the given phrases, splitting them into word lists and retrieve the
+matching tokens.
+
+The phrase array may take on two forms. In unstructured searches (using `q=`
+parameter) the search query is split at the commas and the elements are
+put into a sorted list. For structured searches the phrase array is an
+associative array where the key designates the type of the term (street, city,
+county etc.) The tokenizer may ignore the phrase type at this stage in parsing.
+Matching phrase type and appropriate search token type will be done later
+when the SearchDescription is built.
+
+For each phrase in the list of phrases, the function must analyse the phrase
+string and then call `setWordSets()` to communicate the result of the analysis.
+A word set is a list of strings, where each string refers to a search token.
+A phrase may have multiple interpretations. Therefore a list of word sets is
+usually attached to the phrase. The search tokens themselves are returned
+by the function in an associative array, where the key corresponds to the
+strings given in the word sets. The value is a list of search tokens. Thus
+a single string in the list of word sets may refer to multiple search tokens.
+
diff --git a/docs/extra.css b/docs/extra.css

index 136c59a6438121ba5c3c645fb1376cd2feb378d3..9289c1d39884909c0d6d4c4a0209f8cec1039c97 100644 (file)
--- a/docs/extra.css
+++ b/docs/extra.css
@@ -13,3 +13,11 @@ th, td {
  th {
      background-color: #eee;
  }
+
+/* Indentation for mkdocstrings.
+div.doc-contents:not(.first) {
+  padding-left: 25px;
+  border-left: 4px solid rgba(230, 230, 230);
+  margin-bottom: 60px;
+}*/
+
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml

index 5c6147aa6cafae3892fcad460453e83563e78bf6..bc8a6cddb60d69d597d50c9b3010f4490c7fd361 100644 (file)
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -30,6 +30,7 @@ pages:
          - 'Architecture Overview' : 'develop/overview.md'
          - 'OSM Data Import' : 'develop/Import.md'
          - 'Place Ranking' : 'develop/Ranking.md'
+        - 'Tokenizers' : 'develop/Tokenizers.md'
          - 'Postcodes' : 'develop/Postcodes.md'
          - 'Testing' : 'develop/Testing.md'
          - 'External Data Sources': 'develop/data-sources.md'
@@ -44,3 +45,11 @@ markdown_extensions:
      - toc:
          permalink: 
  extra_css: [extra.css, styles.css]
+plugins:
+    - search
+    - mkdocstrings:
+        handlers:
+          python:
+            rendering:
+              show_source: false
+              show_signature_annotations: false
diff --git a/lib-php/Geocode.php b/lib-php/Geocode.php

index 52b92c9928770f3baac9d4b162d90f9d1c904748..0f76a9c472749652823f8f4b8a8f0f306f8b2edb 100644 (file)
--- a/lib-php/Geocode.php
+++ b/lib-php/Geocode.php
@@ -498,7 +498,6 @@ class Geocode
          if ($this->aCountryCodes) {
              $oCtx->setCountryList($this->aCountryCodes);
          }
-        $this->oTokenizer->setCountryRestriction($this->aCountryCodes);
  
          Debug::newSection('Query Preprocessing');
  
diff --git a/lib-php/Phrase.php b/lib-php/Phrase.php

index d14c842df809158aa9644e0badc90244b84f0508..cdde6134768dd7e0af7501f9c5fb517ef0372931 100644 (file)
--- a/lib-php/Phrase.php
+++ b/lib-php/Phrase.php
@@ -9,9 +9,6 @@ namespace Nominatim;
   */
  class Phrase
  {
-    const MAX_WORDSET_LEN = 20;
-    const MAX_WORDSETS = 100;
-
      // Complete phrase as a string.
      private $sPhrase;
      // Element type for structured searches.
@@ -19,19 +16,6 @@ class Phrase
      // Possible segmentations of the phrase.
      private $aWordSets;
  
-    public static function cmpByArraylen($aA, $aB)
-    {
-        $iALen = count($aA);
-        $iBLen = count($aB);
-
-        if ($iALen == $iBLen) {
-            return 0;
-        }
-
-        return ($iALen < $iBLen) ? -1 : 1;
-    }
-
-
      public function __construct($sPhrase, $sPhraseType)
      {
          $this->sPhrase = trim($sPhrase);
@@ -57,6 +41,11 @@ class Phrase
          return $this->sPhraseType;
      }
  
+    public function setWordSets($aWordSets)
+    {
+        $this->aWordSets = $aWordSets;
+    }
+
      /**
       * Return the array of possible segmentations of the phrase.
       *
@@ -80,61 +69,6 @@ class Phrase
          }
      }
  
-    public function computeWordSets($aWords, $oTokens)
-    {
-        $iNumWords = count($aWords);
-
-        if ($iNumWords == 0) {
-            $this->aWordSets = null;
-            return;
-        }
-
-        // Caches the word set for the partial phrase up to word i.
-        $aSetCache = array_fill(0, $iNumWords, array());
-
-        // Initialise first element of cache. There can only be the word.
-        if ($oTokens->containsAny($aWords[0])) {
-            $aSetCache[0][] = array($aWords[0]);
-        }
-
-        // Now do the next elements using what we already have.
-        for ($i = 1; $i < $iNumWords; $i++) {
-            for ($j = $i; $j > 0; $j--) {
-                $sPartial = $j == $i ? $aWords[$j] : $aWords[$j].' '.$sPartial;
-                if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
-                    $aPartial = array($sPartial);
-                    foreach ($aSetCache[$j - 1] as $aSet) {
-                        if (count($aSet) < Phrase::MAX_WORDSET_LEN) {
-                            $aSetCache[$i][] = array_merge($aSet, $aPartial);
-                        }
-                    }
-                    if (count($aSetCache[$i]) > 2 * Phrase::MAX_WORDSETS) {
-                        usort(
-                            $aSetCache[$i],
-                            array('\Nominatim\Phrase', 'cmpByArraylen')
-                        );
-                        $aSetCache[$i] = array_slice(
-                            $aSetCache[$i],
-                            0,
-                            Phrase::MAX_WORDSETS
-                        );
-                    }
-                }
-            }
-
-            // finally the current full phrase
-            $sPartial = $aWords[0].' '.$sPartial;
-            if ($oTokens->containsAny($sPartial)) {
-                $aSetCache[$i][] = array($sPartial);
-            }
-        }
-
-        $this->aWordSets = $aSetCache[$iNumWords - 1];
-        usort($this->aWordSets, array('\Nominatim\Phrase', 'cmpByArraylen'));
-        $this->aWordSets = array_slice($this->aWordSets, 0, Phrase::MAX_WORDSETS);
-    }
-
-
      public function debugInfo()
      {
          return array(
diff --git a/lib-php/SearchContext.php b/lib-php/SearchContext.php

index 8316a01288c58673332640d72e35ef3bbb506ddf..3b512ecb11434455f78c6aa7918caf7fc792fcae 100644 (file)
--- a/lib-php/SearchContext.php
+++ b/lib-php/SearchContext.php
@@ -28,6 +28,8 @@ class SearchContext
      public $sqlViewboxLarge = '';
      /// Reference along a route (as SQL).
      public $sqlViewboxCentre = '';
+    /// List of countries to restrict search to (as array).
+    public $aCountryList = null;
      /// List of countries to restrict search to (as SQL).
      public $sqlCountryList = '';
      /// List of place IDs to exclude (as SQL).
@@ -187,6 +189,7 @@ class SearchContext
      public function setCountryList($aCountries)
      {
          $this->sqlCountryList = '('.join(',', array_map('addQuotes', $aCountries)).')';
+        $this->aCountryList = $aCountries;
      }
  
      /**
@@ -279,6 +282,19 @@ class SearchContext
          return '';
      }
  
+    /**
+     * Check if the given country is covered by the search context.
+     *
+     * @param string $sCountryCode  Country code of the country to check.
+     *
+     * @return True, if no country code restrictions are set or the
+     *         country is included in the country list.
+     */
+    public function isCountryApplicable($sCountryCode)
+    {
+        return $this->aCountryList === null || in_array($sCountryCode, $this->aCountryList);
+    }
+
      public function debugInfo()
      {
          return array(
diff --git a/lib-php/SimpleWordList.php b/lib-php/SimpleWordList.php

new file mode 100644 (file)

index 0000000..3dd9bda
--- /dev/null
+++ b/lib-php/SimpleWordList.php
@@ -0,0 +1,131 @@
+<?php
+
+namespace Nominatim;
+
+/**
+ * A word list creator based on simple splitting by space.
+ *
+ * Creates possible permutations of split phrases by finding all combination
+ * of splitting the phrase on space boundaries.
+ */
+class SimpleWordList
+{
+    const MAX_WORDSET_LEN = 20;
+    const MAX_WORDSETS = 100;
+
+    // The phrase as a list of simple terms (without spaces).
+    private $aWords;
+
+    /**
+     * Create a new word list
+     *
+     * @param string sPhrase  Phrase to create the word list from. The phrase is
+     *                        expected to be normalised, so that there are no
+     *                        subsequent spaces.
+     */
+    public function __construct($sPhrase)
+    {
+        if (strlen($sPhrase) > 0) {
+            $this->aWords = explode(' ', $sPhrase);
+        } else {
+            $this->aWords = array();
+        }
+    }
+
+    /**
+     * Get all possible tokens that are present in this word list.
+     *
+     * @return array The list of string tokens in the word list.
+     */
+    public function getTokens()
+    {
+        $aTokens = array();
+        $iNumWords = count($this->aWords);
+
+        for ($i = 0; $i < $iNumWords; $i++) {
+            $sPhrase = $this->aWords[$i];
+            $aTokens[$sPhrase] = $sPhrase;
+
+            for ($j = $i + 1; $j < $iNumWords; $j++) {
+                $sPhrase .= ' '.$this->aWords[$j];
+                $aTokens[$sPhrase] = $sPhrase;
+            }
+        }
+
+        return $aTokens;
+    }
+
+    /**
+     * Compute all possible permutations of phrase splits that result in
+     * words which are in the token list.
+     */
+    public function getWordSets($oTokens)
+    {
+        $iNumWords = count($this->aWords);
+
+        if ($iNumWords == 0) {
+            return null;
+        }
+
+        // Caches the word set for the partial phrase up to word i.
+        $aSetCache = array_fill(0, $iNumWords, array());
+
+        // Initialise first element of cache. There can only be the word.
+        if ($oTokens->containsAny($this->aWords[0])) {
+            $aSetCache[0][] = array($this->aWords[0]);
+        }
+
+        // Now do the next elements using what we already have.
+        for ($i = 1; $i < $iNumWords; $i++) {
+            for ($j = $i; $j > 0; $j--) {
+                $sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
+                if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
+                    $aPartial = array($sPartial);
+                    foreach ($aSetCache[$j - 1] as $aSet) {
+                        if (count($aSet) < SimpleWordList::MAX_WORDSET_LEN) {
+                            $aSetCache[$i][] = array_merge($aSet, $aPartial);
+                        }
+                    }
+                    if (count($aSetCache[$i]) > 2 * SimpleWordList::MAX_WORDSETS) {
+                        usort(
+                            $aSetCache[$i],
+                            array('\Nominatim\SimpleWordList', 'cmpByArraylen')
+                        );
+                        $aSetCache[$i] = array_slice(
+                            $aSetCache[$i],
+                            0,
+                            SimpleWordList::MAX_WORDSETS
+                        );
+                    }
+                }
+            }
+
+            // finally the current full phrase
+            $sPartial = $this->aWords[0].' '.$sPartial;
+            if ($oTokens->containsAny($sPartial)) {
+                $aSetCache[$i][] = array($sPartial);
+            }
+        }
+
+        $aWordSets = $aSetCache[$iNumWords - 1];
+        usort($aWordSets, array('\Nominatim\SimpleWordList', 'cmpByArraylen'));
+        return array_slice($aWordSets, 0, SimpleWordList::MAX_WORDSETS);
+    }
+
+    public static function cmpByArraylen($aA, $aB)
+    {
+        $iALen = count($aA);
+        $iBLen = count($aB);
+
+        if ($iALen == $iBLen) {
+            return 0;
+        }
+
+        return ($iALen < $iBLen) ? -1 : 1;
+    }
+
+    public function debugInfo()
+    {
+        return $this->aWords;
+    }
+}
diff --git a/lib-php/TokenCountry.php b/lib-php/TokenCountry.php

index c9b7b6af1a93b1e778f9721397dec8760809c005..ab84c388edf00e9269de3dce040505b08eb6b51a 100644 (file)
--- a/lib-php/TokenCountry.php
+++ b/lib-php/TokenCountry.php
@@ -36,7 +36,9 @@ class Country
       */
      public function isExtendable($oSearch, $oPosition)
      {
-        return !$oSearch->hasCountry() && $oPosition->maybePhrase('country');
+        return !$oSearch->hasCountry()
+               && $oPosition->maybePhrase('country')
+               && $oSearch->getContext()->isCountryApplicable($this->sCountryCode);
      }
  
      /**
diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php

index 4e297954ac457159cab63b666fde905cc293e198..ca224a224861f5bd2e0f126cf10be5ea0edf5488 100644 (file)
--- a/lib-php/tokenizer/legacy_icu_tokenizer.php
+++ b/lib-php/tokenizer/legacy_icu_tokenizer.php
@@ -2,13 +2,14 @@
  
  namespace Nominatim;
  
+require_once(CONST_LibDir.'/SimpleWordList.php');
+
  class Tokenizer
  {
      private $oDB;
  
      private $oNormalizer;
      private $oTransliterator;
-    private $aCountryRestriction;
  
      public function __construct(&$oDB)
      {
@@ -30,12 +31,6 @@ class Tokenizer
      }
  
  
-    public function setCountryRestriction($aCountries)
-    {
-        $this->aCountryRestriction = $aCountries;
-    }
-
-
      public function normalizeString($sTerm)
      {
          if ($this->oNormalizer === null) {
@@ -88,13 +83,10 @@ class Tokenizer
              $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
              $sPhrase = $this->makeStandardWord($oPhrase->getPhrase());
              Debug::printVar('Phrase', $sPhrase);
-            if (strlen($sPhrase) > 0) {
-                $aWords = explode(' ', $sPhrase);
-                Tokenizer::addTokens($aTokens, $aWords);
-                $aWordLists[] = $aWords;
-            } else {
-                $aWordLists[] = array();
-            }
+
+            $oWordList = new SimpleWordList($sPhrase);
+            $aTokens = array_merge($aTokens, $oWordList->getTokens());
+            $aWordLists[] = $oWordList;
          }
  
          Debug::printVar('Tokens', $aTokens);
@@ -103,7 +95,7 @@ class Tokenizer
          $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
  
          foreach ($aPhrases as $iPhrase => $oPhrase) {
-            $oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
+            $oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens));
          }
  
          return $oValidTokens;
@@ -162,10 +154,7 @@ class Tokenizer
  
              switch ($aWord['type']) {
                  case 'C':  // country name tokens
-                    if ($aWord['word'] !== null
-                        && (!$this->aCountryRestriction
-                            || in_array($aWord['word'], $this->aCountryRestriction))
-                    ) {
+                    if ($aWord['word'] !== null) {
                          $oValidTokens->addToken(
                              $sTok,
                              new Token\Country($iId, $aWord['word'])
@@ -220,27 +209,4 @@ class Tokenizer
              }
          }
      }
-
-
-    /**
-     * Add the tokens from this phrase to the given list of tokens.
-     *
-     * @param string[] $aTokens List of tokens to append.
-     *
-     * @return void
-     */
-    private static function addTokens(&$aTokens, $aWords)
-    {
-        $iNumWords = count($aWords);
-
-        for ($i = 0; $i < $iNumWords; $i++) {
-            $sPhrase = $aWords[$i];
-            $aTokens[$sPhrase] = $sPhrase;
-
-            for ($j = $i + 1; $j < $iNumWords; $j++) {
-                $sPhrase .= ' '.$aWords[$j];
-                $aTokens[$sPhrase] = $sPhrase;
-            }
-        }
-    }
  }
diff --git a/lib-php/tokenizer/legacy_tokenizer.php b/lib-php/tokenizer/legacy_tokenizer.php

index 570b88289e7cd13a68dd13d8c5af64c519f33fe6..e5ffbe025f05aabb268b886bc6c055174eb831e5 100644 (file)
--- a/lib-php/tokenizer/legacy_tokenizer.php
+++ b/lib-php/tokenizer/legacy_tokenizer.php
@@ -2,12 +2,13 @@
  
  namespace Nominatim;
  
+require_once(CONST_LibDir.'/SimpleWordList.php');
+
  class Tokenizer
  {
      private $oDB;
  
      private $oNormalizer = null;
-    private $aCountryRestriction = null;
  
      public function __construct(&$oDB)
      {
@@ -37,12 +38,6 @@ class Tokenizer
      }
  
  
-    public function setCountryRestriction($aCountries)
-    {
-        $this->aCountryRestriction = $aCountries;
-    }
-
-
      public function normalizeString($sTerm)
      {
          if ($this->oNormalizer === null) {
@@ -106,13 +101,14 @@ class Tokenizer
          $aWordLists = array();
          $aTokens = array();
          foreach ($aNormPhrases as $sPhrase) {
-            if (strlen($sPhrase) > 0) {
-                $aWords = explode(' ', $sPhrase);
-                Tokenizer::addTokens($aTokens, $aWords);
-                $aWordLists[] = $aWords;
-            } else {
-                $aWordLists[] = array();
+            $oWordList = new SimpleWordList($sPhrase);
+
+            foreach ($oWordList->getTokens() as $sToken) {
+                $aTokens[' '.$sToken] = ' '.$sToken;
+                $aTokens[$sToken] = $sToken;
              }
+
+            $aWordLists[] = $oWordList;
          }
  
          Debug::printVar('Tokens', $aTokens);
@@ -121,7 +117,7 @@ class Tokenizer
          $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
  
          foreach ($aPhrases as $iPhrase => $oPhrase) {
-            $oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
+            $oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens));
          }
  
          return $oValidTokens;
@@ -206,12 +202,7 @@ class Tokenizer
                      );
                  }
              } elseif ($aWord['country_code']) {
-                // Filter country tokens that do not match restricted countries.
-                if (!$this->aCountryRestriction
-                    || in_array($aWord['country_code'], $this->aCountryRestriction)
-                ) {
-                    $oToken = new Token\Country($iId, $aWord['country_code']);
-                }
+                $oToken = new Token\Country($iId, $aWord['country_code']);
              } elseif ($aWord['word_token'][0] == ' ') {
                  $oToken = new Token\Word(
                      $iId,
@@ -238,29 +229,4 @@ class Tokenizer
              }
          }
      }
-
-
-    /**
-     * Add the tokens from this phrase to the given list of tokens.
-     *
-     * @param string[] $aTokens List of tokens to append.
-     *
-     * @return void
-     */
-    private static function addTokens(&$aTokens, $aWords)
-    {
-        $iNumWords = count($aWords);
-
-        for ($i = 0; $i < $iNumWords; $i++) {
-            $sPhrase = $aWords[$i];
-            $aTokens[' '.$sPhrase] = ' '.$sPhrase;
-            $aTokens[$sPhrase] = $sPhrase;
-
-            for ($j = $i + 1; $j < $iNumWords; $j++) {
-                $sPhrase .= ' '.$aWords[$j];
-                $aTokens[' '.$sPhrase] = ' '.$sPhrase;
-                $aTokens[$sPhrase] = $sPhrase;
-            }
-        }
-    }
  }
diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py

new file mode 100644 (file)

index 0000000..00ecae4
--- /dev/null
+++ b/nominatim/tokenizer/base.py
@@ -0,0 +1,224 @@
+"""
+Abstract class defintions for tokenizers. These base classes are here
+mainly for documentation purposes.
+"""
+from abc import ABC, abstractmethod
+from typing import List, Tuple, Dict, Any
+
+from nominatim.config import Configuration
+
+# pylint: disable=unnecessary-pass
+
+class AbstractAnalyzer(ABC):
+    """ The analyzer provides the functions for analysing names and building
+        the token database.
+
+        Analyzers are instantiated on a per-thread base. Access to global data
+        structures must be synchronised accordingly.
+    """
+
+    def __enter__(self) -> 'AbstractAnalyzer':
+        return self
+
+
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        self.close()
+
+
+    @abstractmethod
+    def close(self) -> None:
+        """ Free all resources used by the analyzer.
+        """
+        pass
+
+
+    @abstractmethod
+    def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
+        """ Return token information for the given list of words.
+
+            The function is used for testing and debugging only
+            and does not need to be particularly efficient.
+
+            Arguments:
+                words: A list of words to look up the tokens for.
+                       If a word starts with # it is assumed to be a full name
+                       otherwise is a partial term.
+
+            Returns:
+                The function returns the list of all tuples that could be
+                found for the given words. Each list entry is a tuple of
+                (original word, word token, word id).
+        """
+        pass
+
+
+    @abstractmethod
+    def normalize_postcode(self, postcode: str) -> str:
+        """ Convert the postcode to its standardized form.
+
+            This function must yield exactly the same result as the SQL function
+            `token_normalized_postcode()`.
+
+            Arguments:
+                postcode: The postcode to be normalized.
+
+            Returns:
+                The given postcode after normalization.
+        """
+        pass
+
+
+    @abstractmethod
+    def update_postcodes_from_db(self) -> None:
+        """ Update the tokenizer's postcode tokens from the current content
+            of the `location_postcode` table.
+        """
+        pass
+
+
+    @abstractmethod
+    def update_special_phrases(self, phrases: List[Tuple[str, str, str, str]],
+                               should_replace: bool) -> None:
+        """ Update the tokenizer's special phrase tokens from the given
+            list of special phrases.
+
+            Arguments:
+                phrases: The new list of special phrases. Each entry is
+                         a tuple of (phrase, class, type, operator).
+                should_replace: If true, replace the current list of phrases.
+                                When false, just add the given phrases to the
+                                ones that already exist.
+        """
+        pass
+
+
+    @abstractmethod
+    def add_country_names(self, country_code: str, names: Dict[str, str]):
+        """ Add the given names to the tokenizer's list of country tokens.
+
+            Arguments:
+                country_code: two-letter country code for the country the names
+                              refer to.
+                names: Dictionary of name type to name.
+        """
+        pass
+
+
+    @abstractmethod
+    def process_place(self, place: Dict) -> Any:
+        """ Extract tokens for the given place and compute the
+            information to be handed to the PL/pgSQL processor for building
+            the search index.
+
+            Arguments:
+                place: Dictionary with the information about the place. Currently
+                       the following fields may be present:
+
+                       - *name* is a dictionary of names for the place together
+                         with the designation of the name.
+                       - *address* is a dictionary of address terms.
+                       - *country_feature* is set to a country code when the
+                         place describes a country.
+
+            Returns:
+                A JSON-serialisable structure that will be handed into
+                the database via the `token_info` field.
+        """
+
+
+
+class AbstractTokenizer(ABC):
+    """ The tokenizer instance is the central instance of the tokenizer in
+        the system. There will only be a single instance of the tokenizer
+        active at any time.
+    """
+
+    @abstractmethod
+    def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
+        """ Set up a new tokenizer for the database.
+
+            The function should copy all necessary data into the project
+            directory or save it in the property table to make sure that
+            the tokenizer remains stable over updates.
+
+            Arguments:
+              config: Read-only object with configuration obtions.
+
+              init_db: When set to False, then initialisation of database
+                tables should be skipped. This option is only required for
+                migration purposes and can be savely ignored by custom
+                tokenizers.
+
+            TODO: can we move the init_db parameter somewhere else?
+        """
+        pass
+
+
+    @abstractmethod
+    def init_from_project(self) -> None:
+        """ Initialise the tokenizer from an existing database setup.
+
+            The function should load all previously saved configuration from
+            the project directory and/or the property table.
+        """
+        pass
+
+
+    @abstractmethod
+    def finalize_import(self, config: Configuration) -> None:
+        """ This function is called at the very end of an import when all
+            data has been imported and indexed. The tokenizer may create
+            at this point any additional indexes and data structures needed
+            during query time.
+
+            Arguments:
+              config: Read-only object with configuration obtions.
+        """
+        pass
+
+
+    @abstractmethod
+    def update_sql_functions(self, config: Configuration) -> None:
+        """ Update the SQL part of the tokenizer. This function is called
+            automatically on migrations or may be called explicitly by the
+            user through the `nominatim refresh --functions` command.
+
+            The tokenizer must only update the code of the tokenizer. The
+            data structures or data itself must not be changed by this function.
+
+            Arguments:
+              config: Read-only object with configuration obtions.
+        """
+        pass
+
+
+    @abstractmethod
+    def check_database(self) -> str:
+        """ Check that the database is set up correctly and ready for being
+            queried.
+
+            Returns:
+              If an issue was found, return an error message with the
+              description of the issue as well as hints for the user on
+              how to resolve the issue.
+
+              Return `None`, if no issue was found.
+        """
+        pass
+
+
+    @abstractmethod
+    def name_analyzer(self) -> AbstractAnalyzer:
+        """ Create a new analyzer for tokenizing names and queries
+            using this tokinzer. Analyzers are context managers and should
+            be used accordingly:
+
+            ```
+            with tokenizer.name_analyzer() as analyzer:
+                analyser.tokenize()
+            ```
+
+            When used outside the with construct, the caller must ensure to
+            call the close() function before destructing the analyzer.
+        """
+        pass
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index a887ae286834e6005a97ddd53da467b4af54f1ff..44034f842622f08257878b69d392af1f47b00df7 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -16,6 +16,7 @@ from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
+from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
@@ -28,7 +29,7 @@ def create(dsn, data_dir):
      return LegacyICUTokenizer(dsn, data_dir)
  
  
-class LegacyICUTokenizer:
+class LegacyICUTokenizer(AbstractTokenizer):
      """ This tokenizer uses libICU to covert names and queries to ASCII.
          Otherwise it uses the same algorithms and data structures as the
          normalization routines in Nominatim 3.
@@ -192,7 +193,7 @@ class LegacyICUTokenizer:
          return words
  
  
-class LegacyICUNameAnalyzer:
+class LegacyICUNameAnalyzer(AbstractAnalyzer):
      """ The legacy analyzer uses the ICU library for splitting names.
  
          Each instance opens a connection to the database to request the
@@ -207,14 +208,6 @@ class LegacyICUNameAnalyzer:
          self._cache = _TokenCache()
  
  
-    def __enter__(self):
-        return self
-
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
-
-
      def close(self):
          """ Free all resources used by the analyzer.
          """
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py

index c19dce2f5a2a3c0d903cd13bca7bd6e3738a8008..8957426b353efa7ec17f572e754f7fe47f90022c 100644 (file)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -16,6 +16,7 @@ from nominatim.db import properties
  from nominatim.db import utils as db_utils
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.errors import UsageError
+from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  DBCFG_NORMALIZATION = "tokenizer_normalization"
  DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
@@ -76,7 +77,7 @@ def _check_module(module_dir, conn):
              raise UsageError("Database module cannot be accessed.") from err
  
  
-class LegacyTokenizer:
+class LegacyTokenizer(AbstractTokenizer):
      """ The legacy tokenizer uses a special PostgreSQL module to normalize
          names and queries. The tokenizer thus implements normalization through
          calls to the database.
@@ -238,7 +239,7 @@ class LegacyTokenizer:
          properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
  
  
-class LegacyNameAnalyzer:
+class LegacyNameAnalyzer(AbstractAnalyzer):
      """ The legacy analyzer uses the special Postgresql module for
          splitting names.
  
@@ -255,14 +256,6 @@ class LegacyNameAnalyzer:
          self._cache = _TokenCache(self.conn)
  
  
-    def __enter__(self):
-        return self
-
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
-
-
      def close(self):
          """ Free all resources used by the analyzer.
          """
diff --git a/test/php/Nominatim/PhraseTest.php b/test/php/Nominatim/PhraseTest.php

deleted file mode 100644 (file)

index e4c2bbd..0000000
--- a/test/php/Nominatim/PhraseTest.php
+++ /dev/null
@@ -1,139 +0,0 @@
-<?php
-
-namespace Nominatim;
-
-require_once(CONST_LibDir.'/Phrase.php');
-
-class TokensFullSet
-{
-    public function containsAny($sTerm)
-    {
-        return true;
-    }
-}
-
-// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses
-class TokensPartialSet
-{
-    public function __construct($aTokens)
-    {
-        $this->aTokens = array_flip($aTokens);
-    }
-
-    public function containsAny($sTerm)
-    {
-        return isset($this->aTokens[$sTerm]);
-    }
-}
-
-// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses
-class PhraseTest extends \PHPUnit\Framework\TestCase
-{
-
-
-    private function serializeSets($aSets)
-    {
-        $aParts = array();
-        foreach ($aSets as $aSet) {
-            $aParts[] = '(' . join('|', $aSet) . ')';
-        }
-        return join(',', $aParts);
-    }
-
-
-    public function testEmptyPhrase()
-    {
-        $oPhrase = new Phrase('', '');
-        $oPhrase->computeWordSets(array(), new TokensFullSet());
-
-        $this->assertNull($oPhrase->getWordSets());
-    }
-
-
-    public function testSingleWordPhrase()
-    {
-        $oPhrase = new Phrase('a', '');
-        $oPhrase->computeWordSets(array('a'), new TokensFullSet());
-
-        $this->assertEquals(
-            '(a)',
-            $this->serializeSets($oPhrase->getWordSets())
-        );
-    }
-
-
-    public function testMultiWordPhrase()
-    {
-        $oPhrase = new Phrase('a b', '');
-        $oPhrase->computeWordSets(array('a', 'b'), new TokensFullSet());
-        $this->assertEquals(
-            '(a b),(a|b)',
-            $this->serializeSets($oPhrase->getWordSets())
-        );
-
-        $oPhrase = new Phrase('a b c', '');
-        $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
-        $this->assertEquals(
-            '(a b c),(a|b c),(a b|c),(a|b|c)',
-            $this->serializeSets($oPhrase->getWordSets())
-        );
-
-        $oPhrase = new Phrase('a b c d', '');
-        $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensFullSet());
-        $this->assertEquals(
-            '(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)',
-            $this->serializeSets($oPhrase->getWordSets())
-        );
-    }
-
-
-    public function testInverseWordSets()
-    {
-        $oPhrase = new Phrase('a b c', '');
-        $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
-        $oPhrase->invertWordSets();
-
-        $this->assertEquals(
-            '(a b c),(b c|a),(c|a b),(c|b|a)',
-            $this->serializeSets($oPhrase->getWordSets())
-        );
-    }
-
-
-    public function testMaxWordSets()
-    {
-        $aWords = array_fill(0, 4, 'a');
-        $oPhrase = new Phrase(join(' ', $aWords), '');
-        $oPhrase->computeWordSets($aWords, new TokensFullSet());
-        $this->assertEquals(8, count($oPhrase->getWordSets()));
-        $oPhrase->invertWordSets();
-        $this->assertEquals(8, count($oPhrase->getWordSets()));
-
-        $aWords = array_fill(0, 18, 'a');
-        $oPhrase = new Phrase(join(' ', $aWords), '');
-        $oPhrase->computeWordSets($aWords, new TokensFullSet());
-        $this->assertEquals(100, count($oPhrase->getWordSets()));
-        $oPhrase->invertWordSets();
-        $this->assertEquals(100, count($oPhrase->getWordSets()));
-    }
-
-
-    public function testPartialTokensShortTerm()
-    {
-        $oPhrase = new Phrase('a b c d', '');
-        $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
-        $this->assertEquals(
-            '(a|b c d),(a|b c|d)',
-            $this->serializeSets($oPhrase->getWordSets())
-        );
-    }
-
-
-    public function testPartialTokensLongTerm()
-    {
-        $aWords = array_fill(0, 18, 'a');
-        $oPhrase = new Phrase(join(' ', $aWords), '');
-        $oPhrase->computeWordSets($aWords, new TokensPartialSet(array('a', 'a a a a a')));
-        $this->assertEquals(80, count($oPhrase->getWordSets()));
-    }
-}
diff --git a/test/php/Nominatim/SimpleWordListTest.php b/test/php/Nominatim/SimpleWordListTest.php

new file mode 100644 (file)

index 0000000..5c99366
--- /dev/null
+++ b/test/php/Nominatim/SimpleWordListTest.php
@@ -0,0 +1,112 @@
+<?php
+
+namespace Nominatim;
+
+require_once(CONST_LibDir.'/SimpleWordList.php');
+
+class TokensFullSet
+{
+    public function containsAny($sTerm)
+    {
+        return true;
+    }
+}
+
+// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses
+class TokensPartialSet
+{
+    public function __construct($aTokens)
+    {
+        $this->aTokens = array_flip($aTokens);
+    }
+
+    public function containsAny($sTerm)
+    {
+        return isset($this->aTokens[$sTerm]);
+    }
+}
+
+// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses
+class SimpleWordListTest extends \PHPUnit\Framework\TestCase
+{
+
+
+    private function serializeSets($aSets)
+    {
+        $aParts = array();
+        foreach ($aSets as $aSet) {
+            $aParts[] = '(' . join('|', $aSet) . ')';
+        }
+        return join(',', $aParts);
+    }
+
+
+    public function testEmptyPhrase()
+    {
+        $oList = new SimpleWordList('');
+        $this->assertNull($oList->getWordSets(new TokensFullSet()));
+    }
+
+
+    public function testSingleWordPhrase()
+    {
+        $oList = new SimpleWordList('a');
+
+        $this->assertEquals(
+            '(a)',
+            $this->serializeSets($oList->getWordSets(new TokensFullSet()))
+        );
+    }
+
+
+    public function testMultiWordPhrase()
+    {
+        $oList = new SimpleWordList('a b');
+        $this->assertEquals(
+            '(a b),(a|b)',
+            $this->serializeSets($oList->getWordSets(new TokensFullSet()))
+        );
+
+        $oList = new SimpleWordList('a b c');
+        $this->assertEquals(
+            '(a b c),(a|b c),(a b|c),(a|b|c)',
+            $this->serializeSets($oList->getWordSets(new TokensFullSet()))
+        );
+
+        $oList = new SimpleWordList('a b c d');
+        $this->assertEquals(
+            '(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)',
+            $this->serializeSets($oList->getWordSets(new TokensFullSet()))
+        );
+    }
+
+
+    public function testMaxWordSets()
+    {
+        $aWords = array_fill(0, 4, 'a');
+        $oList = new SimpleWordList(join(' ', $aWords));
+        $this->assertEquals(8, count($oList->getWordSets(new TokensFullSet())));
+
+        $aWords = array_fill(0, 18, 'a');
+        $oList = new SimpleWordList(join(' ', $aWords));
+        $this->assertEquals(100, count($oList->getWordSets(new TokensFullSet())));
+    }
+
+
+    public function testPartialTokensShortTerm()
+    {
+        $oList = new SimpleWordList('a b c d');
+        $this->assertEquals(
+            '(a|b c d),(a|b c|d)',
+            $this->serializeSets($oList->getWordSets(new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d'))))
+        );
+    }
+
+
+    public function testPartialTokensLongTerm()
+    {
+        $aWords = array_fill(0, 18, 'a');
+        $oList = new SimpleWordList(join(' ', $aWords));
+        $this->assertEquals(80, count($oList->getWordSets(new TokensPartialSet(array('a', 'a a a a a')))));
+    }
+}
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 17 Aug 2021 07:38:03 +0000 (09:38 +0200)
committer	GitHub <noreply@github.com>
	Tue, 17 Aug 2021 07:38:03 +0000 (09:38 +0200)
docs/CMakeLists.txt		patch \| blob \| history
docs/develop/Development-Environment.md		patch \| blob \| history
docs/develop/Tokenizers.md	[new file with mode: 0644]	patch \| blob
docs/extra.css		patch \| blob \| history
docs/mkdocs.yml		patch \| blob \| history
lib-php/Geocode.php		patch \| blob \| history
lib-php/Phrase.php		patch \| blob \| history
lib-php/SearchContext.php		patch \| blob \| history
lib-php/SimpleWordList.php	[new file with mode: 0644]	patch \| blob
lib-php/TokenCountry.php		patch \| blob \| history
lib-php/tokenizer/legacy_icu_tokenizer.php		patch \| blob \| history
lib-php/tokenizer/legacy_tokenizer.php		patch \| blob \| history
nominatim/tokenizer/base.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/legacy_icu_tokenizer.py		patch \| blob \| history
nominatim/tokenizer/legacy_tokenizer.py		patch \| blob \| history
test/php/Nominatim/PhraseTest.php	[deleted file]	patch \| blob \| history
test/php/Nominatim/SimpleWordListTest.php	[new file with mode: 0644]	patch \| blob