Merge remote-tracking branch 'upstream/master'

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 12 May 2021 14:18:34 +0000 (16:18 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 12 May 2021 14:18:34 +0000 (16:18 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 12 May 2021 14:18:34 +0000 (16:18 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 12 May 2021 14:18:34 +0000 (16:18 +0200)
diff --git a/.github/actions/build-nominatim/action.yml b/.github/actions/build-nominatim/action.yml

index 191ef2ee3f8af8c13519cb315bd83a6f47cf13c8..d0a89774637eb9238de77f767daa4451d047e34b 100644 (file)
--- a/.github/actions/build-nominatim/action.yml
+++ b/.github/actions/build-nominatim/action.yml
@@ -6,7 +6,7 @@ runs:
      steps:
          - name: Install prerequisites
            run: |
-            sudo apt-get install -y -qq libboost-system-dev libboost-filesystem-dev libexpat1-dev zlib1g-dev libbz2-dev libpq-dev libproj-dev libicu-dev python3-psycopg2 python3-pyosmium python3-dotenv python3-psutil python3-jinja2 python3-icu python3-argparse-manpage
+            sudo apt-get install -y -qq libboost-system-dev libboost-filesystem-dev libexpat1-dev zlib1g-dev libbz2-dev libpq-dev libproj-dev libicu-dev python3-psycopg2 python3-pyosmium python3-dotenv python3-psutil python3-jinja2 python3-icu
            shell: bash
  
          - name: Download dependencies
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml

index a1a4344a1105c17c81582298da361f4f25a4731c..3d473751c01614594b8bec09ea0a8d201d2c84d5 100644 (file)
--- a/.github/workflows/ci-tests.yml
+++ b/.github/workflows/ci-tests.yml
@@ -82,7 +82,18 @@ jobs:
                  verbose: true
  
      import:
-        runs-on: ubuntu-20.04
+        strategy:
+            matrix:
+                ubuntu: [18, 20]
+                include:
+                    - ubuntu: 18
+                      postgresql: 9.5
+                      postgis: 2.5
+                    - ubuntu: 20
+                      postgresql: 13
+                      postgis: 3
+
+        runs-on: ubuntu-${{ matrix.ubuntu }}.04
  
          steps:
              - uses: actions/checkout@v2
@@ -108,12 +119,24 @@ jobs:
                       monaco-latest.osm.pbf
                    key: nominatim-test-data-${{ steps.get-date.outputs.date }}
  
+            - uses: actions/setup-python@v2
+              with:
+                python-version: 3.5
+              if: matrix.ubuntu == 18
+
              - uses: ./Nominatim/.github/actions/setup-postgresql
                with:
-                  postgresql-version: 13
-                  postgis-version: 3
+                  postgresql-version: ${{ matrix.postgresql }}
+                  postgis-version: ${{ matrix.postgis }}
              - uses: ./Nominatim/.github/actions/build-nominatim
  
+            - name: Install extra dependencies for Ubuntu 18
+              run: |
+                sudo apt-get install libicu-dev
+                pip3 install python-dotenv psycopg2==2.7.7 jinja2==2.8 psutil==5.4.2 pyicu osmium
+              shell: bash
+              if: matrix.ubuntu == 18
+
              - name: Clean installation
                run: rm -rf Nominatim build
                shell: bash
@@ -136,10 +159,14 @@ jobs:
                run: nominatim special-phrases --import-from-wiki
                working-directory: data-env
  
-            - name: Check import
+            - name: Check full import
                run: nominatim admin --check-database
                working-directory: data-env
  
+            - name: Warm up database
+              run: nominatim admin --warm
+              working-directory: data-env
+
              - name: Run update
                run: |
                     nominatim replication --init
@@ -147,7 +174,11 @@ jobs:
                working-directory: data-env
  
              - name: Run reverse-only import
-              run : nominatim import --osm-file ../monaco-latest.osm.pbf --reverse-only
+              run : nominatim import --osm-file ../monaco-latest.osm.pbf --reverse-only --no-updates
                working-directory: data-env
                env:
                    NOMINATIM_DATABASE_DSN: pgsql:dbname=reverse
+
+            - name: Check reverse import
+              run: nominatim admin --check-database
+              working-directory: data-env
diff --git a/.pylintrc b/.pylintrc

index eab041818058526209a36951c902e615095d7d23..756bba19e34ecf62382c9d9ad93aa92932ff143b 100644 (file)
--- a/.pylintrc
+++ b/.pylintrc
@@ -10,3 +10,4 @@ ignored-modules=icu
  # closing added here because it sometimes triggers a false positive with
  # 'with' statements.
  ignored-classes=NominatimArgs,closing
+disable=too-few-public-methods,duplicate-code
diff --git a/data/words.sql b/data/words.sql

index ac250739563e9a104c5be0fb41f09d79b69d69eb..5613d927334d21e0053090f2afea6d6c8c675e9b 100644 (file)
--- a/data/words.sql
+++ b/data/words.sql
@@ -29787,7 +29787,7 @@ st      5557484
  
  -- prefill word table
  
-select count(make_keywords(v)) from (select distinct svals(name) as v from place) as w where v is not null;
+select count(precompute_words(v)) from (select distinct svals(name) as v from place) as w where v is not null;
  select count(getorcreate_housenumber_id(make_standard_name(v))) from (select distinct address->'housenumber' as v from place where address ? 'housenumber') as w;
  
  -- copy the word frequencies
diff --git a/docs/admin/Deployment.md b/docs/admin/Deployment.md

index 9ef7f48992aebe09d2bdd92ca2172d49a165a092..7d62df391f40b4bb049ead8bf5f45635fe91a783 100644 (file)
--- a/docs/admin/Deployment.md
+++ b/docs/admin/Deployment.md
@@ -1,7 +1,7 @@
  # Deploying Nominatim
  
  The Nominatim API is implemented as a PHP application. The `website/` directory
-in the build directory contains the configured website. You can serve this
+in the project directory contains the configured website. You can serve this
  in a production environment with any web server that is capable to run
  PHP scripts.
  
@@ -13,10 +13,11 @@ to run a web service. Please refer to the documentation of
  for background information on configuring the services.
  
  !!! Note
-    Throughout this page, we assume that your Nominatim build directory is
-    located in `/srv/nominatim/build` and the source code in
-    `/srv/nominatim/Nominatim`. If you have put it somewhere else, you
-    need to adjust the commands and configuration accordingly.
+    Throughout this page, we assume that your Nominatim project directory is
+    located in `/srv/nominatim-project` and that you have installed Nominatim
+    using the default installation prefix `/usr/local`. If you have put it
+    somewhere else, you need to adjust the commands and configuration
+    accordingly.
  
      We further assume that your web server runs as user `www-data`. Older
      versions of CentOS may still use the user name `apache`. You also need
@@ -29,7 +30,7 @@ web server user. You can check that the permissions are correct by accessing
  on of the php files as the web server user:
  
  ``` sh
-sudo -u www-data head -n 1 /srv/nominatim/build/website/search.php
+sudo -u www-data head -n 1 /srv/nominatim-project/website/search.php
  ```
  
  If this shows a permission error, then you need to adapt the permissions of
@@ -40,11 +41,11 @@ web server access. At a minimum the following SELinux labelling should be done
  for Nominatim:
  
  ``` sh
-sudo semanage fcontext -a -t httpd_sys_content_t "/srv/nominatim/Nominatim/(website|lib|settings)(/.*)?"
-sudo semanage fcontext -a -t httpd_sys_content_t "/srv/nominatim/build/(website|settings)(/.*)?"
-sudo semanage fcontext -a -t lib_t "/srv/nominatim/build/module/nominatim.so"
-sudo restorecon -R -v /srv/nominatim/Nominatim
-sudo restorecon -R -v /srv/nominatim/build
+sudo semanage fcontext -a -t httpd_sys_content_t "/usr/local/nominatim/lib/lib-php(/.*)?"
+sudo semanage fcontext -a -t httpd_sys_content_t "/srv/nominatim-project/website(/.*)?"
+sudo semanage fcontext -a -t lib_t "/srv/nominatim-project/module/nominatim.so"
+sudo restorecon -R -v /usr/local/lib/nominatim
+sudo restorecon -R -v /srv/nominatim-project
  ```
  
  ## Nominatim with Apache
@@ -65,13 +66,13 @@ Make sure your Apache configuration contains the required permissions for the
  directory and create an alias:
  
  ``` apache
-<Directory "/srv/nominatim/build/website">
+<Directory "/srv/nominatim-project/website">
    Options FollowSymLinks MultiViews
    AddType text/html   .php
    DirectoryIndex search.php
    Require all granted
  </Directory>
-Alias /nominatim /srv/nominatim/build/website
+Alias /nominatim /srv/nominatim-project/website
  ```
  
  After making changes in the apache config you need to restart apache.
@@ -110,7 +111,7 @@ Tell nginx that php files are special and to fastcgi_pass to the php-fpm
  unix socket by adding the location definition to the default configuration.
  
  ``` nginx
-root /srv/nominatim/build/website;
+root /srv/nominatim-project/website;
  index search.php;
  location / {
      try_files $uri $uri/ @php;
diff --git a/docs/admin/Update.md b/docs/admin/Update.md

index 256ca3e944245068bfc9e0e832502f239389571f..a2323cfeec931ff985551cde7f220cd1aaa792b6 100644 (file)
--- a/docs/admin/Update.md
+++ b/docs/admin/Update.md
@@ -30,9 +30,9 @@ diffs for Ireland from Geofabrik add the following:
  
      # base URL of the replication service
      NOMINATIM_REPLICATION_URL="https://download.geofabrik.de/europe/ireland-and-northern-ireland-updates"
-    # How often upstream publishes diffs
+    # How often upstream publishes diffs (in seconds)
      NOMINATIM_REPLICATION_UPDATE_INTERVAL=86400
-    # How long to sleep if no update found yet
+    # How long to sleep if no update found yet (in seconds)
      NOMINATIM_REPLICATION_RECHECK_INTERVAL=900
  
  To set up the update process now run the following command:
diff --git a/lib-php/Geocode.php b/lib-php/Geocode.php

index ec6876faa51bbd4b64402e1abaf3450993cdc81b..53ee49c043f53d80434887978434998987f9c830 100644 (file)
--- a/lib-php/Geocode.php
+++ b/lib-php/Geocode.php
@@ -8,12 +8,14 @@ require_once(CONST_LibDir.'/ReverseGeocode.php');
  require_once(CONST_LibDir.'/SearchDescription.php');
  require_once(CONST_LibDir.'/SearchContext.php');
  require_once(CONST_LibDir.'/TokenList.php');
+require_once(CONST_TokenizerDir.'/tokenizer.php');
  
  class Geocode
  {
      protected $oDB;
  
      protected $oPlaceLookup;
+    protected $oTokenizer;
  
      protected $aLangPrefOrder = array();
  
@@ -41,23 +43,12 @@ class Geocode
      protected $sQuery = false;
      protected $aStructuredQuery = false;
  
-    protected $oNormalizer = null;
-
  
      public function __construct(&$oDB)
      {
          $this->oDB =& $oDB;
          $this->oPlaceLookup = new PlaceLookup($this->oDB);
-        $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
-    }
-
-    private function normTerm($sTerm)
-    {
-        if ($this->oNormalizer === null) {
-            return $sTerm;
-        }
-
-        return $this->oNormalizer->transliterate($sTerm);
+        $this->oTokenizer = new \Nominatim\Tokenizer($this->oDB);
      }
  
      public function setLanguagePreference($aLangPref)
@@ -510,12 +501,10 @@ class Geocode
          if ($this->aCountryCodes) {
              $oCtx->setCountryList($this->aCountryCodes);
          }
+        $this->oTokenizer->setCountryRestriction($this->aCountryCodes);
  
          Debug::newSection('Query Preprocessing');
  
-        $sNormQuery = $this->normTerm($this->sQuery);
-        Debug::printVar('Normalized query', $sNormQuery);
-
          $sLanguagePrefArraySQL = $this->oDB->getArraySQL(
              $this->oDB->getDBQuotedList($this->aLangPrefOrder)
          );
@@ -569,108 +558,55 @@ class Geocode
              }
  
              if ($sSpecialTerm && !$aSearches[0]->hasOperator()) {
-                $sSpecialTerm = pg_escape_string($sSpecialTerm);
-                $sToken = $this->oDB->getOne(
-                    'SELECT make_standard_name(:term)',
-                    array(':term' => $sSpecialTerm),
-                    'Cannot decode query. Wrong encoding?'
-                );
-                $sSQL = 'SELECT class, type FROM word ';
-                $sSQL .= '   WHERE word_token in (\' '.$sToken.'\')';
-                $sSQL .= '   AND class is not null AND class not in (\'place\')';
-
-                Debug::printSQL($sSQL);
-                $aSearchWords = $this->oDB->getAll($sSQL);
-                $aNewSearches = array();
-                foreach ($aSearches as $oSearch) {
-                    foreach ($aSearchWords as $aSearchTerm) {
-                        $oNewSearch = clone $oSearch;
-                        $oNewSearch->setPoiSearch(
-                            Operator::TYPE,
-                            $aSearchTerm['class'],
-                            $aSearchTerm['type']
-                        );
-                        $aNewSearches[] = $oNewSearch;
+                $aTokens = $this->oTokenizer->tokensForSpecialTerm($sSpecialTerm);
+
+                if (!empty($aTokens)) {
+                    $aNewSearches = array();
+                    foreach ($aSearches as $oSearch) {
+                        foreach ($aTokens as $oToken) {
+                            $oNewSearch = clone $oSearch;
+                            $oNewSearch->setPoiSearch(
+                                $oToken->iOperator,
+                                $oToken->sClass,
+                                $oToken->sType
+                            );
+                            $aNewSearches[] = $oNewSearch;
+                        }
                      }
+                    $aSearches = $aNewSearches;
                  }
-                $aSearches = $aNewSearches;
              }
  
              // Split query into phrases
              // Commas are used to reduce the search space by indicating where phrases split
+            $aPhrases = array();
              if ($this->aStructuredQuery) {
-                $aInPhrases = $this->aStructuredQuery;
+                foreach ($this->aStructuredQuery as $iPhrase => $sPhrase) {
+                    $aPhrases[] = new Phrase($sPhrase, $iPhrase);
+                }
              } else {
-                $aInPhrases = explode(',', $sQuery);
+                foreach (explode(',', $sQuery) as $sPhrase) {
+                    $aPhrases[] = new Phrase($sPhrase, '');
+                }
              }
  
              Debug::printDebugArray('Search context', $oCtx);
              Debug::printDebugArray('Base search', empty($aSearches) ? null : $aSearches[0]);
-            Debug::printVar('Final query phrases', $aInPhrases);
  
-            // Convert each phrase to standard form
-            // Create a list of standard words
-            // Get all 'sets' of words
-            // Generate a complete list of all
              Debug::newSection('Tokenization');
-            $aTokens = array();
-            $aPhrases = array();
-            foreach ($aInPhrases as $iPhrase => $sPhrase) {
-                $sPhrase = $this->oDB->getOne(
-                    'SELECT make_standard_name(:phrase)',
-                    array(':phrase' => $sPhrase),
-                    'Cannot normalize query string (is it a UTF-8 string?)'
-                );
-                if (trim($sPhrase)) {
-                    $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : '');
-                    $oPhrase->addTokens($aTokens);
-                    $aPhrases[] = $oPhrase;
-                }
-            }
-
-            Debug::printVar('Tokens', $aTokens);
-
-            $oValidTokens = new TokenList();
-
-            if (!empty($aTokens)) {
-                $oValidTokens->addTokensFromDB(
-                    $this->oDB,
-                    $aTokens,
-                    $this->aCountryCodes,
-                    $sNormQuery,
-                    $this->oNormalizer
-                );
+            $oValidTokens = $this->oTokenizer->extractTokensFromPhrases($aPhrases);
  
+            if ($oValidTokens->count() > 0) {
                  $oCtx->setFullNameWords($oValidTokens->getFullWordIDs());
  
-                // Try more interpretations for Tokens that could not be matched.
-                foreach ($aTokens as $sToken) {
-                    if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
-                        if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
-                            // US ZIP+4 codes - merge in the 5-digit ZIP code
-                            $oValidTokens->addToken(
-                                $sToken,
-                                new Token\Postcode(null, $aData[1], 'us')
-                            );
-                        } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
-                            // Unknown single word token with a number.
-                            // Assume it is a house number.
-                            $oValidTokens->addToken(
-                                $sToken,
-                                new Token\HouseNumber(null, trim($sToken))
-                            );
-                        }
-                    }
-                }
+                $aPhrases = array_filter($aPhrases, function ($oPhrase) {
+                    return $oPhrase->getWordSets() !== null;
+                });
  
                  // Any words that have failed completely?
                  // TODO: suggestions
  
                  Debug::printGroupTable('Valid Tokens', $oValidTokens->debugInfo());
-
-                foreach ($aPhrases as $oPhrase) {
-                    $oPhrase->computeWordSets($oValidTokens);
-                }
                  Debug::printDebugTable('Phrases', $aPhrases);
  
                  Debug::newSection('Search candidates');
@@ -829,7 +765,6 @@ class Geocode
                      foreach ($aResults as $oResult) {
                          if (($this->iMaxAddressRank == 30 &&
                               ($oResult->iTable == Result::TABLE_OSMLINE
-                              || $oResult->iTable == Result::TABLE_AUX
                                || $oResult->iTable == Result::TABLE_TIGER))
                              || in_array($oResult->iId, $aFilteredIDs)
                          ) {
diff --git a/lib-php/Phrase.php b/lib-php/Phrase.php

index e2643e878ed40a3bb22533600603647418e98011..d14c842df809158aa9644e0badc90244b84f0508 100644 (file)
--- a/lib-php/Phrase.php
+++ b/lib-php/Phrase.php
@@ -16,8 +16,6 @@ class Phrase
      private $sPhrase;
      // Element type for structured searches.
      private $sPhraseType;
-    // Space-separated words of the phrase.
-    private $aWords;
      // Possible segmentations of the phrase.
      private $aWordSets;
  
@@ -38,7 +36,14 @@ class Phrase
      {
          $this->sPhrase = trim($sPhrase);
          $this->sPhraseType = $sPhraseType;
-        $this->aWords = explode(' ', $this->sPhrase);
+    }
+
+    /**
+     * Get the orginal phrase of the string.
+     */
+    public function getPhrase()
+    {
+        return $this->sPhrase;
      }
  
      /**
@@ -63,30 +68,6 @@ class Phrase
          return $this->aWordSets;
      }
  
-    /**
-     * Add the tokens from this phrase to the given list of tokens.
-     *
-     * @param string[] $aTokens List of tokens to append.
-     *
-     * @return void
-     */
-    public function addTokens(&$aTokens)
-    {
-        $iNumWords = count($this->aWords);
-
-        for ($i = 0; $i < $iNumWords; $i++) {
-            $sPhrase = $this->aWords[$i];
-            $aTokens[' '.$sPhrase] = ' '.$sPhrase;
-            $aTokens[$sPhrase] = $sPhrase;
-
-            for ($j = $i + 1; $j < $iNumWords; $j++) {
-                $sPhrase .= ' '.$this->aWords[$j];
-                $aTokens[' '.$sPhrase] = ' '.$sPhrase;
-                $aTokens[$sPhrase] = $sPhrase;
-            }
-        }
-    }
-
      /**
       * Invert the set of possible segmentations.
       *
@@ -99,21 +80,27 @@ class Phrase
          }
      }
  
-    public function computeWordSets($oTokens)
+    public function computeWordSets($aWords, $oTokens)
      {
-        $iNumWords = count($this->aWords);
+        $iNumWords = count($aWords);
+
+        if ($iNumWords == 0) {
+            $this->aWordSets = null;
+            return;
+        }
+
          // Caches the word set for the partial phrase up to word i.
          $aSetCache = array_fill(0, $iNumWords, array());
  
          // Initialise first element of cache. There can only be the word.
-        if ($oTokens->containsAny($this->aWords[0])) {
-            $aSetCache[0][] = array($this->aWords[0]);
+        if ($oTokens->containsAny($aWords[0])) {
+            $aSetCache[0][] = array($aWords[0]);
          }
  
          // Now do the next elements using what we already have.
          for ($i = 1; $i < $iNumWords; $i++) {
              for ($j = $i; $j > 0; $j--) {
-                $sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
+                $sPartial = $j == $i ? $aWords[$j] : $aWords[$j].' '.$sPartial;
                  if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
                      $aPartial = array($sPartial);
                      foreach ($aSetCache[$j - 1] as $aSet) {
@@ -136,7 +123,7 @@ class Phrase
              }
  
              // finally the current full phrase
-            $sPartial = $this->aWords[0].' '.$sPartial;
+            $sPartial = $aWords[0].' '.$sPartial;
              if ($oTokens->containsAny($sPartial)) {
                  $aSetCache[$i][] = array($sPartial);
              }
@@ -153,7 +140,6 @@ class Phrase
          return array(
                  'Type' => $this->sPhraseType,
                  'Phrase' => $this->sPhrase,
-                'Words' => $this->aWords,
                  'WordSets' => $this->aWordSets
                 );
      }
diff --git a/lib-php/PlaceLookup.php b/lib-php/PlaceLookup.php

index 6d7b6be1af4ed6dd04d5e1a6e79fb9050746252e..b9fa3b1c08c72b1ef200a426eb75f178dc709523 100644 (file)
--- a/lib-php/PlaceLookup.php
+++ b/lib-php/PlaceLookup.php
@@ -373,42 +373,6 @@ class PlaceLookup
  
                  $aSubSelects[] = $sSQL;
              }
-
-            if (CONST_Use_Aux_Location_data) {
-                $sPlaceIDs = Result::joinIdsByTable($aResults, Result::TABLE_AUX);
-                if ($sPlaceIDs) {
-                    $sHousenumbers = Result::sqlHouseNumberTable($aResults, Result::TABLE_AUX);
-                    $sSQL = '  SELECT ';
-                    $sSQL .= "     'L' AS osm_type, ";
-                    $sSQL .= '     place_id AS osm_id, ';
-                    $sSQL .= "     'place' AS class,";
-                    $sSQL .= "     'house' AS type, ";
-                    $sSQL .= '     null::smallint AS admin_level, ';
-                    $sSQL .= '     30 AS rank_search,';
-                    $sSQL .= '     30 AS rank_address, ';
-                    $sSQL .= '     place_id,';
-                    $sSQL .= '     parent_place_id, ';
-                    $sSQL .= '     housenumber,';
-                    $sSQL .= "     'us' AS country_code, ";
-                    $sSQL .= $this->langAddressSql('-1');
-                    $sSQL .= '     null::text AS placename, ';
-                    $sSQL .= '     null::text AS ref, ';
-                    if ($this->bExtraTags) $sSQL .= 'null::text AS extra, ';
-                    if ($this->bNameDetails) $sSQL .= 'null::text AS names, ';
-                    $sSQL .= '     ST_X(centroid) AS lon, ';
-                    $sSQL .= '     ST_Y(centroid) AS lat, ';
-                    $sSQL .= '     -1.10 AS importance, ';
-                    $sSQL .= $this->addressImportanceSql(
-                        'centroid',
-                        'location_property_aux.parent_place_id'
-                    );
-                    $sSQL .= '     null::text AS extra_place ';
-                    $sSQL .= '  FROM location_property_aux ';
-                    $sSQL .= "  WHERE place_id in ($sPlaceIDs) ";
-
-                    $aSubSelects[] = $sSQL;
-                }
-            }
          }
  
          if (empty($aSubSelects)) {
diff --git a/lib-php/Result.php b/lib-php/Result.php

index a7747ea34d6fee12e98b0a7eb31c9fbaccfb7fb8..be103074040da27e56dfc9a914baa868cc3da8e3 100644 (file)
--- a/lib-php/Result.php
+++ b/lib-php/Result.php
@@ -13,8 +13,7 @@ class Result
      const TABLE_PLACEX = 0;
      const TABLE_POSTCODE = 1;
      const TABLE_OSMLINE = 2;
-    const TABLE_AUX = 3;
-    const TABLE_TIGER = 4;
+    const TABLE_TIGER = 3;
  
      /// Database table that contains the result.
      public $iTable;
diff --git a/lib-php/SearchDescription.php b/lib-php/SearchDescription.php

index dd20550214325b952452ec98de3bf4a96351071e..189ffa74e99fbca744efc659f98f62426ca7afc2 100644 (file)
--- a/lib-php/SearchDescription.php
+++ b/lib-php/SearchDescription.php
@@ -790,20 +790,6 @@ class SearchDescription
              }
          }
  
-        // If nothing found try the aux fallback table
-        if (CONST_Use_Aux_Location_data && empty($aResults)) {
-            $sSQL = 'SELECT place_id FROM location_property_aux';
-            $sSQL .= ' WHERE parent_place_id in ('.$sPlaceIDs.')';
-            $sSQL .= " AND housenumber = '".$this->sHouseNumber."'";
-            $sSQL .= $this->oContext->excludeSQL(' AND place_id');
-
-            Debug::printSQL($sSQL);
-
-            foreach ($oDB->getCol($sSQL) as $iPlaceId) {
-                $aResults[$iPlaceId] = new Result($iPlaceId, Result::TABLE_AUX);
-            }
-        }
-
          // If nothing found then search in Tiger data (location_property_tiger)
          if (CONST_Use_US_Tiger_Data && $bIsIntHouseNumber && empty($aResults)) {
              $sSQL = 'SELECT place_id FROM location_property_tiger';
diff --git a/lib-php/Status.php b/lib-php/Status.php

index 2d9e78db42606f59a90cb2d6e18ad4d8774a6a2e..4a8f559226ca669b1c0c5b883c7d5be07daf6ca6 100644 (file)
--- a/lib-php/Status.php
+++ b/lib-php/Status.php
@@ -2,6 +2,8 @@
  
  namespace Nominatim;
  
+require_once(CONST_TokenizerDir.'/tokenizer.php');
+
  use Exception;
  
  class Status
@@ -25,24 +27,8 @@ class Status
              throw new Exception('Database connection failed', 700);
          }
  
-        $sStandardWord = $this->oDB->getOne("SELECT make_standard_name('a')");
-        if ($sStandardWord === false) {
-            throw new Exception('Module failed', 701);
-        }
-
-        if ($sStandardWord != 'a') {
-            throw new Exception('Module call failed', 702);
-        }
-
-        $sSQL = 'SELECT word_id, word_token, word, class, type, country_code, ';
-        $sSQL .= "operator, search_name_count FROM word WHERE word_token IN (' a')";
-        $iWordID = $this->oDB->getOne($sSQL);
-        if ($iWordID === false) {
-            throw new Exception('Query failed', 703);
-        }
-        if (!$iWordID) {
-            throw new Exception('No value', 704);
-        }
+        $oTokenizer = new \Nominatim\Tokenizer($this->oDB);
+        $oTokenizer->checkStatus();
      }
  
      public function dataDate()
@@ -51,7 +37,7 @@ class Status
          $iDataDateEpoch = $this->oDB->getOne($sSQL);
  
          if ($iDataDateEpoch === false) {
-            throw Exception('Data date query failed '.$iDataDateEpoch->getMessage(), 705);
+            throw new Exception('Import date is not available', 705);
          }
  
          return $iDataDateEpoch;
diff --git a/lib-php/TokenList.php b/lib-php/TokenList.php

index a419da6a9ad3429bc9087070038d491392ba1785..2df9fe0586710f120c821b09f809f286cd616f44 100644 (file)
--- a/lib-php/TokenList.php
+++ b/lib-php/TokenList.php
@@ -95,88 +95,6 @@ class TokenList
          return $ids;
      }
  
-    /**
-     * Add token information from the word table in the database.
-     *
-     * @param object   $oDB           Nominatim::DB instance.
-     * @param string[] $aTokens       List of tokens to look up in the database.
-     * @param string[] $aCountryCodes List of country restrictions.
-     * @param string   $sNormQuery    Normalized query string.
-     * @param object   $oNormalizer   Normalizer function to use on tokens.
-     *
-     * @return void
-     */
-    public function addTokensFromDB(&$oDB, &$aTokens, &$aCountryCodes, $sNormQuery, $oNormalizer)
-    {
-        // Check which tokens we have, get the ID numbers
-        $sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';
-        $sSQL .= ' operator, coalesce(search_name_count, 0) as count';
-        $sSQL .= ' FROM word WHERE word_token in (';
-        $sSQL .= join(',', $oDB->getDBQuotedList($aTokens)).')';
-
-        Debug::printSQL($sSQL);
-
-        $aDBWords = $oDB->getAll($sSQL, null, 'Could not get word tokens.');
-
-        foreach ($aDBWords as $aWord) {
-            $oToken = null;
-            $iId = (int) $aWord['word_id'];
-
-            if ($aWord['class']) {
-                // Special terms need to appear in their normalized form.
-                if ($aWord['word']) {
-                    $sNormWord = $aWord['word'];
-                    if ($oNormalizer != null) {
-                        $sNormWord = $oNormalizer->transliterate($aWord['word']);
-                    }
-                    if (strpos($sNormQuery, $sNormWord) === false) {
-                        continue;
-                    }
-                }
-
-                if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
-                    $oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
-                } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
-                    if ($aWord['word']
-                        && pg_escape_string($aWord['word']) == $aWord['word']
-                    ) {
-                        $oToken = new Token\Postcode(
-                            $iId,
-                            $aWord['word'],
-                            $aWord['country_code']
-                        );
-                    }
-                } else {
-                    // near and in operator the same at the moment
-                    $oToken = new Token\SpecialTerm(
-                        $iId,
-                        $aWord['class'],
-                        $aWord['type'],
-                        $aWord['operator'] ? Operator::NEAR : Operator::NONE
-                    );
-                }
-            } elseif ($aWord['country_code']) {
-                // Filter country tokens that do not match restricted countries.
-                if (!$aCountryCodes
-                    || in_array($aWord['country_code'], $aCountryCodes)
-                ) {
-                    $oToken = new Token\Country($iId, $aWord['country_code']);
-                }
-            } else {
-                $oToken = new Token\Word(
-                    $iId,
-                    $aWord['word_token'][0] != ' ',
-                    (int) $aWord['count'],
-                    substr_count($aWord['word_token'], ' ')
-                );
-            }
-
-            if ($oToken) {
-                $this->addToken($aWord['word_token'], $oToken);
-            }
-        }
-    }
-
      /**
       * Add a new token for the given word.
       *
diff --git a/lib-php/admin/query.php b/lib-php/admin/query.php

index 35fd1184a579e7ebc0219b93a32c240465323c94..21121fbd316ac2269c137c8786361ac8da5735d1 100644 (file)
--- a/lib-php/admin/query.php
+++ b/lib-php/admin/query.php
@@ -2,7 +2,6 @@
  @define('CONST_LibDir', dirname(dirname(__FILE__)));
  
  require_once(CONST_LibDir.'/init-cmd.php');
-require_once(CONST_LibDir.'/Geocode.php');
  require_once(CONST_LibDir.'/ParameterParser.php');
  ini_set('memory_limit', '800M');
  
@@ -41,17 +40,16 @@ loadSettings($aCMDResult['project-dir'] ?? getcwd());
  @define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
  @define('CONST_Log_DB', getSettingBool('LOG_DB'));
  @define('CONST_Log_File', getSetting('LOG_FILE', false));
-@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
  @define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
  @define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
  @define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
  @define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
  @define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
-@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
-@define('CONST_Use_Aux_Location_data', getSettingBool('USE_AUX_LOCATION_DATA'));
  @define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
  @define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
+@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
  
+require_once(CONST_LibDir.'/Geocode.php');
  
  $oDB = new Nominatim\DB;
  $oDB->connect();
diff --git a/lib-php/admin/warm.php b/lib-php/admin/warm.php

index 827fd9868780214912e39fd0bac1d0184e34d612..d6aa3d9b0d5f0978045b037427520a059a936c3b 100644 (file)
--- a/lib-php/admin/warm.php
+++ b/lib-php/admin/warm.php
@@ -3,7 +3,6 @@
  
  require_once(CONST_LibDir.'/init-cmd.php');
  require_once(CONST_LibDir.'/log.php');
-require_once(CONST_LibDir.'/Geocode.php');
  require_once(CONST_LibDir.'/PlaceLookup.php');
  require_once(CONST_LibDir.'/ReverseGeocode.php');
  
@@ -26,17 +25,16 @@ loadSettings($aCMDResult['project-dir'] ?? getcwd());
  @define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
  @define('CONST_Log_DB', getSettingBool('LOG_DB'));
  @define('CONST_Log_File', getSetting('LOG_FILE', false));
-@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
  @define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
  @define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
  @define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
  @define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
  @define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
-@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
-@define('CONST_Use_Aux_Location_data', getSettingBool('USE_AUX_LOCATION_DATA'));
  @define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
  @define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
+@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
  
+require_once(CONST_LibDir.'/Geocode.php');
  
  $oDB = new Nominatim\DB();
  $oDB->connect();
diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php

new file mode 100644 (file)

index 0000000..09cfe70
--- /dev/null
+++ b/lib-php/tokenizer/legacy_icu_tokenizer.php
@@ -0,0 +1,238 @@
+<?php
+
+namespace Nominatim;
+
+class Tokenizer
+{
+    private $oDB;
+
+    private $oNormalizer;
+    private $oTransliterator;
+    private $aCountryRestriction;
+
+    public function __construct(&$oDB)
+    {
+        $this->oDB =& $oDB;
+        $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
+        $this->oTransliterator = \Transliterator::createFromRules(CONST_Transliteration);
+    }
+
+    public function checkStatus()
+    {
+        $sSQL = "SELECT word_id FROM word WHERE word_token IN (' a')";
+        $iWordID = $this->oDB->getOne($sSQL);
+        if ($iWordID === false) {
+            throw new Exception('Query failed', 703);
+        }
+        if (!$iWordID) {
+            throw new Exception('No value', 704);
+        }
+    }
+
+
+    public function setCountryRestriction($aCountries)
+    {
+        $this->aCountryRestriction = $aCountries;
+    }
+
+
+    public function normalizeString($sTerm)
+    {
+        if ($this->oNormalizer === null) {
+            return $sTerm;
+        }
+
+        return $this->oNormalizer->transliterate($sTerm);
+    }
+
+    private function makeStandardWord($sTerm)
+    {
+        $sNorm = ' '.$this->oTransliterator->transliterate($sTerm).' ';
+
+        return trim(str_replace(CONST_Abbreviations[0], CONST_Abbreviations[1], $sNorm));
+    }
+
+
+    public function tokensForSpecialTerm($sTerm)
+    {
+        $aResults = array();
+
+        $sSQL = 'SELECT word_id, class, type FROM word ';
+        $sSQL .= '   WHERE word_token = \' \' || :term';
+        $sSQL .= '   AND class is not null AND class not in (\'place\')';
+
+        Debug::printVar('Term', $sTerm);
+        Debug::printSQL($sSQL);
+        $aSearchWords = $this->oDB->getAll($sSQL, array(':term' => $this->makeStandardWord($sTerm)));
+
+        Debug::printVar('Results', $aSearchWords);
+
+        foreach ($aSearchWords as $aSearchTerm) {
+            $aResults[] = new \Nominatim\Token\SpecialTerm(
+                $aSearchTerm['word_id'],
+                $aSearchTerm['class'],
+                $aSearchTerm['type'],
+                \Nominatim\Operator::TYPE
+            );
+        }
+
+        Debug::printVar('Special term tokens', $aResults);
+
+        return $aResults;
+    }
+
+
+    public function extractTokensFromPhrases(&$aPhrases)
+    {
+        $sNormQuery = '';
+        $aWordLists = array();
+        $aTokens = array();
+        foreach ($aPhrases as $iPhrase => $oPhrase) {
+            $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
+            $sPhrase = $this->makeStandardWord($oPhrase->getPhrase());
+            if (strlen($sPhrase) > 0) {
+                $aWords = explode(' ', $sPhrase);
+                Tokenizer::addTokens($aTokens, $aWords);
+                $aWordLists[] = $aWords;
+            } else {
+                $aWordLists[] = array();
+            }
+        }
+
+        Debug::printVar('Tokens', $aTokens);
+        Debug::printVar('WordLists', $aWordLists);
+
+        $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
+
+        foreach ($aPhrases as $iPhrase => $oPhrase) {
+            $oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
+        }
+
+        return $oValidTokens;
+    }
+
+
+    private function computeValidTokens($aTokens, $sNormQuery)
+    {
+        $oValidTokens = new TokenList();
+
+        if (!empty($aTokens)) {
+            $this->addTokensFromDB($oValidTokens, $aTokens, $sNormQuery);
+
+            // Try more interpretations for Tokens that could not be matched.
+            foreach ($aTokens as $sToken) {
+                if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
+                    if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
+                        // US ZIP+4 codes - merge in the 5-digit ZIP code
+                        $oValidTokens->addToken(
+                            $sToken,
+                            new Token\Postcode(null, $aData[1], 'us')
+                        );
+                    } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
+                        // Unknown single word token with a number.
+                        // Assume it is a house number.
+                        $oValidTokens->addToken(
+                            $sToken,
+                            new Token\HouseNumber(null, trim($sToken))
+                        );
+                    }
+                }
+            }
+        }
+
+        return $oValidTokens;
+    }
+
+
+    private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
+    {
+        // Check which tokens we have, get the ID numbers
+        $sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';
+        $sSQL .= ' operator, coalesce(search_name_count, 0) as count';
+        $sSQL .= ' FROM word WHERE word_token in (';
+        $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
+
+        Debug::printSQL($sSQL);
+
+        $aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.');
+
+        foreach ($aDBWords as $aWord) {
+            $oToken = null;
+            $iId = (int) $aWord['word_id'];
+
+            if ($aWord['class']) {
+                // Special terms need to appear in their normalized form.
+                // (postcodes are not normalized in the word table)
+                $sNormWord = $this->normalizeString($aWord['word']);
+                if ($aWord['word'] && strpos($sNormQuery, $sNormWord) === false) {
+                    continue;
+                }
+
+                if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
+                    $oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
+                } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
+                    if ($aWord['word']
+                        && pg_escape_string($aWord['word']) == $aWord['word']
+                    ) {
+                        $oToken = new Token\Postcode(
+                            $iId,
+                            $aWord['word'],
+                            $aWord['country_code']
+                        );
+                    }
+                } else {
+                    // near and in operator the same at the moment
+                    $oToken = new Token\SpecialTerm(
+                        $iId,
+                        $aWord['class'],
+                        $aWord['type'],
+                        $aWord['operator'] ? Operator::NEAR : Operator::NONE
+                    );
+                }
+            } elseif ($aWord['country_code']) {
+                // Filter country tokens that do not match restricted countries.
+                if (!$this->aCountryRestriction
+                    || in_array($aWord['country_code'], $this->aCountryRestriction)
+                ) {
+                    $oToken = new Token\Country($iId, $aWord['country_code']);
+                }
+            } else {
+                $oToken = new Token\Word(
+                    $iId,
+                    $aWord['word_token'][0] != ' ',
+                    (int) $aWord['count'],
+                    substr_count($aWord['word_token'], ' ')
+                );
+            }
+
+            if ($oToken) {
+                $oValidTokens->addToken($aWord['word_token'], $oToken);
+            }
+        }
+    }
+
+
+    /**
+     * Add the tokens from this phrase to the given list of tokens.
+     *
+     * @param string[] $aTokens List of tokens to append.
+     *
+     * @return void
+     */
+    private static function addTokens(&$aTokens, $aWords)
+    {
+        $iNumWords = count($aWords);
+
+        for ($i = 0; $i < $iNumWords; $i++) {
+            $sPhrase = $aWords[$i];
+            $aTokens[' '.$sPhrase] = ' '.$sPhrase;
+            $aTokens[$sPhrase] = $sPhrase;
+
+            for ($j = $i + 1; $j < $iNumWords; $j++) {
+                $sPhrase .= ' '.$aWords[$j];
+                $aTokens[' '.$sPhrase] = ' '.$sPhrase;
+                $aTokens[$sPhrase] = $sPhrase;
+            }
+        }
+    }
+}
diff --git a/lib-php/tokenizer/legacy_tokenizer.php b/lib-php/tokenizer/legacy_tokenizer.php

new file mode 100644 (file)

index 0000000..0fb37fd
--- /dev/null
+++ b/lib-php/tokenizer/legacy_tokenizer.php
@@ -0,0 +1,254 @@
+<?php
+
+namespace Nominatim;
+
+class Tokenizer
+{
+    private $oDB;
+
+    private $oNormalizer = null;
+    private $aCountryRestriction = null;
+
+    public function __construct(&$oDB)
+    {
+        $this->oDB =& $oDB;
+        $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
+    }
+
+    public function checkStatus()
+    {
+        $sStandardWord = $this->oDB->getOne("SELECT make_standard_name('a')");
+        if ($sStandardWord === false) {
+            throw new Exception('Module failed', 701);
+        }
+
+        if ($sStandardWord != 'a') {
+            throw new Exception('Module call failed', 702);
+        }
+
+        $sSQL = "SELECT word_id FROM word WHERE word_token IN (' a')";
+        $iWordID = $this->oDB->getOne($sSQL);
+        if ($iWordID === false) {
+            throw new Exception('Query failed', 703);
+        }
+        if (!$iWordID) {
+            throw new Exception('No value', 704);
+        }
+    }
+
+
+    public function setCountryRestriction($aCountries)
+    {
+        $this->aCountryRestriction = $aCountries;
+    }
+
+
+    public function normalizeString($sTerm)
+    {
+        if ($this->oNormalizer === null) {
+            return $sTerm;
+        }
+
+        return $this->oNormalizer->transliterate($sTerm);
+    }
+
+
+    public function tokensForSpecialTerm($sTerm)
+    {
+        $aResults = array();
+
+        $sSQL = 'SELECT word_id, class, type FROM word ';
+        $sSQL .= '   WHERE word_token = \' \' || make_standard_name(:term)';
+        $sSQL .= '   AND class is not null AND class not in (\'place\')';
+
+        Debug::printVar('Term', $sTerm);
+        Debug::printSQL($sSQL);
+        $aSearchWords = $this->oDB->getAll($sSQL, array(':term' => $sTerm));
+
+        Debug::printVar('Results', $aSearchWords);
+
+        foreach ($aSearchWords as $aSearchTerm) {
+            $aResults[] = new \Nominatim\Token\SpecialTerm(
+                $aSearchTerm['word_id'],
+                $aSearchTerm['class'],
+                $aSearchTerm['type'],
+                \Nominatim\Operator::TYPE
+            );
+        }
+
+        Debug::printVar('Special term tokens', $aResults);
+
+        return $aResults;
+    }
+
+
+    public function extractTokensFromPhrases(&$aPhrases)
+    {
+        // First get the normalized version of all phrases
+        $sNormQuery = '';
+        $sSQL = 'SELECT ';
+        $aParams = array();
+        foreach ($aPhrases as $iPhrase => $oPhrase) {
+            $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
+            $sSQL .= 'make_standard_name(:' .$iPhrase.') as p'.$iPhrase.',';
+            $aParams[':'.$iPhrase] = $oPhrase->getPhrase();
+        }
+        $sSQL = substr($sSQL, 0, -1);
+
+        Debug::printSQL($sSQL);
+        Debug::printVar('SQL parameters', $aParams);
+
+        $aNormPhrases = $this->oDB->getRow($sSQL, $aParams);
+
+        Debug::printVar('SQL result', $aNormPhrases);
+
+        // now compute all possible tokens
+        $aWordLists = array();
+        $aTokens = array();
+        foreach ($aNormPhrases as $sTitle => $sPhrase) {
+            if (strlen($sPhrase) > 0) {
+                $aWords = explode(' ', $sPhrase);
+                Tokenizer::addTokens($aTokens, $aWords);
+                $aWordLists[] = $aWords;
+            } else {
+                $aWordLists[] = array();
+            }
+        }
+
+        Debug::printVar('Tokens', $aTokens);
+        Debug::printVar('WordLists', $aWordLists);
+
+        $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
+
+        foreach ($aPhrases as $iPhrase => $oPhrase) {
+            $oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
+        }
+
+        return $oValidTokens;
+    }
+
+
+    private function computeValidTokens($aTokens, $sNormQuery)
+    {
+        $oValidTokens = new TokenList();
+
+        if (!empty($aTokens)) {
+            $this->addTokensFromDB($oValidTokens, $aTokens, $sNormQuery);
+
+            // Try more interpretations for Tokens that could not be matched.
+            foreach ($aTokens as $sToken) {
+                if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
+                    if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
+                        // US ZIP+4 codes - merge in the 5-digit ZIP code
+                        $oValidTokens->addToken(
+                            $sToken,
+                            new Token\Postcode(null, $aData[1], 'us')
+                        );
+                    } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
+                        // Unknown single word token with a number.
+                        // Assume it is a house number.
+                        $oValidTokens->addToken(
+                            $sToken,
+                            new Token\HouseNumber(null, trim($sToken))
+                        );
+                    }
+                }
+            }
+        }
+
+        return $oValidTokens;
+    }
+
+
+    private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
+    {
+        // Check which tokens we have, get the ID numbers
+        $sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';
+        $sSQL .= ' operator, coalesce(search_name_count, 0) as count';
+        $sSQL .= ' FROM word WHERE word_token in (';
+        $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
+
+        Debug::printSQL($sSQL);
+
+        $aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.');
+
+        foreach ($aDBWords as $aWord) {
+            $oToken = null;
+            $iId = (int) $aWord['word_id'];
+
+            if ($aWord['class']) {
+                // Special terms need to appear in their normalized form.
+                // (postcodes are not normalized in the word table)
+                $sNormWord = $this->normalizeString($aWord['word']);
+                if ($aWord['word'] && strpos($sNormQuery, $sNormWord) === false) {
+                    continue;
+                }
+
+                if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
+                    $oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
+                } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
+                    if ($aWord['word']
+                        && pg_escape_string($aWord['word']) == $aWord['word']
+                    ) {
+                        $oToken = new Token\Postcode(
+                            $iId,
+                            $aWord['word'],
+                            $aWord['country_code']
+                        );
+                    }
+                } else {
+                    // near and in operator the same at the moment
+                    $oToken = new Token\SpecialTerm(
+                        $iId,
+                        $aWord['class'],
+                        $aWord['type'],
+                        $aWord['operator'] ? Operator::NEAR : Operator::NONE
+                    );
+                }
+            } elseif ($aWord['country_code']) {
+                // Filter country tokens that do not match restricted countries.
+                if (!$this->aCountryRestriction
+                    || in_array($aWord['country_code'], $this->aCountryRestriction)
+                ) {
+                    $oToken = new Token\Country($iId, $aWord['country_code']);
+                }
+            } else {
+                $oToken = new Token\Word(
+                    $iId,
+                    $aWord['word_token'][0] != ' ',
+                    (int) $aWord['count'],
+                    substr_count($aWord['word_token'], ' ')
+                );
+            }
+
+            if ($oToken) {
+                $oValidTokens->addToken($aWord['word_token'], $oToken);
+            }
+        }
+    }
+
+
+    /**
+     * Add the tokens from this phrase to the given list of tokens.
+     *
+     * @param string[] $aTokens List of tokens to append.
+     *
+     * @return void
+     */
+    private static function addTokens(&$aTokens, $aWords)
+    {
+        $iNumWords = count($aWords);
+
+        for ($i = 0; $i < $iNumWords; $i++) {
+            $sPhrase = $aWords[$i];
+            $aTokens[' '.$sPhrase] = ' '.$sPhrase;
+            $aTokens[$sPhrase] = $sPhrase;
+
+            for ($j = $i + 1; $j < $iNumWords; $j++) {
+                $sPhrase .= ' '.$aWords[$j];
+                $aTokens[' '.$sPhrase] = ' '.$sPhrase;
+                $aTokens[$sPhrase] = $sPhrase;
+            }
+        }
+    }
+}
diff --git a/lib-php/website/details.php b/lib-php/website/details.php

index bd7df12c23a7cc936423d930335458e27c187860..55a088d1994ab8ee9555211c4d9927d78c910d6b 100644 (file)
--- a/lib-php/website/details.php
+++ b/lib-php/website/details.php
@@ -106,11 +106,6 @@ if ($iParentPlaceID) $iPlaceID = $iParentPlaceID;
  $iParentPlaceID = $oDB->getOne('SELECT parent_place_id FROM location_postcode WHERE place_id = '.$iPlaceID);
  if ($iParentPlaceID) $iPlaceID = $iParentPlaceID;
  
-if (CONST_Use_Aux_Location_data) {
-    $iParentPlaceID = $oDB->getOne('SELECT parent_place_id FROM location_property_aux WHERE place_id = '.$iPlaceID);
-    if ($iParentPlaceID) $iPlaceID = $iParentPlaceID;
-}
-
  $hLog = logStart($oDB, 'details', $_SERVER['QUERY_STRING'], $aLangPrefOrder);
  
  // Get the details for this point
diff --git a/lib-php/website/status.php b/lib-php/website/status.php

index 7c7eb9281c5e449fb09a52f18ce8c1153273e7e7..03e56f655439aa6b13aeb132b92dcb247de190a6 100644 (file)
--- a/lib-php/website/status.php
+++ b/lib-php/website/status.php
@@ -17,6 +17,23 @@ if ($sOutputFormat == 'json') {
  try {
      $oStatus = new Nominatim\Status($oDB);
      $oStatus->status();
+
+    if ($sOutputFormat == 'json') {
+        $epoch = $oStatus->dataDate();
+        $aResponse = array(
+                      'status' => 0,
+                      'message' => 'OK',
+                      'data_updated' => (new DateTime('@'.$epoch))->format(DateTime::RFC3339),
+                      'software_version' => CONST_NominatimVersion
+                     );
+        $sDatabaseVersion = $oStatus->databaseVersion();
+        if ($sDatabaseVersion) {
+            $aResponse['database_version'] = $sDatabaseVersion;
+        }
+        javascript_renderData($aResponse);
+    } else {
+        echo 'OK';
+    }
  } catch (Exception $oErr) {
      if ($sOutputFormat == 'json') {
          $aResponse = array(
@@ -28,25 +45,4 @@ try {
          header('HTTP/1.0 500 Internal Server Error');
          echo 'ERROR: '.$oErr->getMessage();
      }
-    exit;
  }
-
-
-if ($sOutputFormat == 'json') {
-    $epoch = $oStatus->dataDate();
-    $aResponse = array(
-                  'status' => 0,
-                  'message' => 'OK',
-                  'data_updated' => (new DateTime('@'.$epoch))->format(DateTime::RFC3339),
-                  'software_version' => CONST_NominatimVersion
-                 );
-    $sDatabaseVersion = $oStatus->databaseVersion();
-    if ($sDatabaseVersion) {
-        $aResponse['database_version'] = $sDatabaseVersion;
-    }
-    javascript_renderData($aResponse);
-} else {
-    echo 'OK';
-}
-
-exit;
diff --git a/lib-sql/aux_tables.sql b/lib-sql/aux_tables.sql

deleted file mode 100644 (file)

index 8105473..0000000
--- a/lib-sql/aux_tables.sql
+++ /dev/null
@@ -1,6 +0,0 @@
-CREATE TABLE location_property_aux () INHERITS (location_property);
-CREATE INDEX idx_location_property_aux_place_id ON location_property_aux USING BTREE (place_id);
-CREATE INDEX idx_location_property_aux_parent_place_id ON location_property_aux USING BTREE (parent_place_id);
-CREATE INDEX idx_location_property_aux_housenumber_parent_place_id ON location_property_aux USING BTREE (parent_place_id, housenumber);
-GRANT SELECT ON location_property_aux TO "{www-user}";
-
diff --git a/lib-sql/functions.sql b/lib-sql/functions.sql

index 750af9f0f5a9efa79e208f18b6295f92a3b248b9..e9419ca2ea92947a2a3f5fe34379092022a8dc51 100644 (file)
--- a/lib-sql/functions.sql
+++ b/lib-sql/functions.sql
@@ -1,5 +1,4 @@
  {% include('functions/utils.sql') %}
-{% include('functions/normalization.sql') %}
  {% include('functions/ranking.sql') %}
  {% include('functions/importance.sql') %}
  {% include('functions/address_lookup.sql') %}
diff --git a/lib-sql/functions/address_lookup.sql b/lib-sql/functions/address_lookup.sql

index 03b0ea54d9f68105027e22ab3725f936b74fe1c3..b6c552c492421a66d5254f5d1970ea72831ed1f4 100644 (file)
--- a/lib-sql/functions/address_lookup.sql
+++ b/lib-sql/functions/address_lookup.sql
@@ -135,20 +135,6 @@ BEGIN
    END IF;
    {% endif %}
  
-  -- then additional data
-  {% if config.get_bool('USE_AUX_LOCATION_DATA') %}
-  IF place IS NULL THEN
-    SELECT parent_place_id as place_id, 'us' as country_code,
-           housenumber, postcode,
-           'place' as class, 'house' as type,
-           null as name, null as address,
-           centroid
-      INTO place
-      FROM location_property_aux
-      WHERE place_id = in_place_id;
-  END IF;
-  {% endif %}
-
    -- postcode table
    IF place IS NULL THEN
      SELECT parent_place_id as place_id, country_code,
diff --git a/lib-sql/functions/aux_property.sql b/lib-sql/functions/aux_property.sql

deleted file mode 100644 (file)

index 6dd99eb..0000000
--- a/lib-sql/functions/aux_property.sql
+++ /dev/null
@@ -1,53 +0,0 @@
--- Functions for adding external data (currently unused).
-
-CREATE OR REPLACE FUNCTION aux_create_property(pointgeo GEOMETRY, in_housenumber TEXT,
-                                               in_street TEXT, in_isin TEXT,
-                                               in_postcode TEXT, in_countrycode char(2))
-  RETURNS INTEGER
-  AS $$
-DECLARE
-
-  newpoints INTEGER;
-  place_centroid GEOMETRY;
-  out_partition INTEGER;
-  out_parent_place_id BIGINT;
-  location RECORD;
-  address_street_word_ids INTEGER[];
-  out_postcode TEXT;
-
-BEGIN
-
-  place_centroid := ST_Centroid(pointgeo);
-  out_partition := get_partition(in_countrycode);
-  out_parent_place_id := null;
-
-  address_street_word_ids := word_ids_from_name(in_street);
-  IF address_street_word_ids IS NOT NULL THEN
-    out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
-                                                      address_street_word_ids);
-  END IF;
-
-  IF out_parent_place_id IS NULL THEN
-    SELECT getNearestRoadPlaceId(out_partition, place_centroid)
-      INTO out_parent_place_id;
-    END LOOP;
-  END IF;
-
-  out_postcode := in_postcode;
-  IF out_postcode IS NULL THEN
-    SELECT postcode from placex where place_id = out_parent_place_id INTO out_postcode;
-  END IF;
-  -- XXX look into postcode table
-
-  newpoints := 0;
-  insert into location_property_aux (place_id, partition, parent_place_id,
-                                     housenumber, postcode, centroid)
-    values (nextval('seq_place'), out_partition, out_parent_place_id,
-            in_housenumber, out_postcode, place_centroid);
-  newpoints := newpoints + 1;
-
-  RETURN newpoints;
-END;
-$$
-LANGUAGE plpgsql;
-
diff --git a/lib-sql/functions/interpolation.sql b/lib-sql/functions/interpolation.sql

index a797cad3ac1de74b3500eb53d90bbd2111b826f1..55e44dfd646e05f497658ba401207b0fd2194b67 100644 (file)
--- a/lib-sql/functions/interpolation.sql
+++ b/lib-sql/functions/interpolation.sql
@@ -12,39 +12,47 @@ $$
  LANGUAGE plpgsql IMMUTABLE;
  
  
+CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
+RETURNS HSTORE
+  AS $$
+DECLARE
+  location RECORD;
+  waynodes BIGINT[];
+BEGIN
+  IF akeys(in_address) != ARRAY['interpolation'] THEN
+    RETURN in_address;
+  END IF;
+
+  SELECT nodes INTO waynodes FROM planet_osm_ways WHERE id = wayid;
+  FOR location IN
+    SELECT placex.address, placex.osm_id FROM placex
+     WHERE osm_type = 'N' and osm_id = ANY(waynodes)
+           and placex.address is not null
+           and (placex.address ? 'street' or placex.address ? 'place')
+           and indexed_status < 100
+  LOOP
+    -- mark it as a derived address
+    RETURN location.address || in_address || hstore('_inherited', '');
+  END LOOP;
+
+  RETURN in_address;
+END;
+$$
+LANGUAGE plpgsql STABLE;
+
+
+
  -- find the parent road of the cut road parts
-CREATE OR REPLACE FUNCTION get_interpolation_parent(wayid BIGINT, street TEXT,
-                                                    place TEXT, partition SMALLINT,
+CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[],
+                                                    partition SMALLINT,
                                                      centroid GEOMETRY, geom GEOMETRY)
    RETURNS BIGINT
    AS $$
  DECLARE
-  addr_street TEXT;
-  addr_place TEXT;
    parent_place_id BIGINT;
-
-  waynodes BIGINT[];
-
    location RECORD;
  BEGIN
-  addr_street = street;
-  addr_place = place;
-
-  IF addr_street is null and addr_place is null THEN
-    select nodes from planet_osm_ways where id = wayid INTO waynodes;
-    FOR location IN SELECT placex.address from placex
-                    where osm_type = 'N' and osm_id = ANY(waynodes)
-                          and placex.address is not null
-                          and (placex.address ? 'street' or placex.address ? 'place')
-                          and indexed_status < 100
-                    limit 1 LOOP
-      addr_street = location.address->'street';
-      addr_place = location.address->'place';
-    END LOOP;
-  END IF;
-
-  parent_place_id := find_parent_for_address(addr_street, addr_place,
-                                             partition, centroid);
+  parent_place_id := find_parent_for_address(street, place, partition, centroid);
  
    IF parent_place_id is null THEN
      FOR location IN SELECT place_id FROM placex
@@ -147,15 +155,15 @@ BEGIN
    NEW.interpolationtype = NEW.address->'interpolation';
  
    place_centroid := ST_PointOnSurface(NEW.linegeo);
-  NEW.parent_place_id = get_interpolation_parent(NEW.osm_id, NEW.address->'street',
-                                                 NEW.address->'place',
+  NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info),
+                                                 token_addr_place_match_tokens(NEW.token_info),
                                                   NEW.partition, place_centroid, NEW.linegeo);
  
-  IF NEW.address is not NULL AND NEW.address ? 'postcode' AND NEW.address->'postcode' not similar to '%(,|;)%' THEN
-    interpol_postcode := NEW.address->'postcode';
-    housenum := getorcreate_postcode_id(NEW.address->'postcode');
-  ELSE
-    interpol_postcode := NULL;
+  interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
+
+  NEW.token_info := token_strip_info(NEW.token_info);
+  IF NEW.address ? '_inherited' THEN
+    NEW.address := hstore('interpolation', NEW.interpolationtype);
    END IF;
  
    -- if the line was newly inserted, split the line as necessary
@@ -202,12 +210,13 @@ BEGIN
  
              -- determine postcode
              postcode := coalesce(interpol_postcode,
-                                 prevnode.address->'postcode',
-                                 nextnode.address->'postcode',
+                                 token_normalized_postcode(prevnode.address->'postcode'),
+                                 token_normalized_postcode(nextnode.address->'postcode'),
                                   postcode);
  
              IF postcode is NULL THEN
-                SELECT placex.postcode FROM placex WHERE place_id = NEW.parent_place_id INTO postcode;
+                SELECT token_normalized_postcode(placex.postcode)
+                  FROM placex WHERE place_id = NEW.parent_place_id INTO postcode;
              END IF;
              IF postcode is NULL THEN
                  postcode := get_nearest_postcode(NEW.country_code, nextnode.geometry);
@@ -217,7 +226,7 @@ BEGIN
                  NEW.startnumber := startnumber;
                  NEW.endnumber := endnumber;
                  NEW.linegeo := sectiongeo;
-                NEW.postcode := upper(trim(postcode));
+                NEW.postcode := postcode;
               ELSE
                insert into location_property_osmline
                       (linegeo, partition, osm_id, parent_place_id,
diff --git a/lib-sql/functions/normalization.sql b/lib-sql/functions/normalization.sql

deleted file mode 100644 (file)

index f283f91..0000000
--- a/lib-sql/functions/normalization.sql
+++ /dev/null
@@ -1,545 +0,0 @@
--- Functions for term normalisation and access to the 'word' table.
-
-CREATE OR REPLACE FUNCTION transliteration(text) RETURNS text
-  AS '{{ modulepath }}/nominatim.so', 'transliteration'
-LANGUAGE c IMMUTABLE STRICT;
-
-
-CREATE OR REPLACE FUNCTION gettokenstring(text) RETURNS text
-  AS '{{ modulepath }}/nominatim.so', 'gettokenstring'
-LANGUAGE c IMMUTABLE STRICT;
-
-
-CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT
-  AS $$
-DECLARE
-  o TEXT;
-BEGIN
-  o := public.gettokenstring(public.transliteration(name));
-  RETURN trim(substr(o,1,length(o)));
-END;
-$$
-LANGUAGE plpgsql IMMUTABLE;
-
--- returns NULL if the word is too common
-CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT) 
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_id INTEGER;
-  count INTEGER;
-BEGIN
-  lookup_token := trim(lookup_word);
-  SELECT min(word_id), max(search_name_count) FROM word
-    WHERE word_token = lookup_token and class is null and type is null
-    INTO return_word_id, count;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0);
-  ELSE
-    IF count > get_maxwordfreq() THEN
-      return_word_id := NULL;
-    END IF;
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
--- Create housenumber tokens from an OSM addr:housenumber.
--- The housnumber is split at comma and semicolon as necessary.
--- The function returns the normalized form of the housenumber suitable
--- for comparison.
-CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
-  RETURNS TEXT
-  AS $$
-DECLARE
-  normtext TEXT;
-BEGIN
-  SELECT array_to_string(array_agg(trans), ';')
-    INTO normtext
-    FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word)
-          FROM (SELECT make_standard_name(h) as lookup_word
-                FROM regexp_split_to_table(housenumber, '[,;]') h) x) y;
-
-  return normtext;
-END;
-$$ LANGUAGE plpgsql STABLE STRICT;
-
-CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_token := ' ' || trim(lookup_word);
-  SELECT min(word_id) FROM word
-    WHERE word_token = lookup_token and class='place' and type='house'
-    INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, null,
-                             'place', 'house', null, 0);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  lookup_word TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_word := upper(trim(postcode));
-  lookup_token := ' ' || make_standard_name(lookup_word);
-  SELECT min(word_id) FROM word
-    WHERE word_token = lookup_token and word = lookup_word
-          and class='place' and type='postcode'
-    INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, lookup_word,
-                             'place', 'postcode', null, 0);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
-                                               lookup_country_code varchar(2))
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_token := ' '||trim(lookup_word);
-  SELECT min(word_id) FROM word
-    WHERE word_token = lookup_token and country_code=lookup_country_code
-    INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, null,
-                             null, null, lookup_country_code, 0);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
-                                               lookup_class text, lookup_type text)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_token := ' '||trim(lookup_word);
-  SELECT min(word_id) FROM word
-  WHERE word_token = lookup_token and word = normalized_word
-        and class = lookup_class and type = lookup_type
-  INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
-                             lookup_class, lookup_type, null, 0);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT,
-                                                       normalized_word TEXT,
-                                                       lookup_class text,
-                                                       lookup_type text,
-                                                       op text)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_token := ' '||trim(lookup_word);
-  SELECT min(word_id) FROM word
-  WHERE word_token = lookup_token and word = normalized_word
-        and class = lookup_class and type = lookup_type and operator = op
-  INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
-                             lookup_class, lookup_type, null, 0, op);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT, src_word TEXT)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  nospace_lookup_token TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_token := ' '||trim(lookup_word);
-  SELECT min(word_id) FROM word
-  WHERE word_token = lookup_token and class is null and type is null
-  INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, src_word,
-                             null, null, null, 0);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-BEGIN
-  RETURN getorcreate_name_id(lookup_word, '');
-END;
-$$
-LANGUAGE plpgsql;
-
--- Normalize a string and lookup its word ids (partial words).
-CREATE OR REPLACE FUNCTION addr_ids_from_name(lookup_word TEXT)
-  RETURNS INTEGER[]
-  AS $$
-DECLARE
-  words TEXT[];
-  id INTEGER;
-  return_word_id INTEGER[];
-  word_ids INTEGER[];
-  j INTEGER;
-BEGIN
-  words := string_to_array(make_standard_name(lookup_word), ' ');
-  IF array_upper(words, 1) IS NOT NULL THEN
-    FOR j IN 1..array_upper(words, 1) LOOP
-      IF (words[j] != '') THEN
-        SELECT array_agg(word_id) INTO word_ids
-          FROM word
-         WHERE word_token = words[j] and class is null and type is null;
-
-        IF word_ids IS NULL THEN
-          id := nextval('seq_word');
-          INSERT INTO word VALUES (id, words[j], null, null, null, null, 0);
-          return_word_id := return_word_id || id;
-        ELSE
-          return_word_id := array_merge(return_word_id, word_ids);
-        END IF;
-      END IF;
-    END LOOP;
-  END IF;
-
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
--- Normalize a string and look up its name ids (full words).
-CREATE OR REPLACE FUNCTION word_ids_from_name(lookup_word TEXT)
-  RETURNS INTEGER[]
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_ids INTEGER[];
-BEGIN
-  lookup_token := ' '|| make_standard_name(lookup_word);
-  SELECT array_agg(word_id) FROM word
-    WHERE word_token = lookup_token and class is null and type is null
-    INTO return_word_ids;
-  RETURN return_word_ids;
-END;
-$$
-LANGUAGE plpgsql STABLE STRICT;
-
-
-CREATE OR REPLACE FUNCTION create_country(src HSTORE, country_code varchar(2))
-  RETURNS VOID
-  AS $$
-DECLARE
-  s TEXT;
-  w INTEGER;
-  words TEXT[];
-  item RECORD;
-  j INTEGER;
-BEGIN
-  FOR item IN SELECT (each(src)).* LOOP
-
-    s := make_standard_name(item.value);
-    w := getorcreate_country(s, country_code);
-
-    words := regexp_split_to_array(item.value, E'[,;()]');
-    IF array_upper(words, 1) != 1 THEN
-      FOR j IN 1..array_upper(words, 1) LOOP
-        s := make_standard_name(words[j]);
-        IF s != '' THEN
-          w := getorcreate_country(s, country_code);
-        END IF;
-      END LOOP;
-    END IF;
-  END LOOP;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
-  RETURNS INTEGER[]
-  AS $$
-DECLARE
-  result INTEGER[];
-  s TEXT;
-  w INTEGER;
-  words TEXT[];
-  item RECORD;
-  j INTEGER;
-BEGIN
-  result := '{}'::INTEGER[];
-
-  FOR item IN SELECT (each(src)).* LOOP
-
-    s := make_standard_name(item.value);
-    w := getorcreate_name_id(s, item.value);
-
-    IF not(ARRAY[w] <@ result) THEN
-      result := result || w;
-    END IF;
-
-    w := getorcreate_word_id(s);
-
-    IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
-      result := result || w;
-    END IF;
-
-    words := string_to_array(s, ' ');
-    IF array_upper(words, 1) IS NOT NULL THEN
-      FOR j IN 1..array_upper(words, 1) LOOP
-        IF (words[j] != '') THEN
-          w = getorcreate_word_id(words[j]);
-          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
-            result := result || w;
-          END IF;
-        END IF;
-      END LOOP;
-    END IF;
-
-    words := regexp_split_to_array(item.value, E'[,;()]');
-    IF array_upper(words, 1) != 1 THEN
-      FOR j IN 1..array_upper(words, 1) LOOP
-        s := make_standard_name(words[j]);
-        IF s != '' THEN
-          w := getorcreate_word_id(s);
-          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
-            result := result || w;
-          END IF;
-        END IF;
-      END LOOP;
-    END IF;
-
-    s := regexp_replace(item.value, '市$', '');
-    IF s != item.value THEN
-      s := make_standard_name(s);
-      IF s != '' THEN
-        w := getorcreate_name_id(s, item.value);
-        IF NOT (ARRAY[w] <@ result) THEN
-          result := result || w;
-        END IF;
-      END IF;
-    END IF;
-
-  END LOOP;
-
-  RETURN result;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION make_keywords(src TEXT)
-  RETURNS INTEGER[]
-  AS $$
-DECLARE
-  result INTEGER[];
-  s TEXT;
-  w INTEGER;
-  words TEXT[];
-  i INTEGER;
-  j INTEGER;
-BEGIN
-  result := '{}'::INTEGER[];
-
-  s := make_standard_name(src);
-  w := getorcreate_name_id(s, src);
-
-  IF NOT (ARRAY[w] <@ result) THEN
-    result := result || w;
-  END IF;
-
-  w := getorcreate_word_id(s);
-
-  IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
-    result := result || w;
-  END IF;
-
-  words := string_to_array(s, ' ');
-  IF array_upper(words, 1) IS NOT NULL THEN
-    FOR j IN 1..array_upper(words, 1) LOOP
-      IF (words[j] != '') THEN
-        w = getorcreate_word_id(words[j]);
-        IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
-          result := result || w;
-        END IF;
-      END IF;
-    END LOOP;
-  END IF;
-
-  words := regexp_split_to_array(src, E'[,;()]');
-  IF array_upper(words, 1) != 1 THEN
-    FOR j IN 1..array_upper(words, 1) LOOP
-      s := make_standard_name(words[j]);
-      IF s != '' THEN
-        w := getorcreate_word_id(s);
-        IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
-          result := result || w;
-        END IF;
-      END IF;
-    END LOOP;
-  END IF;
-
-  s := regexp_replace(src, '市$', '');
-  IF s != src THEN
-    s := make_standard_name(s);
-    IF s != '' THEN
-      w := getorcreate_name_id(s, src);
-      IF NOT (ARRAY[w] <@ result) THEN
-        result := result || w;
-      END IF;
-    END IF;
-  END IF;
-
-  RETURN result;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
-                                                   in_partition SMALLINT,
-                                                   parent_place_id BIGINT,
-                                                   address HSTORE,
-                                                   country TEXT,
-                                                   housenumber TEXT,
-                                                   initial_name_vector INTEGER[],
-                                                   geometry GEOMETRY,
-                                                   OUT name_vector INTEGER[],
-                                                   OUT nameaddress_vector INTEGER[])
-  AS $$
-DECLARE
-  parent_name_vector INTEGER[];
-  parent_address_vector INTEGER[];
-  addr_place_ids INTEGER[];
-
-  addr_item RECORD;
-  parent_address_place_ids BIGINT[];
-  filtered_address HSTORE;
-BEGIN
-  nameaddress_vector := '{}'::INTEGER[];
-
-  SELECT s.name_vector, s.nameaddress_vector
-    INTO parent_name_vector, parent_address_vector
-    FROM search_name s
-    WHERE s.place_id = parent_place_id;
-
-  -- Find all address tags that don't appear in the parent search names.
-  SELECT hstore(array_agg(ARRAY[k, v])) INTO filtered_address
-    FROM (SELECT skeys(address) as k, svals(address) as v) a
-   WHERE not addr_ids_from_name(v) && parent_address_vector
-         AND k not in ('country', 'street', 'place', 'postcode',
-                       'housenumber', 'streetnumber', 'conscriptionnumber');
-
-  -- Compute all search terms from the addr: tags.
-  IF filtered_address IS NOT NULL THEN
-    FOR addr_item IN
-      SELECT * FROM
-        get_places_for_addr_tags(in_partition, geometry, filtered_address, country)
-    LOOP
-        IF addr_item.place_id is null THEN
-            nameaddress_vector := array_merge(nameaddress_vector,
-                                              addr_item.keywords);
-            CONTINUE;
-        END IF;
-
-        IF parent_address_place_ids is null THEN
-            SELECT array_agg(parent_place_id) INTO parent_address_place_ids
-              FROM place_addressline
-             WHERE place_id = parent_place_id;
-        END IF;
-
-        IF not parent_address_place_ids @> ARRAY[addr_item.place_id] THEN
-            nameaddress_vector := array_merge(nameaddress_vector,
-                                              addr_item.keywords);
-
-            INSERT INTO place_addressline (place_id, address_place_id, fromarea,
-                                           isaddress, distance, cached_rank_address)
-            VALUES (obj_place_id, addr_item.place_id, not addr_item.isguess,
-                    true, addr_item.distance, addr_item.rank_address);
-        END IF;
-    END LOOP;
-  END IF;
-
-  name_vector := initial_name_vector;
-
-  -- Check if the parent covers all address terms.
-  -- If not, create a search name entry with the house number as the name.
-  -- This is unusual for the search_name table but prevents that the place
-  -- is returned when we only search for the street/place.
-
-  IF housenumber is not null and not nameaddress_vector <@ parent_address_vector THEN
-    name_vector := array_merge(name_vector,
-                               ARRAY[getorcreate_housenumber_id(make_standard_name(housenumber))]);
-  END IF;
-
-  IF not address ? 'street' and address ? 'place' THEN
-    addr_place_ids := addr_ids_from_name(address->'place');
-    IF not addr_place_ids <@ parent_name_vector THEN
-      -- make sure addr:place terms are always searchable
-      nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
-      -- If there is a housenumber, also add the place name as a name,
-      -- so we can search it by the usual housenumber+place algorithms.
-      IF housenumber is not null THEN
-        name_vector := array_merge(name_vector,
-                                   ARRAY[getorcreate_name_id(make_standard_name(address->'place'))]);
-      END IF;
-    END IF;
-  END IF;
-
-  -- Cheating here by not recomputing all terms but simply using the ones
-  -- from the parent object.
-  nameaddress_vector := array_merge(nameaddress_vector, parent_name_vector);
-  nameaddress_vector := array_merge(nameaddress_vector, parent_address_vector);
-
-END;
-$$
-LANGUAGE plpgsql;
diff --git a/lib-sql/functions/partition-functions.sql b/lib-sql/functions/partition-functions.sql

index cfa151de14c8aa268ebad34b70901e94c38058f5..53aba22c90a3e15290c76ad125c0c7f11982ae52 100644 (file)
--- a/lib-sql/functions/partition-functions.sql
+++ b/lib-sql/functions/partition-functions.sql
@@ -63,54 +63,36 @@ END
  $$
  LANGUAGE plpgsql STABLE;
  
-CREATE OR REPLACE FUNCTION get_places_for_addr_tags(in_partition SMALLINT,
-                                                    feature GEOMETRY,
-                                                    address HSTORE, country TEXT)
-  RETURNS SETOF nearfeaturecentr
+
+CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
+                                             from_rank SMALLINT, to_rank SMALLINT,
+                                             extent FLOAT, tokens INT[])
+  RETURNS nearfeaturecentr
    AS $$
  DECLARE
    r nearfeaturecentr%rowtype;
-  item RECORD;
  BEGIN
-  FOR item IN
-    SELECT (get_addr_tag_rank(key, country)).*, key, name FROM
-      (SELECT skeys(address) as key, svals(address) as name) x
-  LOOP
-   IF item.from_rank is null THEN
-     CONTINUE;
-   END IF;
-
  {% for partition in db.partitions %}
-    IF in_partition = {{ partition }} THEN
-        SELECT place_id, keywords, rank_address, rank_search,
-               min(ST_Distance(feature, centroid)) as distance,
-               isguess, postcode, centroid INTO r
+  IF in_partition = {{ partition }} THEN
+      SELECT place_id, keywords, rank_address, rank_search,
+             min(ST_Distance(feature, centroid)) as distance,
+             isguess, postcode, centroid INTO r
          FROM location_area_large_{{ partition }}
-        WHERE geometry && ST_Expand(feature, item.extent)
-          AND rank_address between item.from_rank and item.to_rank
-          AND word_ids_from_name(item.name) && keywords
+        WHERE geometry && ST_Expand(feature, extent)
+              AND rank_address between from_rank and to_rank
+              AND tokens && keywords
          GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
          ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
-      IF r.place_id is null THEN
-        -- If we cannot find a place for the term, just return the
-        -- search term for the given name. That ensures that the address
-        -- element can still be searched for, even though it will not be
-        -- displayed.
-        RETURN NEXT ROW(null, addr_ids_from_name(item.name), null, null,
-                        null, null, null, null)::nearfeaturecentr;
-      ELSE
-        RETURN NEXT r;
-      END IF;
-      CONTINUE;
-    END IF;
+      RETURN r;
+  END IF;
  {% endfor %}
  
-    RAISE EXCEPTION 'Unknown partition %', in_partition;
-  END LOOP;
+  RAISE EXCEPTION 'Unknown partition %', in_partition;
  END;
  $$
  LANGUAGE plpgsql STABLE;
  
+
  create or replace function deleteLocationArea(in_partition INTEGER, in_place_id BIGINT, in_rank_search INTEGER) RETURNS BOOLEAN AS $$
  DECLARE
  BEGIN
diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql

index 812bc79ff5c44314858af1ae350ec7d2c8cd0597..9a31f3ae327c338dc0d1538f68b51e03f1591148 100644 (file)
--- a/lib-sql/functions/placex_triggers.sql
+++ b/lib-sql/functions/placex_triggers.sql
@@ -1,5 +1,84 @@
  -- Trigger functions for the placex table.
  
+-- Retrieve the data needed by the indexer for updating the place.
+--
+-- Return parameters:
+--  name            list of names
+--  address         list of address tags, either from the object or a surrounding
+--                  building
+--  country_feature If the place is a country feature, this contains the
+--                  country code, otherwise it is null.
+CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
+                                                 OUT name HSTORE,
+                                                 OUT address HSTORE,
+                                                 OUT country_feature VARCHAR)
+  AS $$
+BEGIN
+  -- For POI nodes, check if the address should be derived from a surrounding
+  -- building.
+  IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
+    address := p.address;
+  ELSE
+    -- The additional && condition works around the misguided query
+    -- planner of postgis 3.0.
+    SELECT placex.address || hstore('_inherited', '') INTO address
+      FROM placex
+     WHERE ST_Covers(geometry, p.centroid)
+           and geometry && p.centroid
+           and placex.address is not null
+           and (placex.address ? 'housenumber' or placex.address ? 'street' or placex.address ? 'place')
+           and rank_search = 30 AND ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon')
+     LIMIT 1;
+  END IF;
+
+  address := address - '_unlisted_place'::TEXT;
+  name := p.name;
+
+  country_feature := CASE WHEN p.admin_level = 2
+                               and p.class = 'boundary' and p.type = 'administrative'
+                               and p.osm_type = 'R'
+                          THEN p.country_code
+                          ELSE null
+                     END;
+END;
+$$
+LANGUAGE plpgsql STABLE;
+
+
+CREATE OR REPLACE FUNCTION find_associated_street(poi_osm_type CHAR(1),
+                                                  poi_osm_id BIGINT)
+  RETURNS BIGINT
+  AS $$
+DECLARE
+  location RECORD;
+  parent RECORD;
+BEGIN
+  FOR location IN
+    SELECT members FROM planet_osm_rels
+    WHERE parts @> ARRAY[poi_osm_id]
+          and members @> ARRAY[lower(poi_osm_type) || poi_osm_id]
+          and tags @> ARRAY['associatedStreet']
+  LOOP
+    FOR i IN 1..array_upper(location.members, 1) BY 2 LOOP
+      IF location.members[i+1] = 'street' THEN
+        FOR parent IN
+          SELECT place_id from placex
+           WHERE osm_type = 'W' and osm_id = substring(location.members[i],2)::bigint
+                 and name is not null
+                 and rank_search between 26 and 27
+        LOOP
+          RETURN parent.place_id;
+        END LOOP;
+      END IF;
+    END LOOP;
+  END LOOP;
+
+  RETURN NULL;
+END;
+$$
+LANGUAGE plpgsql STABLE;
+
+
  -- Find the parent road of a POI.
  --
  -- \returns Place ID of parent object or NULL if none
@@ -10,118 +89,89 @@ CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1),
                                                 poi_osm_id BIGINT,
                                                 poi_partition SMALLINT,
                                                 bbox GEOMETRY,
-                                               addr_street TEXT,
-                                               addr_place TEXT,
-                                               fallback BOOL = true)
+                                               addr_street INTEGER[],
+                                               addr_place INTEGER[],
+                                               is_place_addr BOOLEAN)
    RETURNS BIGINT
    AS $$
  DECLARE
    parent_place_id BIGINT DEFAULT NULL;
    location RECORD;
-  parent RECORD;
  BEGIN
-    {% if debug %}RAISE WARNING 'finding street for % %', poi_osm_type, poi_osm_id;{% endif %}
+  {% if debug %}RAISE WARNING 'finding street for % %', poi_osm_type, poi_osm_id;{% endif %}
+
+  -- Is this object part of an associatedStreet relation?
+  parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
  
-    -- Is this object part of an associatedStreet relation?
+  IF parent_place_id is null THEN
+    parent_place_id := find_parent_for_address(addr_street, addr_place,
+                                               poi_partition, bbox);
+  END IF;
+
+  IF parent_place_id is null and poi_osm_type = 'N' THEN
+    -- Is this node part of an interpolation?
      FOR location IN
-      SELECT members FROM planet_osm_rels
-      WHERE parts @> ARRAY[poi_osm_id]
-        and members @> ARRAY[lower(poi_osm_type) || poi_osm_id]
-        and tags @> ARRAY['associatedStreet']
+      SELECT q.parent_place_id
+        FROM location_property_osmline q, planet_osm_ways x
+       WHERE q.linegeo && bbox and x.id = q.osm_id
+             and poi_osm_id = any(x.nodes)
+       LIMIT 1
      LOOP
-      FOR i IN 1..array_upper(location.members, 1) BY 2 LOOP
-        IF location.members[i+1] = 'street' THEN
-          FOR parent IN
-            SELECT place_id from placex
-             WHERE osm_type = 'W' and osm_id = substring(location.members[i],2)::bigint
-               and name is not null
-               and rank_search between 26 and 27
-          LOOP
-            RETURN parent.place_id;
-          END LOOP;
-        END IF;
-      END LOOP;
+      {% if debug %}RAISE WARNING 'Get parent from interpolation: %', location.parent_place_id;{% endif %}
+      RETURN location.parent_place_id;
      END LOOP;
  
-    parent_place_id := find_parent_for_address(addr_street, addr_place,
-                                               poi_partition, bbox);
-    IF parent_place_id is not null THEN
-      RETURN parent_place_id;
-    END IF;
+    FOR location IN
+      SELECT p.place_id, p.osm_id, p.rank_search, p.address,
+             coalesce(p.centroid, ST_Centroid(p.geometry)) as centroid
+        FROM placex p, planet_osm_ways w
+       WHERE p.osm_type = 'W' and p.rank_search >= 26
+             and p.geometry && bbox
+             and w.id = p.osm_id and poi_osm_id = any(w.nodes)
+    LOOP
+      {% if debug %}RAISE WARNING 'Node is part of way % ', location.osm_id;{% endif %}
+
+      -- Way IS a road then we are on it - that must be our road
+      IF location.rank_search < 28 THEN
+        {% if debug %}RAISE WARNING 'node in way that is a street %',location;{% endif %}
+        RETURN location.place_id;
+      END IF;
+
+      parent_place_id := find_associated_street('W', location.osm_id);
+    END LOOP;
+  END IF;
  
-    IF poi_osm_type = 'N' THEN
-      -- Is this node part of an interpolation?
-      FOR parent IN
-        SELECT q.parent_place_id
-          FROM location_property_osmline q, planet_osm_ways x
-         WHERE q.linegeo && bbox and x.id = q.osm_id
-               and poi_osm_id = any(x.nodes)
-         LIMIT 1
+  IF parent_place_id is NULL THEN
+    IF is_place_addr THEN
+      -- The address is attached to a place we don't know.
+      -- Instead simply use the containing area with the largest rank.
+      FOR location IN
+        SELECT place_id FROM placex
+         WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
+               AND rank_address between 5 and 25
+         ORDER BY rank_address desc
        LOOP
-        {% if debug %}RAISE WARNING 'Get parent from interpolation: %', parent.parent_place_id;{% endif %}
-        RETURN parent.parent_place_id;
+        RETURN location.place_id;
        END LOOP;
-
-      -- Is this node part of any other way?
+    ELSEIF ST_Area(bbox) < 0.005 THEN
+      -- for smaller features get the nearest road
+      SELECT getNearestRoadPlaceId(poi_partition, bbox) INTO parent_place_id;
+      {% if debug %}RAISE WARNING 'Checked for nearest way (%)', parent_place_id;{% endif %}
+    ELSE
+      -- for larger features simply find the area with the largest rank that
+      -- contains the bbox, only use addressable features
        FOR location IN
-        SELECT p.place_id, p.osm_id, p.rank_search, p.address,
-               coalesce(p.centroid, ST_Centroid(p.geometry)) as centroid
-          FROM placex p, planet_osm_ways w
-         WHERE p.osm_type = 'W' and p.rank_search >= 26
-               and p.geometry && bbox
-               and w.id = p.osm_id and poi_osm_id = any(w.nodes)
+        SELECT place_id FROM placex
+         WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
+               AND rank_address between 5 and 25
+        ORDER BY rank_address desc
        LOOP
-        {% if debug %}RAISE WARNING 'Node is part of way % ', location.osm_id;{% endif %}
-
-        -- Way IS a road then we are on it - that must be our road
-        IF location.rank_search < 28 THEN
-          {% if debug %}RAISE WARNING 'node in way that is a street %',location;{% endif %}
-          return location.place_id;
-        END IF;
-
-        SELECT find_parent_for_poi('W', location.osm_id, poi_partition,
-                                   location.centroid,
-                                   location.address->'street',
-                                   location.address->'place',
-                                   false)
-          INTO parent_place_id;
-        IF parent_place_id is not null THEN
-          RETURN parent_place_id;
-        END IF;
+        RETURN location.place_id;
        END LOOP;
      END IF;
+  END IF;
  
-    IF fallback THEN
-      IF addr_street is null and addr_place is not null THEN
-        -- The address is attached to a place we don't know.
-        -- Instead simply use the containing area with the largest rank.
-        FOR location IN
-          SELECT place_id FROM placex
-            WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
-                  AND rank_address between 5 and 25
-            ORDER BY rank_address desc
-        LOOP
-            RETURN location.place_id;
-        END LOOP;
-      ELSEIF ST_Area(bbox) < 0.005 THEN
-        -- for smaller features get the nearest road
-        SELECT getNearestRoadPlaceId(poi_partition, bbox) INTO parent_place_id;
-        {% if debug %}RAISE WARNING 'Checked for nearest way (%)', parent_place_id;{% endif %}
-      ELSE
-        -- for larger features simply find the area with the largest rank that
-        -- contains the bbox, only use addressable features
-        FOR location IN
-          SELECT place_id FROM placex
-            WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
-                  AND rank_address between 5 and 25
-            ORDER BY rank_address desc
-        LOOP
-            RETURN location.place_id;
-        END LOOP;
-      END IF;
-    END IF;
-
-    RETURN parent_place_id;
+  RETURN parent_place_id;
  END;
  $$
  LANGUAGE plpgsql STABLE;
@@ -240,6 +290,101 @@ $$
  LANGUAGE plpgsql STABLE;
  
  
+CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
+                                                   in_partition SMALLINT,
+                                                   parent_place_id BIGINT,
+                                                   is_place_addr BOOLEAN,
+                                                   country TEXT,
+                                                   token_info JSONB,
+                                                   geometry GEOMETRY,
+                                                   OUT name_vector INTEGER[],
+                                                   OUT nameaddress_vector INTEGER[])
+  AS $$
+DECLARE
+  parent_name_vector INTEGER[];
+  parent_address_vector INTEGER[];
+  addr_place_ids INTEGER[];
+  hnr_vector INTEGER[];
+
+  addr_item RECORD;
+  addr_place RECORD;
+  parent_address_place_ids BIGINT[];
+BEGIN
+  nameaddress_vector := '{}'::INTEGER[];
+
+  SELECT s.name_vector, s.nameaddress_vector
+    INTO parent_name_vector, parent_address_vector
+    FROM search_name s
+    WHERE s.place_id = parent_place_id;
+
+  FOR addr_item IN
+    SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
+      FROM token_get_address_tokens(token_info)
+      WHERE not search_tokens <@ parent_address_vector
+  LOOP
+    addr_place := get_address_place(in_partition, geometry,
+                                    addr_item.from_rank, addr_item.to_rank,
+                                    addr_item.extent, addr_item.match_tokens);
+
+    IF addr_place is null THEN
+      -- No place found in OSM that matches. Make it at least searchable.
+      nameaddress_vector := array_merge(nameaddress_vector, addr_item.search_tokens);
+    ELSE
+      IF parent_address_place_ids is null THEN
+        SELECT array_agg(parent_place_id) INTO parent_address_place_ids
+          FROM place_addressline
+          WHERE place_id = parent_place_id;
+      END IF;
+
+      -- If the parent already lists the place in place_address line, then we
+      -- are done. Otherwise, add its own place_address line.
+      IF not parent_address_place_ids @> ARRAY[addr_place.place_id] THEN
+        nameaddress_vector := array_merge(nameaddress_vector, addr_place.keywords);
+
+        INSERT INTO place_addressline (place_id, address_place_id, fromarea,
+                                       isaddress, distance, cached_rank_address)
+          VALUES (obj_place_id, addr_place.place_id, not addr_place.isguess,
+                    true, addr_place.distance, addr_place.rank_address);
+      END IF;
+    END IF;
+  END LOOP;
+
+  name_vector := token_get_name_search_tokens(token_info);
+
+  -- Check if the parent covers all address terms.
+  -- If not, create a search name entry with the house number as the name.
+  -- This is unusual for the search_name table but prevents that the place
+  -- is returned when we only search for the street/place.
+
+  hnr_vector := token_get_housenumber_search_tokens(token_info);
+
+  IF hnr_vector is not null and not nameaddress_vector <@ parent_address_vector THEN
+    name_vector := array_merge(name_vector, hnr_vector);
+  END IF;
+
+  IF is_place_addr THEN
+    addr_place_ids := token_addr_place_search_tokens(token_info);
+    IF not addr_place_ids <@ parent_name_vector THEN
+      -- make sure addr:place terms are always searchable
+      nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
+      -- If there is a housenumber, also add the place name as a name,
+      -- so we can search it by the usual housenumber+place algorithms.
+      IF hnr_vector is not null THEN
+        name_vector := array_merge(name_vector, addr_place_ids);
+      END IF;
+    END IF;
+  END IF;
+
+  -- Cheating here by not recomputing all terms but simply using the ones
+  -- from the parent object.
+  nameaddress_vector := array_merge(nameaddress_vector, parent_name_vector);
+  nameaddress_vector := array_merge(nameaddress_vector, parent_address_vector);
+
+END;
+$$
+LANGUAGE plpgsql;
+
+
  -- Insert address of a place into the place_addressline table.
  --
  -- \param obj_place_id  Place_id of the place to compute the address for.
@@ -260,7 +405,7 @@ LANGUAGE plpgsql STABLE;
  CREATE OR REPLACE FUNCTION insert_addresslines(obj_place_id BIGINT,
                                                 partition SMALLINT,
                                                 maxrank SMALLINT,
-                                               address HSTORE,
+                                               token_info JSONB,
                                                 geometry GEOMETRY,
                                                 country TEXT,
                                                 OUT parent_place_id BIGINT,
@@ -275,7 +420,8 @@ DECLARE
    current_node_area GEOMETRY := NULL;
  
    parent_place_rank INT := 0;
-  addr_place_ids BIGINT[];
+  addr_place_ids BIGINT[] := '{}'::int[];
+  new_address_vector INT[];
  
    location RECORD;
  BEGIN
@@ -285,16 +431,21 @@ BEGIN
    address_havelevel := array_fill(false, ARRAY[maxrank]);
  
    FOR location IN
-    SELECT * FROM get_places_for_addr_tags(partition, geometry,
-                                                   address, country)
-    ORDER BY rank_address, distance, isguess desc
+    SELECT (get_address_place(partition, geometry, from_rank, to_rank,
+                              extent, match_tokens)).*, search_tokens
+      FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
+              FROM token_get_address_tokens(token_info)) x
+      ORDER BY rank_address, distance, isguess desc
    LOOP
-    {% if not db.reverse_only %}
-      nameaddress_vector := array_merge(nameaddress_vector,
-                                        location.keywords::int[]);
-    {% endif %}
+    IF location.place_id is null THEN
+      {% if not db.reverse_only %}
+      nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens);
+      {% endif %}
+    ELSE
+      {% if not db.reverse_only %}
+      nameaddress_vector := array_merge(nameaddress_vector, location.keywords::INTEGER[]);
+      {% endif %}
  
-    IF location.place_id is not null THEN
        location_isaddress := not address_havelevel[location.rank_address];
        IF not address_havelevel[location.rank_address] THEN
          address_havelevel[location.rank_address] := true;
@@ -309,13 +460,13 @@ BEGIN
          VALUES (obj_place_id, location.place_id, not location.isguess,
                  true, location.distance, location.rank_address);
  
-      addr_place_ids := array_append(addr_place_ids, location.place_id);
+      addr_place_ids := addr_place_ids || location.place_id;
      END IF;
    END LOOP;
  
    FOR location IN
      SELECT * FROM getNearFeatures(partition, geometry, maxrank)
-    WHERE addr_place_ids is null or not addr_place_ids @> ARRAY[place_id]
+    WHERE not addr_place_ids @> ARRAY[place_id]
      ORDER BY rank_address, isguess asc,
               distance *
                 CASE WHEN rank_address = 16 AND rank_search = 15 THEN 0.2
@@ -397,10 +548,11 @@ BEGIN
    NEW.place_id := nextval('seq_place');
    NEW.indexed_status := 1; --STATUS_NEW
  
-  NEW.country_code := lower(get_country_code(NEW.geometry));
+  NEW.centroid := ST_PointOnSurface(NEW.geometry);
+  NEW.country_code := lower(get_country_code(NEW.centroid));
  
    NEW.partition := get_partition(NEW.country_code);
-  NEW.geometry_sector := geometry_sector(NEW.partition, NEW.geometry);
+  NEW.geometry_sector := geometry_sector(NEW.partition, NEW.centroid);
  
    IF NEW.osm_type = 'X' THEN
      -- E'X'ternal records should already be in the right format so do nothing
@@ -522,8 +674,8 @@ DECLARE
    parent_address_level SMALLINT;
    place_address_level SMALLINT;
  
-  addr_street TEXT;
-  addr_place TEXT;
+  addr_street INTEGER[];
+  addr_place INTEGER[];
  
    max_rank SMALLINT;
  
@@ -531,12 +683,11 @@ DECLARE
    nameaddress_vector INTEGER[];
    addr_nameaddress_vector INTEGER[];
  
-  inherited_address HSTORE;
-
    linked_node_id BIGINT;
    linked_importance FLOAT;
    linked_wikipedia TEXT;
  
+  is_place_address BOOLEAN;
    result BOOLEAN;
  BEGIN
    -- deferred delete
@@ -566,9 +717,9 @@ BEGIN
    -- update not necessary for osmline, cause linked_place_id does not exist
  
    NEW.extratags := NEW.extratags - 'linked_place'::TEXT;
-  NEW.address := NEW.address - '_unlisted_place'::TEXT;
  
    IF NEW.linked_place_id is not null THEN
+    NEW.token_info := null;
      {% if debug %}RAISE WARNING 'place already linked to %', NEW.linked_place_id;{% endif %}
      RETURN NEW;
    END IF;
@@ -579,13 +730,34 @@ BEGIN
    -- imported as place=postcode. That's why relations are allowed to pass here.
    -- This can go away in a couple of versions.
    IF NEW.class = 'place'  and NEW.type = 'postcode' and NEW.osm_type != 'R' THEN
+    NEW.token_info := null;
      RETURN NEW;
    END IF;
  
-  -- Speed up searches - just use the centroid of the feature
-  -- cheaper but less acurate
+  -- Compute a preliminary centroid.
    NEW.centroid := ST_PointOnSurface(NEW.geometry);
-  {% if debug %}RAISE WARNING 'Computing preliminary centroid at %',ST_AsText(NEW.centroid);{% endif %}
+
+    -- recalculate country and partition
+  IF NEW.rank_search = 4 AND NEW.address is not NULL AND NEW.address ? 'country' THEN
+    -- for countries, believe the mapped country code,
+    -- so that we remain in the right partition if the boundaries
+    -- suddenly expand.
+    NEW.country_code := lower(NEW.address->'country');
+    NEW.partition := get_partition(lower(NEW.country_code));
+    IF NEW.partition = 0 THEN
+      NEW.country_code := lower(get_country_code(NEW.centroid));
+      NEW.partition := get_partition(NEW.country_code);
+    END IF;
+  ELSE
+    IF NEW.rank_search >= 4 THEN
+      NEW.country_code := lower(get_country_code(NEW.centroid));
+    ELSE
+      NEW.country_code := NULL;
+    END IF;
+    NEW.partition := get_partition(NEW.country_code);
+  END IF;
+  {% if debug %}RAISE WARNING 'Country updated: "%"', NEW.country_code;{% endif %}
+
  
    -- recompute the ranks, they might change when linking changes
    SELECT * INTO NEW.rank_search, NEW.rank_address
@@ -665,54 +837,12 @@ BEGIN
      parent_address_level := 3;
    END IF;
  
-  {% if debug %}RAISE WARNING 'Copy over address tags';{% endif %}
-  -- housenumber is a computed field, so start with an empty value
-  NEW.housenumber := NULL;
-  IF NEW.address is not NULL THEN
-      IF NEW.address ? 'conscriptionnumber' THEN
-        IF NEW.address ? 'streetnumber' THEN
-            NEW.housenumber := (NEW.address->'conscriptionnumber') || '/' || (NEW.address->'streetnumber');
-        ELSE
-            NEW.housenumber := NEW.address->'conscriptionnumber';
-        END IF;
-      ELSEIF NEW.address ? 'streetnumber' THEN
-        NEW.housenumber := NEW.address->'streetnumber';
-      ELSEIF NEW.address ? 'housenumber' THEN
-        NEW.housenumber := NEW.address->'housenumber';
-      END IF;
-      NEW.housenumber := create_housenumber_id(NEW.housenumber);
-
-      addr_street := NEW.address->'street';
-      addr_place := NEW.address->'place';
-
-      IF NEW.address ? 'postcode' and NEW.address->'postcode' not similar to '%(:|,|;)%' THEN
-        i := getorcreate_postcode_id(NEW.address->'postcode');
-      END IF;
-  END IF;
+  NEW.housenumber := token_normalized_housenumber(NEW.token_info);
+  addr_street := token_addr_street_match_tokens(NEW.token_info);
+  addr_place := token_addr_place_match_tokens(NEW.token_info);
  
    NEW.postcode := null;
  
-  -- recalculate country and partition
-  IF NEW.rank_search = 4 AND NEW.address is not NULL AND NEW.address ? 'country' THEN
-    -- for countries, believe the mapped country code,
-    -- so that we remain in the right partition if the boundaries
-    -- suddenly expand.
-    NEW.country_code := lower(NEW.address->'country');
-    NEW.partition := get_partition(lower(NEW.country_code));
-    IF NEW.partition = 0 THEN
-      NEW.country_code := lower(get_country_code(NEW.centroid));
-      NEW.partition := get_partition(NEW.country_code);
-    END IF;
-  ELSE
-    IF NEW.rank_search >= 4 THEN
-      NEW.country_code := lower(get_country_code(NEW.centroid));
-    ELSE
-      NEW.country_code := NULL;
-    END IF;
-    NEW.partition := get_partition(NEW.country_code);
-  END IF;
-  {% if debug %}RAISE WARNING 'Country updated: "%"', NEW.country_code;{% endif %}
-
    -- waterway ways are linked when they are part of a relation and have the same class/type
    IF NEW.osm_type = 'R' and NEW.class = 'waterway' THEN
        FOR relation_members IN select members from planet_osm_rels r where r.id = NEW.osm_id and r.parts != array[]::bigint[]
@@ -749,33 +879,14 @@ BEGIN
  
      {% if debug %}RAISE WARNING 'finding street for % %', NEW.osm_type, NEW.osm_id;{% endif %}
      NEW.parent_place_id := null;
-
-    -- if we have a POI and there is no address information,
-    -- see if we can get it from a surrounding building
-    inherited_address := ''::HSTORE;
-    IF NEW.osm_type = 'N' AND addr_street IS NULL AND addr_place IS NULL
-       AND NEW.housenumber IS NULL THEN
-      FOR location IN
-        -- The additional && condition works around the misguided query
-        -- planner of postgis 3.0.
-        SELECT address from placex where ST_Covers(geometry, NEW.centroid)
-            and geometry && NEW.centroid
-            and (address ? 'housenumber' or address ? 'street' or address ? 'place')
-            and rank_search > 28 AND ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon')
-            limit 1
-      LOOP
-        NEW.housenumber := location.address->'housenumber';
-        addr_street := location.address->'street';
-        addr_place := location.address->'place';
-        inherited_address := location.address;
-      END LOOP;
-    END IF;
+    is_place_address := coalesce(not NEW.address ? 'street' and NEW.address ? 'place', FALSE);
  
      -- We have to find our parent road.
      NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
                                                 NEW.partition,
                                                 ST_Envelope(NEW.geometry),
-                                               addr_street, addr_place);
+                                               addr_street, addr_place,
+                                               is_place_address);
  
      -- If we found the road take a shortcut here.
      -- Otherwise fall back to the full address getting method below.
@@ -785,12 +896,12 @@ BEGIN
        SELECT p.country_code, p.postcode, p.name FROM placex p
         WHERE p.place_id = NEW.parent_place_id INTO location;
  
-      IF addr_street is null and addr_place is not null THEN
+      IF is_place_address THEN
          -- Check if the addr:place tag is part of the parent name
          SELECT count(*) INTO i
-          FROM svals(location.name) AS pname WHERE pname = addr_place;
+          FROM svals(location.name) AS pname WHERE pname = NEW.address->'place';
          IF i = 0 THEN
-          NEW.address = NEW.address || hstore('_unlisted_place', addr_place);
+          NEW.address = NEW.address || hstore('_unlisted_place', NEW.address->'place');
          END IF;
        END IF;
  
@@ -798,39 +909,21 @@ BEGIN
        {% if debug %}RAISE WARNING 'Got parent details from search name';{% endif %}
  
        -- determine postcode
-      IF NEW.address is not null AND NEW.address ? 'postcode' THEN
-          NEW.postcode = upper(trim(NEW.address->'postcode'));
-      ELSE
-         NEW.postcode := location.postcode;
-      END IF;
-      IF NEW.postcode is null THEN
-        NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
-      END IF;
+      NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
+                               location.postcode,
+                               get_nearest_postcode(NEW.country_code, NEW.geometry));
  
        IF NEW.name is not NULL THEN
            NEW.name := add_default_place_name(NEW.country_code, NEW.name);
-          name_vector := make_keywords(NEW.name);
-
-          IF NEW.rank_search <= 25 and NEW.rank_address > 0 THEN
-            result := add_location(NEW.place_id, NEW.country_code, NEW.partition,
-                                   name_vector, NEW.rank_search, NEW.rank_address,
-                                   upper(trim(NEW.address->'postcode')), NEW.geometry,
-                                   NEW.centroid);
-            {% if debug %}RAISE WARNING 'Place added to location table';{% endif %}
-          END IF;
-
        END IF;
  
        {% if not db.reverse_only %}
-      IF array_length(name_vector, 1) is not NULL
-         OR inherited_address is not NULL OR NEW.address is not NULL
-      THEN
+      IF NEW.name is not NULL OR NEW.address is not NULL THEN
          SELECT * INTO name_vector, nameaddress_vector
            FROM create_poi_search_terms(NEW.place_id,
                                         NEW.partition, NEW.parent_place_id,
-                                       inherited_address || NEW.address,
-                                       NEW.country_code, NEW.housenumber,
-                                       name_vector, NEW.centroid);
+                                       is_place_address, NEW.country_code,
+                                       NEW.token_info, NEW.centroid);
  
          IF array_length(name_vector, 1) is not NULL THEN
            INSERT INTO search_name (place_id, search_rank, address_rank,
@@ -844,6 +937,17 @@ BEGIN
        END IF;
        {% endif %}
  
+      NEW.token_info := token_strip_info(NEW.token_info);
+      -- If the address was inherited from a surrounding building,
+      -- do not add it permanently to the table.
+      IF NEW.address ? '_inherited' THEN
+        IF NEW.address ? '_unlisted_place' THEN
+          NEW.address := hstore('_unlisted_place', NEW.address->'_unlisted_place');
+        ELSE
+          NEW.address := null;
+        END IF;
+      END IF;
+
        RETURN NEW;
      END IF;
  
@@ -914,19 +1018,11 @@ BEGIN
      END IF;
    END IF;
  
-  -- Initialise the name vector using our name
-  NEW.name := add_default_place_name(NEW.country_code, NEW.name);
-  name_vector := make_keywords(NEW.name);
-
-  -- make sure all names are in the word table
    IF NEW.admin_level = 2
       AND NEW.class = 'boundary' AND NEW.type = 'administrative'
       AND NEW.country_code IS NOT NULL AND NEW.osm_type = 'R'
    THEN
-    PERFORM create_country(NEW.name, lower(NEW.country_code));
-    {% if debug %}RAISE WARNING 'Country names updated';{% endif %}
-
-    -- Also update the list of country names. Adding an additional sanity
+    -- Update the list of country names. Adding an additional sanity
      -- check here: make sure the country does overlap with the area where
      -- we expect it to be as per static country grid.
      FOR location IN
@@ -959,29 +1055,28 @@ BEGIN
    ELSEIF NEW.rank_address > 25 THEN
      max_rank := 25;
    ELSE
-    max_rank = NEW.rank_address;
+    max_rank := NEW.rank_address;
    END IF;
  
    SELECT * FROM insert_addresslines(NEW.place_id, NEW.partition, max_rank,
-                                    NEW.address, geom, NEW.country_code)
+                                    NEW.token_info, geom, NEW.country_code)
      INTO NEW.parent_place_id, NEW.postcode, nameaddress_vector;
  
    {% if debug %}RAISE WARNING 'RETURN insert_addresslines: %, %, %', NEW.parent_place_id, NEW.postcode, nameaddress_vector;{% endif %}
  
-  IF NEW.address is not null AND NEW.address ? 'postcode' 
-     AND NEW.address->'postcode' not similar to '%(,|;)%' THEN
-    NEW.postcode := upper(trim(NEW.address->'postcode'));
-  END IF;
-
-  IF NEW.postcode is null AND NEW.rank_search > 8 THEN
-    NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
-  END IF;
+  NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
+                           NEW.postcode);
  
    -- if we have a name add this to the name search table
    IF NEW.name IS NOT NULL THEN
+    -- Initialise the name vector using our name
+    NEW.name := add_default_place_name(NEW.country_code, NEW.name);
+    name_vector := token_get_name_search_tokens(NEW.token_info);
  
      IF NEW.rank_search <= 25 and NEW.rank_address > 0 THEN
-      result := add_location(NEW.place_id, NEW.country_code, NEW.partition, name_vector, NEW.rank_search, NEW.rank_address, upper(trim(NEW.address->'postcode')), NEW.geometry, NEW.centroid);
+      result := add_location(NEW.place_id, NEW.country_code, NEW.partition,
+                             name_vector, NEW.rank_search, NEW.rank_address,
+                             NEW.postcode, NEW.geometry, NEW.centroid);
        {% if debug %}RAISE WARNING 'added to location (full)';{% endif %}
      END IF;
  
@@ -990,8 +1085,11 @@ BEGIN
        {% if debug %}RAISE WARNING 'insert into road location table (full)';{% endif %}
      END IF;
  
-    result := insertSearchName(NEW.partition, NEW.place_id, name_vector,
-                               NEW.rank_search, NEW.rank_address, NEW.geometry);
+    IF NEW.rank_address between 16 and 27 THEN
+      result := insertSearchName(NEW.partition, NEW.place_id,
+                                 token_get_name_match_tokens(NEW.token_info),
+                                 NEW.rank_search, NEW.rank_address, NEW.geometry);
+    END IF;
      {% if debug %}RAISE WARNING 'added to search name (full)';{% endif %}
  
      {% if not db.reverse_only %}
@@ -1002,11 +1100,15 @@ BEGIN
                         NEW.importance, NEW.country_code, name_vector,
                         nameaddress_vector, NEW.centroid);
      {% endif %}
+  END IF;
  
+  IF NEW.postcode is null AND NEW.rank_search > 8 THEN
+    NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
    END IF;
  
    {% if debug %}RAISE WARNING 'place update % % finsihed.', NEW.osm_type, NEW.osm_id;{% endif %}
  
+  NEW.token_info := token_strip_info(NEW.token_info);
    RETURN NEW;
  END;
  $$
diff --git a/lib-sql/functions/utils.sql b/lib-sql/functions/utils.sql

index 4868b828e4bf6e988f1daf601f7655ac57bcba87..c308d0259b8505887d8c8fdc9a85630a20b3313f 100644 (file)
--- a/lib-sql/functions/utils.sql
+++ b/lib-sql/functions/utils.sql
@@ -221,37 +221,30 @@ LANGUAGE plpgsql STABLE;
  -- \param centroid   Location of the address.
  --
  -- \return Place ID of the parent if one was found, NULL otherwise.
-CREATE OR REPLACE FUNCTION find_parent_for_address(street TEXT, place TEXT,
+CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[],
                                                     partition SMALLINT,
                                                     centroid GEOMETRY)
    RETURNS BIGINT
    AS $$
  DECLARE
    parent_place_id BIGINT;
-  word_ids INTEGER[];
  BEGIN
    IF street is not null THEN
      -- Check for addr:street attributes
      -- Note that addr:street links can only be indexed, once the street itself is indexed
-    word_ids := word_ids_from_name(street);
-    IF word_ids is not null THEN
-      parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, word_ids);
-      IF parent_place_id is not null THEN
-        {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
-        RETURN parent_place_id;
-      END IF;
+    parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street);
+    IF parent_place_id is not null THEN
+      {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
+      RETURN parent_place_id;
      END IF;
    END IF;
  
    -- Check for addr:place attributes.
    IF place is not null THEN
-    word_ids := word_ids_from_name(place);
-    IF word_ids is not null THEN
-      parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, word_ids);
-      IF parent_place_id is not null THEN
-        {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
-        RETURN parent_place_id;
-      END IF;
+    parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place);
+    IF parent_place_id is not null THEN
+      {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
+      RETURN parent_place_id;
      END IF;
    END IF;
  
diff --git a/lib-sql/indices.sql b/lib-sql/indices.sql

index a6f7cf95fcb6c7e0346ee0cdc8ca54fe2a77be81..81299544573c0c4c1ffea2850b8be54477f6a2bb 100644 (file)
--- a/lib-sql/indices.sql
+++ b/lib-sql/indices.sql
@@ -1,9 +1,6 @@
  -- Indices used only during search and update.
  -- These indices are created only after the indexing process is done.
  
-CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id
-  ON word USING BTREE (word_id) {{db.tablespace.search_index}};
-
  CREATE INDEX {{sql.if_index_not_exists}} idx_place_addressline_address_place_id
    ON place_addressline USING BTREE (address_place_id) {{db.tablespace.search_index}};
  
diff --git a/lib-sql/tables.sql b/lib-sql/tables.sql

index aa213dbaccb4ebd49436fc68853ed0ab8d9c6789..9732c26cb3b82623e2fe5dd799d94fe24492b226 100644 (file)
--- a/lib-sql/tables.sql
+++ b/lib-sql/tables.sql
@@ -43,22 +43,6 @@ CREATE TABLE nominatim_properties (
  );
  GRANT SELECT ON TABLE nominatim_properties TO "{{config.DATABASE_WEBUSER}}";
  
-drop table IF EXISTS word;
-CREATE TABLE word (
-  word_id INTEGER,
-  word_token text,
-  word text,
-  class text,
-  type text,
-  country_code varchar(2),
-  search_name_count INTEGER,
-  operator TEXT
-  ) {{db.tablespace.search_data}};
-CREATE INDEX idx_word_word_token on word USING BTREE (word_token) {{db.tablespace.search_index}};
-GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}" ;
-DROP SEQUENCE IF EXISTS seq_word;
-CREATE SEQUENCE seq_word start 1;
-
  drop table IF EXISTS location_area CASCADE;
  CREATE TABLE location_area (
    place_id BIGINT,
@@ -84,22 +68,6 @@ CREATE TABLE location_area_country (
  CREATE INDEX idx_location_area_country_geometry ON location_area_country USING GIST (geometry) {{db.tablespace.address_index}};
  
  
-drop table IF EXISTS location_property CASCADE;
-CREATE TABLE location_property (
-  place_id BIGINT,
-  parent_place_id BIGINT,
-  partition SMALLINT,
-  housenumber TEXT,
-  postcode TEXT,
-  centroid GEOMETRY(Point, 4326)
-  );
-
-CREATE TABLE location_property_aux () INHERITS (location_property);
-CREATE INDEX idx_location_property_aux_place_id ON location_property_aux USING BTREE (place_id);
-CREATE INDEX idx_location_property_aux_parent_place_id ON location_property_aux USING BTREE (parent_place_id);
-CREATE INDEX idx_location_property_aux_housenumber_parent_place_id ON location_property_aux USING BTREE (parent_place_id, housenumber);
-GRANT SELECT ON location_property_aux TO "{{config.DATABASE_WEBUSER}}";
-
  CREATE TABLE location_property_tiger (
    place_id BIGINT,
    parent_place_id BIGINT,
@@ -125,6 +93,7 @@ CREATE TABLE location_property_osmline (
      linegeo GEOMETRY,
      interpolationtype TEXT,
      address HSTORE,
+    token_info JSONB, -- custom column for tokenizer use only
      postcode TEXT,
      country_code VARCHAR(2)
    ){{db.tablespace.search_data}};
@@ -174,6 +143,7 @@ CREATE TABLE placex (
    indexed_status SMALLINT,
    LIKE place INCLUDING CONSTRAINTS,
    wikipedia TEXT, -- calculated wikipedia article name (language:title)
+  token_info JSONB, -- custom column for tokenizer use only
    country_code varchar(2),
    housenumber TEXT,
    postcode TEXT,
@@ -184,6 +154,10 @@ CREATE INDEX idx_placex_osmid ON placex USING BTREE (osm_type, osm_id) {{db.tabl
  CREATE INDEX idx_placex_linked_place_id ON placex USING BTREE (linked_place_id) {{db.tablespace.address_index}} WHERE linked_place_id IS NOT NULL;
  CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}};
  CREATE INDEX idx_placex_geometry ON placex USING GIST (geometry) {{db.tablespace.search_index}};
+CREATE INDEX idx_placex_geometry_buildings ON placex
+  USING GIST (geometry) {{db.tablespace.search_index}}
+  WHERE address is not null and rank_search = 30
+        and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon');
  CREATE INDEX idx_placex_geometry_placenode ON placex
    USING GIST (geometry) {{db.tablespace.search_index}}
    WHERE osm_type = 'N' and rank_search < 26
@@ -194,7 +168,6 @@ DROP SEQUENCE IF EXISTS seq_place;
  CREATE SEQUENCE seq_place start 1;
  GRANT SELECT on placex to "{{config.DATABASE_WEBUSER}}" ;
  GRANT SELECT on place_addressline to "{{config.DATABASE_WEBUSER}}" ;
-GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}" ;
  GRANT SELECT ON planet_osm_ways to "{{config.DATABASE_WEBUSER}}" ;
  GRANT SELECT ON planet_osm_rels to "{{config.DATABASE_WEBUSER}}" ;
  GRANT SELECT on location_area to "{{config.DATABASE_WEBUSER}}" ;
diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/legacy_icu_tokenizer.sql

new file mode 100644 (file)

index 0000000..8fd0ede
--- /dev/null
+++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql
@@ -0,0 +1,134 @@
+-- Get tokens used for searching the given place.
+--
+-- These are the tokens that will be saved in the search_name table.
+CREATE OR REPLACE FUNCTION token_get_name_search_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'names')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Get tokens for matching the place name against others.
+--
+-- This should usually be restricted to full name tokens.
+CREATE OR REPLACE FUNCTION token_get_name_match_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'names')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Return the housenumber tokens applicable for the place.
+CREATE OR REPLACE FUNCTION token_get_housenumber_search_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'hnr_tokens')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Return the housenumber in the form that it can be matched during search.
+CREATE OR REPLACE FUNCTION token_normalized_housenumber(info JSONB)
+  RETURNS TEXT
+AS $$
+  SELECT info->>'hnr';
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'street')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'place_match')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'place_search')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+DROP TYPE IF EXISTS token_addresstoken CASCADE;
+CREATE TYPE token_addresstoken AS (
+  key TEXT,
+  match_tokens INT[],
+  search_tokens INT[]
+);
+
+CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
+  RETURNS SETOF token_addresstoken
+AS $$
+  SELECT key, (value->>1)::int[] as match_tokens,
+         (value->>0)::int[] as search_tokens
+  FROM jsonb_each(info->'addr');
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+  RETURNS TEXT
+AS $$
+  SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Return token info that should be saved permanently in the database.
+CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
+  RETURNS JSONB
+AS $$
+  SELECT NULL::JSONB;
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+--------------- private functions ----------------------------------------------
+
+CREATE OR REPLACE FUNCTION getorcreate_term_id(lookup_term TEXT)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  return_id INTEGER;
+  term_count INTEGER;
+BEGIN
+  SELECT min(word_id), max(search_name_count) INTO return_id, term_count
+    FROM word WHERE word_token = lookup_term and class is null and type is null;
+
+  IF return_id IS NULL THEN
+    return_id := nextval('seq_word');
+    INSERT INTO word (word_id, word_token, search_name_count)
+      VALUES (return_id, lookup_term, 0);
+  ELSEIF left(lookup_term, 1) = ' ' and term_count > {{ max_word_freq }} THEN
+    return_id := 0;
+  END IF;
+
+  RETURN return_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  return_id INTEGER;
+BEGIN
+  SELECT min(word_id) INTO return_id
+    FROM word
+    WHERE word_token = '  '  || lookup_term
+          and class = 'place' and type = 'house';
+
+  IF return_id IS NULL THEN
+    return_id := nextval('seq_word');
+    INSERT INTO word (word_id, word_token, class, type, search_name_count)
+      VALUES (return_id, ' ' || lookup_term, 'place', 'house', 0);
+  END IF;
+
+  RETURN return_id;
+END;
+$$
+LANGUAGE plpgsql;
diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql

new file mode 100644 (file)

index 0000000..fe82762
--- /dev/null
+++ b/lib-sql/tokenizer/legacy_tokenizer.sql
@@ -0,0 +1,399 @@
+-- Get tokens used for searching the given place.
+--
+-- These are the tokens that will be saved in the search_name table.
+CREATE OR REPLACE FUNCTION token_get_name_search_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'names')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Get tokens for matching the place name against others.
+--
+-- This should usually be restricted to full name tokens.
+CREATE OR REPLACE FUNCTION token_get_name_match_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'names')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Return the housenumber tokens applicable for the place.
+CREATE OR REPLACE FUNCTION token_get_housenumber_search_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'hnr_tokens')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Return the housenumber in the form that it can be matched during search.
+CREATE OR REPLACE FUNCTION token_normalized_housenumber(info JSONB)
+  RETURNS TEXT
+AS $$
+  SELECT info->>'hnr';
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'street')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'place_match')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'place_search')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+DROP TYPE IF EXISTS token_addresstoken CASCADE;
+CREATE TYPE token_addresstoken AS (
+  key TEXT,
+  match_tokens INT[],
+  search_tokens INT[]
+);
+
+CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
+  RETURNS SETOF token_addresstoken
+AS $$
+  SELECT key, (value->>1)::int[] as match_tokens,
+         (value->>0)::int[] as search_tokens
+  FROM jsonb_each(info->'addr');
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+  RETURNS TEXT
+AS $$
+  SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Return token info that should be saved permanently in the database.
+CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
+  RETURNS JSONB
+AS $$
+  SELECT NULL::JSONB;
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+--------------- private functions ----------------------------------------------
+
+-- Functions for term normalisation and access to the 'word' table.
+
+CREATE OR REPLACE FUNCTION transliteration(text) RETURNS text
+  AS '{{ modulepath }}/nominatim.so', 'transliteration'
+LANGUAGE c IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION gettokenstring(text) RETURNS text
+  AS '{{ modulepath }}/nominatim.so', 'gettokenstring'
+LANGUAGE c IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT
+  AS $$
+DECLARE
+  o TEXT;
+BEGIN
+  o := public.gettokenstring(public.transliteration(name));
+  RETURN trim(substr(o,1,length(o)));
+END;
+$$
+LANGUAGE plpgsql IMMUTABLE;
+
+-- returns NULL if the word is too common
+CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT) 
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  return_word_id INTEGER;
+  count INTEGER;
+BEGIN
+  lookup_token := trim(lookup_word);
+  SELECT min(word_id), max(search_name_count) FROM word
+    WHERE word_token = lookup_token and class is null and type is null
+    INTO return_word_id, count;
+  IF return_word_id IS NULL THEN
+    return_word_id := nextval('seq_word');
+    INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0);
+  ELSE
+    IF count > {{ max_word_freq }} THEN
+      return_word_id := NULL;
+    END IF;
+  END IF;
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+-- Create housenumber tokens from an OSM addr:housenumber.
+-- The housnumber is split at comma and semicolon as necessary.
+-- The function returns the normalized form of the housenumber suitable
+-- for comparison.
+CREATE OR REPLACE FUNCTION create_housenumbers(housenumbers TEXT[],
+                                               OUT tokens TEXT,
+                                               OUT normtext TEXT)
+  AS $$
+BEGIN
+  SELECT array_to_string(array_agg(trans), ';'), array_agg(tid)::TEXT
+    INTO normtext, tokens
+    FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word) as tid
+          FROM (SELECT make_standard_name(h) as lookup_word
+                FROM unnest(housenumbers) h) x) y;
+END;
+$$ LANGUAGE plpgsql STABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  return_word_id INTEGER;
+BEGIN
+  lookup_token := ' ' || trim(lookup_word);
+  SELECT min(word_id) FROM word
+    WHERE word_token = lookup_token and class='place' and type='house'
+    INTO return_word_id;
+  IF return_word_id IS NULL THEN
+    return_word_id := nextval('seq_word');
+    INSERT INTO word VALUES (return_word_id, lookup_token, null,
+                             'place', 'house', null, 0);
+  END IF;
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
+  RETURNS BOOLEAN
+  AS $$
+DECLARE
+  r RECORD;
+  lookup_token TEXT;
+  return_word_id INTEGER;
+BEGIN
+  lookup_token := ' ' || make_standard_name(postcode);
+  FOR r IN
+    SELECT word_id FROM word
+    WHERE word_token = lookup_token and word = postcode
+          and class='place' and type='postcode'
+  LOOP
+    RETURN false;
+  END LOOP;
+
+  INSERT INTO word VALUES (nextval('seq_word'), lookup_token, postcode,
+                           'place', 'postcode', null, 0);
+  RETURN true;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT, src_word TEXT)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  nospace_lookup_token TEXT;
+  return_word_id INTEGER;
+BEGIN
+  lookup_token := ' '||trim(lookup_word);
+  SELECT min(word_id) FROM word
+  WHERE word_token = lookup_token and class is null and type is null
+  INTO return_word_id;
+  IF return_word_id IS NULL THEN
+    return_word_id := nextval('seq_word');
+    INSERT INTO word VALUES (return_word_id, lookup_token, src_word,
+                             null, null, null, 0);
+  END IF;
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+-- Normalize a string and lookup its word ids (partial words).
+CREATE OR REPLACE FUNCTION addr_ids_from_name(lookup_word TEXT)
+  RETURNS INTEGER[]
+  AS $$
+DECLARE
+  words TEXT[];
+  id INTEGER;
+  return_word_id INTEGER[];
+  word_ids INTEGER[];
+  j INTEGER;
+BEGIN
+  words := string_to_array(make_standard_name(lookup_word), ' ');
+  IF array_upper(words, 1) IS NOT NULL THEN
+    FOR j IN 1..array_upper(words, 1) LOOP
+      IF (words[j] != '') THEN
+        SELECT array_agg(word_id) INTO word_ids
+          FROM word
+         WHERE word_token = words[j] and class is null and type is null;
+
+        IF word_ids IS NULL THEN
+          id := nextval('seq_word');
+          INSERT INTO word VALUES (id, words[j], null, null, null, null, 0);
+          return_word_id := return_word_id || id;
+        ELSE
+          return_word_id := array_merge(return_word_id, word_ids);
+        END IF;
+      END IF;
+    END LOOP;
+  END IF;
+
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+-- Normalize a string and look up its name ids (full words).
+CREATE OR REPLACE FUNCTION word_ids_from_name(lookup_word TEXT)
+  RETURNS INTEGER[]
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  return_word_ids INTEGER[];
+BEGIN
+  lookup_token := ' '|| make_standard_name(lookup_word);
+  SELECT array_agg(word_id) FROM word
+    WHERE word_token = lookup_token and class is null and type is null
+    INTO return_word_ids;
+  RETURN return_word_ids;
+END;
+$$
+LANGUAGE plpgsql STABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
+  RETURNS INTEGER[]
+  AS $$
+DECLARE
+  result INTEGER[];
+  s TEXT;
+  w INTEGER;
+  words TEXT[];
+  item RECORD;
+  j INTEGER;
+BEGIN
+  result := '{}'::INTEGER[];
+
+  FOR item IN SELECT (each(src)).* LOOP
+
+    s := make_standard_name(item.value);
+    w := getorcreate_name_id(s, item.value);
+
+    IF not(ARRAY[w] <@ result) THEN
+      result := result || w;
+    END IF;
+
+    w := getorcreate_word_id(s);
+
+    IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
+      result := result || w;
+    END IF;
+
+    words := string_to_array(s, ' ');
+    IF array_upper(words, 1) IS NOT NULL THEN
+      FOR j IN 1..array_upper(words, 1) LOOP
+        IF (words[j] != '') THEN
+          w = getorcreate_word_id(words[j]);
+          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
+            result := result || w;
+          END IF;
+        END IF;
+      END LOOP;
+    END IF;
+
+    words := regexp_split_to_array(item.value, E'[,;()]');
+    IF array_upper(words, 1) != 1 THEN
+      FOR j IN 1..array_upper(words, 1) LOOP
+        s := make_standard_name(words[j]);
+        IF s != '' THEN
+          w := getorcreate_word_id(s);
+          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
+            result := result || w;
+          END IF;
+        END IF;
+      END LOOP;
+    END IF;
+
+    s := regexp_replace(item.value, '市$', '');
+    IF s != item.value THEN
+      s := make_standard_name(s);
+      IF s != '' THEN
+        w := getorcreate_name_id(s, item.value);
+        IF NOT (ARRAY[w] <@ result) THEN
+          result := result || w;
+        END IF;
+      END IF;
+    END IF;
+
+  END LOOP;
+
+  RETURN result;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION precompute_words(src TEXT)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  s TEXT;
+  w INTEGER;
+  words TEXT[];
+  i INTEGER;
+  j INTEGER;
+BEGIN
+  s := make_standard_name(src);
+  w := getorcreate_name_id(s, src);
+
+  w := getorcreate_word_id(s);
+
+  words := string_to_array(s, ' ');
+  IF array_upper(words, 1) IS NOT NULL THEN
+    FOR j IN 1..array_upper(words, 1) LOOP
+      IF (words[j] != '') THEN
+        w := getorcreate_word_id(words[j]);
+      END IF;
+    END LOOP;
+  END IF;
+
+  words := regexp_split_to_array(src, E'[,;()]');
+  IF array_upper(words, 1) != 1 THEN
+    FOR j IN 1..array_upper(words, 1) LOOP
+      s := make_standard_name(words[j]);
+      IF s != '' THEN
+        w := getorcreate_word_id(s);
+      END IF;
+    END LOOP;
+  END IF;
+
+  s := regexp_replace(src, '市$', '');
+  IF s != src THEN
+    s := make_standard_name(s);
+    IF s != '' THEN
+      w := getorcreate_name_id(s, src);
+    END IF;
+  END IF;
+
+  RETURN 1;
+END;
+$$
+LANGUAGE plpgsql;
diff --git a/lib-sql/tokenizer/legacy_tokenizer_indices.sql b/lib-sql/tokenizer/legacy_tokenizer_indices.sql

new file mode 100644 (file)

index 0000000..44a2909
--- /dev/null
+++ b/lib-sql/tokenizer/legacy_tokenizer_indices.sql
@@ -0,0 +1,2 @@
+CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id
+  ON word USING BTREE (word_id) {{db.tablespace.search_index}};
diff --git a/lib-sql/tokenizer/legacy_tokenizer_tables.sql b/lib-sql/tokenizer/legacy_tokenizer_tables.sql

new file mode 100644 (file)

index 0000000..937eaaa
--- /dev/null
+++ b/lib-sql/tokenizer/legacy_tokenizer_tables.sql
@@ -0,0 +1,21 @@
+DROP TABLE IF EXISTS word;
+CREATE TABLE word (
+  word_id INTEGER,
+  word_token text NOT NULL,
+  word text,
+  class text,
+  type text,
+  country_code varchar(2),
+  search_name_count INTEGER,
+  operator TEXT
+) {{db.tablespace.search_data}};
+
+CREATE INDEX idx_word_word_token ON word
+    USING BTREE (word_token) {{db.tablespace.search_index}};
+CREATE INDEX idx_word_word ON word
+    USING BTREE (word) {{db.tablespace.search_index}} WHERE word is not null;
+GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
+
+DROP SEQUENCE IF EXISTS seq_word;
+CREATE SEQUENCE seq_word start 1;
+GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
diff --git a/lib-sql/words.sql b/lib-sql/words.sql

deleted file mode 100644 (file)

index 8be1781..0000000
--- a/lib-sql/words.sql
+++ /dev/null
@@ -1,14 +0,0 @@
-CREATE TABLE word_frequencies AS
-  (SELECT unnest(make_keywords(v)) as id, sum(count) as count
-     FROM (select svals(name) as v, count(*)from place group by v) cnt
-    WHERE v is not null
- GROUP BY id);
-
-select count(getorcreate_postcode_id(v)) from (select distinct address->'postcode' as v from place where address ? 'postcode') as w where v is not null;
-select count(create_housenumber_id(v)) from (select distinct address->'housenumber' as v from place where address ? 'housenumber') as w;
-
--- copy the word frequencies
-update word set search_name_count = count from word_frequencies wf where wf.id = word.word_id;
-
--- and drop the temporary frequency table again
-drop table word_frequencies;
diff --git a/manual/nominatim.1 b/manual/nominatim.1

index c5563bb58e83eede774062b7e91cfa32fe484651..a26861ffcf716be9a1df29cdddc1a242d360b9a2 100644 (file)
--- a/manual/nominatim.1
+++ b/manual/nominatim.1
@@ -3,7 +3,7 @@
  nominatim
  .SH SYNOPSIS
  .B nominatim
-[-h] {import,freeze,replication,special-phrases,add-data,index,refresh,admin,export,serve,search,reverse,lookup,details,status,transition} ...
+[-h] {import,freeze,replication,special-phrases,add-data,index,refresh,admin,export,serve,search,reverse,lookup,details,status} ...
  .SH DESCRIPTION
      Command\-line tools for importing, updating, administrating and
      querying the Nominatim database.
@@ -58,9 +58,6 @@ nominatim
  .TP
  \fBnominatim\fR \fI\,status\/\fR
      Execute API status query.
-.TP
-\fBnominatim\fR \fI\,transition\/\fR
-    Internal functions for code transition. Do not use.
  .SH OPTIONS 'nominatim import'
  usage: nominatim import [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
                          (--osm-file FILE | --continue {load-data,indexing,db-postprocess})
@@ -244,7 +241,7 @@ usage: nominatim add-data [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
  
      Add additional data from a file or an online source.
  
-    Data is only imported, not indexed. You need to call `nominatim\-update index`
+    Data is only imported, not indexed. You need to call `nominatim index`
      to complete the process.
      
  
@@ -909,106 +906,6 @@ Number of parallel threads to use
  \fB\-\-format\fR {text,json}
  Format of result
  
-.SH OPTIONS 'nominatim transition'
-usage: nominatim transition [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
-                            [--create-db] [--setup-db] [--import-data]
-                            [--load-data] [--create-tables]
-                            [--create-partition-tables] [--index]
-                            [--create-search-indices] [--create-country-names]
-                            [--no-partitions] [--osm-file FILE] [--drop]
-                            [--osm2pgsql-cache SIZE] [--no-analyse]
-                            [--ignore-errors] [--reverse-only]
-                            [--tiger-data FILE]
-
-    Internal functions for code transition. Do not use.
-    
-
-
-
-.TP
-\fB\-q\fR, \fB\-\-quiet\fR
-Print only error messages
-
-.TP
-\fB\-v\fR, \fB\-\-verbose\fR
-Increase verboseness of output
-
-.TP
-\fB\-\-project\-dir\fR DIR
-Base directory of the Nominatim installation (default:.)
-
-.TP
-\fB\-j\fR NUM, \fB\-\-threads\fR NUM
-Number of parallel threads to use
-
-.TP
-\fB\-\-create\-db\fR
-Create nominatim db
-
-.TP
-\fB\-\-setup\-db\fR
-Build a blank nominatim db
-
-.TP
-\fB\-\-import\-data\fR
-Import a osm file
-
-.TP
-\fB\-\-load\-data\fR
-Copy data to live tables from import table
-
-.TP
-\fB\-\-create\-tables\fR
-Create main tables
-
-.TP
-\fB\-\-create\-partition\-tables\fR
-Create required partition tables
-
-.TP
-\fB\-\-index\fR
-Index the data
-
-.TP
-\fB\-\-create\-search\-indices\fR
-Create additional indices required for search and update
-
-.TP
-\fB\-\-create\-country\-names\fR
-Create search index for default country names.
-
-.TP
-\fB\-\-no\-partitions\fR
-Do not partition search indices
-
-.TP
-\fB\-\-osm\-file\fR FILE
-File to import
-
-.TP
-\fB\-\-drop\fR
-Drop tables needed for updates, making the database readonly
-
-.TP
-\fB\-\-osm2pgsql\-cache\fR SIZE
-Size of cache to be used by osm2pgsql (in MB)
-
-.TP
-\fB\-\-no\-analyse\fR
-Do not perform analyse operations during index
-
-.TP
-\fB\-\-ignore\-errors\fR
-Ignore certain erros on import.
-
-.TP
-\fB\-\-reverse\-only\fR
-Do not create search tables and indexes
-
-.TP
-\fB\-\-tiger\-data\fR FILE
-File to import
-
  .SH DISTRIBUTION
  The latest version of Nominatim may be downloaded from
  .UR https://nominatim.org
diff --git a/nominatim/cli.py b/nominatim/cli.py

index 55f51aac72dc7d6a2b2337b198f2966d991a30f6..20a9c5f167c225c17d358d9a726e034e7a9618c1 100644 (file)
--- a/nominatim/cli.py
+++ b/nominatim/cli.py
@@ -121,7 +121,7 @@ class UpdateAddData:
      """\
      Add additional data from a file or an online source.
  
-    Data is only imported, not indexed. You need to call `nominatim-update index`
+    Data is only imported, not indexed. You need to call `nominatim index`
      to complete the process.
      """
  
diff --git a/nominatim/clicmd/args.py b/nominatim/clicmd/args.py

index 47007579f6f69cf1a97edac668c8646ed2046d40..ee1941875d56b8c1007cf6bea1d222672080fd22 100644 (file)
--- a/nominatim/clicmd/args.py
+++ b/nominatim/clicmd/args.py
@@ -3,7 +3,7 @@ Provides custom functions over command-line arguments.
  """
  
  
-class NominatimArgs: # pylint: disable=too-few-public-methods
+class NominatimArgs:
      """ Customized namespace class for the nominatim command line tool
          to receive the command-line arguments.
      """
diff --git a/nominatim/clicmd/index.py b/nominatim/clicmd/index.py

index 8fd4f6011251f480a3d37ff2572d979c7b971cb5..ea95e4565270a72dad8b9cdd2c609f51ee27a9d3 100644 (file)
--- a/nominatim/clicmd/index.py
+++ b/nominatim/clicmd/index.py
@@ -32,8 +32,11 @@ class UpdateIndex:
      @staticmethod
      def run(args):
          from ..indexer.indexer import Indexer
+        from ..tokenizer import factory as tokenizer_factory
  
-        indexer = Indexer(args.config.get_libpq_dsn(),
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+
+        indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                            args.threads or psutil.cpu_count() or 1)
  
          if not args.no_boundaries:
diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py

index ddc00d497bbe786a944664fc64ca0f45e69a70f3..e6e749121d24100d1393d6e59cc5d7c71bfa8218 100644 (file)
--- a/nominatim/clicmd/refresh.py
+++ b/nominatim/clicmd/refresh.py
@@ -46,6 +46,7 @@ class UpdateRefresh:
      @staticmethod
      def run(args):
          from ..tools import refresh
+        from ..tokenizer import factory as tokenizer_factory
  
          if args.postcodes:
              LOG.warning("Update postcodes centroid")
@@ -66,6 +67,8 @@ class UpdateRefresh:
              with connect(args.config.get_libpq_dsn()) as conn:
                  refresh.create_functions(conn, args.config,
                                           args.diffs, args.enable_debug_statements)
+                tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+                tokenizer.update_sql_functions(args.config)
  
          if args.wiki_data:
              data_path = Path(args.config.WIKIPEDIA_DATA_PATH
diff --git a/nominatim/clicmd/replication.py b/nominatim/clicmd/replication.py

index c75322d9bf4c5591b9d252bdd288496424dccf54..69939430188838bb69f89bfe9ae041eb3cac1989 100644 (file)
--- a/nominatim/clicmd/replication.py
+++ b/nominatim/clicmd/replication.py
@@ -83,6 +83,7 @@ class UpdateReplication:
      def _update(args):
          from ..tools import replication
          from ..indexer.indexer import Indexer
+        from ..tokenizer import factory as tokenizer_factory
  
          params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
          params.update(base_url=args.config.REPLICATION_URL,
@@ -106,6 +107,8 @@ class UpdateReplication:
                  raise UsageError("Bad argument '--no-index'.")
              recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')
  
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+
          while True:
              with connect(args.config.get_libpq_dsn()) as conn:
                  start = dt.datetime.now(dt.timezone.utc)
@@ -116,7 +119,7 @@ class UpdateReplication:
  
              if state is not replication.UpdateState.NO_CHANGES and args.do_index:
                  index_start = dt.datetime.now(dt.timezone.utc)
-                indexer = Indexer(args.config.get_libpq_dsn(),
+                indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                                    args.threads or 1)
                  indexer.index_boundaries(0, 30)
                  indexer.index_by_rank(0, 30)
diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py

index 2014ff9e2faa32453483cbbef1993def03832bf0..eb0178a9f560f563a867488b5e608e67a0c024ad 100644 (file)
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@@ -56,6 +56,7 @@ class SetupAll:
          from ..tools import refresh
          from ..indexer.indexer import Indexer
          from ..tools import postcodes
+        from ..tokenizer import factory as tokenizer_factory
  
          if args.osm_file and not Path(args.osm_file).is_file():
              LOG.fatal("OSM file '%s' does not exist.", args.osm_file)
@@ -67,12 +68,6 @@ class SetupAll:
                                                      args.no_partitions,
                                                      rouser=args.config.DATABASE_WEBUSER)
  
-            LOG.warning('Installing database module')
-            with connect(args.config.get_libpq_dsn()) as conn:
-                database_import.install_module(args.module_dir, args.project_dir,
-                                               args.config.DATABASE_MODULE_PATH,
-                                               conn=conn)
-
              LOG.warning('Importing OSM data file')
              database_import.import_osm_data(Path(args.osm_file),
                                              args.osm2pgsql_options(0, 1),
@@ -105,22 +100,31 @@ class SetupAll:
          if args.continue_at is None or args.continue_at == 'load-data':
              LOG.warning('Initialise tables')
              with connect(args.config.get_libpq_dsn()) as conn:
-                database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY)
+                database_import.truncate_data_tables(conn)
  
              LOG.warning('Load data into placex table')
              database_import.load_data(args.config.get_libpq_dsn(),
-                                      args.data_dir,
                                        args.threads or psutil.cpu_count() or 1)
  
+        LOG.warning("Setting up tokenizer")
+        if args.continue_at is None or args.continue_at == 'load-data':
+            # (re)initialise the tokenizer data
+            tokenizer = tokenizer_factory.create_tokenizer(args.config)
+        else:
+            # just load the tokenizer
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+
+        if args.continue_at is None or args.continue_at == 'load-data':
              LOG.warning('Calculate postcodes')
-            postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir)
+            postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir,
+                                       tokenizer)
  
          if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
              if args.continue_at is not None and args.continue_at != 'load-data':
                  with connect(args.config.get_libpq_dsn()) as conn:
                      SetupAll._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX)
              LOG.warning('Indexing places')
-            indexer = Indexer(args.config.get_libpq_dsn(),
+            indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                                args.threads or psutil.cpu_count() or 1)
              indexer.index_full(analyse=not args.index_noanalyse)
  
@@ -129,7 +133,9 @@ class SetupAll:
              database_import.create_search_indices(conn, args.config,
                                                    drop=args.no_updates)
              LOG.warning('Create search index for default country names.')
-            database_import.create_country_names(conn, args.config)
+            database_import.create_country_names(conn, tokenizer,
+                                                 args.config.LANGUAGES)
+        tokenizer.finalize_import(args.config)
  
          webdir = args.project_dir / 'website'
          LOG.warning('Setup website at %s', webdir)
diff --git a/nominatim/clicmd/special_phrases.py b/nominatim/clicmd/special_phrases.py

index 99e825925ad63443b305543624cde60572740fee..002960feb2049b9d850ea54aeebc7b1644ba4dd1 100644 (file)
--- a/nominatim/clicmd/special_phrases.py
+++ b/nominatim/clicmd/special_phrases.py
@@ -2,13 +2,15 @@
      Implementation of the 'special-phrases' command.
  """
  import logging
-from nominatim.tools.special_phrases import SpecialPhrasesImporter
+from nominatim.tools import SpecialPhrasesImporter
  from nominatim.db.connection import connect
  
  LOG = logging.getLogger()
  
  # Do not repeat documentation of subcommand classes.
  # pylint: disable=C0111
+# Using non-top-level imports to avoid eventually unused imports.
+# pylint: disable=E0012,C0415
  
  class ImportSpecialPhrases:
      """\
@@ -22,10 +24,13 @@ class ImportSpecialPhrases:
  
      @staticmethod
      def run(args):
+        from ..tokenizer import factory as tokenizer_factory
+
          if args.import_from_wiki:
              LOG.warning('Special phrases importation starting')
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
              with connect(args.config.get_libpq_dsn()) as db_connection:
                  SpecialPhrasesImporter(
                      args.config, args.phplib_dir, db_connection
-                ).import_from_wiki()
+                ).import_from_wiki(tokenizer)
          return 0
diff --git a/nominatim/config.py b/nominatim/config.py

index d1df17b726bac3db70d7c8a788c98a9abbf4401b..72aaf0bd6b436ed78322f39ccec34feb560d542d 100644 (file)
--- a/nominatim/config.py
+++ b/nominatim/config.py
@@ -30,7 +30,7 @@ class Configuration:
          self.project_dir = project_dir
          self.config_dir = config_dir
          self._config = dotenv_values(str((config_dir / 'env.defaults').resolve()))
-        if project_dir is not None:
+        if project_dir is not None and (project_dir / '.env').is_file():
              self._config.update(dotenv_values(str((project_dir / '.env').resolve())))
  
          # Add defaults for variables that are left empty to set the default.
@@ -39,7 +39,7 @@ class Configuration:
              self._config['NOMINATIM_ADDRESS_LEVEL_CONFIG'] = \
                  str(config_dir / 'address-levels.json')
  
-        class _LibDirs: # pylint: disable=too-few-public-methods
+        class _LibDirs:
              pass
  
          self.lib_dir = _LibDirs()
diff --git a/nominatim/db/async_connection.py b/nominatim/db/async_connection.py

index c5d6872bf0790abfaa10f7b07d2567ba0913663d..a4f554965ab9144a89bb609296ee80ca82049ac3 100644 (file)
--- a/nominatim/db/async_connection.py
+++ b/nominatim/db/async_connection.py
@@ -14,7 +14,7 @@ from psycopg2.extras import wait_select
  try:
      import psycopg2.errors # pylint: disable=no-name-in-module,import-error
      __has_psycopg2_errors__ = True
-except ModuleNotFoundError:
+except ImportError:
      __has_psycopg2_errors__ = False
  
  LOG = logging.getLogger()
@@ -48,14 +48,14 @@ class DBConnection:
      """ A single non-blocking database connection.
      """
  
-    def __init__(self, dsn):
+    def __init__(self, dsn, cursor_factory=None):
          self.current_query = None
          self.current_params = None
          self.dsn = dsn
  
          self.conn = None
          self.cursor = None
-        self.connect()
+        self.connect(cursor_factory=cursor_factory)
  
      def close(self):
          """ Close all open connections. Does not wait for pending requests.
@@ -66,7 +66,7 @@ class DBConnection:
  
          self.conn = None
  
-    def connect(self):
+    def connect(self, cursor_factory=None):
          """ (Re)connect to the database. Creates an asynchronous connection
              with JIT and parallel processing disabled. If a connection was
              already open, it is closed and a new connection established.
@@ -79,7 +79,7 @@ class DBConnection:
          self.conn = psycopg2.connect(**{'dsn' : self.dsn, 'async' : True})
          self.wait()
  
-        self.cursor = self.conn.cursor()
+        self.cursor = self.conn.cursor(cursor_factory=cursor_factory)
          # Disable JIT and parallel workers as they are known to cause problems.
          # Update pg_settings instead of using SET because it does not yield
          # errors on older versions of Postgres where the settings are not
diff --git a/nominatim/db/sql_preprocessor.py b/nominatim/db/sql_preprocessor.py

index c7009b34fc0a7b1e9d8a3a0717d8ac875a7c9b41..dafc5de434bb3bf69a69014d7d7e20a2059f9313 100644 (file)
--- a/nominatim/db/sql_preprocessor.py
+++ b/nominatim/db/sql_preprocessor.py
@@ -64,7 +64,7 @@ def _setup_postgresql_features(conn):
          'has_index_non_key_column' : pg_version >= (11, 0, 0)
      }
  
-class SQLPreprocessor: # pylint: disable=too-few-public-methods
+class SQLPreprocessor:
      """ A environment for preprocessing SQL files from the
          lib-sql directory.
  
@@ -89,8 +89,6 @@ class SQLPreprocessor: # pylint: disable=too-few-public-methods
          self.env.globals['db'] = db_info
          self.env.globals['sql'] = _setup_postgres_sql(conn)
          self.env.globals['postgres'] = _setup_postgresql_features(conn)
-        self.env.globals['modulepath'] = config.DATABASE_MODULE_PATH or \
-                                         str((config.project_dir / 'module').resolve())
  
  
      def run_sql_file(self, conn, name, **kwargs):
diff --git a/nominatim/db/status.py b/nominatim/db/status.py

index e63a40f9ba67a6ec81ec43003c4ce1984ab6d9e1..c2ff63dbce91f80c37fd0a4b288cc71c3c0fedc1 100644 (file)
--- a/nominatim/db/status.py
+++ b/nominatim/db/status.py
@@ -9,6 +9,7 @@ from nominatim.tools.exec_utils import get_url
  from nominatim.errors import UsageError
  
  LOG = logging.getLogger()
+ISODATE_FORMAT = '%Y-%m-%dT%H:%M:%S'
  
  def compute_database_date(conn):
      """ Determine the date of the database from the newest object in the
@@ -34,9 +35,9 @@ def compute_database_date(conn):
                    "URL used: %s", node_url)
          raise UsageError("Bad API data.")
  
-    LOG.debug("Found timestamp %s", match[1])
+    LOG.debug("Found timestamp %s", match.group(1))
  
-    return dt.datetime.fromisoformat(match[1]).replace(tzinfo=dt.timezone.utc)
+    return dt.datetime.strptime(match.group(1), ISODATE_FORMAT).replace(tzinfo=dt.timezone.utc)
  
  
  def set_status(conn, date, seq=None, indexed=True):
diff --git a/nominatim/indexer/indexer.py b/nominatim/indexer/indexer.py

index 4f4de2189e57dfde2c5cce5e7ec4274170f4d5b2..b7673abaddc8090a896351c7ad230f372742a739 100644 (file)
--- a/nominatim/indexer/indexer.py
+++ b/nominatim/indexer/indexer.py
@@ -1,155 +1,162 @@
  """
  Main work horse for indexing (computing addresses) the database.
  """
-# pylint: disable=C0111
  import logging
  import select
+import time
  
-import psycopg2
+import psycopg2.extras
  
  from nominatim.indexer.progress import ProgressLogger
+from nominatim.indexer import runners
  from nominatim.db.async_connection import DBConnection
+from nominatim.db.connection import connect
  
  LOG = logging.getLogger()
  
-class RankRunner:
-    """ Returns SQL commands for indexing one rank within the placex table.
+
+class PlaceFetcher:
+    """ Asynchronous connection that fetches place details for processing.
      """
+    def __init__(self, dsn, setup_conn):
+        self.wait_time = 0
+        self.current_ids = None
+        self.conn = DBConnection(dsn, cursor_factory=psycopg2.extras.DictCursor)
+
+        with setup_conn.cursor() as cur:
+            # need to fetch those manually because register_hstore cannot
+            # fetch them on an asynchronous connection below.
+            hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid")
+            hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid")
+
+        psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid,
+                                        array_oid=hstore_array_oid)
+
+    def close(self):
+        """ Close the underlying asynchronous connection.
+        """
+        if self.conn:
+            self.conn.close()
+            self.conn = None
  
-    def __init__(self, rank):
-        self.rank = rank
  
-    def name(self):
-        return "rank {}".format(self.rank)
+    def fetch_next_batch(self, cur, runner):
+        """ Send a request for the next batch of places.
+            If details for the places are required, they will be fetched
+            asynchronously.
  
-    def sql_count_objects(self):
-        return """SELECT count(*) FROM placex
-                  WHERE rank_address = {} and indexed_status > 0
-               """.format(self.rank)
+            Returns true if there is still data available.
+        """
+        ids = cur.fetchmany(100)
  
-    def sql_get_objects(self):
-        return """SELECT place_id FROM placex
-                  WHERE indexed_status > 0 and rank_address = {}
-                  ORDER BY geometry_sector""".format(self.rank)
+        if not ids:
+            self.current_ids = None
+            return False
  
-    @staticmethod
-    def sql_index_place(ids):
-        return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
-               .format(','.join((str(i) for i in ids)))
+        if hasattr(runner, 'get_place_details'):
+            runner.get_place_details(self.conn, ids)
+            self.current_ids = []
+        else:
+            self.current_ids = ids
  
+        return True
  
-class InterpolationRunner:
-    """ Returns SQL commands for indexing the address interpolation table
-        location_property_osmline.
-    """
+    def get_batch(self):
+        """ Get the next batch of data, previously requested with
+            `fetch_next_batch`.
+        """
+        if self.current_ids is not None and not self.current_ids:
+            tstart = time.time()
+            self.conn.wait()
+            self.wait_time += time.time() - tstart
+            self.current_ids = self.conn.cursor.fetchall()
+
+        return self.current_ids
+
+    def __enter__(self):
+        return self
  
-    @staticmethod
-    def name():
-        return "interpolation lines (location_property_osmline)"
-
-    @staticmethod
-    def sql_count_objects():
-        return """SELECT count(*) FROM location_property_osmline
-                  WHERE indexed_status > 0"""
-
-    @staticmethod
-    def sql_get_objects():
-        return """SELECT place_id FROM location_property_osmline
-                  WHERE indexed_status > 0
-                  ORDER BY geometry_sector"""
-
-    @staticmethod
-    def sql_index_place(ids):
-        return """UPDATE location_property_osmline
-                  SET indexed_status = 0 WHERE place_id IN ({})
-               """.format(','.join((str(i) for i in ids)))
-
-class BoundaryRunner:
-    """ Returns SQL commands for indexing the administrative boundaries
-        of a certain rank.
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.conn.wait()
+        self.close()
+
+class WorkerPool:
+    """ A pool of asynchronous database connections.
+
+        The pool may be used as a context manager.
      """
+    REOPEN_CONNECTIONS_AFTER = 100000
  
-    def __init__(self, rank):
-        self.rank = rank
+    def __init__(self, dsn, pool_size):
+        self.threads = [DBConnection(dsn) for _ in range(pool_size)]
+        self.free_workers = self._yield_free_worker()
+        self.wait_time = 0
  
-    def name(self):
-        return "boundaries rank {}".format(self.rank)
  
-    def sql_count_objects(self):
-        return """SELECT count(*) FROM placex
-                  WHERE indexed_status > 0
-                    AND rank_search = {}
-                    AND class = 'boundary' and type = 'administrative'
-               """.format(self.rank)
+    def finish_all(self):
+        """ Wait for all connection to finish.
+        """
+        for thread in self.threads:
+            while not thread.is_done():
+                thread.wait()
  
-    def sql_get_objects(self):
-        return """SELECT place_id FROM placex
-                  WHERE indexed_status > 0 and rank_search = {}
-                        and class = 'boundary' and type = 'administrative'
-                  ORDER BY partition, admin_level
-               """.format(self.rank)
+        self.free_workers = self._yield_free_worker()
  
-    @staticmethod
-    def sql_index_place(ids):
-        return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
-               .format(','.join((str(i) for i in ids)))
+    def close(self):
+        """ Close all connections and clear the pool.
+        """
+        for thread in self.threads:
+            thread.close()
+        self.threads = []
+        self.free_workers = None
  
  
-class PostcodeRunner:
-    """ Provides the SQL commands for indexing the location_postcode table.
-    """
+    def next_free_worker(self):
+        """ Get the next free connection.
+        """
+        return next(self.free_workers)
  
-    @staticmethod
-    def name():
-        return "postcodes (location_postcode)"
  
-    @staticmethod
-    def sql_count_objects():
-        return 'SELECT count(*) FROM location_postcode WHERE indexed_status > 0'
+    def _yield_free_worker(self):
+        ready = self.threads
+        command_stat = 0
+        while True:
+            for thread in ready:
+                if thread.is_done():
+                    command_stat += 1
+                    yield thread
+
+            if command_stat > self.REOPEN_CONNECTIONS_AFTER:
+                for thread in self.threads:
+                    while not thread.is_done():
+                        thread.wait()
+                    thread.connect()
+                ready = self.threads
+                command_stat = 0
+            else:
+                tstart = time.time()
+                _, ready, _ = select.select([], self.threads, [])
+                self.wait_time += time.time() - tstart
  
-    @staticmethod
-    def sql_get_objects():
-        return """SELECT place_id FROM location_postcode
-                  WHERE indexed_status > 0
-                  ORDER BY country_code, postcode"""
  
-    @staticmethod
-    def sql_index_place(ids):
-        return """UPDATE location_postcode SET indexed_status = 0
-                  WHERE place_id IN ({})
-               """.format(','.join((str(i) for i in ids)))
+    def __enter__(self):
+        return self
  
  
-def _analyse_db_if(conn, condition):
-    if condition:
-        with conn.cursor() as cur:
-            cur.execute('ANALYSE')
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.finish_all()
+        self.close()
  
  
  class Indexer:
      """ Main indexing routine.
      """
  
-    def __init__(self, dsn, num_threads):
+    def __init__(self, dsn, tokenizer, num_threads):
          self.dsn = dsn
+        self.tokenizer = tokenizer
          self.num_threads = num_threads
-        self.conn = None
-        self.threads = []
-
-
-    def _setup_connections(self):
-        self.conn = psycopg2.connect(self.dsn)
-        self.threads = [DBConnection(self.dsn) for _ in range(self.num_threads)]
-
-
-    def _close_connections(self):
-        if self.conn:
-            self.conn.close()
-            self.conn = None
-
-        for thread in self.threads:
-            thread.close()
-        self.threads = []
  
  
      def index_full(self, analyse=True):
@@ -158,26 +165,31 @@ class Indexer:
              database will be analysed at the appropriate places to
              ensure that database statistics are updated.
          """
-        conn = psycopg2.connect(self.dsn)
-        conn.autocommit = True
+        with connect(self.dsn) as conn:
+            conn.autocommit = True
+
+            if analyse:
+                def _analyze():
+                    with conn.cursor() as cur:
+                        cur.execute('ANALYZE')
+            else:
+                def _analyze():
+                    pass
  
-        try:
              self.index_by_rank(0, 4)
-            _analyse_db_if(conn, analyse)
+            _analyze()
  
              self.index_boundaries(0, 30)
-            _analyse_db_if(conn, analyse)
+            _analyze()
  
              self.index_by_rank(5, 25)
-            _analyse_db_if(conn, analyse)
+            _analyze()
  
              self.index_by_rank(26, 30)
-            _analyse_db_if(conn, analyse)
+            _analyze()
  
              self.index_postcodes()
-            _analyse_db_if(conn, analyse)
-        finally:
-            conn.close()
+            _analyze()
  
  
      def index_boundaries(self, minrank, maxrank):
@@ -186,13 +198,9 @@ class Indexer:
          LOG.warning("Starting indexing boundaries using %s threads",
                      self.num_threads)
  
-        self._setup_connections()
-
-        try:
+        with self.tokenizer.name_analyzer() as analyzer:
              for rank in range(max(minrank, 4), min(maxrank, 26)):
-                self.index(BoundaryRunner(rank))
-        finally:
-            self._close_connections()
+                self._index(runners.BoundaryRunner(rank, analyzer))
  
      def index_by_rank(self, minrank, maxrank):
          """ Index all entries of placex in the given rank range (inclusive)
@@ -205,20 +213,16 @@ class Indexer:
          LOG.warning("Starting indexing rank (%i to %i) using %i threads",
                      minrank, maxrank, self.num_threads)
  
-        self._setup_connections()
-
-        try:
+        with self.tokenizer.name_analyzer() as analyzer:
              for rank in range(max(1, minrank), maxrank):
-                self.index(RankRunner(rank))
+                self._index(runners.RankRunner(rank, analyzer))
  
              if maxrank == 30:
-                self.index(RankRunner(0))
-                self.index(InterpolationRunner(), 20)
-                self.index(RankRunner(30), 20)
+                self._index(runners.RankRunner(0, analyzer))
+                self._index(runners.InterpolationRunner(analyzer), 20)
+                self._index(runners.RankRunner(30, analyzer), 20)
              else:
-                self.index(RankRunner(maxrank))
-        finally:
-            self._close_connections()
+                self._index(runners.RankRunner(maxrank, analyzer))
  
  
      def index_postcodes(self):
@@ -226,89 +230,58 @@ class Indexer:
          """
          LOG.warning("Starting indexing postcodes using %s threads", self.num_threads)
  
-        self._setup_connections()
+        self._index(runners.PostcodeRunner(), 20)
  
-        try:
-            self.index(PostcodeRunner(), 20)
-        finally:
-            self._close_connections()
  
      def update_status_table(self):
          """ Update the status in the status table to 'indexed'.
          """
-        conn = psycopg2.connect(self.dsn)
-
-        try:
+        with connect(self.dsn) as conn:
              with conn.cursor() as cur:
                  cur.execute('UPDATE import_status SET indexed = true')
  
              conn.commit()
-        finally:
-            conn.close()
  
-    def index(self, obj, batch=1):
-        """ Index a single rank or table. `obj` describes the SQL to use
+    def _index(self, runner, batch=1):
+        """ Index a single rank or table. `runner` describes the SQL to use
              for indexing. `batch` describes the number of objects that
              should be processed with a single SQL statement
          """
-        LOG.warning("Starting %s (using batch size %s)", obj.name(), batch)
+        LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)
  
-        cur = self.conn.cursor()
-        cur.execute(obj.sql_count_objects())
-
-        total_tuples = cur.fetchone()[0]
-        LOG.debug("Total number of rows: %i", total_tuples)
+        with connect(self.dsn) as conn:
+            psycopg2.extras.register_hstore(conn)
+            with conn.cursor() as cur:
+                total_tuples = cur.scalar(runner.sql_count_objects())
+                LOG.debug("Total number of rows: %i", total_tuples)
  
-        cur.close()
+            conn.commit()
  
-        progress = ProgressLogger(obj.name(), total_tuples)
+            progress = ProgressLogger(runner.name(), total_tuples)
  
-        if total_tuples > 0:
-            cur = self.conn.cursor(name='places')
-            cur.execute(obj.sql_get_objects())
+            if total_tuples > 0:
+                with conn.cursor(name='places') as cur:
+                    cur.execute(runner.sql_get_objects())
  
-            next_thread = self.find_free_thread()
-            while True:
-                places = [p[0] for p in cur.fetchmany(batch)]
-                if not places:
-                    break
+                    with PlaceFetcher(self.dsn, conn) as fetcher:
+                        with WorkerPool(self.dsn, self.num_threads) as pool:
+                            has_more = fetcher.fetch_next_batch(cur, runner)
+                            while has_more:
+                                places = fetcher.get_batch()
  
-                LOG.debug("Processing places: %s", str(places))
-                thread = next(next_thread)
+                                # asynchronously get the next batch
+                                has_more = fetcher.fetch_next_batch(cur, runner)
  
-                thread.perform(obj.sql_index_place(places))
-                progress.add(len(places))
+                                # And insert the curent batch
+                                for idx in range(0, len(places), batch):
+                                    part = places[idx:idx+batch]
+                                    LOG.debug("Processing places: %s", str(part))
+                                    runner.index_places(pool.next_free_worker(), part)
+                                    progress.add(len(part))
  
-            cur.close()
+                            LOG.info("Wait time: fetcher: %.2fs,  pool: %.2fs",
+                                     fetcher.wait_time, pool.wait_time)
  
-            for thread in self.threads:
-                thread.wait()
+                conn.commit()
  
          progress.done()
-
-    def find_free_thread(self):
-        """ Generator that returns the next connection that is free for
-            sending a query.
-        """
-        ready = self.threads
-        command_stat = 0
-
-        while True:
-            for thread in ready:
-                if thread.is_done():
-                    command_stat += 1
-                    yield thread
-
-            # refresh the connections occasionaly to avoid potential
-            # memory leaks in Postgresql.
-            if command_stat > 100000:
-                for thread in self.threads:
-                    while not thread.is_done():
-                        thread.wait()
-                    thread.connect()
-                command_stat = 0
-                ready = self.threads
-            else:
-                ready, _, _ = select.select(self.threads, [], [])
-
-        assert False, "Unreachable code"
diff --git a/nominatim/indexer/runners.py b/nominatim/indexer/runners.py

new file mode 100644 (file)

index 0000000..aa607fa
--- /dev/null
+++ b/nominatim/indexer/runners.py
@@ -0,0 +1,162 @@
+"""
+Mix-ins that provide the actual commands for the indexer for various indexing
+tasks.
+"""
+import functools
+
+import psycopg2.extras
+
+# pylint: disable=C0111
+
+class AbstractPlacexRunner:
+    """ Returns SQL commands for indexing of the placex table.
+    """
+    SELECT_SQL = 'SELECT place_id FROM placex'
+
+    def __init__(self, rank, analyzer):
+        self.rank = rank
+        self.analyzer = analyzer
+
+
+    @staticmethod
+    @functools.lru_cache(maxsize=1)
+    def _index_sql(num_places):
+        return """ UPDATE placex
+                   SET indexed_status = 0, address = v.addr, token_info = v.ti
+                   FROM (VALUES {}) as v(id, addr, ti)
+                   WHERE place_id = v.id
+               """.format(','.join(["(%s, %s::hstore, %s::jsonb)"]  * num_places))
+
+
+    @staticmethod
+    def get_place_details(worker, ids):
+        worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
+                          FROM placex WHERE place_id IN %s""",
+                       (tuple((p[0] for p in ids)), ))
+
+
+    def index_places(self, worker, places):
+        values = []
+        for place in places:
+            values.extend((place[x] for x in ('place_id', 'address')))
+            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+
+        worker.perform(self._index_sql(len(places)), values)
+
+
+class RankRunner(AbstractPlacexRunner):
+    """ Returns SQL commands for indexing one rank within the placex table.
+    """
+
+    def name(self):
+        return "rank {}".format(self.rank)
+
+    def sql_count_objects(self):
+        return """SELECT count(*) FROM placex
+                  WHERE rank_address = {} and indexed_status > 0
+               """.format(self.rank)
+
+    def sql_get_objects(self):
+        return """{} WHERE indexed_status > 0 and rank_address = {}
+                     ORDER BY geometry_sector
+               """.format(self.SELECT_SQL, self.rank)
+
+
+class BoundaryRunner(AbstractPlacexRunner):
+    """ Returns SQL commands for indexing the administrative boundaries
+        of a certain rank.
+    """
+
+    def name(self):
+        return "boundaries rank {}".format(self.rank)
+
+    def sql_count_objects(self):
+        return """SELECT count(*) FROM placex
+                  WHERE indexed_status > 0
+                    AND rank_search = {}
+                    AND class = 'boundary' and type = 'administrative'
+               """.format(self.rank)
+
+    def sql_get_objects(self):
+        return """{} WHERE indexed_status > 0 and rank_search = {}
+                           and class = 'boundary' and type = 'administrative'
+                     ORDER BY partition, admin_level
+               """.format(self.SELECT_SQL, self.rank)
+
+
+class InterpolationRunner:
+    """ Returns SQL commands for indexing the address interpolation table
+        location_property_osmline.
+    """
+
+    def __init__(self, analyzer):
+        self.analyzer = analyzer
+
+
+    @staticmethod
+    def name():
+        return "interpolation lines (location_property_osmline)"
+
+    @staticmethod
+    def sql_count_objects():
+        return """SELECT count(*) FROM location_property_osmline
+                  WHERE indexed_status > 0"""
+
+    @staticmethod
+    def sql_get_objects():
+        return """SELECT place_id
+                  FROM location_property_osmline
+                  WHERE indexed_status > 0
+                  ORDER BY geometry_sector"""
+
+
+    @staticmethod
+    def get_place_details(worker, ids):
+        worker.perform("""SELECT place_id, get_interpolation_address(address, osm_id) as address
+                          FROM location_property_osmline WHERE place_id IN %s""",
+                       (tuple((p[0] for p in ids)), ))
+
+
+    @staticmethod
+    @functools.lru_cache(maxsize=1)
+    def _index_sql(num_places):
+        return """ UPDATE location_property_osmline
+                   SET indexed_status = 0, address = v.addr, token_info = v.ti
+                   FROM (VALUES {}) as v(id, addr, ti)
+                   WHERE place_id = v.id
+               """.format(','.join(["(%s, %s::hstore, %s::jsonb)"]  * num_places))
+
+
+    def index_places(self, worker, places):
+        values = []
+        for place in places:
+            values.extend((place[x] for x in ('place_id', 'address')))
+            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+
+        worker.perform(self._index_sql(len(places)), values)
+
+
+
+class PostcodeRunner:
+    """ Provides the SQL commands for indexing the location_postcode table.
+    """
+
+    @staticmethod
+    def name():
+        return "postcodes (location_postcode)"
+
+    @staticmethod
+    def sql_count_objects():
+        return 'SELECT count(*) FROM location_postcode WHERE indexed_status > 0'
+
+    @staticmethod
+    def sql_get_objects():
+        return """SELECT place_id FROM location_postcode
+                  WHERE indexed_status > 0
+                  ORDER BY country_code, postcode"""
+
+    @staticmethod
+    def index_places(worker, ids):
+        worker.perform(""" UPDATE location_postcode SET indexed_status = 0
+                           WHERE place_id IN ({})
+                       """.format(','.join((str(i[0]) for i in ids))))
diff --git a/nominatim/tokenizer/__init__.py b/nominatim/tokenizer/__init__.py

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/nominatim/tokenizer/factory.py b/nominatim/tokenizer/factory.py

new file mode 100644 (file)

index 0000000..e0c0629
--- /dev/null
+++ b/nominatim/tokenizer/factory.py
@@ -0,0 +1,88 @@
+"""
+Functions for creating a tokenizer or initialising the right one for an
+existing database.
+
+A tokenizer is something that is bound to the lifetime of a database. It
+can be choosen and configured before the intial import but then needs to
+be used consistently when querying and updating the database.
+
+This module provides the functions to create and configure a new tokenizer
+as well as instanciating the appropriate tokenizer for updating an existing
+database.
+
+A tokenizer usually also includes PHP code for querying. The appropriate PHP
+normalizer module is installed, when the tokenizer is created.
+"""
+import logging
+import importlib
+
+from ..errors import UsageError
+from ..db import properties
+from ..db.connection import connect
+
+LOG = logging.getLogger()
+
+def _import_tokenizer(name):
+    """ Load the tokenizer.py module from project directory.
+    """
+    try:
+        return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
+    except ModuleNotFoundError as exp:
+        LOG.fatal("No tokenizer named '%s' available. "
+                  "Check the setting of NOMINATIM_TOKENIZER.", name)
+        raise UsageError('Tokenizer not found') from exp
+
+
+def create_tokenizer(config, init_db=True, module_name=None):
+    """ Create a new tokenizer as defined by the given configuration.
+
+        The tokenizer data and code is copied into the 'tokenizer' directory
+        of the project directory and the tokenizer loaded from its new location.
+    """
+    if module_name is None:
+        module_name = config.TOKENIZER
+
+    # Create the directory for the tokenizer data
+    basedir = config.project_dir / 'tokenizer'
+    if not basedir.exists():
+        basedir.mkdir()
+    elif not basedir.is_dir():
+        LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
+        raise UsageError("Tokenizer setup failed.")
+
+    # Import and initialize the tokenizer.
+    tokenizer_module = _import_tokenizer(module_name)
+
+    tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
+    tokenizer.init_new_db(config, init_db=init_db)
+
+    with connect(config.get_libpq_dsn()) as conn:
+        properties.set_property(conn, 'tokenizer', module_name)
+
+    return tokenizer
+
+
+def get_tokenizer_for_db(config):
+    """ Instantiate a tokenizer for an existing database.
+
+        The function looks up the appropriate tokenizer in the database
+        and initialises it.
+    """
+    basedir = config.project_dir / 'tokenizer'
+    if not basedir.is_dir():
+        LOG.fatal("Cannot find tokenizer data in '%s'.", basedir)
+        raise UsageError('Cannot initialize tokenizer.')
+
+    with connect(config.get_libpq_dsn()) as conn:
+        name = properties.get_property(conn, 'tokenizer')
+
+    if name is None:
+        LOG.fatal("Tokenizer was not set up properly. Database property missing.")
+        raise UsageError('Cannot initialize tokenizer.')
+
+    tokenizer_module = _import_tokenizer(name)
+
+    tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
+    tokenizer.init_from_project()
+
+    return tokenizer
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

new file mode 100644 (file)

index 0000000..065fdb0
--- /dev/null
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -0,0 +1,632 @@
+"""
+Tokenizer implementing normalisation as used before Nominatim 4 but using
+libICU instead of the PostgreSQL module.
+"""
+from collections import Counter
+import functools
+import io
+import itertools
+import json
+import logging
+import re
+from textwrap import dedent
+from pathlib import Path
+
+from icu import Transliterator
+import psycopg2.extras
+
+from nominatim.db.connection import connect
+from nominatim.db.properties import set_property, get_property
+from nominatim.db.sql_preprocessor import SQLPreprocessor
+
+DBCFG_NORMALIZATION = "tokenizer_normalization"
+DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
+DBCFG_TRANSLITERATION = "tokenizer_transliteration"
+DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
+
+LOG = logging.getLogger()
+
+def create(dsn, data_dir):
+    """ Create a new instance of the tokenizer provided by this module.
+    """
+    return LegacyICUTokenizer(dsn, data_dir)
+
+
+class LegacyICUTokenizer:
+    """ This tokenizer uses libICU to covert names and queries to ASCII.
+        Otherwise it uses the same algorithms and data structures as the
+        normalization routines in Nominatim 3.
+    """
+
+    def __init__(self, dsn, data_dir):
+        self.dsn = dsn
+        self.data_dir = data_dir
+        self.normalization = None
+        self.transliteration = None
+        self.abbreviations = None
+
+
+    def init_new_db(self, config, init_db=True):
+        """ Set up a new tokenizer for the database.
+
+            This copies all necessary data in the project directory to make
+            sure the tokenizer remains stable even over updates.
+        """
+        if config.TOKENIZER_CONFIG:
+            cfgfile = Path(config.TOKENIZER_CONFIG)
+        else:
+            cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
+
+        rules = json.loads(cfgfile.read_text())
+        self.transliteration = ';'.join(rules['normalization']) + ';'
+        self.abbreviations = rules["abbreviations"]
+        self.normalization = config.TERM_NORMALIZATION
+
+        self._install_php(config)
+        self._save_config(config)
+
+        if init_db:
+            self.update_sql_functions(config)
+            self._init_db_tables(config)
+
+
+    def init_from_project(self):
+        """ Initialise the tokenizer from the project directory.
+        """
+        with connect(self.dsn) as conn:
+            self.normalization = get_property(conn, DBCFG_NORMALIZATION)
+            self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
+            self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
+
+
+    def finalize_import(self, config):
+        """ Do any required postprocessing to make the tokenizer data ready
+            for use.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+
+
+    def update_sql_functions(self, config):
+        """ Reimport the SQL functions for this tokenizer.
+        """
+        with connect(self.dsn) as conn:
+            max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
+                              max_word_freq=max_word_freq)
+
+
+    def check_database(self):
+        """ Check that the tokenizer is set up correctly.
+        """
+        self.init_from_project()
+
+        if self.normalization is None\
+           or self.transliteration is None\
+           or self.abbreviations is None:
+            return "Configuration for tokenizer 'legacy_icu' are missing."
+
+        return None
+
+
+    def name_analyzer(self):
+        """ Create a new analyzer for tokenizing names and queries
+            using this tokinzer. Analyzers are context managers and should
+            be used accordingly:
+
+            ```
+            with tokenizer.name_analyzer() as analyzer:
+                analyser.tokenize()
+            ```
+
+            When used outside the with construct, the caller must ensure to
+            call the close() function before destructing the analyzer.
+
+            Analyzers are not thread-safe. You need to instantiate one per thread.
+        """
+        norm = Transliterator.createFromRules("normalizer", self.normalization)
+        trans = Transliterator.createFromRules("trans", self.transliteration)
+        return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
+
+
+    def _install_php(self, config):
+        """ Install the php script for the tokenizer.
+        """
+        abbr_inverse = list(zip(*self.abbreviations))
+        php_file = self.data_dir / "tokenizer.php"
+        php_file.write_text(dedent("""\
+            <?php
+            @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
+            @define('CONST_Term_Normalization_Rules', "{0.normalization}");
+            @define('CONST_Transliteration', "{0.transliteration}");
+            @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
+            require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
+            """.format(self, config,
+                       "','".join(abbr_inverse[0]),
+                       "','".join(abbr_inverse[1]))))
+
+
+    def _save_config(self, config):
+        """ Save the configuration that needs to remain stable for the given
+            database as database properties.
+        """
+        with connect(self.dsn) as conn:
+            set_property(conn, DBCFG_NORMALIZATION, self.normalization)
+            set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
+            set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
+            set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
+
+
+    def _init_db_tables(self, config):
+        """ Set up the word table and fill it with pre-computed word
+            frequencies.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
+            conn.commit()
+
+            LOG.warning("Precomputing word tokens")
+
+            # get partial words and their frequencies
+            words = Counter()
+            with self.name_analyzer() as analyzer:
+                with conn.cursor(name="words") as cur:
+                    cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
+
+                    for name, cnt in cur:
+                        term = analyzer.make_standard_word(name)
+                        if term:
+                            for word in term.split():
+                                words[word] += cnt
+
+            # copy them back into the word table
+            copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
+
+
+            with conn.cursor() as cur:
+                copystr.seek(0)
+                cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
+                cur.execute("""UPDATE word SET word_id = nextval('seq_word')
+                               WHERE word_id is null""")
+
+            conn.commit()
+
+
+class LegacyICUNameAnalyzer:
+    """ The legacy analyzer uses the ICU library for splitting names.
+
+        Each instance opens a connection to the database to request the
+        normalization.
+    """
+
+    def __init__(self, dsn, normalizer, transliterator, abbreviations):
+        self.conn = connect(dsn).connection
+        self.conn.autocommit = True
+        self.normalizer = normalizer
+        self.transliterator = transliterator
+        self.abbreviations = abbreviations
+
+        self._cache = _TokenCache()
+
+
+    def __enter__(self):
+        return self
+
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+
+    def close(self):
+        """ Free all resources used by the analyzer.
+        """
+        if self.conn:
+            self.conn.close()
+            self.conn = None
+
+
+    def get_word_token_info(self, conn, words):
+        """ Return token information for the given list of words.
+            If a word starts with # it is assumed to be a full name
+            otherwise is a partial name.
+
+            The function returns a list of tuples with
+            (original word, word token, word id).
+
+            The function is used for testing and debugging only
+            and not necessarily efficient.
+        """
+        tokens = {}
+        for word in words:
+            if word.startswith('#'):
+                tokens[word] = ' ' + self.make_standard_word(word[1:])
+            else:
+                tokens[word] = self.make_standard_word(word)
+
+        with conn.cursor() as cur:
+            cur.execute("""SELECT word_token, word_id
+                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
+                           WHERE word_token = t.term
+                                 and class is null and country_code is null""",
+                        (list(tokens.values()), ))
+            ids = {r[0]: r[1] for r in cur}
+
+        return [(k, v, ids[v]) for k, v in tokens.items()]
+
+
+    def normalize(self, phrase):
+        """ Normalize the given phrase, i.e. remove all properties that
+            are irrelevant for search.
+        """
+        return self.normalizer.transliterate(phrase)
+
+    @functools.lru_cache(maxsize=1024)
+    def make_standard_word(self, name):
+        """ Create the normalised version of the input.
+        """
+        norm = ' ' + self.transliterator.transliterate(name) + ' '
+        for full, abbr in self.abbreviations:
+            if full in norm:
+                norm = norm.replace(full, abbr)
+
+        return norm.strip()
+
+
+    def _make_standard_hnr(self, hnr):
+        """ Create a normalised version of a housenumber.
+
+            This function takes minor shortcuts on transliteration.
+        """
+        if hnr.isdigit():
+            return hnr
+
+        return self.transliterator.transliterate(hnr)
+
+    def add_postcodes_from_db(self):
+        """ Add postcodes from the location_postcode table to the word table.
+        """
+        copystr = io.StringIO()
+        with self.conn.cursor() as cur:
+            cur.execute("SELECT distinct(postcode) FROM location_postcode")
+            for (postcode, ) in cur:
+                copystr.write(postcode)
+                copystr.write('\t ')
+                copystr.write(self.transliterator.transliterate(postcode))
+                copystr.write('\tplace\tpostcode\t0\n')
+
+            copystr.seek(0)
+            cur.copy_from(copystr, 'word',
+                          columns=['word', 'word_token', 'class', 'type',
+                                   'search_name_count'])
+            # Don't really need an ID for postcodes....
+            # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
+            #                WHERE word_id is null and type = 'postcode'""")
+
+
+    def update_special_phrases(self, phrases):
+        """ Replace the search index for special phrases with the new phrases.
+        """
+        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
+                            for p in phrases))
+
+        with self.conn.cursor() as cur:
+            # Get the old phrases.
+            existing_phrases = set()
+            cur.execute("""SELECT word, class, type, operator FROM word
+                           WHERE class != 'place'
+                                 OR (type != 'house' AND type != 'postcode')""")
+            for label, cls, typ, oper in cur:
+                existing_phrases.add((label, cls, typ, oper or '-'))
+
+            to_add = norm_phrases - existing_phrases
+            to_delete = existing_phrases - norm_phrases
+
+            if to_add:
+                copystr = io.StringIO()
+                for word, cls, typ, oper in to_add:
+                    term = self.make_standard_word(word)
+                    if term:
+                        copystr.write(word)
+                        copystr.write('\t ')
+                        copystr.write(term)
+                        copystr.write('\t')
+                        copystr.write(cls)
+                        copystr.write('\t')
+                        copystr.write(typ)
+                        copystr.write('\t')
+                        copystr.write(oper if oper in ('in', 'near')  else '\\N')
+                        copystr.write('\t0\n')
+
+                copystr.seek(0)
+                cur.copy_from(copystr, 'word',
+                              columns=['word', 'word_token', 'class', 'type',
+                                       'operator', 'search_name_count'])
+
+            if to_delete:
+                psycopg2.extras.execute_values(
+                    cur,
+                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                        WHERE word = name and class = in_class and type = in_type
+                              and ((op = '-' and operator is null) or op = operator)""",
+                    to_delete)
+
+        LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
+                 len(norm_phrases), len(to_add), len(to_delete))
+
+
+    def add_country_names(self, country_code, names):
+        """ Add names for the given country to the search index.
+        """
+        full_names = set((self.make_standard_word(n) for n in names))
+        full_names.discard('')
+        self._add_normalized_country_names(country_code, full_names)
+
+
+    def _add_normalized_country_names(self, country_code, names):
+        """ Add names for the given country to the search index.
+        """
+        word_tokens = set((' ' + name for name in names))
+        with self.conn.cursor() as cur:
+            # Get existing names
+            cur.execute("SELECT word_token FROM word WHERE country_code = %s",
+                        (country_code, ))
+            word_tokens.difference_update((t[0] for t in cur))
+
+            if word_tokens:
+                cur.execute("""INSERT INTO word (word_id, word_token, country_code,
+                                                 search_name_count)
+                               (SELECT nextval('seq_word'), token, '{}', 0
+                                FROM unnest(%s) as token)
+                            """.format(country_code), (list(word_tokens),))
+
+
+    def process_place(self, place):
+        """ Determine tokenizer information about the given place.
+
+            Returns a JSON-serialisable structure that will be handed into
+            the database via the token_info field.
+        """
+        token_info = _TokenInfo(self._cache)
+
+        names = place.get('name')
+
+        if names:
+            full_names = set((self.make_standard_word(name) for name in names.values()))
+            full_names.discard('')
+
+            token_info.add_names(self.conn, full_names)
+
+            country_feature = place.get('country_feature')
+            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
+                self._add_normalized_country_names(country_feature.lower(),
+                                                   full_names)
+
+        address = place.get('address')
+
+        if address:
+            hnrs = []
+            addr_terms = []
+            for key, value in address.items():
+                if key == 'postcode':
+                    self._add_postcode(value)
+                elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                    hnrs.append(value)
+                elif key == 'street':
+                    token_info.add_street(self.conn, self.make_standard_word(value))
+                elif key == 'place':
+                    token_info.add_place(self.conn, self.make_standard_word(value))
+                elif not key.startswith('_') and \
+                     key not in ('country', 'full'):
+                    addr_terms.append((key, self.make_standard_word(value)))
+
+            if hnrs:
+                hnrs = self._split_housenumbers(hnrs)
+                token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
+
+            if addr_terms:
+                token_info.add_address_terms(self.conn, addr_terms)
+
+        return token_info.data
+
+
+    def _add_postcode(self, postcode):
+        """ Make sure the normalized postcode is present in the word table.
+        """
+        if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
+            term = self.make_standard_word(postcode)
+            if not term:
+                return
+
+            with self.conn.cursor() as cur:
+                # no word_id needed for postcodes
+                cur.execute("""INSERT INTO word (word, word_token, class, type,
+                                                 search_name_count)
+                               (SELECT pc, %s, 'place', 'postcode', 0
+                                FROM (VALUES (%s)) as v(pc)
+                                WHERE NOT EXISTS
+                                 (SELECT * FROM word
+                                  WHERE word = pc and class='place' and type='postcode'))
+                            """, (' ' + term, postcode))
+            self._cache.postcodes.add(postcode)
+
+    @staticmethod
+    def _split_housenumbers(hnrs):
+        if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
+            # split numbers if necessary
+            simple_list = []
+            for hnr in hnrs:
+                simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
+
+            if len(simple_list) > 1:
+                hnrs = list(set(simple_list))
+            else:
+                hnrs = simple_list
+
+        return hnrs
+
+
+
+
+class _TokenInfo:
+    """ Collect token information to be sent back to the database.
+    """
+    def __init__(self, cache):
+        self.cache = cache
+        self.data = {}
+
+    @staticmethod
+    def _mk_array(tokens):
+        return '{%s}' % ','.join((str(s) for s in tokens))
+
+
+    def add_names(self, conn, names):
+        """ Adds token information for the normalised names.
+        """
+        # Start with all partial names
+        terms = set((part for ns in names for part in ns.split()))
+        # Add partials for the full terms (TO BE REMOVED)
+        terms.update((n for n in names))
+        # Add the full names
+        terms.update((' ' + n for n in names))
+
+        self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
+
+
+    def add_housenumbers(self, conn, hnrs):
+        """ Extract housenumber information from a list of normalised
+            housenumbers.
+        """
+        self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
+        self.data['hnr'] = ';'.join(hnrs)
+
+
+    def add_street(self, conn, street):
+        """ Add addr:street match terms.
+        """
+        if not street:
+            return
+
+        term = ' ' + street
+
+        tid = self.cache.names.get(term)
+
+        if tid is None:
+            with conn.cursor() as cur:
+                cur.execute("""SELECT word_id FROM word
+                                WHERE word_token = %s
+                                      and class is null and type is null""",
+                            (term, ))
+                if cur.rowcount > 0:
+                    tid = cur.fetchone()[0]
+                    self.cache.names[term] = tid
+
+        if tid is not None:
+            self.data['street'] = '{%d}' % tid
+
+
+    def add_place(self, conn, place):
+        """ Add addr:place search and match terms.
+        """
+        if not place:
+            return
+
+        partial_ids = self.cache.get_term_tokens(conn, place.split())
+        tid = self.cache.get_term_tokens(conn, [' ' + place])
+
+        self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
+        self.data['place_match'] = '{%s}' % tid[0]
+
+
+    def add_address_terms(self, conn, terms):
+        """ Add additional address terms.
+        """
+        tokens = {}
+
+        for key, value in terms:
+            if not value:
+                continue
+            partial_ids = self.cache.get_term_tokens(conn, value.split())
+            term = ' ' + value
+            tid = self.cache.names.get(term)
+
+            if tid is None:
+                with conn.cursor() as cur:
+                    cur.execute("""SELECT word_id FROM word
+                                    WHERE word_token = %s
+                                          and class is null and type is null""",
+                                (term, ))
+                    if cur.rowcount > 0:
+                        tid = cur.fetchone()[0]
+                        self.cache.names[term] = tid
+
+            tokens[key] = [self._mk_array(partial_ids),
+                           '{%s}' % ('' if tid is None else str(tid))]
+
+        if tokens:
+            self.data['addr'] = tokens
+
+
+class _TokenCache:
+    """ Cache for token information to avoid repeated database queries.
+
+        This cache is not thread-safe and needs to be instantiated per
+        analyzer.
+    """
+    def __init__(self):
+        self.names = {}
+        self.postcodes = set()
+        self.housenumbers = {}
+
+
+    def get_term_tokens(self, conn, terms):
+        """ Get token ids for a list of terms, looking them up in the database
+            if necessary.
+        """
+        tokens = []
+        askdb = []
+
+        for term in terms:
+            token = self.names.get(term)
+            if token is None:
+                askdb.append(term)
+            elif token != 0:
+                tokens.append(token)
+
+        if askdb:
+            with conn.cursor() as cur:
+                cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
+                            (askdb, ))
+                for term, tid in cur:
+                    self.names[term] = tid
+                    if tid != 0:
+                        tokens.append(tid)
+
+        return tokens
+
+
+    def get_hnr_tokens(self, conn, terms):
+        """ Get token ids for a list of housenumbers, looking them up in the
+            database if necessary.
+        """
+        tokens = []
+        askdb = []
+
+        for term in terms:
+            token = self.housenumbers.get(term)
+            if token is None:
+                askdb.append(term)
+            else:
+                tokens.append(token)
+
+        if askdb:
+            with conn.cursor() as cur:
+                cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
+                            (askdb, ))
+                for term, tid in cur:
+                    self.housenumbers[term] = tid
+                    tokens.append(tid)
+
+        return tokens
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py

new file mode 100644 (file)

index 0000000..438a5af
--- /dev/null
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -0,0 +1,567 @@
+"""
+Tokenizer implementing normalisation as used before Nominatim 4.
+"""
+from collections import OrderedDict
+import logging
+import re
+import shutil
+from textwrap import dedent
+
+from icu import Transliterator
+import psycopg2
+import psycopg2.extras
+
+from nominatim.db.connection import connect
+from nominatim.db import properties
+from nominatim.db import utils as db_utils
+from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.errors import UsageError
+
+DBCFG_NORMALIZATION = "tokenizer_normalization"
+DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
+
+LOG = logging.getLogger()
+
+def create(dsn, data_dir):
+    """ Create a new instance of the tokenizer provided by this module.
+    """
+    return LegacyTokenizer(dsn, data_dir)
+
+
+def _install_module(config_module_path, src_dir, module_dir):
+    """ Copies the PostgreSQL normalisation module into the project
+        directory if necessary. For historical reasons the module is
+        saved in the '/module' subdirectory and not with the other tokenizer
+        data.
+
+        The function detects when the installation is run from the
+        build directory. It doesn't touch the module in that case.
+    """
+    # Custom module locations are simply used as is.
+    if config_module_path:
+        LOG.info("Using custom path for database module at '%s'", config_module_path)
+        return config_module_path
+
+    # Compatibility mode for builddir installations.
+    if module_dir.exists() and src_dir.samefile(module_dir):
+        LOG.info('Running from build directory. Leaving database module as is.')
+        return module_dir
+
+    # In any other case install the module in the project directory.
+    if not module_dir.exists():
+        module_dir.mkdir()
+
+    destfile = module_dir / 'nominatim.so'
+    shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
+    destfile.chmod(0o755)
+
+    LOG.info('Database module installed at %s', str(destfile))
+
+    return module_dir
+
+
+def _check_module(module_dir, conn):
+    """ Try to use the PostgreSQL module to confirm that it is correctly
+        installed and accessible from PostgreSQL.
+    """
+    with conn.cursor() as cur:
+        try:
+            cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
+                           RETURNS text AS '{}/nominatim.so', 'transliteration'
+                           LANGUAGE c IMMUTABLE STRICT;
+                           DROP FUNCTION nominatim_test_import_func(text)
+                        """.format(module_dir))
+        except psycopg2.DatabaseError as err:
+            LOG.fatal("Error accessing database module: %s", err)
+            raise UsageError("Database module cannot be accessed.") from err
+
+
+class LegacyTokenizer:
+    """ The legacy tokenizer uses a special PostgreSQL module to normalize
+        names and queries. The tokenizer thus implements normalization through
+        calls to the database.
+    """
+
+    def __init__(self, dsn, data_dir):
+        self.dsn = dsn
+        self.data_dir = data_dir
+        self.normalization = None
+
+
+    def init_new_db(self, config, init_db=True):
+        """ Set up a new tokenizer for the database.
+
+            This copies all necessary data in the project directory to make
+            sure the tokenizer remains stable even over updates.
+        """
+        module_dir = _install_module(config.DATABASE_MODULE_PATH,
+                                     config.lib_dir.module,
+                                     config.project_dir / 'module')
+
+        self.normalization = config.TERM_NORMALIZATION
+
+        self._install_php(config)
+
+        with connect(self.dsn) as conn:
+            _check_module(module_dir, conn)
+            self._save_config(conn, config)
+            conn.commit()
+
+        if init_db:
+            self.update_sql_functions(config)
+            self._init_db_tables(config)
+
+
+    def init_from_project(self):
+        """ Initialise the tokenizer from the project directory.
+        """
+        with connect(self.dsn) as conn:
+            self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
+
+
+    def finalize_import(self, config):
+        """ Do any required postprocessing to make the tokenizer data ready
+            for use.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+
+
+    def update_sql_functions(self, config):
+        """ Reimport the SQL functions for this tokenizer.
+        """
+        with connect(self.dsn) as conn:
+            max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
+            modulepath = config.DATABASE_MODULE_PATH or \
+                         str((config.project_dir / 'module').resolve())
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
+                              max_word_freq=max_word_freq,
+                              modulepath=modulepath)
+
+
+    def check_database(self):
+        """ Check that the tokenizer is set up correctly.
+        """
+        hint = """\
+             The Postgresql extension nominatim.so was not correctly loaded.
+
+             Error: {error}
+
+             Hints:
+             * Check the output of the CMmake/make installation step
+             * Does nominatim.so exist?
+             * Does nominatim.so exist on the database server?
+             * Can nominatim.so be accessed by the database user?
+             """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                try:
+                    out = cur.scalar("SELECT make_standard_name('a')")
+                except psycopg2.Error as err:
+                    return hint.format(error=str(err))
+
+        if out != 'a':
+            return hint.format(error='Unexpected result for make_standard_name()')
+
+        return None
+
+
+    def migrate_database(self, config):
+        """ Initialise the project directory of an existing database for
+            use with this tokenizer.
+
+            This is a special migration function for updating existing databases
+            to new software versions.
+        """
+        self.normalization = config.TERM_NORMALIZATION
+        module_dir = _install_module(config.DATABASE_MODULE_PATH,
+                                     config.lib_dir.module,
+                                     config.project_dir / 'module')
+
+        with connect(self.dsn) as conn:
+            _check_module(module_dir, conn)
+            self._save_config(conn, config)
+
+
+    def name_analyzer(self):
+        """ Create a new analyzer for tokenizing names and queries
+            using this tokinzer. Analyzers are context managers and should
+            be used accordingly:
+
+            ```
+            with tokenizer.name_analyzer() as analyzer:
+                analyser.tokenize()
+            ```
+
+            When used outside the with construct, the caller must ensure to
+            call the close() function before destructing the analyzer.
+
+            Analyzers are not thread-safe. You need to instantiate one per thread.
+        """
+        normalizer = Transliterator.createFromRules("phrase normalizer",
+                                                    self.normalization)
+        return LegacyNameAnalyzer(self.dsn, normalizer)
+
+
+    def _install_php(self, config):
+        """ Install the php script for the tokenizer.
+        """
+        php_file = self.data_dir / "tokenizer.php"
+        php_file.write_text(dedent("""\
+            <?php
+            @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
+            @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
+            require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+            """.format(config)))
+
+
+    def _init_db_tables(self, config):
+        """ Set up the word table and fill it with pre-computed word
+            frequencies.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
+            conn.commit()
+
+        LOG.warning("Precomputing word tokens")
+        db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
+
+
+    def _save_config(self, conn, config):
+        """ Save the configuration that needs to remain stable for the given
+            database as database properties.
+        """
+        properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
+        properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
+
+
+class LegacyNameAnalyzer:
+    """ The legacy analyzer uses the special Postgresql module for
+        splitting names.
+
+        Each instance opens a connection to the database to request the
+        normalization.
+    """
+
+    def __init__(self, dsn, normalizer):
+        self.conn = connect(dsn).connection
+        self.conn.autocommit = True
+        self.normalizer = normalizer
+        psycopg2.extras.register_hstore(self.conn)
+
+        self._cache = _TokenCache(self.conn)
+
+
+    def __enter__(self):
+        return self
+
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+
+    def close(self):
+        """ Free all resources used by the analyzer.
+        """
+        if self.conn:
+            self.conn.close()
+            self.conn = None
+
+
+    @staticmethod
+    def get_word_token_info(conn, words):
+        """ Return token information for the given list of words.
+            If a word starts with # it is assumed to be a full name
+            otherwise is a partial name.
+
+            The function returns a list of tuples with
+            (original word, word token, word id).
+
+            The function is used for testing and debugging only
+            and not necessarily efficient.
+        """
+        with conn.cursor() as cur:
+            cur.execute("""SELECT t.term, word_token, word_id
+                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
+                           WHERE word_token = (CASE
+                                   WHEN left(t.term, 1) = '#' THEN
+                                     ' ' || make_standard_name(substring(t.term from 2))
+                                   ELSE
+                                     make_standard_name(t.term)
+                                   END)
+                                 and class is null and country_code is null""",
+                        (words, ))
+
+            return [(r[0], r[1], r[2]) for r in cur]
+
+
+    def normalize(self, phrase):
+        """ Normalize the given phrase, i.e. remove all properties that
+            are irrelevant for search.
+        """
+        return self.normalizer.transliterate(phrase)
+
+
+    def add_postcodes_from_db(self):
+        """ Add postcodes from the location_postcode table to the word table.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute("""SELECT count(create_postcode_id(pc))
+                           FROM (SELECT distinct(postcode) as pc
+                                 FROM location_postcode) x""")
+
+
+    def update_special_phrases(self, phrases):
+        """ Replace the search index for special phrases with the new phrases.
+        """
+        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
+                            for p in phrases))
+
+        with self.conn.cursor() as cur:
+            # Get the old phrases.
+            existing_phrases = set()
+            cur.execute("""SELECT word, class, type, operator FROM word
+                           WHERE class != 'place'
+                                 OR (type != 'house' AND type != 'postcode')""")
+            for label, cls, typ, oper in cur:
+                existing_phrases.add((label, cls, typ, oper or '-'))
+
+            to_add = norm_phrases - existing_phrases
+            to_delete = existing_phrases - norm_phrases
+
+            if to_add:
+                psycopg2.extras.execute_values(
+                    cur,
+                    """ INSERT INTO word (word_id, word_token, word, class, type,
+                                          search_name_count, operator)
+                        (SELECT nextval('seq_word'), make_standard_name(name), name,
+                                class, type, 0,
+                                CASE WHEN op in ('in', 'near') THEN op ELSE null END
+                           FROM (VALUES %s) as v(name, class, type, op))""",
+                    to_add)
+
+            if to_delete:
+                psycopg2.extras.execute_values(
+                    cur,
+                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                        WHERE word = name and class = in_class and type = in_type
+                              and ((op = '-' and operator is null) or op = operator)""",
+                    to_delete)
+
+        LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
+                 len(norm_phrases), len(to_add), len(to_delete))
+
+
+    def add_country_names(self, country_code, names):
+        """ Add names for the given country to the search index.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute(
+                """INSERT INTO word (word_id, word_token, country_code)
+                   (SELECT nextval('seq_word'), lookup_token, %s
+                      FROM (SELECT ' ' || make_standard_name(n) as lookup_token
+                            FROM unnest(%s)n) y
+                      WHERE NOT EXISTS(SELECT * FROM word
+                                       WHERE word_token = lookup_token and country_code = %s))
+                """, (country_code, names, country_code))
+
+
+    def process_place(self, place):
+        """ Determine tokenizer information about the given place.
+
+            Returns a JSON-serialisable structure that will be handed into
+            the database via the token_info field.
+        """
+        token_info = _TokenInfo(self._cache)
+
+        names = place.get('name')
+
+        if names:
+            token_info.add_names(self.conn, names)
+
+            country_feature = place.get('country_feature')
+            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
+                self.add_country_names(country_feature.lower(), list(names.values()))
+
+        address = place.get('address')
+
+        if address:
+            hnrs = []
+            addr_terms = []
+            for key, value in address.items():
+                if key == 'postcode':
+                    self._add_postcode(value)
+                elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                    hnrs.append(value)
+                elif key == 'street':
+                    token_info.add_street(self.conn, value)
+                elif key == 'place':
+                    token_info.add_place(self.conn, value)
+                elif not key.startswith('_') and \
+                     key not in ('country', 'full'):
+                    addr_terms.append((key, value))
+
+            if hnrs:
+                token_info.add_housenumbers(self.conn, hnrs)
+
+            if addr_terms:
+                token_info.add_address_terms(self.conn, addr_terms)
+
+        return token_info.data
+
+
+    def _add_postcode(self, postcode):
+        """ Make sure the normalized postcode is present in the word table.
+        """
+        def _create_postcode_from_db(pcode):
+            with self.conn.cursor() as cur:
+                cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
+
+        if re.search(r'[:,;]', postcode) is None:
+            self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
+
+
+class _TokenInfo:
+    """ Collect token information to be sent back to the database.
+    """
+    def __init__(self, cache):
+        self.cache = cache
+        self.data = {}
+
+
+    def add_names(self, conn, names):
+        """ Add token information for the names of the place.
+        """
+        with conn.cursor() as cur:
+            # Create the token IDs for all names.
+            self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
+                                            (names, ))
+
+
+    def add_housenumbers(self, conn, hnrs):
+        """ Extract housenumber information from the address.
+        """
+        if len(hnrs) == 1:
+            token = self.cache.get_housenumber(hnrs[0])
+            if token is not None:
+                self.data['hnr_tokens'] = token
+                self.data['hnr'] = hnrs[0]
+                return
+
+        # split numbers if necessary
+        simple_list = []
+        for hnr in hnrs:
+            simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
+
+        if len(simple_list) > 1:
+            simple_list = list(set(simple_list))
+
+        with conn.cursor() as cur:
+            cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
+            self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
+
+
+    def add_street(self, conn, street):
+        """ Add addr:street match terms.
+        """
+        def _get_street(name):
+            with conn.cursor() as cur:
+                return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
+
+        self.data['street'] = self.cache.streets.get(street, _get_street)
+
+
+    def add_place(self, conn, place):
+        """ Add addr:place search and match terms.
+        """
+        def _get_place(name):
+            with conn.cursor() as cur:
+                cur.execute("""SELECT (addr_ids_from_name(%s)
+                                       || getorcreate_name_id(make_standard_name(%s), ''))::text,
+                                      word_ids_from_name(%s)::text""",
+                            (name, name, name))
+                return cur.fetchone()
+
+        self.data['place_search'], self.data['place_match'] = \
+            self.cache.places.get(place, _get_place)
+
+
+    def add_address_terms(self, conn, terms):
+        """ Add additional address terms.
+        """
+        def _get_address_term(name):
+            with conn.cursor() as cur:
+                cur.execute("""SELECT addr_ids_from_name(%s)::text,
+                                      word_ids_from_name(%s)::text""",
+                            (name, name))
+                return cur.fetchone()
+
+        tokens = {}
+        for key, value in terms:
+            tokens[key] = self.cache.address_terms.get(value, _get_address_term)
+
+        self.data['addr'] = tokens
+
+
+class _LRU:
+    """ Least recently used cache that accepts a generator function to
+        produce the item when there is a cache miss.
+    """
+
+    def __init__(self, maxsize=128, init_data=None):
+        self.data = init_data or OrderedDict()
+        self.maxsize = maxsize
+        if init_data is not None and len(init_data) > maxsize:
+            self.maxsize = len(init_data)
+
+    def get(self, key, generator):
+        """ Get the item with the given key from the cache. If nothing
+            is found in the cache, generate the value through the
+            generator function and store it in the cache.
+        """
+        value = self.data.get(key)
+        if value is not None:
+            self.data.move_to_end(key)
+        else:
+            value = generator(key)
+            if len(self.data) >= self.maxsize:
+                self.data.popitem(last=False)
+            self.data[key] = value
+
+        return value
+
+
+class _TokenCache:
+    """ Cache for token information to avoid repeated database queries.
+
+        This cache is not thread-safe and needs to be instantiated per
+        analyzer.
+    """
+    def __init__(self, conn):
+        # various LRU caches
+        self.streets = _LRU(maxsize=256)
+        self.places = _LRU(maxsize=128)
+        self.address_terms = _LRU(maxsize=1024)
+
+        # Lookup houseunumbers up to 100 and cache them
+        with conn.cursor() as cur:
+            cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
+                           FROM generate_series(1, 100) as i""")
+            self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
+
+        # Get postcodes that are already saved
+        postcodes = OrderedDict()
+        with conn.cursor() as cur:
+            cur.execute("""SELECT word FROM word
+                           WHERE class ='place' and type = 'postcode'""")
+            for row in cur:
+                postcodes[row[0]] = None
+        self.postcodes = _LRU(maxsize=32, init_data=postcodes)
+
+    def get_housenumber(self, number):
+        """ Get a housenumber token from the cache.
+        """
+        return self._cached_housenumbers.get(number)
diff --git a/nominatim/tools/__init__.py b/nominatim/tools/__init__.py

index cab6fb8b89ce541f1054ad7887cd5aafcdd0d5c1..cc5d3e9ba2890ac52993bac07cf57472e128a5d3 100644 (file)
--- a/nominatim/tools/__init__.py
+++ b/nominatim/tools/__init__.py
@@ -2,3 +2,5 @@
  Module with functions for importing, updating Nominatim databases
  as well as general maintenance helpers.
  """
+
+from nominatim.tools.special_phrases.special_phrases_importer import SpecialPhrasesImporter
diff --git a/nominatim/tools/check_database.py b/nominatim/tools/check_database.py

index 265f8666adf65e24657c68760ced7888d289e5af..d4f793b46334c27f304fbfe898268a4f23f564a9 100644 (file)
--- a/nominatim/tools/check_database.py
+++ b/nominatim/tools/check_database.py
@@ -4,10 +4,9 @@ Collection of functions that check if the database is complete and functional.
  from enum import Enum
  from textwrap import dedent
  
-import psycopg2
-
  from nominatim.db.connection import connect
  from nominatim.errors import UsageError
+from nominatim.tokenizer import factory as tokenizer_factory
  
  CHECKLIST = []
  
@@ -47,7 +46,7 @@ def _check(hint=None):
  
      return decorator
  
-class _BadConnection: # pylint: disable=R0903
+class _BadConnection:
  
      def __init__(self, msg):
          self.msg = msg
@@ -78,14 +77,12 @@ def check_database(config):
  
  
  def _get_indexes(conn):
-    indexes = ['idx_word_word_id',
-               'idx_place_addressline_address_place_id',
+    indexes = ['idx_place_addressline_address_place_id',
                 'idx_placex_rank_search',
                 'idx_placex_rank_address',
                 'idx_placex_parent_place_id',
                 'idx_placex_geometry_reverse_lookuppolygon',
                 'idx_placex_geometry_placenode',
-               'idx_placex_housenumber',
                 'idx_osmline_parent_place_id',
                 'idx_osmline_parent_osm_id',
                 'idx_postcode_id',
@@ -95,6 +92,9 @@ def _get_indexes(conn):
          indexes.extend(('idx_search_name_nameaddress_vector',
                          'idx_search_name_name_vector',
                          'idx_search_name_centroid'))
+        if conn.server_version_tuple() >= (11, 0, 0):
+            indexes.extend(('idx_placex_housenumber',
+                            'idx_osmline_parent_osm_id_with_hnr'))
      if conn.table_exists('place'):
          indexes.extend(('idx_placex_pendingsector',
                          'idx_location_area_country_place_id',
@@ -147,7 +147,7 @@ def check_placex_table(conn, config):
  
  
  @_check(hint="""placex table has no data. Did the import finish sucessfully?""")
-def check_placex_size(conn, config): # pylint: disable=W0613
+def check_placex_size(conn, _):
      """ Checking for placex content
      """
      with conn.cursor() as cur:
@@ -156,38 +156,30 @@ def check_placex_size(conn, config): # pylint: disable=W0613
      return CheckState.OK if cnt > 0 else CheckState.FATAL
  
  
-@_check(hint="""\
-             The Postgresql extension nominatim.so was not correctly loaded.
-
-             Error: {error}
-
-             Hints:
-             * Check the output of the CMmake/make installation step
-             * Does nominatim.so exist?
-             * Does nominatim.so exist on the database server?
-             * Can nominatim.so be accessed by the database user?
-             """)
-def check_module(conn, config): # pylint: disable=W0613
-    """ Checking that nominatim.so module is installed
+@_check(hint="""{msg}""")
+def check_tokenizer(_, config):
+    """ Checking that tokenizer works
      """
-    with conn.cursor() as cur:
-        try:
-            out = cur.scalar("SELECT make_standard_name('a')")
-        except psycopg2.ProgrammingError as err:
-            return CheckState.FAIL, dict(error=str(err))
+    try:
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+    except UsageError:
+        return CheckState.FAIL, dict(msg="""\
+            Cannot load tokenizer. Did the import finish sucessfully?""")
  
-        if out != 'a':
-            return CheckState.FAIL, dict(error='Unexpected result for make_standard_name()')
+    result = tokenizer.check_database()
  
+    if result is None:
          return CheckState.OK
  
+    return CheckState.FAIL, dict(msg=result)
+
  
  @_check(hint="""\
               The indexing didn't finish. {count} entries are not yet indexed.
  
               To index the remaining entries, run:   {index_cmd}
               """)
-def check_indexing(conn, config): # pylint: disable=W0613
+def check_indexing(conn, _):
      """ Checking indexing status
      """
      with conn.cursor() as cur:
@@ -196,7 +188,7 @@ def check_indexing(conn, config): # pylint: disable=W0613
      if cnt == 0:
          return CheckState.OK
  
-    if conn.index_exists('idx_word_word_id'):
+    if conn.index_exists('idx_placex_rank_search'):
          # Likely just an interrupted update.
          index_cmd = 'nominatim index'
      else:
@@ -212,7 +204,7 @@ def check_indexing(conn, config): # pylint: disable=W0613
  
               Rerun the index creation with:   nominatim import --continue db-postprocess
               """)
-def check_database_indexes(conn, config): # pylint: disable=W0613
+def check_database_indexes(conn, _):
      """ Checking that database indexes are complete
      """
      missing = []
@@ -234,7 +226,7 @@ def check_database_indexes(conn, config): # pylint: disable=W0613
               Invalid indexes:
                 {indexes}
               """)
-def check_database_index_valid(conn, config): # pylint: disable=W0613
+def check_database_index_valid(conn, _):
      """ Checking that all database indexes are valid
      """
      with conn.cursor() as cur:
diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py

index 25efedb9a9a6a5ffb2c165df73e2c8a51f4431b4..664d3c6b39ed2fafb57e45960c052ccd76401505 100644 (file)
--- a/nominatim/tools/database_import.py
+++ b/nominatim/tools/database_import.py
@@ -5,11 +5,10 @@ import logging
  import os
  import selectors
  import subprocess
-import shutil
  from pathlib import Path
  
  import psutil
-import psycopg2
+import psycopg2.extras
  
  from nominatim.db.connection import connect, get_pg_env
  from nominatim.db import utils as db_utils
@@ -89,49 +88,6 @@ def setup_extensions(conn):
          raise UsageError('PostGIS version is too old.')
  
  
-def install_module(src_dir, project_dir, module_dir, conn=None):
-    """ Copy the normalization module from src_dir into the project
-        directory under the '/module' directory. If 'module_dir' is set, then
-        use the module from there instead and check that it is accessible
-        for Postgresql.
-
-        The function detects when the installation is run from the
-        build directory. It doesn't touch the module in that case.
-
-        If 'conn' is given, then the function also tests if the module
-        can be access via the given database.
-    """
-    if not module_dir:
-        module_dir = project_dir / 'module'
-
-        if not module_dir.exists() or not src_dir.samefile(module_dir):
-
-            if not module_dir.exists():
-                module_dir.mkdir()
-
-            destfile = module_dir / 'nominatim.so'
-            shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
-            destfile.chmod(0o755)
-
-            LOG.info('Database module installed at %s', str(destfile))
-        else:
-            LOG.info('Running from build directory. Leaving database module as is.')
-    else:
-        LOG.info("Using custom path for database module at '%s'", module_dir)
-
-    if conn is not None:
-        with conn.cursor() as cur:
-            try:
-                cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
-                               RETURNS text AS '{}/nominatim.so', 'transliteration'
-                               LANGUAGE c IMMUTABLE STRICT;
-                               DROP FUNCTION nominatim_test_import_func(text)
-                            """.format(module_dir))
-            except psycopg2.DatabaseError as err:
-                LOG.fatal("Error accessing database module: %s", err)
-                raise UsageError("Database module cannot be accessed.") from err
-
-
  def import_base_data(dsn, sql_dir, ignore_partitions=False):
      """ Create and populate the tables with basic static data that provides
          the background for geocoding. Data is assumed to not yet exist.
@@ -205,16 +161,14 @@ def create_partition_tables(conn, config):
      sql.run_sql_file(conn, 'partition-tables.src.sql')
  
  
-def truncate_data_tables(conn, max_word_frequency=None):
+def truncate_data_tables(conn):
      """ Truncate all data tables to prepare for a fresh load.
      """
      with conn.cursor() as cur:
-        cur.execute('TRUNCATE word')
          cur.execute('TRUNCATE placex')
          cur.execute('TRUNCATE place_addressline')
          cur.execute('TRUNCATE location_area')
          cur.execute('TRUNCATE location_area_country')
-        cur.execute('TRUNCATE location_property')
          cur.execute('TRUNCATE location_property_tiger')
          cur.execute('TRUNCATE location_property_osmline')
          cur.execute('TRUNCATE location_postcode')
@@ -229,23 +183,13 @@ def truncate_data_tables(conn, max_word_frequency=None):
          for table in [r[0] for r in list(cur)]:
              cur.execute('TRUNCATE ' + table)
  
-        if max_word_frequency is not None:
-            # Used by getorcreate_word_id to ignore frequent partial words.
-            cur.execute("""CREATE OR REPLACE FUNCTION get_maxwordfreq()
-                           RETURNS integer AS $$
-                             SELECT {} as maxwordfreq;
-                           $$ LANGUAGE SQL IMMUTABLE
-                        """.format(max_word_frequency))
-        conn.commit()
+    conn.commit()
  
  _COPY_COLUMNS = 'osm_type, osm_id, class, type, name, admin_level, address, extratags, geometry'
  
-def load_data(dsn, data_dir, threads):
+def load_data(dsn, threads):
      """ Copy data into the word and placex table.
      """
-    # Pre-calculate the most important terms in the word list.
-    db_utils.execute_file(dsn, data_dir / 'words.sql')
-
      sel = selectors.DefaultSelector()
      # Then copy data from place to placex in <threads - 1> chunks.
      place_threads = max(1, threads - 1)
@@ -307,34 +251,37 @@ def create_search_indices(conn, config, drop=False):
  
      sql.run_sql_file(conn, 'indices.sql', drop=drop)
  
-def create_country_names(conn, config):
-    """ Create search index for default country names.
+def create_country_names(conn, tokenizer, languages=None):
+    """ Add default country names to search index. `languages` is a comma-
+        separated list of language codes as used in OSM. If `languages` is not
+        empty then only name translations for the given languages are added
+        to the index.
      """
+    if languages:
+        languages = languages.split(',')
+
+    def _include_key(key):
+        return key == 'name' or \
+               (key.startswith('name:') \
+                and (not languages or key[5:] in languages))
  
      with conn.cursor() as cur:
-        cur.execute("""SELECT getorcreate_country(make_standard_name('uk'), 'gb')""")
-        cur.execute("""SELECT getorcreate_country(make_standard_name('united states'), 'us')""")
-        cur.execute("""SELECT COUNT(*) FROM
-                       (SELECT getorcreate_country(make_standard_name(country_code),
-                       country_code) FROM country_name WHERE country_code is not null) AS x""")
-        cur.execute("""SELECT COUNT(*) FROM
-                       (SELECT getorcreate_country(make_standard_name(name->'name'), country_code) 
-                       FROM country_name WHERE name ? 'name') AS x""")
-        sql_statement = """SELECT COUNT(*) FROM (SELECT getorcreate_country(make_standard_name(v),
-                           country_code) FROM (SELECT country_code, skeys(name)
-                           AS k, svals(name) AS v FROM country_name) x WHERE k"""
-
-        languages = config.LANGUAGES
-
-        if languages:
-            sql_statement = "{} IN (".format(sql_statement)
-            delim = ''
-            for language in languages.split(','):
-                sql_statement = "{}{}'name:{}'".format(sql_statement, delim, language)
-                delim = ', '
-            sql_statement = '{})'.format(sql_statement)
-        else:
-            sql_statement = "{} LIKE 'name:%'".format(sql_statement)
-        sql_statement = "{}) v".format(sql_statement)
-        cur.execute(sql_statement)
+        psycopg2.extras.register_hstore(cur)
+        cur.execute("""SELECT country_code, name FROM country_name
+                       WHERE country_code is not null""")
+
+        with tokenizer.name_analyzer() as analyzer:
+            for code, name in cur:
+                names = [code]
+                if code == 'gb':
+                    names.append('UK')
+                if code == 'us':
+                    names.append('United States')
+
+                # country names (only in languages as provided)
+                if name:
+                    names.extend((v for k, v in name.items() if _include_key(k)))
+
+                analyzer.add_country_names(code, names)
+
      conn.commit()
diff --git a/nominatim/tools/exec_utils.py b/nominatim/tools/exec_utils.py

index 96679d279472fd49d568b7ce41fdfeeebde1a206..9888d96a73e83ca35abe90a01ebc13bc1eec0df9 100644 (file)
--- a/nominatim/tools/exec_utils.py
+++ b/nominatim/tools/exec_utils.py
@@ -18,16 +18,16 @@ def run_legacy_script(script, *args, nominatim_env=None, throw_on_fail=False):
          then throw a `CalledProcessError` on a non-zero exit.
      """
      cmd = ['/usr/bin/env', 'php', '-Cq',
-           nominatim_env.phplib_dir / 'admin' / script]
+           str(nominatim_env.phplib_dir / 'admin' / script)]
      cmd.extend([str(a) for a in args])
  
      env = nominatim_env.config.get_os_env()
      env['NOMINATIM_DATADIR'] = str(nominatim_env.data_dir)
      env['NOMINATIM_SQLDIR'] = str(nominatim_env.sqllib_dir)
      env['NOMINATIM_CONFIGDIR'] = str(nominatim_env.config_dir)
-    env['NOMINATIM_DATABASE_MODULE_SRC_PATH'] = nominatim_env.module_dir
+    env['NOMINATIM_DATABASE_MODULE_SRC_PATH'] = str(nominatim_env.module_dir)
      if not env['NOMINATIM_OSM2PGSQL_BINARY']:
-        env['NOMINATIM_OSM2PGSQL_BINARY'] = nominatim_env.osm2pgsql_path
+        env['NOMINATIM_OSM2PGSQL_BINARY'] = str(nominatim_env.osm2pgsql_path)
  
      proc = subprocess.run(cmd, cwd=str(nominatim_env.project_dir), env=env,
                            check=throw_on_fail)
@@ -99,7 +99,7 @@ def run_osm2pgsql(options):
      """ Run osm2pgsql with the given options.
      """
      env = get_pg_env(options['dsn'])
-    cmd = [options['osm2pgsql'],
+    cmd = [str(options['osm2pgsql']),
             '--hstore', '--latlon', '--slim',
             '--with-forward-dependencies', 'false',
             '--log-progress', 'true',
diff --git a/nominatim/tools/migration.py b/nominatim/tools/migration.py

index 4af5cb4879ef8068c60b2426c4f7d64293a97b6a..ddf25cd91ba30307fef2c4d2ee65cae2a38596ce 100644 (file)
--- a/nominatim/tools/migration.py
+++ b/nominatim/tools/migration.py
@@ -6,7 +6,8 @@ import logging
  from nominatim.db import properties
  from nominatim.db.connection import connect
  from nominatim.version import NOMINATIM_VERSION
-from nominatim.tools import refresh, database_import
+from nominatim.tools import refresh
+from nominatim.tokenizer import factory as tokenizer_factory
  from nominatim.errors import UsageError
  
  LOG = logging.getLogger()
@@ -43,11 +44,14 @@ def migrate(config, paths):
                              '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(version))
                  kwargs = dict(conn=conn, config=config, paths=paths)
                  func(**kwargs)
+                conn.commit()
                  has_run_migration = True
  
          if has_run_migration:
              LOG.warning('Updating SQL functions.')
              refresh.create_functions(conn, config)
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+            tokenizer.update_sql_functions(config)
  
          properties.set_property(conn, 'database_version',
                                  '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
@@ -108,17 +112,6 @@ def import_status_timestamp_change(conn, **_):
                         TYPE timestamp with time zone;""")
  
  
-@_migration(3, 5, 0, 99)
-def install_database_module_in_project_directory(conn, config, paths, **_):
-    """ Install database module in project directory.
-
-        The database module needs to be present in the project directory
-        since those were introduced.
-    """
-    database_import.install_module(paths.module_dir, paths.project_dir,
-                                   config.DATABASE_MODULE_PATH, conn=conn)
-
-
  @_migration(3, 5, 0, 99)
  def add_nominatim_property_table(conn, config, **_):
      """ Add nominatim_property table.
@@ -137,6 +130,9 @@ def change_housenumber_transliteration(conn, **_):
  
          The database schema switched from saving raw housenumbers in
          placex.housenumber to saving transliterated ones.
+
+        Note: the function create_housenumber_id() has been dropped in later
+              versions.
      """
      with conn.cursor() as cur:
          cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
@@ -173,3 +169,25 @@ def switch_placenode_geometry_index(conn, **_):
                                and class = 'place' and type != 'postcode'
                                and linked_place_id is null""")
          cur.execute(""" DROP INDEX IF EXISTS idx_placex_adminname """)
+
+
+@_migration(3, 7, 0, 1)
+def install_legacy_tokenizer(conn, config, **_):
+    """ Setup legacy tokenizer.
+
+        If no other tokenizer has been configured yet, then create the
+        configuration for the backwards-compatible legacy tokenizer
+    """
+    if properties.get_property(conn, 'tokenizer') is None:
+        with conn.cursor() as cur:
+            for table in ('placex', 'location_property_osmline'):
+                has_column = cur.scalar("""SELECT count(*) FROM information_schema.columns
+                                           WHERE table_name = %s
+                                           and column_name = 'token_info'""",
+                                        (table, ))
+            if has_column == 0:
+                cur.execute('ALTER TABLE {} ADD COLUMN token_info JSONB'.format(table))
+        tokenizer = tokenizer_factory.create_tokenizer(config, init_db=False,
+                                                       module_name='legacy')
+
+        tokenizer.migrate_database(config)
diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py

index 0a568cbafc73c125c4f9d34b7141b42b05eb6b11..78bd8cb9490c5646754cef2ef2bbf2348d5e2a74 100644 (file)
--- a/nominatim/tools/postcodes.py
+++ b/nominatim/tools/postcodes.py
@@ -6,7 +6,7 @@ of artificial postcode centroids.
  from nominatim.db.utils import execute_file
  from nominatim.db.connection import connect
  
-def import_postcodes(dsn, project_dir):
+def import_postcodes(dsn, project_dir, tokenizer):
      """ Set up the initial list of postcodes.
      """
  
@@ -41,10 +41,11 @@ def import_postcodes(dsn, project_dir):
                  INSERT INTO location_postcode
                   (place_id, indexed_status, country_code, postcode, geometry)
                  SELECT nextval('seq_place'), 1, country_code,
-                       upper(trim (both ' ' from address->'postcode')) as pc,
+                       token_normalized_postcode(address->'postcode') as pc,
                         ST_Centroid(ST_Collect(ST_Centroid(geometry)))
                    FROM placex
-                 WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
+                 WHERE address ? 'postcode'
+                       and token_normalized_postcode(address->'postcode') is not null
                         AND geometry IS NOT null
                   GROUP BY country_code, pc
              """)
@@ -52,9 +53,10 @@ def import_postcodes(dsn, project_dir):
              cur.execute("""
                  INSERT INTO location_postcode
                   (place_id, indexed_status, country_code, postcode, geometry)
-                SELECT nextval('seq_place'), 1, 'us', postcode,
+                SELECT nextval('seq_place'), 1, 'us',
+                       token_normalized_postcode(postcode),
                         ST_SetSRID(ST_Point(x,y),4326)
-                  FROM us_postcode WHERE postcode NOT IN
+                  FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN
                          (SELECT postcode FROM location_postcode
                            WHERE country_code = 'us')
              """)
@@ -62,8 +64,9 @@ def import_postcodes(dsn, project_dir):
              cur.execute("""
                  INSERT INTO location_postcode
                   (place_id, indexed_status, country_code, postcode, geometry)
-                SELECT nextval('seq_place'), 1, 'gb', postcode, geometry
-                  FROM gb_postcode WHERE postcode NOT IN
+                SELECT nextval('seq_place'), 1, 'gb',
+                       token_normalized_postcode(postcode), geometry
+                  FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN
                             (SELECT postcode FROM location_postcode
                               WHERE country_code = 'gb')
              """)
@@ -72,9 +75,7 @@ def import_postcodes(dsn, project_dir):
                      DELETE FROM word WHERE class='place' and type='postcode'
                      and word NOT IN (SELECT postcode FROM location_postcode)
              """)
-
-            cur.execute("""
-                SELECT count(getorcreate_postcode_id(v)) FROM
-                (SELECT distinct(postcode) as v FROM location_postcode) p
-            """)
          conn.commit()
+
+        with tokenizer.name_analyzer() as analyzer:
+            analyzer.add_postcodes_from_db()
diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py

index d38cb216865003c0ce75a9f1f735c3db18a49e19..6720465fd9220387781b8df939742a1aee482a0e 100644 (file)
--- a/nominatim/tools/refresh.py
+++ b/nominatim/tools/refresh.py
@@ -104,14 +104,11 @@ PHP_CONST_DEFS = (
      ('Default_Language', 'DEFAULT_LANGUAGE', str),
      ('Log_DB', 'LOG_DB', bool),
      ('Log_File', 'LOG_FILE', str),
-    ('Max_Word_Frequency', 'MAX_WORD_FREQUENCY', int),
      ('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
      ('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
      ('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
      ('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
      ('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
-    ('Term_Normalization_Rules', 'TERM_NORMALIZATION', str),
-    ('Use_Aux_Location_data', 'USE_AUX_LOCATION_DATA', bool),
      ('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
      ('MapIcon_URL', 'MAPICON_URL', str),
  )
@@ -176,9 +173,11 @@ def setup_website(basedir, config):
  
                        @define('CONST_Debug', $_GET['debug'] ?? false);
                        @define('CONST_LibDir', '{0}');
+                      @define('CONST_TokenizerDir', '{2}');
                        @define('CONST_NominatimVersion', '{1[0]}.{1[1]}.{1[2]}-{1[3]}');
  
-                      """.format(config.lib_dir.php, NOMINATIM_VERSION))
+                      """.format(config.lib_dir.php, NOMINATIM_VERSION,
+                                 config.project_dir / 'tokenizer'))
  
      for php_name, conf_name, var_type in PHP_CONST_DEFS:
          if var_type == bool:
diff --git a/nominatim/tools/replication.py b/nominatim/tools/replication.py

index d6e8089161bc96cc71f1119b8252e3046af654b3..c167a49f81fa0a28c6cbdbf877e3646d0b2c168b 100644 (file)
--- a/nominatim/tools/replication.py
+++ b/nominatim/tools/replication.py
@@ -13,7 +13,7 @@ from nominatim.errors import UsageError
  try:
      from osmium.replication.server import ReplicationServer
      from osmium import WriteHandler
-except ModuleNotFoundError as exc:
+except ImportError as exc:
      logging.getLogger().fatal("pyosmium not installed. Replication functions not available.\n"
                                "To install pyosmium via pip: pip3 install osmium")
      raise UsageError("replication tools not available") from exc
diff --git a/nominatim/tools/special_phrases/__init__.py b/nominatim/tools/special_phrases/__init__.py

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/nominatim/tools/special_phrases/importer_statistics.py b/nominatim/tools/special_phrases/importer_statistics.py

new file mode 100644 (file)

index 0000000..9b97bca
--- /dev/null
+++ b/nominatim/tools/special_phrases/importer_statistics.py
@@ -0,0 +1,101 @@
+"""
+    Contains the class which handles statistics for the
+    import of special phrases.
+"""
+import logging
+LOG = logging.getLogger()
+
+class SpecialPhrasesImporterStatistics():
+    # pylint: disable-msg=too-many-instance-attributes
+    """
+        Class handling statistics of the import
+        process of special phrases.
+    """
+    def __init__(self):
+        self._set_lang_values_to_0()
+        self._set_global_values_to_0()
+
+    def _set_global_values_to_0(self):
+        """
+            Set all counts for the global
+            import to 0.
+        """
+        self.tables_created = 0
+        self.tables_deleted = 0
+        self.tables_ignored = 0
+        self.global_phrases_invalid = 0
+
+    def _set_lang_values_to_0(self):
+        """
+            Set all counts for the current
+            lang to 0.
+        """
+        self.lang_phrases_invalid = 0
+
+    def notify_one_phrase_invalid(self):
+        """
+            Add +1 to the count of invalid entries
+            fetched from the wiki.
+        """
+        self.lang_phrases_invalid += 1
+        self.global_phrases_invalid += 1
+
+    def notify_one_table_created(self):
+        """
+            Add +1 to the count of created tables.
+        """
+        self.tables_created += 1
+
+    def notify_one_table_deleted(self):
+        """
+            Add +1 to the count of deleted tables.
+        """
+        self.tables_deleted += 1
+
+    def notify_one_table_ignored(self):
+        """
+            Add +1 to the count of ignored tables.
+        """
+        self.tables_ignored += 1
+
+
+    def notify_import_done(self):
+        """
+            Print stats for the whole import process
+            and reset all values.
+        """
+        LOG.info('====================================================================')
+        LOG.info('Final statistics of the import:')
+        LOG.info('- %s phrases were invalid.', self.global_phrases_invalid)
+        if self.global_phrases_invalid > 0:
+            LOG.info('  Those invalid phrases have been skipped.')
+        LOG.info('- %s tables were ignored as they already exist on the database',
+                 self.tables_ignored)
+        LOG.info('- %s tables were created', self.tables_created)
+        LOG.info('- %s tables were deleted from the database', self.tables_deleted)
+        if self.tables_deleted > 0:
+            LOG.info('  They were deleted as they are not valid anymore.')
+
+        if self.global_phrases_invalid > 0:
+            LOG.warning('%s phrases were invalid and have been skipped during the whole process.',
+                        self.global_phrases_invalid)
+
+        self._set_global_values_to_0()
+
+    def notify_current_lang_done(self, lang):
+        """
+            Print stats for the current lang
+            and then reset lang values.
+        """
+        LOG.info('====================================================================')
+        LOG.info('Statistics for the import of %s:', lang)
+        LOG.info('- %s phrases were invalid.', self.lang_phrases_invalid)
+        if self.lang_phrases_invalid > 0:
+            LOG.info('  Those invalid phrases have been skipped.')
+        LOG.info('====================================================================')
+
+        if self.lang_phrases_invalid > 0:
+            LOG.warning('%s phrases were invalid and have been skipped for the import of lang %s.',
+                        self.lang_phrases_invalid, lang)
+
+        self._set_lang_values_to_0()
diff --git a/nominatim/tools/special_phrases.py b/nominatim/tools/special_phrases/special_phrases_importer.py

similarity index 71%

rename from nominatim/tools/special_phrases.py

rename to nominatim/tools/special_phrases/special_phrases_importer.py

index 9d0259dc6e5533314b1fd664e33ce451fc936a27..9649f94b1a736b0d561d489d1b9e67ef8546d28b 100644 (file)
--- a/nominatim/tools/special_phrases.py
+++ b/nominatim/tools/special_phrases/special_phrases_importer.py
@@ -3,24 +3,26 @@
  """
  import logging
  import os
+from os.path import isfile
  from pathlib import Path
  import re
  import subprocess
  import json
-from os.path import isfile
-from icu import Transliterator
+
  from psycopg2.sql import Identifier, Literal, SQL
+
  from nominatim.tools.exec_utils import get_url
  from nominatim.errors import UsageError
+from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics
  
  LOG = logging.getLogger()
  class SpecialPhrasesImporter():
      # pylint: disable-msg=too-many-instance-attributes
-    # pylint: disable-msg=too-few-public-methods
      """
          Class handling the process of special phrases importations.
      """
      def __init__(self, config, phplib_dir, db_connection) -> None:
+        self.statistics_handler = SpecialPhrasesImporterStatistics()
          self.db_connection = db_connection
          self.config = config
          self.phplib_dir = phplib_dir
@@ -30,21 +32,14 @@ class SpecialPhrasesImporter():
              r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
          )
          self.sanity_check_pattern = re.compile(r'^\w+$')
-        self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
-                                                             self.config.TERM_NORMALIZATION)
-        #This set will contain all existing phrases from the word table which
-        #no longer exist on the wiki.
-        #It contain tuples with the following format: (normalized_word, class, type, operator)
-        self.words_phrases_to_delete = set()
-        #This set will contain the phrases which still exist from the wiki.
-        #It is used to prevent duplicates on the wiki by removing them from
-        #the word_phrases_to_delete only at the end.
-        self.words_phrases_still_exist = set()
+        # This set will contain all existing phrases to be added.
+        # It contains tuples with the following format: (lable, class, type, operator)
+        self.word_phrases = set()
          #This set will contain all existing place_classtype tables which doesn't match any
          #special phrases class/type on the wiki.
          self.table_phrases_to_delete = set()
  
-    def import_from_wiki(self, languages=None):
+    def import_from_wiki(self, tokenizer, languages=None):
          """
              Iterate through all specified languages and
              extract corresponding special phrases from the wiki.
@@ -52,7 +47,6 @@ class SpecialPhrasesImporter():
          if languages is not None and not isinstance(languages, list):
              raise TypeError('The \'languages\' argument should be of type list.')
  
-        self._fetch_existing_words_phrases()
          self._fetch_existing_place_classtype_tables()
  
          #Get all languages to process.
@@ -62,34 +56,21 @@ class SpecialPhrasesImporter():
          class_type_pairs = set()
  
          for lang in languages:
-            LOG.warning('Import phrases for lang: %s', lang)
+            LOG.warning('Importing phrases for lang: %s...', lang)
              wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang)
              class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
+            self.statistics_handler.notify_current_lang_done(lang)
  
          self._create_place_classtype_table_and_indexes(class_type_pairs)
-        self._remove_non_existent_phrases_from_db()
+        self._remove_non_existent_tables_from_db()
          self.db_connection.commit()
+
+        with tokenizer.name_analyzer() as analyzer:
+            analyzer.update_special_phrases(self.word_phrases)
+
          LOG.warning('Import done.')
+        self.statistics_handler.notify_import_done()
  
-    def _fetch_existing_words_phrases(self):
-        """
-            Fetch existing special phrases from the word table.
-            Fill the word_phrases_to_delete set of the class.
-        """
-        #Only extract special phrases terms:
-        #If class=place and type=house then it is a housenumber term.
-        #If class=place and type=postcode then it is a postcode term.
-        word_query = """
-            SELECT word, class, type, operator FROM word
-            WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
-        """
-        with self.db_connection.cursor() as db_cursor:
-            db_cursor.execute(SQL(word_query))
-            for row in db_cursor:
-                row[3] = '-' if row[3] is None else row[3]
-                self.words_phrases_to_delete.add(
-                    (row[0], row[1], row[2], row[3])
-                )
  
      def _fetch_existing_place_classtype_tables(self):
          """
@@ -116,7 +97,7 @@ class SpecialPhrasesImporter():
          if self.config.PHRASE_CONFIG:
              settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
  
-        with open(settings_path, "r") as json_settings:
+        with settings_path.open("r") as json_settings:
              settings = json.load(json_settings)
          return settings['blackList'], settings['whiteList']
  
@@ -152,7 +133,7 @@ class SpecialPhrasesImporter():
          type_matchs = self.sanity_check_pattern.findall(phrase_type)
          class_matchs = self.sanity_check_pattern.findall(phrase_class)
  
-        if len(class_matchs) < 1 or len(type_matchs) < 1:
+        if not class_matchs or not type_matchs:
              LOG.warning("Bad class/type for language %s: %s=%s. It will not be imported",
                          lang, phrase_class, phrase_type)
              return False
@@ -171,7 +152,6 @@ class SpecialPhrasesImporter():
  
          for match in matches:
              phrase_label = match[0].strip()
-            normalized_label = self.transliterator.transliterate(phrase_label)
              phrase_class = match[1].strip()
              phrase_type = match[2].strip()
              phrase_operator = match[3].strip()
@@ -193,53 +173,18 @@ class SpecialPhrasesImporter():
              ):
                  continue
  
-            #Check if the phrase already exists in the database.
-            if (
-                    (normalized_label, phrase_class, phrase_type, phrase_operator)
-                    in self.words_phrases_to_delete
-            ):
-                #Remove this phrase from the ones to delete as it still exist on the wiki.
-                self.words_phrases_still_exist.add(
-                    (normalized_label, phrase_class, phrase_type, phrase_operator)
-                )
-                class_type_pairs.add((phrase_class, phrase_type))
-                #Dont need to add this phrase as it already exists in the word table.
-                continue
-
              #sanity check, in case somebody added garbage in the wiki
              if not self._check_sanity(lang, phrase_class, phrase_type):
+                self.statistics_handler.notify_one_phrase_invalid()
                  continue
  
              class_type_pairs.add((phrase_class, phrase_type))
  
-            self._process_amenity(
-                phrase_label, normalized_label, phrase_class,
-                phrase_type, phrase_operator
-            )
+            self.word_phrases.add((phrase_label, phrase_class,
+                                   phrase_type, phrase_operator))
  
          return class_type_pairs
  
-    def _process_amenity(self, phrase_label, normalized_label,
-                         phrase_class, phrase_type, phrase_operator):
-        # pylint: disable-msg=too-many-arguments
-        """
-            Add phrase lookup and corresponding class and
-            type to the word table based on the operator.
-        """
-        with self.db_connection.cursor() as db_cursor:
-            if phrase_operator == 'near':
-                db_cursor.execute("""SELECT getorcreate_amenityoperator(
-                                  make_standard_name(%s), %s, %s, %s, 'near')""",
-                                  (phrase_label, normalized_label, phrase_class, phrase_type))
-            elif phrase_operator == 'in':
-                db_cursor.execute("""SELECT getorcreate_amenityoperator(
-                                  make_standard_name(%s), %s, %s, %s, 'in')""",
-                                  (phrase_label, normalized_label, phrase_class, phrase_type))
-            else:
-                db_cursor.execute("""SELECT getorcreate_amenity(
-                                  make_standard_name(%s), %s, %s, %s)""",
-                                  (phrase_label, normalized_label, phrase_class, phrase_type))
-
  
      def _create_place_classtype_table_and_indexes(self, class_type_pairs):
          """
@@ -262,6 +207,7 @@ class SpecialPhrasesImporter():
              table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
  
              if table_name in self.table_phrases_to_delete:
+                self.statistics_handler.notify_one_table_ignored()
                  #Remove this table from the ones to delete as it match a class/type
                  #still existing on the special phrases of the wiki.
                  self.table_phrases_to_delete.remove(table_name)
@@ -277,6 +223,8 @@ class SpecialPhrasesImporter():
              #Grant access on read to the web user.
              self._grant_access_to_webuser(phrase_class, phrase_type)
  
+            self.statistics_handler.notify_one_table_created()
+
          with self.db_connection.cursor() as db_cursor:
              db_cursor.execute("DROP INDEX idx_placex_classtype")
  
@@ -328,34 +276,18 @@ class SpecialPhrasesImporter():
                                .format(Identifier(table_name),
                                        Identifier(self.config.DATABASE_WEBUSER)))
  
-    def _remove_non_existent_phrases_from_db(self):
+    def _remove_non_existent_tables_from_db(self):
          """
              Remove special phrases which doesn't exist on the wiki anymore.
-            Delete from the word table and delete the place_classtype tables.
+            Delete the place_classtype tables.
          """
          LOG.warning('Cleaning database...')
-        self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
          #Array containing all queries to execute. Contain tuples of format (query, parameters)
          queries_parameters = []
  
-        #Delete phrases from the word table which are not on the wiki anymore.
-        for phrase_to_delete in self.words_phrases_to_delete:
-            if phrase_to_delete[3] == '-':
-                query = """
-                    DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
-                """
-                parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
-                queries_parameters.append((query, parameters))
-            else:
-                query = """
-                    DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
-                """
-                parameters = (phrase_to_delete[0], phrase_to_delete[1],
-                              phrase_to_delete[2], phrase_to_delete[3], )
-                queries_parameters.append((query, parameters))
-
          #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
          for table in self.table_phrases_to_delete:
+            self.statistics_handler.notify_one_table_deleted()
              query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table))
              queries_parameters.append((query, ()))
  
diff --git a/nominatim/version.py b/nominatim/version.py

index 9670ea6062b2c7a14971fa39993e16d165f99a1e..6f9005eaf868d8dabce8f26ade77ec28612d63e4 100644 (file)
--- a/nominatim/version.py
+++ b/nominatim/version.py
@@ -10,7 +10,7 @@ Version information for Nominatim.
  # and must always be increased when there is a change to the database or code
  # that requires a migration.
  # Released versions always have a database patch level of 0.
-NOMINATIM_VERSION = (3, 7, 0, 1)
+NOMINATIM_VERSION = (3, 7, 0, 2)
  
  POSTGRESQL_REQUIRED_VERSION = (9, 3)
  POSTGIS_REQUIRED_VERSION = (2, 2)
diff --git a/settings/env.defaults b/settings/env.defaults

index 4069270eb73161073fba7ed0af593a8c3730e69c..cf1f5108c4ac1ff839c6a4336e9070833af00cc5 100644 (file)
--- a/settings/env.defaults
+++ b/settings/env.defaults
@@ -18,6 +18,12 @@ NOMINATIM_DATABASE_WEBUSER="www-data"
  # Changing this value requires to run 'nominatim refresh --functions'.
  NOMINATIM_DATABASE_MODULE_PATH=
  
+# Tokenizer used for normalizing and parsing queries and names.
+# The tokenizer is set up during import and cannot be changed afterwards
+# without a reimport.
+# Currently available tokenizers: legacy
+NOMINATIM_TOKENIZER="legacy"
+
  # Number of occurances of a word before it is considered frequent.
  # Similar to the concept of stop words. Frequent partial words get ignored
  # or handled differently during search.
@@ -40,6 +46,12 @@ NOMINATIM_LANGUAGES=
  # Changing this value requires a reimport.
  NOMINATIM_TERM_NORMALIZATION=":: NFD (); [[:Nonspacing Mark:] [:Cf:]] >;  :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();"
  
+# Configuration file for the tokenizer.
+# The content depends on the tokenizer used. If left empty the default settings
+# for the chooseen tokenizer will be used. The configuration can only be set
+# on import and not be changed afterwards.
+NOMINATIM_TOKENIZER_CONFIG=
+
  # Search in the Tiger house number data for the US.
  # Note: The tables must already exist or queries will throw errors.
  # Changing this value requires to run ./utils/setup --create-functions --setup-website.
@@ -150,14 +162,14 @@ NOMINATIM_REPLICATION_URL="https://planet.openstreetmap.org/replication/minute"
  # Size is in MB.
  NOMINATIM_REPLICATION_MAX_DIFF=50
  
-# Publication interval of the replication service.
+# Publication interval of the replication service (in seconds).
  # Determines when Nominatim will attempt again to download again a new
  # update. The time is computed from the publication date of the last diff
  # downloaded. Setting this to a slightly higher value than the actual
  # publication interval avoids unnecessary rechecks.
  NOMINATIM_REPLICATION_UPDATE_INTERVAL=75
  
-# Wait time to recheck for a pending update.
+# Wait time to recheck for a pending update (in seconds).
  # Time to wait after an expected update was not available on the server.
  NOMINATIM_REPLICATION_RECHECK_INTERVAL=60
  
diff --git a/settings/legacy_icu_tokenizer.json b/settings/legacy_icu_tokenizer.json

new file mode 100644 (file)

index 0000000..d09528e
--- /dev/null
+++ b/settings/legacy_icu_tokenizer.json
@@ -0,0 +1,5829 @@
+{ "normalization": [ ":: Latin ()",
+                     "'ª' > 'a';",
+                     "'µ' > 'u';",
+                     "'º' > 'o';",
+                     "'Ƅ' > '6';",
+                     "'ƅ' > '6';",
+                     "'Ɔ' > 'o';",
+                     "'ƍ' > 'd';",
+                     "'Ǝ' > '3';",
+                     "'Ɣ' > 'g';",
+                     "'ƛ' > 'l';",
+                     "'Ɯ' > 'w';",
+                     "'Ɵ' > 'o';",
+                     "'Ʀ' > 'yr';",
+                     "'Ƨ' > '2';",
+                     "'ƨ' > '2';",
+                     "'Ʃ' > 'sh';",
+                     "'ƪ' > 'sh';",
+                     "'Ʊ' > 'y';",
+                     "'Ʒ' > 'zh';",
+                     "'Ƹ' > 'zh';",
+                     "'ƹ' > 'zh';",
+                     "'ƺ' > 'zh';",
+                     "'ƻ' > '2';",
+                     "'Ƽ' > '5';",
+                     "'ƽ' > '5';",
+                     "'ƾ' > 'ts';",
+                     "'ƿ' > 'w';",
+                     "'Ƕ' > 'hv';",
+                     "'Ƿ' > 'w';",
+                     "'Ȝ' > 'y';",
+                     "'ȝ' > 'y';",
+                     "'Ƞ' > 'n';",
+                     "'Ȣ' > 'ou';",
+                     "'ȣ' > 'ou';",
+                     "'Ʌ' > 'v';",
+                     "'Ɋ' > 'q';",
+                     "'ɋ' > 'q';",
+                     "'ɐ' > 'a';",
+                     "'ɑ' > 'a';",
+                     "'ɒ' > 'a';",
+                     "'ɔ' > 'o';",
+                     "'ɘ' > 'e';",
+                     "'ɜ' > 'e';",
+                     "'ɝ' > 'e';",
+                     "'ɞ' > 'e';",
+                     "'ɣ' > 'g';",
+                     "'ɤ' > 'u';",
+                     "'ɥ' > 'y';",
+                     "'ɩ' > 'i';",
+                     "'ɮ' > 'lz';",
+                     "'ɯ' > 'w';",
+                     "'ɰ' > 'w';",
+                     "'ɵ' > 'o';",
+                     "'ɷ' > 'o';",
+                     "'ɸ' > 'f';",
+                     "'ɹ' > 'r';",
+                     "'ɺ' > 'r';",
+                     "'ɻ' > 'r';",
+                     "'ɿ' > 'r';",
+                     "'ʁ' > 'r';",
+                     "'ʃ' > 's';",
+                     "'ʄ' > 'j';",
+                     "'ʅ' > 's';",
+                     "'ʆ' > 's';",
+                     "'ʇ' > 't';",
+                     "'ʊ' > 'u';",
+                     "'ʍ' > 'w';",
+                     "'ʎ' > 'y';",
+                     "'ʒ' > 'z';",
+                     "'ʓ' > 'z';",
+                     "'ʗ' > 'c';",
+                     "'ʚ' > 'e';",
+                     "'ʞ' > 'k';",
+                     "'ʤ' > 'dz';",
+                     "'ʧ' > 'ts';",
+                     "'ʨ' > 'tc';",
+                     "'ʩ' > 'fn';",
+                     "'ʬ' > 'ww';",
+                     "'ʮ' > 'h';",
+                     "'ʯ' > 'h';",
+                     "'ʰ' > 'k';",
+                     "'ʱ' > 'h';",
+                     "'ʲ' > 'j';",
+                     "'ʳ' > 'r';",
+                     "'ʴ' > 'r';",
+                     "'ʵ' > 'r';",
+                     "'ʶ' > 'r';",
+                     "'ʷ' > 'w';",
+                     "'ʸ' > 'y';",
+                     "'ˇ' > 'v';",
+                     "'ˉ' > ' ';",
+                     "'ˊ' > ' ';",
+                     "'ˌ' > ' ';",
+                     "'ˎ' > ' ';",
+                     "'ˏ' > ' ';",
+                     "'ˑ' > ' ';",
+                     "'ˠ' > 'g';",
+                     "'ˡ' > 'l';",
+                     "'ˢ' > 's';",
+                     "'ˣ' > 'x';",
+                     "'ˬ' > 'v';",
+                     "'Ͱ' > 'heta';",
+                     "'ͱ' > 'heta';",
+                     "'Ͳ' > 'sampi';",
+                     "'ͳ' > 'sampi';",
+                     "'ϗ' > ' ';",
+                     "'Ϙ' > 'koppa';",
+                     "'ϙ' > 'koppa';",
+                     "'Ϛ' > 'st';",
+                     "'ϛ' > 'st';",
+                     "'Ϝ' > 'w';",
+                     "'ϝ' > 'w';",
+                     "'Ϟ' > 'q';",
+                     "'ϟ' > 'q';",
+                     "'Ϡ' > 'sp';",
+                     "'ϡ' > 'sp';",
+                     "'Ϣ' > 'sh';",
+                     "'ϣ' > 'sh';",
+                     "'Ϥ' > 'f';",
+                     "'ϥ' > 'f';",
+                     "'Ϧ' > 'kh';",
+                     "'ϧ' > 'kh';",
+                     "'Ϩ' > 'h';",
+                     "'ϩ' > 'h';",
+                     "'Ϫ' > 'g';",
+                     "'ϫ' > 'g';",
+                     "'Ϭ' > 'ch';",
+                     "'ϭ' > 'ch';",
+                     "'Ϯ' > 'ti';",
+                     "'ϯ' > 'ti';",
+                     "'Ѡ' > 'o';",
+                     "'ѡ' > 'o';",
+                     "'Ѣ' > 'e';",
+                     "'ѣ' > 'e';",
+                     "'Ѥ' > 'ie';",
+                     "'ѥ' > 'ie';",
+                     "'Ѧ' > 'e';",
+                     "'ѧ' > 'e';",
+                     "'Ѩ' > 'ie';",
+                     "'ѩ' > 'ie';",
+                     "'Ѫ' > 'o';",
+                     "'ѫ' > 'o';",
+                     "'Ѭ' > 'io';",
+                     "'ѭ' > 'io';",
+                     "'Ѯ' > 'ks';",
+                     "'ѯ' > 'ks';",
+                     "'Ѱ' > 'ps';",
+                     "'ѱ' > 'ps';",
+                     "'Ѳ' > 'f';",
+                     "'ѳ' > 'f';",
+                     "'Ѵ' > 'y';",
+                     "'ѵ' > 'y';",
+                     "'Ѷ' > 'y';",
+                     "'ѷ' > 'y';",
+                     "'Ѹ' > 'u';",
+                     "'ѹ' > 'u';",
+                     "'Ѻ' > 'o';",
+                     "'ѻ' > 'o';",
+                     "'Ѽ' > 'o';",
+                     "'ѽ' > 'o';",
+                     "'Ѿ' > 'ot';",
+                     "'ѿ' > 'ot';",
+                     "'Ҁ' > 'q';",
+                     "'ҁ' > 'q';",
+                     "'Ҋ' > 'i';",
+                     "'ҋ' > 'i';",
+                     "'Ҏ' > 'r';",
+                     "'ҏ' > 'r';",
+                     "'Җ' > 'zh';",
+                     "'җ' > 'zh';",
+                     "'Ҝ' > 'k';",
+                     "'ҝ' > 'k';",
+                     "'Ҟ' > 'k';",
+                     "'ҟ' > 'k';",
+                     "'Ҡ' > 'k';",
+                     "'ҡ' > 'k';",
+                     "'Ң' > 'n';",
+                     "'ң' > 'n';",
+                     "'Ҥ' > 'ng';",
+                     "'ҥ' > 'ng';",
+                     "'Ҧ' > 'p';",
+                     "'ҧ' > 'p';",
+                     "'Ҩ' > 'kh';",
+                     "'ҩ' > 'kh';",
+                     "'Ҫ' > 's';",
+                     "'ҫ' > 's';",
+                     "'Ҭ' > 't';",
+                     "'ҭ' > 't';",
+                     "'Ү' > 'u';",
+                     "'ү' > 'u';",
+                     "'Ұ' > 'u';",
+                     "'ұ' > 'u';",
+                     "'Ҳ' > 'kh';",
+                     "'ҳ' > 'kh';",
+                     "'Ҵ' > 'tts';",
+                     "'ҵ' > 'tts';",
+                     "'Ҷ' > 'ch';",
+                     "'ҷ' > 'ch';",
+                     "'Ҹ' > 'ch';",
+                     "'ҹ' > 'ch';",
+                     "'Һ' > 'h';",
+                     "'һ' > 'h';",
+                     "'Ҽ' > 'ch';",
+                     "'ҽ' > 'ch';",
+                     "'Ҿ' > 'ch';",
+                     "'ҿ' > 'ch';",
+                     "'Ӄ' > 'k';",
+                     "'ӄ' > 'k';",
+                     "'Ӆ' > 'el';",
+                     "'ӆ' > 'el';",
+                     "'Ӈ' > 'n';",
+                     "'ӈ' > 'n';",
+                     "'Ӊ' > 'en';",
+                     "'ӊ' > 'en';",
+                     "'Ӌ' > 'ch';",
+                     "'ӌ' > 'ch';",
+                     "'Ӎ' > 'em';",
+                     "'ӎ' > 'em';",
+                     "'ӏ' > 'palochka';",
+                     "'Ӡ' > 'dz';",
+                     "'ӡ' > 'dz';",
+                     "'Ө' > 'o';",
+                     "'ө' > 'o';",
+                     "'Ӫ' > 'o';",
+                     "'ӫ' > 'o';",
+                     "'Ӷ' > 'ghe';",
+                     "'ӷ' > 'ghe';",
+                     "'Ӻ' > 'ghe';",
+                     "'ӻ' > 'ghe';",
+                     "'Ӽ' > 'ha';",
+                     "'ӽ' > 'ha';",
+                     "'Ӿ' > 'ha';",
+                     "'ӿ' > 'ha';",
+                     "'Ԁ' > 'de';",
+                     "'ԁ' > 'de';",
+                     "'Ԃ' > 'dje';",
+                     "'ԃ' > 'dje';",
+                     "'Ԅ' > 'zje';",
+                     "'ԅ' > 'zje';",
+                     "'Ԇ' > 'dzje';",
+                     "'ԇ' > 'dzje';",
+                     "'Ԉ' > 'lje';",
+                     "'ԉ' > 'lje';",
+                     "'Ԋ' > 'nje';",
+                     "'ԋ' > 'nje';",
+                     "'Ԍ' > 'sje';",
+                     "'ԍ' > 'sje';",
+                     "'Ԏ' > 'tje';",
+                     "'ԏ' > 'tje';",
+                     "'Ԑ' > 'ze';",
+                     "'ԑ' > 'ze';",
+                     "'Ԓ' > 'el';",
+                     "'ԓ' > 'el';",
+                     "'Ԕ' > 'lha';",
+                     "'ԕ' > 'lha';",
+                     "'Ԗ' > 'rha';",
+                     "'ԗ' > 'rha';",
+                     "'Ԙ' > 'yae';",
+                     "'ԙ' > 'yae';",
+                     "'Ԛ' > 'qa';",
+                     "'ԛ' > 'qa';",
+                     "'Ԝ' > 'we';",
+                     "'ԝ' > 'we';",
+                     "'Ԟ' > 'aleut';",
+                     "'ԟ' > 'aleut';",
+                     "'Ԡ' > 'el';",
+                     "'ԡ' > 'el';",
+                     "'Ԣ' > 'en';",
+                     "'ԣ' > 'en';",
+                     "'ՙ' > 'left';",
+                     "'ػ' > 'keheh';",
+                     "'ؼ' > 'keheh';",
+                     "'ٮ' > 'beh';",
+                     "'ٯ' > 'qaf';",
+                     "'ٱ' > 'alef';",
+                     "'ٲ' > 'alef';",
+                     "'ٳ' > 'alef';",
+                     "'ٴ' > 'high';",
+                     "'ٹ' > 'tt';",
+                     "'ٺ' > 'tth';",
+                     "'ٻ' > 'b';",
+                     "'ټ' > 't';",
+                     "'ٽ' > 't';",
+                     "'ٿ' > 'th';",
+                     "'ڀ' > 'bh';",
+                     "'ځ' > 'hah';",
+                     "'ڂ' > 'h';",
+                     "'ڃ' > 'ny';",
+                     "'ڄ' > 'dy';",
+                     "'څ' > 'h';",
+                     "'ڇ' > 'cch';",
+                     "'ڈ' > 'dd';",
+                     "'ډ' > 'd';",
+                     "'ڊ' > 'd';",
+                     "'ڋ' > 'dt';",
+                     "'ڌ' > 'dh';",
+                     "'ڍ' > 'ddh';",
+                     "'ڎ' > 'd';",
+                     "'ڏ' > 'd';",
+                     "'ڐ' > 'd';",
+                     "'ڑ' > 'rr';",
+                     "'ڒ' > 'r';",
+                     "'ړ' > 'r';",
+                     "'ڔ' > 'r';",
+                     "'ڕ' > 'r';",
+                     "'ږ' > 'r';",
+                     "'ڗ' > 'r';",
+                     "'ڙ' > 'r';",
+                     "'ڛ' > 's';",
+                     "'ڜ' > 's';",
+                     "'ڝ' > 's';",
+                     "'ڞ' > 's';",
+                     "'ڟ' > 't';",
+                     "'ڠ' > 'gh';",
+                     "'ڡ' > 'f';",
+                     "'ڢ' > 'f';",
+                     "'ڣ' > 'f';",
+                     "'ڥ' > 'f';",
+                     "'ڦ' > 'ph';",
+                     "'ڧ' > 'q';",
+                     "'ڨ' > 'q';",
+                     "'ڪ' > 'k';",
+                     "'ګ' > 'k';",
+                     "'ڬ' > 'k';",
+                     "'ڮ' > 'k';",
+                     "'ڰ' > 'g';",
+                     "'ڱ' > 'n';",
+                     "'ڲ' > 'g';",
+                     "'ڳ' > 'g';",
+                     "'ڴ' > 'g';",
+                     "'ڵ' > 'l';",
+                     "'ڶ' > 'l';",
+                     "'ڷ' > 'l';",
+                     "'ڸ' > 'l';",
+                     "'ڹ' > 'n';",
+                     "'ں' > 'n';",
+                     "'ڻ' > 'n';",
+                     "'ڼ' > 'n';",
+                     "'ڽ' > 'n';",
+                     "'ھ' > 'h';",
+                     "'ڿ' > 'ch';",
+                     "'ہ' > 'h';",
+                     "'ۃ' > 'teh';",
+                     "'ۄ' > 'w';",
+                     "'ۅ' > 'oe';",
+                     "'ۆ' > 'oe';",
+                     "'ۇ' > 'u';",
+                     "'ۈ' > 'yu';",
+                     "'ۉ' > 'yu';",
+                     "'ۊ' > 'w';",
+                     "'ۍ' > 'y';",
+                     "'ێ' > 'y';",
+                     "'ۏ' > 'w';",
+                     "'ې' > 'e';",
+                     "'ۑ' > 'yeh';",
+                     "'ے' > 'y';",
+                     "'ە' > 'ae';",
+                     "'ۮ' > 'dal';",
+                     "'ۯ' > 'reh';",
+                     "'ۺ' > 'sh';",
+                     "'ۻ' > 'd';",
+                     "'ۼ' > 'gh';",
+                     "'ۿ' > 'heh';",
+                     "'ݐ' > 'beh';",
+                     "'ݑ' > 'beh';",
+                     "'ݒ' > 'beh';",
+                     "'ݓ' > 'beh';",
+                     "'ݔ' > 'beh';",
+                     "'ݕ' > 'beh';",
+                     "'ݖ' > 'beh';",
+                     "'ݗ' > 'hah';",
+                     "'ݘ' > 'hah';",
+                     "'ݙ' > 'dal';",
+                     "'ݚ' > 'dal';",
+                     "'ݛ' > 'reh';",
+                     "'ݜ' > 'seen';",
+                     "'ݝ' > 'ain';",
+                     "'ݞ' > 'ain';",
+                     "'ݟ' > 'ain';",
+                     "'ݠ' > 'feh';",
+                     "'ݡ' > 'feh';",
+                     "'ݢ' > 'keheh';",
+                     "'ݣ' > 'keheh';",
+                     "'ݤ' > 'keheh';",
+                     "'ݥ' > 'meem';",
+                     "'ݦ' > 'meem';",
+                     "'ݧ' > 'noon';",
+                     "'ݨ' > 'noon';",
+                     "'ݩ' > 'noon';",
+                     "'ݪ' > 'lam';",
+                     "'ݫ' > 'reh';",
+                     "'ݬ' > 'reh';",
+                     "'ݭ' > 'seen';",
+                     "'ݮ' > 'hah';",
+                     "'ݯ' > 'hah';",
+                     "'ݰ' > 'seen';",
+                     "'ݱ' > 'reh';",
+                     "'ݲ' > 'hah';",
+                     "'ݳ' > 'alef';",
+                     "'ݴ' > 'alef';",
+                     "'ݸ' > 'waw';",
+                     "'ݹ' > 'waw';",
+                     "'ݺ' > 'yeh';",
+                     "'ݻ' > 'yeh';",
+                     "'ݼ' > 'hah';",
+                     "'ݽ' > 'seen';",
+                     "'ݾ' > 'seen';",
+                     "'ݿ' > 'kaf';",
+                     "'ޜ' > 'z';",
+                     "'ޡ' > 'z';",
+                     "'ޥ' > 'w';",
+                     "'ޱ' > 'naa';",
+                     "'ߊ' > 'a';",
+                     "'ߋ' > 'ee';",
+                     "'ߌ' > 'i';",
+                     "'ߍ' > 'e';",
+                     "'ߎ' > 'u';",
+                     "'ߏ' > 'oo';",
+                     "'ߐ' > 'o';",
+                     "'ߑ' > 'dagbasinna';",
+                     "'ߒ' > 'n';",
+                     "'ߓ' > 'ba';",
+                     "'ߔ' > 'pa';",
+                     "'ߕ' > 'ta';",
+                     "'ߖ' > 'ja';",
+                     "'ߗ' > 'cha';",
+                     "'ߘ' > 'da';",
+                     "'ߙ' > 'ra';",
+                     "'ߚ' > 'rra';",
+                     "'ߛ' > 'sa';",
+                     "'ߜ' > 'gba';",
+                     "'ߝ' > 'fa';",
+                     "'ߞ' > 'ka';",
+                     "'ߟ' > 'la';",
+                     "'ߠ' > 'na';",
+                     "'ߡ' > 'ma';",
+                     "'ߢ' > 'nya';",
+                     "'ߣ' > 'na';",
+                     "'ߤ' > 'ha';",
+                     "'ߥ' > 'wa';",
+                     "'ߦ' > 'ya';",
+                     "'ߧ' > 'nya';",
+                     "'ߨ' > 'jona';",
+                     "'ߩ' > 'jona';",
+                     "'ߪ' > 'jona';",
+                     "'ॱ' > 'high';",
+                     "'ॲ' > 'candra';",
+                     "'ॻ' > 'gga';",
+                     "'ॼ' > 'jja';",
+                     "'ॾ' > 'ddda';",
+                     "'ॿ' > 'bba';",
+                     "'ௐ' > 'aum';",
+                     "'ఽ' > 'avagraha';",
+                     "'ౘ' > 'tsa';",
+                     "'ౙ' > 'dza';",
+                     "'ೱ' > 'jihvamuliya';",
+                     "'ೲ' > 'upadhmaniya';",
+                     "'ഽ' > 'avagraha';",
+                     "'අ' > 'a';",
+                     "'ආ' > 'aa';",
+                     "'ඇ' > 'i';",
+                     "'ඈ' > 'ii';",
+                     "'ඉ' > 'u';",
+                     "'ඊ' > 'uu';",
+                     "'උ' > 'r';",
+                     "'ඌ' > 'l';",
+                     "'ඍ' > 'iruyanna';",
+                     "'ඎ' > 'e';",
+                     "'ඏ' > 'ee';",
+                     "'ඐ' > 'ai';",
+                     "'එ' > 'eyanna';",
+                     "'ඒ' > 'o';",
+                     "'ඓ' > 'oo';",
+                     "'ඔ' > 'au';",
+                     "'ඕ' > 'k';",
+                     "'ඖ' > 'kh';",
+                     "'ක' > 'c';",
+                     "'ඛ' > 'ch';",
+                     "'ග' > 'j';",
+                     "'ඝ' > 'jh';",
+                     "'ඞ' > 'ny';",
+                     "'ඟ' > 'tt';",
+                     "'ච' > 'tth';",
+                     "'ඡ' > 'dd';",
+                     "'ජ' > 'ddh';",
+                     "'ඣ' > 'nn';",
+                     "'ඤ' > 't';",
+                     "'ඥ' > 'th';",
+                     "'ඦ' > 'd';",
+                     "'ට' > 'dh';",
+                     "'ඨ' > 'n';",
+                     "'ඩ' > 'alpapraana';",
+                     "'ඪ' > 'p';",
+                     "'ණ' > 'ph';",
+                     "'ඬ' > 'b';",
+                     "'ත' > 'bh';",
+                     "'ථ' > 'm';",
+                     "'ද' > 'y';",
+                     "'ධ' > 'r';",
+                     "'න' > 'rr';",
+                     "'ඳ' > 'll';",
+                     "'ප' > 'alpapraana';",
+                     "'ඵ' > 'v';",
+                     "'බ' > 'sh';",
+                     "'භ' > 'ss';",
+                     "'ම' > 's';",
+                     "'ඹ' > 'h';",
+                     "'ය' > 'yayanna';",
+                     "'ර' > 'rayanna';",
+                     "'ල' > 'dantaja';",
+                     "'ව' > 'ii';",
+                     "'ශ' > 'u';",
+                     "'ෂ' > 'uu';",
+                     "'ස' > 'r';",
+                     "'හ' > 'rr';",
+                     "'ළ' > 'muurdhaja';",
+                     "'ෆ' > 'e';",
+                     "'ກ' > 'ko';",
+                     "'ຂ' > 'n';",
+                     "'ຄ' > 'kho';",
+                     "'ງ' > 'ae';",
+                     "'ຈ' > 'aae';",
+                     "'ຊ' > 'ii';",
+                     "'ຍ' > 'r';",
+                     "'ດ' > 'o';",
+                     "'ຕ' > 'oo';",
+                     "'ຖ' > 'au';",
+                     "'ທ' > 'tho';",
+                     "'ນ' > 'no';",
+                     "'ບ' > 'k';",
+                     "'ປ' > 'kh';",
+                     "'ຜ' > 'g';",
+                     "'ຝ' > 'gh';",
+                     "'ພ' > 'ng';",
+                     "'ຟ' > 'nng';",
+                     "'ມ' > 'ch';",
+                     "'ຢ' > 'j';",
+                     "'ຣ' > 'jh';",
+                     "'ລ' > 'jny';",
+                     "'ວ' > 'tt';",
+                     "'ສ' > 'ddh';",
+                     "'ຫ' > 'nn';",
+                     "'ອ' > 't';",
+                     "'ຮ' > 'th';",
+                     "'ຯ' > 'd';",
+                     "'ະ' > 'dh';",
+                     "'າ' > 'aa';",
+                     "'ຳ' > 'nd';",
+                     "'ຽ' > 'l';",
+                     "'ເ' > 'v';",
+                     "'ແ' > 'sh';",
+                     "'ໂ' > 'ss';",
+                     "'ໃ' > 's';",
+                     "'ໄ' > 'h';",
+                     "'ໆ' > 'f';",
+                     "'ໜ' > 'o';",
+                     "'ໝ' > 'oo';",
+                     "'ໞ' > 'au';",
+                     "'ໟ' > 'l';",
+                     "'ༀ' > 'om';",
+                     "'ཀ' > 'e';",
+                     "'ཁ' > 'ae';",
+                     "'ག' > 'o';",
+                     "'གྷ' > 'ai';",
+                     "'ང' > 'ai';",
+                     "'ཅ' > 'ao';",
+                     "'ཆ' > 'cha';",
+                     "'ཇ' > 'ja';",
+                     "'ཉ' > 'nya';",
+                     "'ཊ' > 'tta';",
+                     "'ཋ' > 'ttha';",
+                     "'ཌ' > 'dda';",
+                     "'ཌྷ' > 'm';",
+                     "'ཎ' > 'nna';",
+                     "'ཏ' > 'ta';",
+                     "'ཐ' > 'tha';",
+                     "'ད' > 'da';",
+                     "'དྷ' > 'dha';",
+                     "'ན' > 'na';",
+                     "'པ' > 'pa';",
+                     "'ཕ' > 'pha';",
+                     "'བ' > 'ba';",
+                     "'བྷ' > 'bha';",
+                     "'མ' > 'ma';",
+                     "'ཙ' > 'tsa';",
+                     "'ཚ' > 'tsha';",
+                     "'ཛ' > 'dza';",
+                     "'ཛྷ' > 'dzha';",
+                     "'ཝ' > 'wa';",
+                     "'ཞ' > 'zha';",
+                     "'ཟ' > 'za';",
+                     "'འ' > '-a';",
+                     "'ཡ' > 'ya';",
+                     "'ར' > 'ra';",
+                     "'ལ' > 'la';",
+                     "'ཤ' > 'sha';",
+                     "'ཥ' > 'ssa';",
+                     "'ས' > 'sa';",
+                     "'ཧ' > 'ha';",
+                     "'ཨ' > 'a';",
+                     "'ཀྵ' > 'kssa';",
+                     "'ཫ' > 'kka';",
+                     "'ཬ' > 'rra';",
+                     "'ྈ' > 'ch';",
+                     "'ྉ' > 'mchu';",
+                     "'ྊ' > 's';",
+                     "'ྋ' > 'gru';",
+                     "'က' > 'aum';",
+                     "'ခ' > 'kha';",
+                     "'ဂ' > 'ga';",
+                     "'ဃ' > 'gha';",
+                     "'င' > 'nga';",
+                     "'စ' > 'ca';",
+                     "'ဆ' > 'cha';",
+                     "'ဇ' > 'ja';",
+                     "'ဈ' > 'jha';",
+                     "'ဉ' > 'nya';",
+                     "'ည' > 'nnya';",
+                     "'ဋ' > 'tta';",
+                     "'ဌ' > 'ttha';",
+                     "'ဍ' > 'dda';",
+                     "'ဎ' > 'ddha';",
+                     "'ဏ' > 'nna';",
+                     "'တ' > 'ta';",
+                     "'ထ' > 'tha';",
+                     "'ဒ' > 'da';",
+                     "'ဓ' > 'dha';",
+                     "'န' > 'na';",
+                     "'ပ' > 'pa';",
+                     "'ဖ' > 'pha';",
+                     "'ဗ' > 'ba';",
+                     "'ဘ' > 'bha';",
+                     "'မ' > 'ma';",
+                     "'ယ' > 'ya';",
+                     "'ရ' > 'ra';",
+                     "'လ' > 'la';",
+                     "'ဝ' > 'wa';",
+                     "'သ' > 'sa';",
+                     "'ဟ' > 'ha';",
+                     "'ဠ' > 'lla';",
+                     "'အ' > 'a';",
+                     "'ဢ' > 'shan';",
+                     "'ဣ' > 'i';",
+                     "'ဤ' > 'ii';",
+                     "'ဥ' > 'u';",
+                     "'ဦ' > 'uu';",
+                     "'ဧ' > 'e';",
+                     "'ဨ' > 'mon';",
+                     "'ဩ' > 'o';",
+                     "'ဪ' > 'au';",
+                     "'ၐ' > 'th';",
+                     "'ၑ' > 'd';",
+                     "'ၒ' > 'dh';",
+                     "'ၓ' > 'n';",
+                     "'ၔ' > 'p';",
+                     "'ၕ' > 'ph';",
+                     "'ၚ' > 'tsh';",
+                     "'ၛ' > 'dz';",
+                     "'ၜ' > 'dzh';",
+                     "'ၝ' > 'w';",
+                     "'ၡ' > 'y';",
+                     "'ၥ' > 'ssh';",
+                     "'ၦ' > 's';",
+                     "'ၵ' > 'uu';",
+                     "'ၶ' > 'r';",
+                     "'ၷ' > 'rr';",
+                     "'ၸ' > 'l';",
+                     "'ၹ' > 'll';",
+                     "'ၺ' > 'e';",
+                     "'ၻ' > 'ee';",
+                     "'ၼ' > 'o';",
+                     "'ၽ' > 'oo';",
+                     "'ၾ' > 'm';",
+                     "'ၿ' > 'h';",
+                     "'ႀ' > 'i';",
+                     "'ႁ' > 'ii';",
+                     "'ႎ' > 'rumai';",
+                     "'Ⴀ' > 'th';",
+                     "'Ⴁ' > 'd';",
+                     "'Ⴂ' > 'dh';",
+                     "'Ⴃ' > 'n';",
+                     "'Ⴄ' > 'p';",
+                     "'Ⴅ' > 'ph';",
+                     "'Ⴆ' > 'b';",
+                     "'Ⴇ' > 'bh';",
+                     "'Ⴈ' > 'm';",
+                     "'Ⴉ' > 'ts';",
+                     "'Ⴊ' > 'tsh';",
+                     "'Ⴋ' > 'dz';",
+                     "'Ⴌ' > 'dzh';",
+                     "'Ⴍ' > 'w';",
+                     "'Ⴎ' > 'zh';",
+                     "'Ⴏ' > 'z';",
+                     "'Ⴐ' > 'rae';",
+                     "'Ⴑ' > 'y';",
+                     "'Ⴒ' > 'r';",
+                     "'Ⴓ' > 'l';",
+                     "'Ⴔ' > 'sh';",
+                     "'Ⴕ' > 'ss';",
+                     "'Ⴖ' > 's';",
+                     "'Ⴗ' > 'h';",
+                     "'Ⴘ' > 'a';",
+                     "'Ⴙ' > 'kss';",
+                     "'Ⴚ' > 'w';",
+                     "'Ⴛ' > 'y';",
+                     "'Ⴜ' > 'r';",
+                     "'Ⴞ' > 'x';",
+                     "'Ⴟ' > 'jhan';",
+                     "'Ⴠ' > 'hae';",
+                     "'Ⴡ' > 'he';",
+                     "'Ⴢ' > 'hie';",
+                     "'Ⴣ' > 'we';",
+                     "'Ⴤ' > 'har';",
+                     "'Ⴥ' > 'hoe';",
+                     "'ჱ' > 'he';",
+                     "'ჲ' > 'hie';",
+                     "'ჵ' > 'hoe';",
+                     "'ჶ' > 'fi';",
+                     "'ჷ' > 'yn';",
+                     "'ჸ' > 'elifi';",
+                     "'ჹ' > 'gan';",
+                     "'ჺ' > 'ain';",
+                     "'ᄓ' > 'dh';",
+                     "'ᄔ' > 'n';",
+                     "'ᄕ' > 'p';",
+                     "'ᄖ' > 'ph';",
+                     "'ᄗ' > 'b';",
+                     "'ᄘ' > 'bh';",
+                     "'ᄙ' > 'm';",
+                     "'ᄚ' > 'y';",
+                     "'ᄛ' > 'r';",
+                     "'ᄜ' > 'l';",
+                     "'ᄝ' > 'w';",
+                     "'ᄞ' > 's';",
+                     "'ᄟ' > 'h';",
+                     "'ᄠ' > 'll';",
+                     "'ᄡ' > 'a';",
+                     "'ᄣ' > 'i';",
+                     "'ᄤ' > 'ii';",
+                     "'ᄥ' > 'u';",
+                     "'ᄦ' > 'uu';",
+                     "'ᄧ' > 'e';",
+                     "'ᄩ' > 'o';",
+                     "'ᄪ' > 'au';",
+                     "'ᄬ' > 'aa';",
+                     "'ᄭ' > 'i';",
+                     "'ᄮ' > 'ii';",
+                     "'ᄯ' > 'u';",
+                     "'ᄰ' > 'uu';",
+                     "'ᄱ' > 'e';",
+                     "'ᄲ' > 'ai';",
+                     "'ᄶ' > 'n';",
+                     "'ᅌ' > 'n';",
+                     "'ᅍ' > 'r';",
+                     "'ᅎ' > 'l';",
+                     "'ᅏ' > 'e';",
+                     "'ᅐ' > 'sh';",
+                     "'ᅑ' > 'ss';",
+                     "'ᅒ' > 'r';",
+                     "'ᅓ' > 'rr';",
+                     "'ᅔ' > 'l';",
+                     "'ᅕ' > 'll';",
+                     "'ᅖ' > 'r';",
+                     "'ᅗ' > 'rr';",
+                     "'ᅘ' > 'l';",
+                     "'ᅙ' > 'll';",
+                     "'ᅶ' > 'a-o';",
+                     "'ᅷ' > 'a-u';",
+                     "'ᅸ' > 'ya-o';",
+                     "'ᅹ' > 'ya-yo';",
+                     "'ᅺ' > 'eo-o';",
+                     "'ᅻ' > 'eo-u';",
+                     "'ᅼ' > 'eo-eu';",
+                     "'ᅽ' > 'yeo-o';",
+                     "'ᅾ' > 'yeo-u';",
+                     "'ᅿ' > 'o-eo';",
+                     "'ᆀ' > 'o-e';",
+                     "'ᆁ' > 'o-ye';",
+                     "'ᆂ' > 'o-o';",
+                     "'ᆃ' > 'o-u';",
+                     "'ᆄ' > 'yo-ya';",
+                     "'ᆅ' > 'yo-yae';",
+                     "'ᆆ' > 'yo-yeo';",
+                     "'ᆇ' > 'yo-o';",
+                     "'ᆈ' > 'yo-i';",
+                     "'ᆉ' > 'u-a';",
+                     "'ᆊ' > 'u-ae';",
+                     "'ᆋ' > 'u-eo-eu';",
+                     "'ᆌ' > 'u-ye';",
+                     "'ᆍ' > 'u-u';",
+                     "'ᆎ' > 'yu-a';",
+                     "'ᆏ' > 'yu-eo';",
+                     "'ᆐ' > 'yu-e';",
+                     "'ᆑ' > 'yu-yeo';",
+                     "'ᆒ' > 'yu-ye';",
+                     "'ᆓ' > 'yu-u';",
+                     "'ᆔ' > 'yu-i';",
+                     "'ᆕ' > 'eu-u';",
+                     "'ᆖ' > 'eu-eu';",
+                     "'ᆗ' > 'yi-u';",
+                     "'ᆘ' > 'i-a';",
+                     "'ᆙ' > 'i-ya';",
+                     "'ᆚ' > 'i-o';",
+                     "'ᆛ' > 'i-u';",
+                     "'ᆜ' > 'i-eu';",
+                     "'ᆝ' > 'i-araea';",
+                     "'ᆞ' > 'araea';",
+                     "'ᆟ' > 'araea-eo';",
+                     "'ᆠ' > 'a';",
+                     "'ᆡ' > 'b';",
+                     "'ᆢ' > 'g';",
+                     "'ᆣ' > 'd';",
+                     "'ᆤ' > 'e';",
+                     "'ᆥ' > 'v';",
+                     "'ᆦ' > 'z';",
+                     "'ᆧ' > 't';",
+                     "'ᇃ' > 'w';",
+                     "'ᇄ' > 'xh';",
+                     "'ᇅ' > 'oe';",
+                     "'ᇆ' > 'nieun-tikeut';",
+                     "'ᇇ' > 'nieun-sios';",
+                     "'ᇈ' > 'nieun-pansios';",
+                     "'ᇉ' > 'nieun-thieuth';",
+                     "'ᇊ' > 'tikeut-kiyeok';",
+                     "'ᇋ' > 'tikeut-rieul';",
+                     "'ᇌ' > 'rieul-kiyeok-sios';",
+                     "'ᇍ' > 'rieul-nieun';",
+                     "'ᇎ' > 'rieul-tikeut';",
+                     "'ᇏ' > 'rieul-tikeut-hieuh';",
+                     "'ᇐ' > 'a';",
+                     "'ᇑ' > 'b';",
+                     "'ᇒ' > 'g';",
+                     "'ᇓ' > 'd';",
+                     "'ᇔ' > 'e';",
+                     "'ᇕ' > 'v';",
+                     "'ᇖ' > 'z';",
+                     "'ᇗ' > 't';",
+                     "'ᇘ' > 'i';",
+                     "'ᇙ' > 'k';",
+                     "'ᇚ' > 'l';",
+                     "'ᇛ' > 'm';",
+                     "'ᇜ' > 'n';",
+                     "'ᇝ' > 'o';",
+                     "'ᇞ' > 'p';",
+                     "'ᇟ' > 'zh';",
+                     "'ᇠ' > 'r';",
+                     "'ᇡ' > 's';",
+                     "'ᇢ' > 't';",
+                     "'ᇣ' > 'u';",
+                     "'ᇤ' > 'p';",
+                     "'ᇥ' > 'k';",
+                     "'ᇦ' > 'g';",
+                     "'ᇧ' > 'q';",
+                     "'ᇨ' > 'sh';",
+                     "'ᇩ' > 'ch';",
+                     "'ᇪ' > 'c';",
+                     "'ᇫ' > 'z';",
+                     "'ᇬ' > 'c';",
+                     "'ᇭ' > 'ch';",
+                     "'ᇮ' > 'x';",
+                     "'ᇯ' > 'j';",
+                     "'ᇰ' > 'h';",
+                     "'ᇱ' > 'e';",
+                     "'ᇲ' > 'y';",
+                     "'ᇳ' > 'w';",
+                     "'ᇴ' > 'xh';",
+                     "'ᇵ' > 'oe';",
+                     "'ᇶ' > 'f';",
+                     "'ᇷ' > 'hieuh-mieum';",
+                     "'ᇸ' > 'hieuh-pieup';",
+                     "'ᇹ' > 'yeorinhieuh';",
+                     "'ሀ' > 'g';",
+                     "'ሁ' > 'gg';",
+                     "'ሂ' > 'n';",
+                     "'ሃ' > 'd';",
+                     "'ሄ' > 'dd';",
+                     "'ህ' > 'r';",
+                     "'ሆ' > 'm';",
+                     "'ሇ' > 'b';",
+                     "'ለ' > 'bb';",
+                     "'ሉ' > 's';",
+                     "'ሊ' > 'ss';",
+                     "'ላ' > 'laa';",
+                     "'ሌ' > 'j';",
+                     "'ል' > 'jj';",
+                     "'ሎ' > 'c';",
+                     "'ሏ' > 'k';",
+                     "'ሐ' > 't';",
+                     "'ሑ' > 'p';",
+                     "'ሒ' > 'h';",
+                     "'ሓ' > 'ng';",
+                     "'ሔ' > 'nn';",
+                     "'ሕ' > 'nd';",
+                     "'ሖ' > 'nb';",
+                     "'ሗ' > 'dg';",
+                     "'መ' > 'rn';",
+                     "'ሙ' > 'rr';",
+                     "'ሚ' > 'rh';",
+                     "'ማ' > 'rn';",
+                     "'ሜ' > 'mb';",
+                     "'ም' > 'mn';",
+                     "'ሞ' > 'bg';",
+                     "'ሟ' > 'bn';",
+                     "'ሠ' > 'sza';",
+                     "'ሡ' > 'bs';",
+                     "'ሢ' > 'bsg';",
+                     "'ሣ' > 'bst';",
+                     "'ሤ' > 'bsb';",
+                     "'ሥ' > 'bss';",
+                     "'ሦ' > 'bsj';",
+                     "'ሧ' > 'bj';",
+                     "'ረ' > 'bc';",
+                     "'ሩ' > 'bt';",
+                     "'ሪ' > 'bp';",
+                     "'ራ' > 'bn';",
+                     "'ሬ' > 'bbn';",
+                     "'ር' > 'sg';",
+                     "'ሮ' > 'sn';",
+                     "'ሯ' > 'sd';",
+                     "'ሰ' > 'sr';",
+                     "'ሱ' > 'sm';",
+                     "'ሲ' > 'sb';",
+                     "'ሳ' > 'sbg';",
+                     "'ሴ' > 'sss';",
+                     "'ስ' > 's';",
+                     "'ሶ' > 'sj';",
+                     "'ሷ' > 'sc';",
+                     "'ሸ' > 'sk';",
+                     "'ሹ' > 'st';",
+                     "'ሺ' > 'sp';",
+                     "'ሻ' > 'sh';",
+                     "'ሼ' > 'shee';",
+                     "'ሽ' > 'she';",
+                     "'ሾ' > 'sho';",
+                     "'ሿ' > 'shwa';",
+                     "'ቀ' > 'z';",
+                     "'ቁ' > 'g';",
+                     "'ቂ' > 'd';",
+                     "'ቃ' > 'm';",
+                     "'ቄ' > 'b';",
+                     "'ቅ' > 's';",
+                     "'ቆ' > 'z';",
+                     "'ቇ' > 'qoa';",
+                     "'ቈ' > 'j';",
+                     "'ቊ' > 't';",
+                     "'ቋ' > 'p';",
+                     "'ቌ' > 'n';",
+                     "'ቍ' > 'j';",
+                     "'ቐ' > 'qha';",
+                     "'ቑ' > 'qhu';",
+                     "'ቒ' > 'ck';",
+                     "'ቓ' > 'ch';",
+                     "'ቔ' > 'qhee';",
+                     "'ቕ' > 'qhe';",
+                     "'ቖ' > 'pb';",
+                     "'ቘ' > 'hh';",
+                     "'ቚ' > 'qhwi';",
+                     "'ቛ' > 'qhwaa';",
+                     "'ቜ' > 'qhwee';",
+                     "'ቝ' > 'qhwe';",
+                     "'በ' > 'ba';",
+                     "'ቡ' > 'a';",
+                     "'ቢ' > 'ae';",
+                     "'ባ' > 'ya';",
+                     "'ቤ' > 'yae';",
+                     "'ብ' > 'eo';",
+                     "'ቦ' > 'e';",
+                     "'ቧ' > 'yeo';",
+                     "'ቨ' > 'ye';",
+                     "'ቩ' > 'o';",
+                     "'ቪ' > 'wa';",
+                     "'ቫ' > 'wae';",
+                     "'ቬ' > 'oe';",
+                     "'ቭ' > 'yo';",
+                     "'ቮ' > 'u';",
+                     "'ቯ' > 'weo';",
+                     "'ተ' > 'we';",
+                     "'ቱ' > 'wi';",
+                     "'ቲ' > 'yu';",
+                     "'ታ' > 'eu';",
+                     "'ቴ' > 'yi';",
+                     "'ት' > 'i';",
+                     "'ቶ' > 'a-o';",
+                     "'ቷ' > 'a-u';",
+                     "'ቸ' > 'ya-o';",
+                     "'ቹ' > 'ya-yo';",
+                     "'ቺ' > 'eo-o';",
+                     "'ቻ' > 'eo-u';",
+                     "'ቼ' > 'eo-eu';",
+                     "'ች' > 'yeo-o';",
+                     "'ቾ' > 'yeo-u';",
+                     "'ቿ' > 'o-eo';",
+                     "'ኀ' > 'o-e';",
+                     "'ኁ' > 'o-ye';",
+                     "'ኂ' > 'o-o';",
+                     "'ኃ' > 'o-u';",
+                     "'ኄ' > 'yo-ya';",
+                     "'ኅ' > 'yo-yae';",
+                     "'ኆ' > 'yo-yeo';",
+                     "'ኇ' > 'yo-o';",
+                     "'ኈ' > 'yo-i';",
+                     "'ኊ' > 'u-ae';",
+                     "'ኋ' > 'u-eo-eu';",
+                     "'ኌ' > 'u-ye';",
+                     "'ኍ' > 'u-u';",
+                     "'ነ' > 'yu-e';",
+                     "'ኑ' > 'yu-yeo';",
+                     "'ኒ' > 'yu-ye';",
+                     "'ና' > 'yu-u';",
+                     "'ኔ' > 'yu-i';",
+                     "'ን' > 'eu-u';",
+                     "'ኖ' > 'eu-eu';",
+                     "'ኗ' > 'yi-u';",
+                     "'ኘ' > 'i-a';",
+                     "'ኙ' > 'i-ya';",
+                     "'ኚ' > 'i-o';",
+                     "'ኛ' > 'i-u';",
+                     "'ኜ' > 'i-eu';",
+                     "'ኝ' > 'i-u';",
+                     "'ኞ' > 'u';",
+                     "'ኟ' > 'u-eo';",
+                     "'አ' > 'u-u';",
+                     "'ኡ' > 'u-i';",
+                     "'ኢ' > 'uu';",
+                     "'ኣ' > 'aa';",
+                     "'ኤ' > 'ee';",
+                     "'እ' > 'e';",
+                     "'ኦ' > 'o';",
+                     "'ኧ' > 'wa';",
+                     "'ከ' > 'g';",
+                     "'ኩ' > 'gg';",
+                     "'ኪ' > 'gs';",
+                     "'ካ' > 'n';",
+                     "'ኬ' > 'nj';",
+                     "'ክ' > 'nh';",
+                     "'ኮ' > 'd';",
+                     "'ኯ' > 'l';",
+                     "'ኰ' > 'lg';",
+                     "'ኲ' > 'lb';",
+                     "'ኳ' > 'ls';",
+                     "'ኴ' > 'lt';",
+                     "'ኵ' > 'lp';",
+                     "'ኸ' > 'b';",
+                     "'ኹ' > 'bs';",
+                     "'ኺ' > 's';",
+                     "'ኻ' > 'ss';",
+                     "'ኼ' > 'ng';",
+                     "'ኽ' > 'j';",
+                     "'ኾ' > 'c';",
+                     "'ዀ' > 't';",
+                     "'ዂ' > 'h';",
+                     "'ዃ' > 'gl';",
+                     "'ዄ' > 'gsg';",
+                     "'ዅ' > 'ng';",
+                     "'ወ' > 'nz';",
+                     "'ዉ' > 'nt';",
+                     "'ዊ' > 'dg';",
+                     "'ዋ' > 'tl';",
+                     "'ዌ' > 'lgs';",
+                     "'ው' > 'ln';",
+                     "'ዎ' > 'ld';",
+                     "'ዏ' > 'lth';",
+                     "'ዐ' > 'll';",
+                     "'ዑ' > 'lmg';",
+                     "'ዒ' > 'lms';",
+                     "'ዓ' > 'lbs';",
+                     "'ዔ' > 'lbh';",
+                     "'ዕ' > 'rnp';",
+                     "'ዖ' > 'lss';",
+                     "'ዘ' > 'lk';",
+                     "'ዙ' > 'lq';",
+                     "'ዚ' > 'mg';",
+                     "'ዛ' > 'ml';",
+                     "'ዜ' > 'mb';",
+                     "'ዝ' > 'ms';",
+                     "'ዞ' > 'mss';",
+                     "'ዟ' > 'mz';",
+                     "'ዠ' > 'mc';",
+                     "'ዡ' > 'mh';",
+                     "'ዢ' > 'mn';",
+                     "'ዣ' > 'bl';",
+                     "'ዤ' > 'bp';",
+                     "'ዥ' > 'ph';",
+                     "'ዦ' > 'pn';",
+                     "'ዧ' > 'sg';",
+                     "'የ' > 'sd';",
+                     "'ዩ' > 'sl';",
+                     "'ዪ' > 'sb';",
+                     "'ያ' > 'z';",
+                     "'ዬ' > 'g';",
+                     "'ይ' > 'ss';",
+                     "'ዮ' > 'yo';",
+                     "'ዯ' > 'kh';",
+                     "'ደ' > 'n';",
+                     "'ዱ' > 'ns';",
+                     "'ዲ' > 'nz';",
+                     "'ዳ' > 'pb';",
+                     "'ዴ' > 'pn';",
+                     "'ድ' > 'hn';",
+                     "'ዶ' > 'hl';",
+                     "'ዷ' > 'hm';",
+                     "'ዸ' > 'hb';",
+                     "'ዹ' > 'q';",
+                     "'ዺ' > 'ddi';",
+                     "'ዻ' > 'ddaa';",
+                     "'ዼ' > 'ddee';",
+                     "'ዽ' > 'dde';",
+                     "'ዾ' > 'ddo';",
+                     "'ዿ' > 'ddwa';",
+                     "'ጀ' > 'ha';",
+                     "'ጁ' > 'hu';",
+                     "'ጂ' > 'hi';",
+                     "'ጃ' > 'haa';",
+                     "'ጄ' > 'hee';",
+                     "'ጅ' > 'he';",
+                     "'ጆ' > 'ho';",
+                     "'ጇ' > 'jwa';",
+                     "'ገ' > 'la';",
+                     "'ጉ' > 'lu';",
+                     "'ጊ' > 'li';",
+                     "'ጋ' > 'laa';",
+                     "'ጌ' > 'lee';",
+                     "'ግ' > 'le';",
+                     "'ጎ' > 'lo';",
+                     "'ጏ' > 'lwa';",
+                     "'ጐ' > 'hha';",
+                     "'ጒ' > 'hhi';",
+                     "'ጓ' > 'hhaa';",
+                     "'ጔ' > 'hhee';",
+                     "'ጕ' > 'hhe';",
+                     "'ጘ' > 'ma';",
+                     "'ጙ' > 'mu';",
+                     "'ጚ' > 'mi';",
+                     "'ጛ' > 'maa';",
+                     "'ጜ' > 'mee';",
+                     "'ጝ' > 'me';",
+                     "'ጞ' > 'mo';",
+                     "'ጟ' > 'mwa';",
+                     "'ጠ' > 'sza';",
+                     "'ጡ' > 'szu';",
+                     "'ጢ' > 'szi';",
+                     "'ጣ' > 'szaa';",
+                     "'ጤ' > 'szee';",
+                     "'ጥ' > 'sze';",
+                     "'ጦ' > 'szo';",
+                     "'ጧ' > 'szwa';",
+                     "'ጨ' > 'ra';",
+                     "'ጩ' > 'ru';",
+                     "'ጪ' > 'ri';",
+                     "'ጫ' > 'raa';",
+                     "'ጬ' > 'ree';",
+                     "'ጭ' > 're';",
+                     "'ጮ' > 'ro';",
+                     "'ጯ' > 'rwa';",
+                     "'ጰ' > 'sa';",
+                     "'ጱ' > 'su';",
+                     "'ጲ' > 'si';",
+                     "'ጳ' > 'saa';",
+                     "'ጴ' > 'see';",
+                     "'ጵ' > 'se';",
+                     "'ጶ' > 'so';",
+                     "'ጷ' > 'swa';",
+                     "'ጸ' > 'sha';",
+                     "'ጹ' > 'shu';",
+                     "'ጺ' > 'shi';",
+                     "'ጻ' > 'shaa';",
+                     "'ጼ' > 'shee';",
+                     "'ጽ' > 'she';",
+                     "'ጾ' > 'sho';",
+                     "'ጿ' > 'shwa';",
+                     "'ፀ' > 'qa';",
+                     "'ፁ' > 'qu';",
+                     "'ፂ' > 'qi';",
+                     "'ፃ' > 'qaa';",
+                     "'ፄ' > 'qee';",
+                     "'ፅ' > 'qe';",
+                     "'ፆ' > 'qo';",
+                     "'ፇ' > 'tzoa';",
+                     "'ፈ' > 'qwa';",
+                     "'ፉ' > 'fu';",
+                     "'ፊ' > 'qwi';",
+                     "'ፋ' > 'qwaa';",
+                     "'ፌ' > 'qwee';",
+                     "'ፍ' > 'qwe';",
+                     "'ፎ' > 'fo';",
+                     "'ፏ' > 'fwa';",
+                     "'ፐ' > 'qha';",
+                     "'ፑ' > 'qhu';",
+                     "'ፒ' > 'qhi';",
+                     "'ፓ' > 'qhaa';",
+                     "'ፔ' > 'qhee';",
+                     "'ፕ' > 'qhe';",
+                     "'ፖ' > 'qho';",
+                     "'ፗ' > 'pwa';",
+                     "'ፘ' > 'qhwa';",
+                     "'ፙ' > 'mya';",
+                     "'ፚ' > 'qhwi';",
+                     "'ᎀ' > 'xa';",
+                     "'ᎁ' > 'xu';",
+                     "'ᎂ' > 'xi';",
+                     "'ᎃ' > 'xaa';",
+                     "'ᎄ' > 'xee';",
+                     "'ᎅ' > 'xe';",
+                     "'ᎆ' > 'xo';",
+                     "'ᎇ' > 'bwe';",
+                     "'ᎈ' > 'xwa';",
+                     "'ᎉ' > 'fwi';",
+                     "'ᎊ' > 'xwi';",
+                     "'ᎋ' > 'xwaa';",
+                     "'ᎌ' > 'xwee';",
+                     "'ᎍ' > 'xwe';",
+                     "'ᎎ' > 'pwee';",
+                     "'ᎏ' > 'pwe';",
+                     "'Ꭰ' > 'a';",
+                     "'Ꭱ' > 'e';",
+                     "'Ꭲ' > 'i';",
+                     "'Ꭳ' > 'o';",
+                     "'Ꭴ' > 'u';",
+                     "'Ꭵ' > 'v';",
+                     "'Ꭶ' > 'ga';",
+                     "'Ꭷ' > 'ka';",
+                     "'Ꭸ' > 'ka';",
+                     "'Ꭹ' > 'ku';",
+                     "'Ꭺ' > 'ki';",
+                     "'Ꭻ' > 'kaa';",
+                     "'Ꭼ' > 'kee';",
+                     "'Ꭽ' > 'ke';",
+                     "'Ꭾ' > 'ko';",
+                     "'Ꭿ' > 'hi';",
+                     "'Ꮀ' > 'kwa';",
+                     "'Ꮁ' > 'hu';",
+                     "'Ꮂ' > 'kwi';",
+                     "'Ꮃ' > 'kwaa';",
+                     "'Ꮄ' > 'kwee';",
+                     "'Ꮅ' > 'kwe';",
+                     "'Ꮆ' > 'lo';",
+                     "'Ꮇ' > 'lu';",
+                     "'Ꮈ' > 'kxa';",
+                     "'Ꮉ' > 'kxu';",
+                     "'Ꮊ' > 'kxi';",
+                     "'Ꮋ' > 'kxaa';",
+                     "'Ꮌ' > 'kxee';",
+                     "'Ꮍ' > 'kxe';",
+                     "'Ꮎ' > 'kxo';",
+                     "'Ꮏ' > 'hna';",
+                     "'Ꮐ' > 'kxwa';",
+                     "'Ꮑ' > 'ne';",
+                     "'Ꮒ' > 'kxwi';",
+                     "'Ꮓ' > 'kxwaa';",
+                     "'Ꮔ' > 'kxwee';",
+                     "'Ꮕ' > 'kxwe';",
+                     "'Ꮖ' > 'qua';",
+                     "'Ꮗ' > 'que';",
+                     "'Ꮘ' > 'wa';",
+                     "'Ꮙ' > 'wu';",
+                     "'Ꮚ' > 'wi';",
+                     "'Ꮛ' > 'waa';",
+                     "'Ꮜ' > 'wee';",
+                     "'Ꮝ' > 'we';",
+                     "'Ꮞ' > 'wo';",
+                     "'Ꮟ' > 'si';",
+                     "'Ꮠ' > 'so';",
+                     "'Ꮡ' > 'su';",
+                     "'Ꮢ' > 'sv';",
+                     "'Ꮣ' > 'da';",
+                     "'Ꮤ' > 'ta';",
+                     "'Ꮥ' > 'de';",
+                     "'Ꮦ' > 'te';",
+                     "'Ꮧ' > 'di';",
+                     "'Ꮨ' > 'za';",
+                     "'Ꮩ' > 'zu';",
+                     "'Ꮪ' > 'zi';",
+                     "'Ꮫ' > 'zaa';",
+                     "'Ꮬ' > 'zee';",
+                     "'Ꮭ' > 'ze';",
+                     "'Ꮮ' > 'zo';",
+                     "'Ꮯ' > 'zwa';",
+                     "'Ꮰ' > 'zha';",
+                     "'Ꮱ' > 'zhu';",
+                     "'Ꮲ' > 'zhi';",
+                     "'Ꮳ' > 'zhaa';",
+                     "'Ꮴ' > 'zhee';",
+                     "'Ꮵ' > 'zhe';",
+                     "'Ꮶ' > 'zho';",
+                     "'Ꮷ' > 'zhwa';",
+                     "'Ꮸ' > 'ya';",
+                     "'Ꮹ' > 'yu';",
+                     "'Ꮺ' > 'yi';",
+                     "'Ꮻ' > 'yaa';",
+                     "'Ꮼ' > 'yee';",
+                     "'Ꮽ' > 'ye';",
+                     "'Ꮾ' > 'yo';",
+                     "'Ꮿ' > 'ya';",
+                     "'Ᏸ' > 'da';",
+                     "'Ᏹ' > 'du';",
+                     "'Ᏺ' > 'di';",
+                     "'Ᏻ' > 'daa';",
+                     "'Ᏼ' > 'dee';",
+                     "'Ᏽ' > 'de';",
+                     "'ᏸ' > 'dda';",
+                     "'ᏹ' > 'ddu';",
+                     "'ᏺ' > 'ddi';",
+                     "'ᏻ' > 'ddaa';",
+                     "'ᏼ' > 'ddee';",
+                     "'ᏽ' > 'dde';",
+                     "'ᐁ' > 'ju';",
+                     "'ᐂ' > 'ji';",
+                     "'ᐃ' > 'jaa';",
+                     "'ᐄ' > 'jee';",
+                     "'ᐅ' > 'je';",
+                     "'ᐆ' > 'jo';",
+                     "'ᐇ' > 'jwa';",
+                     "'ᐈ' > 'ga';",
+                     "'ᐉ' > 'gu';",
+                     "'ᐊ' > 'gi';",
+                     "'ᐋ' > 'gaa';",
+                     "'ᐌ' > 'gee';",
+                     "'ᐍ' > 'ge';",
+                     "'ᐎ' > 'go';",
+                     "'ᐐ' > 'gwa';",
+                     "'ᐒ' > 'gwi';",
+                     "'ᐓ' > 'gwaa';",
+                     "'ᐔ' > 'gwee';",
+                     "'ᐕ' > 'gwe';",
+                     "'ᐘ' > 'gga';",
+                     "'ᐙ' > 'ggu';",
+                     "'ᐚ' > 'ggi';",
+                     "'ᐛ' > 'ggaa';",
+                     "'ᐜ' > 'ggee';",
+                     "'ᐝ' > 'gge';",
+                     "'ᐞ' > 'ggo';",
+                     "'ᐠ' > 'tha';",
+                     "'ᐡ' > 'thu';",
+                     "'ᐢ' > 'thi';",
+                     "'ᐣ' > 'thaa';",
+                     "'ᐤ' > 'thee';",
+                     "'ᐥ' > 'the';",
+                     "'ᐦ' > 'tho';",
+                     "'ᐧ' > 'thwa';",
+                     "'ᐨ' > 'cha';",
+                     "'ᐩ' > 'chu';",
+                     "'ᐪ' > 'chi';",
+                     "'ᐫ' > 'chaa';",
+                     "'ᐬ' > 'chee';",
+                     "'ᐭ' > 'che';",
+                     "'ᐮ' > 'cho';",
+                     "'ᐯ' > 'chwa';",
+                     "'ᐰ' > 'pha';",
+                     "'ᐱ' > 'phu';",
+                     "'ᐲ' > 'phi';",
+                     "'ᐳ' > 'phaa';",
+                     "'ᐴ' > 'phee';",
+                     "'ᐵ' > 'phe';",
+                     "'ᐶ' > 'pho';",
+                     "'ᐷ' > 'phwa';",
+                     "'ᐸ' > 'tsa';",
+                     "'ᐹ' > 'tsu';",
+                     "'ᐺ' > 'tsi';",
+                     "'ᐻ' > 'tsaa';",
+                     "'ᐼ' > 'tsee';",
+                     "'ᐽ' > 'tse';",
+                     "'ᐾ' > 'tso';",
+                     "'ᐿ' > 'tswa';",
+                     "'ᑀ' > 'tza';",
+                     "'ᑁ' > 'tzu';",
+                     "'ᑂ' > 'tzi';",
+                     "'ᑃ' > 'tzaa';",
+                     "'ᑄ' > 'tzee';",
+                     "'ᑅ' > 'tze';",
+                     "'ᑆ' > 'tzo';",
+                     "'ᑈ' > 'fa';",
+                     "'ᑉ' > 'fu';",
+                     "'ᑊ' > 'fi';",
+                     "'ᑋ' > 'faa';",
+                     "'ᑌ' > 'fee';",
+                     "'ᑍ' > 'fe';",
+                     "'ᑎ' > 'fo';",
+                     "'ᑏ' > 'fwa';",
+                     "'ᑐ' > 'pa';",
+                     "'ᑑ' > 'pu';",
+                     "'ᑒ' > 'pi';",
+                     "'ᑓ' > 'paa';",
+                     "'ᑔ' > 'pee';",
+                     "'ᑕ' > 'pe';",
+                     "'ᑖ' > 'po';",
+                     "'ᑗ' > 'pwa';",
+                     "'ᑘ' > 'rya';",
+                     "'ᑙ' > 'mya';",
+                     "'ᑚ' > 'fya';",
+                     "'ᒠ' > 'a';",
+                     "'ᒡ' > 'e';",
+                     "'ᒢ' > 'i';",
+                     "'ᒣ' > 'o';",
+                     "'ᒤ' > 'u';",
+                     "'ᒥ' > 'v';",
+                     "'ᒦ' > 'ga';",
+                     "'ᒧ' > 'ka';",
+                     "'ᒨ' > 'ge';",
+                     "'ᒩ' > 'gi';",
+                     "'ᒪ' > 'go';",
+                     "'ᒫ' > 'gu';",
+                     "'ᒬ' > 'gv';",
+                     "'ᒭ' > 'ha';",
+                     "'ᒮ' > 'he';",
+                     "'ᒯ' > 'hi';",
+                     "'ᒰ' > 'ho';",
+                     "'ᒱ' > 'hu';",
+                     "'ᒲ' > 'hv';",
+                     "'ᒳ' > 'la';",
+                     "'ᒴ' > 'le';",
+                     "'ᒵ' > 'li';",
+                     "'ᒶ' > 'lo';",
+                     "'ᒷ' > 'lu';",
+                     "'ᒸ' > 'lv';",
+                     "'ᒹ' > 'ma';",
+                     "'ᒺ' > 'me';",
+                     "'ᒻ' > 'mi';",
+                     "'ᒼ' > 'mo';",
+                     "'ᒽ' > 'mu';",
+                     "'ᒾ' > 'na';",
+                     "'ᒿ' > 'hna';",
+                     "'ᓀ' > 'nah';",
+                     "'ᓁ' > 'ne';",
+                     "'ᓂ' > 'ni';",
+                     "'ᓃ' > 'no';",
+                     "'ᓄ' > 'nu';",
+                     "'ᓅ' > 'nv';",
+                     "'ᓆ' > 'qua';",
+                     "'ᓇ' > 'que';",
+                     "'ᓈ' > 'qui';",
+                     "'ᓉ' > 'quo';",
+                     "'ᓊ' > 'quu';",
+                     "'ᓋ' > 'quv';",
+                     "'ᓌ' > 'sa';",
+                     "'ᓍ' > 's';",
+                     "'ᓎ' > 'se';",
+                     "'ᓏ' > 'si';",
+                     "'ᓐ' > 'so';",
+                     "'ᓑ' > 'su';",
+                     "'ᓒ' > 'sv';",
+                     "'ᓓ' > 'da';",
+                     "'ᓔ' > 'ta';",
+                     "'ᓕ' > 'de';",
+                     "'ᓖ' > 'te';",
+                     "'ᓗ' > 'di';",
+                     "'ᓘ' > 'ti';",
+                     "'ᓙ' > 'do';",
+                     "'ᓚ' > 'du';",
+                     "'ᓛ' > 'dv';",
+                     "'ᓜ' > 'dla';",
+                     "'ᓝ' > 'tla';",
+                     "'ᓞ' > 'tle';",
+                     "'ᓟ' > 'tli';",
+                     "'ᓠ' > 'tlo';",
+                     "'ᓡ' > 'tlu';",
+                     "'ᓢ' > 'tlv';",
+                     "'ᓣ' > 'tsa';",
+                     "'ᓤ' > 'tse';",
+                     "'ᓥ' > 'tsi';",
+                     "'ᓦ' > 'tso';",
+                     "'ᓧ' > 'tsu';",
+                     "'ᓨ' > 'tsv';",
+                     "'ᓩ' > 'wa';",
+                     "'ᓪ' > 'we';",
+                     "'ᓫ' > 'wi';",
+                     "'ᓬ' > 'wo';",
+                     "'ᓭ' > 'wu';",
+                     "'ᓮ' > 'wv';",
+                     "'ᓯ' > 'ya';",
+                     "'ᓰ' > 'ye';",
+                     "'ᓱ' > 'yi';",
+                     "'ᓲ' > 'yo';",
+                     "'ᓳ' > 'yu';",
+                     "'ᓴ' > 'yv';",
+                     "'ᔁ' > 'e';",
+                     "'ᔂ' > 'aai';",
+                     "'ᔃ' > 'i';",
+                     "'ᔄ' > 'ii';",
+                     "'ᔅ' > 'o';",
+                     "'ᔆ' > 'oo';",
+                     "'ᔇ' > 'oo';",
+                     "'ᔈ' > 'ee';",
+                     "'ᔉ' > 'i';",
+                     "'ᔊ' > 'a';",
+                     "'ᔋ' > 'aa';",
+                     "'ᔌ' > 'we';",
+                     "'ᔍ' > 'we';",
+                     "'ᔎ' > 'wi';",
+                     "'ᔏ' > 'wi';",
+                     "'ᔐ' > 'wii';",
+                     "'ᔑ' > 'wii';",
+                     "'ᔒ' > 'wo';",
+                     "'ᔓ' > 'wo';",
+                     "'ᔔ' > 'woo';",
+                     "'ᔕ' > 'woo';",
+                     "'ᔖ' > 'woo';",
+                     "'ᔗ' > 'wa';",
+                     "'ᔘ' > 'wa';",
+                     "'ᔙ' > 'waa';",
+                     "'ᔚ' > 'waa';",
+                     "'ᔛ' > 'waa';",
+                     "'ᔜ' > 'ai';",
+                     "'ᔝ' > 'w';",
+                     "'ᔟ' > 't';",
+                     "'ᔠ' > 'k';",
+                     "'ᔡ' > 'sh';",
+                     "'ᔢ' > 's';",
+                     "'ᔣ' > 'n';",
+                     "'ᔤ' > 'w';",
+                     "'ᔥ' > 'n';",
+                     "'ᔧ' > 'w';",
+                     "'ᔨ' > 'c';",
+                     "'ᔪ' > 'l';",
+                     "'ᔫ' > 'en';",
+                     "'ᔬ' > 'in';",
+                     "'ᔭ' > 'on';",
+                     "'ᔮ' > 'an';",
+                     "'ᔯ' > 'pe';",
+                     "'ᔰ' > 'paai';",
+                     "'ᔱ' > 'pi';",
+                     "'ᔲ' > 'pii';",
+                     "'ᔳ' > 'po';",
+                     "'ᔴ' > 'poo';",
+                     "'ᔵ' > 'poo';",
+                     "'ᔶ' > 'hee';",
+                     "'ᔷ' > 'hi';",
+                     "'ᔸ' > 'pa';",
+                     "'ᔹ' > 'paa';",
+                     "'ᔺ' > 'pwe';",
+                     "'ᔻ' > 'pwe';",
+                     "'ᔼ' > 'pwi';",
+                     "'ᔽ' > 'pwi';",
+                     "'ᔾ' > 'pwii';",
+                     "'ᔿ' > 'pwii';",
+                     "'ᕀ' > 'pwo';",
+                     "'ᕁ' > 'pwo';",
+                     "'ᕂ' > 'pwoo';",
+                     "'ᕃ' > 'pwoo';",
+                     "'ᕄ' > 'pwa';",
+                     "'ᕅ' > 'pwa';",
+                     "'ᕆ' > 'pwaa';",
+                     "'ᕇ' > 'pwaa';",
+                     "'ᕈ' > 'pwaa';",
+                     "'ᕉ' > 'p';",
+                     "'ᕊ' > 'p';",
+                     "'ᕋ' > 'h';",
+                     "'ᕌ' > 'te';",
+                     "'ᕍ' > 'taai';",
+                     "'ᕎ' > 'ti';",
+                     "'ᕏ' > 'tii';",
+                     "'ᕐ' > 'to';",
+                     "'ᕑ' > 'too';",
+                     "'ᕒ' > 'too';",
+                     "'ᕓ' > 'dee';",
+                     "'ᕔ' > 'di';",
+                     "'ᕕ' > 'ta';",
+                     "'ᕖ' > 'taa';",
+                     "'ᕗ' > 'twe';",
+                     "'ᕘ' > 'twe';",
+                     "'ᕙ' > 'twi';",
+                     "'ᕚ' > 'twi';",
+                     "'ᕛ' > 'twii';",
+                     "'ᕜ' > 'twii';",
+                     "'ᕝ' > 'two';",
+                     "'ᕞ' > 'two';",
+                     "'ᕟ' > 'twoo';",
+                     "'ᕠ' > 'twoo';",
+                     "'ᕡ' > 'twa';",
+                     "'ᕢ' > 'twa';",
+                     "'ᕣ' > 'twaa';",
+                     "'ᕤ' > 'twaa';",
+                     "'ᕥ' > 'twaa';",
+                     "'ᕦ' > 't';",
+                     "'ᕧ' > 'tte';",
+                     "'ᕨ' > 'tti';",
+                     "'ᕩ' > 'tto';",
+                     "'ᕪ' > 'tta';",
+                     "'ᕫ' > 'ke';",
+                     "'ᕬ' > 'kaai';",
+                     "'ᕭ' > 'ki';",
+                     "'ᕮ' > 'kii';",
+                     "'ᕯ' > 'ko';",
+                     "'ᕰ' > 'koo';",
+                     "'ᕱ' > 'koo';",
+                     "'ᕲ' > 'ka';",
+                     "'ᕳ' > 'kaa';",
+                     "'ᕴ' > 'kwe';",
+                     "'ᕵ' > 'kwe';",
+                     "'ᕶ' > 'kwi';",
+                     "'ᕷ' > 'kwi';",
+                     "'ᕸ' > 'kwii';",
+                     "'ᕹ' > 'kwii';",
+                     "'ᕺ' > 'kwo';",
+                     "'ᕻ' > 'kwo';",
+                     "'ᕼ' > 'kwoo';",
+                     "'ᕽ' > 'kwoo';",
+                     "'ᕾ' > 'kwa';",
+                     "'ᕿ' > 'kwa';",
+                     "'ᖀ' > 'kwaa';",
+                     "'ᖁ' > 'kwaa';",
+                     "'ᖂ' > 'kwaa';",
+                     "'ᖃ' > 'k';",
+                     "'ᖄ' > 'kw';",
+                     "'ᖅ' > 'keh';",
+                     "'ᖆ' > 'kih';",
+                     "'ᖇ' > 'koh';",
+                     "'ᖈ' > 'kah';",
+                     "'ᖉ' > 'ce';",
+                     "'ᖊ' > 'caai';",
+                     "'ᖋ' > 'ci';",
+                     "'ᖌ' > 'cii';",
+                     "'ᖍ' > 'co';",
+                     "'ᖎ' > 'coo';",
+                     "'ᖏ' > 'coo';",
+                     "'ᖐ' > 'ca';",
+                     "'ᖑ' > 'caa';",
+                     "'ᖒ' > 'cwe';",
+                     "'ᖓ' > 'cwe';",
+                     "'ᖔ' > 'cwi';",
+                     "'ᖕ' > 'cwi';",
+                     "'ᖖ' > 'cwii';",
+                     "'ᖗ' > 'cwii';",
+                     "'ᖘ' > 'cwo';",
+                     "'ᖙ' > 'cwo';",
+                     "'ᖚ' > 'cwoo';",
+                     "'ᖛ' > 'cwoo';",
+                     "'ᖜ' > 'cwa';",
+                     "'ᖝ' > 'cwa';",
+                     "'ᖞ' > 'cwaa';",
+                     "'ᖟ' > 'cwaa';",
+                     "'ᖠ' > 'cwaa';",
+                     "'ᖡ' > 'c';",
+                     "'ᖢ' > 'th';",
+                     "'ᖣ' > 'me';",
+                     "'ᖤ' > 'maai';",
+                     "'ᖥ' > 'mi';",
+                     "'ᖦ' > 'mii';",
+                     "'ᖧ' > 'mo';",
+                     "'ᖨ' > 'moo';",
+                     "'ᖩ' > 'moo';",
+                     "'ᖪ' > 'ma';",
+                     "'ᖫ' > 'maa';",
+                     "'ᖬ' > 'mwe';",
+                     "'ᖭ' > 'mwe';",
+                     "'ᖮ' > 'mwi';",
+                     "'ᖯ' > 'mwi';",
+                     "'ᖰ' > 'mwii';",
+                     "'ᖱ' > 'mwii';",
+                     "'ᖲ' > 'mwo';",
+                     "'ᖳ' > 'mwo';",
+                     "'ᖴ' > 'mwoo';",
+                     "'ᖵ' > 'mwoo';",
+                     "'ᖶ' > 'mwa';",
+                     "'ᖷ' > 'mwa';",
+                     "'ᖸ' > 'mwaa';",
+                     "'ᖹ' > 'mwaa';",
+                     "'ᖺ' > 'mwaa';",
+                     "'ᖻ' > 'm';",
+                     "'ᖼ' > 'm';",
+                     "'ᖽ' > 'mh';",
+                     "'ᖾ' > 'm';",
+                     "'ᖿ' > 'm';",
+                     "'ᗀ' > 'ne';",
+                     "'ᗁ' > 'naai';",
+                     "'ᗂ' > 'ni';",
+                     "'ᗃ' > 'nii';",
+                     "'ᗄ' > 'no';",
+                     "'ᗅ' > 'noo';",
+                     "'ᗆ' > 'noo';",
+                     "'ᗇ' > 'na';",
+                     "'ᗈ' > 'naa';",
+                     "'ᗉ' > 'nwe';",
+                     "'ᗊ' > 'nwe';",
+                     "'ᗋ' > 'nwa';",
+                     "'ᗌ' > 'nwa';",
+                     "'ᗍ' > 'nwaa';",
+                     "'ᗎ' > 'nwaa';",
+                     "'ᗏ' > 'nwaa';",
+                     "'ᗐ' > 'n';",
+                     "'ᗑ' > 'ng';",
+                     "'ᗒ' > 'nh';",
+                     "'ᗓ' > 'le';",
+                     "'ᗔ' > 'laai';",
+                     "'ᗕ' > 'li';",
+                     "'ᗖ' > 'lii';",
+                     "'ᗗ' > 'lo';",
+                     "'ᗘ' > 'loo';",
+                     "'ᗙ' > 'loo';",
+                     "'ᗚ' > 'la';",
+                     "'ᗛ' > 'laa';",
+                     "'ᗜ' > 'lwe';",
+                     "'ᗝ' > 'lwe';",
+                     "'ᗞ' > 'lwi';",
+                     "'ᗟ' > 'lwi';",
+                     "'ᗠ' > 'lwii';",
+                     "'ᗡ' > 'lwii';",
+                     "'ᗢ' > 'lwo';",
+                     "'ᗣ' > 'lwo';",
+                     "'ᗤ' > 'lwoo';",
+                     "'ᗥ' > 'lwoo';",
+                     "'ᗦ' > 'lwa';",
+                     "'ᗧ' > 'lwa';",
+                     "'ᗨ' > 'lwaa';",
+                     "'ᗩ' > 'lwaa';",
+                     "'ᗪ' > 'l';",
+                     "'ᗫ' > 'l';",
+                     "'ᗬ' > 'l';",
+                     "'ᗭ' > 'se';",
+                     "'ᗮ' > 'saai';",
+                     "'ᗯ' > 'si';",
+                     "'ᗰ' > 'sii';",
+                     "'ᗱ' > 'so';",
+                     "'ᗲ' > 'soo';",
+                     "'ᗳ' > 'soo';",
+                     "'ᗴ' > 'sa';",
+                     "'ᗵ' > 'saa';",
+                     "'ᗶ' > 'swe';",
+                     "'ᗷ' > 'swe';",
+                     "'ᗸ' > 'swi';",
+                     "'ᗹ' > 'swi';",
+                     "'ᗺ' > 'swii';",
+                     "'ᗻ' > 'swii';",
+                     "'ᗼ' > 'swo';",
+                     "'ᗽ' > 'swo';",
+                     "'ᗾ' > 'swoo';",
+                     "'ᗿ' > 'swoo';",
+                     "'ᘀ' > 'swa';",
+                     "'ᘁ' > 'swa';",
+                     "'ᘂ' > 'swaa';",
+                     "'ᘃ' > 'swaa';",
+                     "'ᘄ' > 'swaa';",
+                     "'ᘅ' > 's';",
+                     "'ᘆ' > 's';",
+                     "'ᘇ' > 'sw';",
+                     "'ᘈ' > 's';",
+                     "'ᘉ' > 'sk';",
+                     "'ᘊ' > 'skw';",
+                     "'ᘋ' > 'sw';",
+                     "'ᘌ' > 'spwa';",
+                     "'ᘍ' > 'stwa';",
+                     "'ᘎ' > 'skwa';",
+                     "'ᘏ' > 'scwa';",
+                     "'ᘐ' > 'she';",
+                     "'ᘑ' > 'shi';",
+                     "'ᘒ' > 'shii';",
+                     "'ᘓ' > 'sho';",
+                     "'ᘔ' > 'shoo';",
+                     "'ᘕ' > 'sha';",
+                     "'ᘖ' > 'shaa';",
+                     "'ᘗ' > 'shwe';",
+                     "'ᘘ' > 'shwe';",
+                     "'ᘙ' > 'shwi';",
+                     "'ᘚ' > 'shwi';",
+                     "'ᘛ' > 'shwii';",
+                     "'ᘜ' > 'shwii';",
+                     "'ᘝ' > 'shwo';",
+                     "'ᘞ' > 'shwo';",
+                     "'ᘟ' > 'shwoo';",
+                     "'ᘠ' > 'shwoo';",
+                     "'ᘡ' > 'shwa';",
+                     "'ᘢ' > 'shwa';",
+                     "'ᘣ' > 'shwaa';",
+                     "'ᘤ' > 'shwaa';",
+                     "'ᘥ' > 'sh';",
+                     "'ᘦ' > 'ye';",
+                     "'ᘧ' > 'yaai';",
+                     "'ᘨ' > 'yi';",
+                     "'ᘩ' > 'yii';",
+                     "'ᘪ' > 'yo';",
+                     "'ᘫ' > 'yoo';",
+                     "'ᘬ' > 'yoo';",
+                     "'ᘭ' > 'ya';",
+                     "'ᘮ' > 'yaa';",
+                     "'ᘯ' > 'ywe';",
+                     "'ᘰ' > 'ywe';",
+                     "'ᘱ' > 'ywi';",
+                     "'ᘲ' > 'ywi';",
+                     "'ᘳ' > 'ywii';",
+                     "'ᘴ' > 'ywii';",
+                     "'ᘵ' > 'ywo';",
+                     "'ᘶ' > 'ywo';",
+                     "'ᘷ' > 'ywoo';",
+                     "'ᘸ' > 'ywoo';",
+                     "'ᘹ' > 'ywa';",
+                     "'ᘺ' > 'ywa';",
+                     "'ᘻ' > 'ywaa';",
+                     "'ᘼ' > 'ywaa';",
+                     "'ᘽ' > 'ywaa';",
+                     "'ᘾ' > 'y';",
+                     "'ᘿ' > 'y';",
+                     "'ᙀ' > 'y';",
+                     "'ᙁ' > 'yi';",
+                     "'ᙂ' > 're';",
+                     "'ᙃ' > 're';",
+                     "'ᙄ' > 'le';",
+                     "'ᙅ' > 'raai';",
+                     "'ᙆ' > 'ri';",
+                     "'ᙇ' > 'rii';",
+                     "'ᙈ' > 'ro';",
+                     "'ᙉ' > 'roo';",
+                     "'ᙊ' > 'lo';",
+                     "'ᙋ' > 'ra';",
+                     "'ᙌ' > 'raa';",
+                     "'ᙍ' > 'la';",
+                     "'ᙎ' > 'rwaa';",
+                     "'ᙏ' > 'rwaa';",
+                     "'ᙐ' > 'r';",
+                     "'ᙑ' > 'r';",
+                     "'ᙒ' > 'r';",
+                     "'ᙓ' > 'fe';",
+                     "'ᙔ' > 'faai';",
+                     "'ᙕ' > 'fi';",
+                     "'ᙖ' > 'fii';",
+                     "'ᙗ' > 'fo';",
+                     "'ᙘ' > 'foo';",
+                     "'ᙙ' > 'fa';",
+                     "'ᙚ' > 'faa';",
+                     "'ᙛ' > 'fwaa';",
+                     "'ᙜ' > 'fwaa';",
+                     "'ᙝ' > 'f';",
+                     "'ᙞ' > 'the';",
+                     "'ᙟ' > 'the';",
+                     "'ᙠ' > 'thi';",
+                     "'ᙡ' > 'thi';",
+                     "'ᙢ' > 'thii';",
+                     "'ᙣ' > 'thii';",
+                     "'ᙤ' > 'tho';",
+                     "'ᙥ' > 'thoo';",
+                     "'ᙦ' > 'tha';",
+                     "'ᙧ' > 'thaa';",
+                     "'ᙨ' > 'thwaa';",
+                     "'ᙩ' > 'thwaa';",
+                     "'ᙪ' > 'th';",
+                     "'ᙫ' > 'tthe';",
+                     "'ᙬ' > 'tthi';",
+                     "'ᙯ' > 'tth';",
+                     "'ᙰ' > 'tye';",
+                     "'ᙱ' > 'tyi';",
+                     "'ᙲ' > 'tyo';",
+                     "'ᙳ' > 'tya';",
+                     "'ᙴ' > 'he';",
+                     "'ᙵ' > 'hi';",
+                     "'ᙶ' > 'hii';",
+                     "'ᙷ' > 'ho';",
+                     "'ᙸ' > 'hoo';",
+                     "'ᙹ' > 'ha';",
+                     "'ᙺ' > 'haa';",
+                     "'ᙻ' > 'h';",
+                     "'ᙼ' > 'h';",
+                     "'ᙽ' > 'hk';",
+                     "'ᙾ' > 'qaai';",
+                     "'ᙿ' > 'qi';",
+                     "'ᚁ' > 'qo';",
+                     "'ᚂ' > 'qoo';",
+                     "'ᚃ' > 'qa';",
+                     "'ᚄ' > 'qaa';",
+                     "'ᚅ' > 'q';",
+                     "'ᚆ' > 'tlhe';",
+                     "'ᚇ' > 'tlhi';",
+                     "'ᚈ' > 'tlho';",
+                     "'ᚉ' > 'tlha';",
+                     "'ᚊ' > 're';",
+                     "'ᚋ' > 'ri';",
+                     "'ᚌ' > 'ro';",
+                     "'ᚍ' > 'ra';",
+                     "'ᚎ' > 'ngaai';",
+                     "'ᚏ' > 'ngi';",
+                     "'ᚐ' > 'ngii';",
+                     "'ᚑ' > 'ngo';",
+                     "'ᚒ' > 'ngoo';",
+                     "'ᚓ' > 'nga';",
+                     "'ᚔ' > 'ngaa';",
+                     "'ᚕ' > 'ng';",
+                     "'ᚖ' > 'nng';",
+                     "'ᚗ' > 'she';",
+                     "'ᚘ' > 'shi';",
+                     "'ᚙ' > 'sho';",
+                     "'ᚚ' > 'sha';",
+                     "'ᚠ' > 'lhi';",
+                     "'ᚡ' > 'lhii';",
+                     "'ᚢ' > 'lho';",
+                     "'ᚣ' > 'lhoo';",
+                     "'ᚤ' > 'lha';",
+                     "'ᚥ' > 'lhaa';",
+                     "'ᚦ' > 'lh';",
+                     "'ᚧ' > 'the';",
+                     "'ᚨ' > 'thi';",
+                     "'ᚩ' > 'thii';",
+                     "'ᚪ' > 'tho';",
+                     "'ᚫ' > 'thoo';",
+                     "'ᚬ' > 'tha';",
+                     "'ᚭ' > 'thaa';",
+                     "'ᚮ' > 'th';",
+                     "'ᚯ' > 'b';",
+                     "'ᚰ' > 'e';",
+                     "'ᚱ' > 'i';",
+                     "'ᚲ' > 'o';",
+                     "'ᚳ' > 'a';",
+                     "'ᚴ' > 'we';",
+                     "'ᚵ' > 'wi';",
+                     "'ᚶ' > 'wo';",
+                     "'ᚷ' > 'wa';",
+                     "'ᚸ' > 'ne';",
+                     "'ᚹ' > 'ni';",
+                     "'ᚺ' > 'no';",
+                     "'ᚻ' > 'na';",
+                     "'ᚼ' > 'ke';",
+                     "'ᚽ' > 'ki';",
+                     "'ᚾ' > 'ko';",
+                     "'ᚿ' > 'ka';",
+                     "'ᛀ' > 'he';",
+                     "'ᛁ' > 'hi';",
+                     "'ᛂ' > 'ho';",
+                     "'ᛃ' > 'ha';",
+                     "'ᛄ' > 'ghu';",
+                     "'ᛅ' > 'gho';",
+                     "'ᛆ' > 'ghe';",
+                     "'ᛇ' > 'ghee';",
+                     "'ᛈ' > 'ghi';",
+                     "'ᛉ' > 'gha';",
+                     "'ᛊ' > 'ru';",
+                     "'ᛋ' > 'ro';",
+                     "'ᛌ' > 're';",
+                     "'ᛍ' > 'ree';",
+                     "'ᛎ' > 'ri';",
+                     "'ᛏ' > 'ra';",
+                     "'ᛐ' > 'wu';",
+                     "'ᛑ' > 'wo';",
+                     "'ᛒ' > 'we';",
+                     "'ᛓ' > 'wee';",
+                     "'ᛔ' > 'wi';",
+                     "'ᛕ' > 'wa';",
+                     "'ᛖ' > 'hwu';",
+                     "'ᛗ' > 'hwo';",
+                     "'ᛘ' > 'hwe';",
+                     "'ᛙ' > 'hwee';",
+                     "'ᛚ' > 'hwi';",
+                     "'ᛛ' > 'hwa';",
+                     "'ᛜ' > 'thu';",
+                     "'ᛝ' > 'tho';",
+                     "'ᛞ' > 'the';",
+                     "'ᛟ' > 'thee';",
+                     "'ᛠ' > 'thi';",
+                     "'ᛡ' > 'tha';",
+                     "'ᛢ' > 'ttu';",
+                     "'ᛣ' > 'tto';",
+                     "'ᛤ' > 'tte';",
+                     "'ᛥ' > 'ttee';",
+                     "'ᛦ' > 'tti';",
+                     "'ᛧ' > 'tta';",
+                     "'ᛨ' > 'pu';",
+                     "'ᛩ' > 'po';",
+                     "'ᛪ' > 'pe';",
+                     "'ᛱ' > 'ge';",
+                     "'ᛲ' > 'gee';",
+                     "'ᛳ' > 'gi';",
+                     "'ᛴ' > 'ga';",
+                     "'ᛵ' > 'khu';",
+                     "'ᛶ' > 'kho';",
+                     "'ᛷ' > 'khe';",
+                     "'ᛸ' > 'khee';",
+                     "'ᜀ' > 'kka';",
+                     "'ᜁ' > 'kk';",
+                     "'ᜂ' > 'nu';",
+                     "'ᜃ' > 'no';",
+                     "'ᜄ' > 'ne';",
+                     "'ᜅ' > 'nee';",
+                     "'ᜆ' > 'ni';",
+                     "'ᜇ' > 'na';",
+                     "'ᜈ' > 'mu';",
+                     "'ᜉ' > 'mo';",
+                     "'ᜊ' > 'me';",
+                     "'ᜋ' > 'mee';",
+                     "'ᜌ' > 'mi';",
+                     "'ᜎ' > 'yu';",
+                     "'ᜏ' > 'yo';",
+                     "'ᜐ' > 'ye';",
+                     "'ᜑ' > 'yee';",
+                     "'ᜠ' > 'jji';",
+                     "'ᜡ' > 'jja';",
+                     "'ᜢ' > 'lu';",
+                     "'ᜣ' > 'lo';",
+                     "'ᜤ' > 'le';",
+                     "'ᜥ' > 'lee';",
+                     "'ᜦ' > 'li';",
+                     "'ᜧ' > 'la';",
+                     "'ᜨ' > 'dlu';",
+                     "'ᜩ' > 'dlo';",
+                     "'ᜪ' > 'dle';",
+                     "'ᜫ' > 'dlee';",
+                     "'ᜬ' > 'dli';",
+                     "'ᜭ' > 'dla';",
+                     "'ᜮ' > 'lhu';",
+                     "'ᜯ' > 'lho';",
+                     "'ᜰ' > 'lhe';",
+                     "'ᜱ' > 'lhee';",
+                     "'ᝀ' > 'zu';",
+                     "'ᝁ' > 'zo';",
+                     "'ᝂ' > 'ze';",
+                     "'ᝃ' > 'zee';",
+                     "'ᝄ' > 'zi';",
+                     "'ᝅ' > 'za';",
+                     "'ᝆ' > 'z';",
+                     "'ᝇ' > 'z';",
+                     "'ᝈ' > 'dzu';",
+                     "'ᝉ' > 'dzo';",
+                     "'ᝊ' > 'dze';",
+                     "'ᝋ' > 'dzee';",
+                     "'ᝌ' > 'dzi';",
+                     "'ᝍ' > 'dza';",
+                     "'ᝎ' > 'su';",
+                     "'ᝏ' > 'so';",
+                     "'ᝐ' > 'se';",
+                     "'ᝑ' > 'see';",
+                     "'ᝠ' > 'tsa';",
+                     "'ᝡ' > 'chu';",
+                     "'ᝢ' > 'cho';",
+                     "'ᝣ' > 'che';",
+                     "'ᝤ' > 'chee';",
+                     "'ᝥ' > 'chi';",
+                     "'ᝦ' > 'cha';",
+                     "'ᝧ' > 'ttsu';",
+                     "'ᝨ' > 'ttso';",
+                     "'ᝩ' > 'ttse';",
+                     "'ᝪ' > 'ttsee';",
+                     "'ᝫ' > 'ttsi';",
+                     "'ᝬ' > 'ttsa';",
+                     "'ᝮ' > 'la';",
+                     "'ᝯ' > 'qai';",
+                     "'ᝰ' > 'ngai';",
+                     "'ក' > 'ka';",
+                     "'ខ' > 'b';",
+                     "'គ' > 'l';",
+                     "'ឃ' > 'f';",
+                     "'ង' > 's';",
+                     "'ច' > 'n';",
+                     "'ឆ' > 'h';",
+                     "'ជ' > 'd';",
+                     "'ឈ' > 't';",
+                     "'ញ' > 'c';",
+                     "'ដ' > 'q';",
+                     "'ឋ' > 'm';",
+                     "'ឌ' > 'g';",
+                     "'ឍ' > 'ng';",
+                     "'ណ' > 'z';",
+                     "'ត' > 'r';",
+                     "'ថ' > 'a';",
+                     "'ទ' > 'o';",
+                     "'ធ' > 'u';",
+                     "'ន' > 'e';",
+                     "'ប' > 'i';",
+                     "'ផ' > 'ch';",
+                     "'ព' > 'th';",
+                     "'ភ' > 'ph';",
+                     "'ម' > 'p';",
+                     "'យ' > 'x';",
+                     "'រ' > 'p';",
+                     "'ល' > 'lo';",
+                     "'វ' > 'vo';",
+                     "'ឝ' > 'sha';",
+                     "'ឞ' > 'sso';",
+                     "'ស' > 'sa';",
+                     "'ហ' > 'f';",
+                     "'ឡ' > 'v';",
+                     "'អ' > 'u';",
+                     "'ឣ' > 'yr';",
+                     "'ឤ' > 'y';",
+                     "'ឥ' > 'w';",
+                     "'ឦ' > 'th';",
+                     "'ឧ' > 'th';",
+                     "'ឨ' > 'a';",
+                     "'ឩ' > 'o';",
+                     "'ឪ' > 'ac';",
+                     "'ឫ' > 'ae';",
+                     "'ឬ' > 'o';",
+                     "'ឭ' > 'o';",
+                     "'ឮ' > 'o';",
+                     "'ឯ' > 'oe';",
+                     "'ឰ' > 'on';",
+                     "'ឱ' > 'r';",
+                     "'ឲ' > 'k';",
+                     "'ឳ' > 'c';",
+                     "'ៗ' > 'm';",
+                     "'ៜ' > 'ng';",
+                     "'ᠠ' > 'a';",
+                     "'ᠡ' > 'e';",
+                     "'ᠢ' > 'i';",
+                     "'ᠣ' > 'o';",
+                     "'ᠤ' > 'u';",
+                     "'ᠥ' > 'oe';",
+                     "'ᠦ' > 'ue';",
+                     "'ᠧ' > 'ee';",
+                     "'ᠨ' > 'na';",
+                     "'ᠩ' > 'ang';",
+                     "'ᠪ' > 'ba';",
+                     "'ᠫ' > 'pa';",
+                     "'ᠬ' > 'qa';",
+                     "'ᠭ' > 'ga';",
+                     "'ᠮ' > 'ma';",
+                     "'ᠯ' > 'la';",
+                     "'ᠰ' > 'sa';",
+                     "'ᠱ' > 'sha';",
+                     "'ᠲ' > 'ta';",
+                     "'ᠳ' > 'da';",
+                     "'ᠴ' > 'cha';",
+                     "'ᠵ' > 'ja';",
+                     "'ᠶ' > 'ya';",
+                     "'ᠷ' > 'ra';",
+                     "'ᠸ' > 'wa';",
+                     "'ᠹ' > 'fa';",
+                     "'ᠺ' > 'ka';",
+                     "'ᠻ' > 'kha';",
+                     "'ᠼ' > 'tsa';",
+                     "'ᠽ' > 'za';",
+                     "'ᠾ' > 'haa';",
+                     "'ᠿ' > 'zra';",
+                     "'ᡀ' > 'lha';",
+                     "'ᡁ' > 'zhi';",
+                     "'ᡂ' > 'chi';",
+                     "'ᢀ' > 'k';",
+                     "'ᢁ' > 'kh';",
+                     "'ᢂ' > 'g';",
+                     "'ᢃ' > 'gh';",
+                     "'ᢄ' > 'ng';",
+                     "'ᢇ' > 'j';",
+                     "'ᢈ' > 'jh';",
+                     "'ᢉ' > 'ny';",
+                     "'ᢊ' > 't';",
+                     "'ᢋ' > 'tth';",
+                     "'ᢌ' > 'd';",
+                     "'ᢍ' > 'ddh';",
+                     "'ᢎ' > 'nn';",
+                     "'ᢏ' > 't';",
+                     "'ᢐ' > 'th';",
+                     "'ᢑ' > 'd';",
+                     "'ᢒ' > 'dh';",
+                     "'ᢓ' > 'n';",
+                     "'ᢔ' > 'p';",
+                     "'ᢕ' > 'ph';",
+                     "'ᢖ' > 'b';",
+                     "'ᢗ' > 'bh';",
+                     "'ᢘ' > 'm';",
+                     "'ᢙ' > 'y';",
+                     "'ᢚ' > 'r';",
+                     "'ᢛ' > 'l';",
+                     "'ᢜ' > 'v';",
+                     "'ᢝ' > 'sh';",
+                     "'ᢞ' > 'ss';",
+                     "'ᢟ' > 's';",
+                     "'ᢠ' > 'h';",
+                     "'ᢡ' > 'l';",
+                     "'ᢢ' > 'q';",
+                     "'ᢣ' > 'a';",
+                     "'ᢤ' > 'aa';",
+                     "'ᢥ' > 'i';",
+                     "'ᢦ' > 'ii';",
+                     "'ᢧ' > 'u';",
+                     "'ᢨ' > 'uk';",
+                     "'ᢪ' > 'uuv';",
+                     "'ᢰ' > 'ai';",
+                     "'ᢱ' > 'oo';",
+                     "'ᢲ' > 'oo';",
+                     "'ᢳ' > 'au';",
+                     "'ᢴ' > 'a';",
+                     "'ᢵ' > 'aa';",
+                     "'ᢶ' > 'aa';",
+                     "'ᢷ' > 'i';",
+                     "'ᢸ' > 'ii';",
+                     "'ᢹ' > 'y';",
+                     "'ᢺ' > 'yy';",
+                     "'ᢻ' > 'u';",
+                     "'ᢼ' > 'uu';",
+                     "'ᢽ' > 'ua';",
+                     "'ᢾ' > 'oe';",
+                     "'ᢿ' > 'ya';",
+                     "'ᣀ' > 'ie';",
+                     "'ᣁ' > 'e';",
+                     "'ᣂ' > 'ae';",
+                     "'ᣃ' > 'ai';",
+                     "'ᣄ' > 'oo';",
+                     "'ᣅ' > 'au';",
+                     "'ᣆ' > 'm';",
+                     "'ᣇ' > 'h';",
+                     "'ᣈ' > 'a';",
+                     "'ᣌ' > 'r';",
+                     "'ᣛ' > 'kr';",
+                     "'ᤁ' > 'ka';",
+                     "'ᤂ' > 'kha';",
+                     "'ᤃ' > 'ga';",
+                     "'ᤄ' > 'gha';",
+                     "'ᤅ' > 'nga';",
+                     "'ᤆ' > 'ca';",
+                     "'ᤇ' > 'cha';",
+                     "'ᤈ' > 'ja';",
+                     "'ᤉ' > 'jha';",
+                     "'ᤊ' > 'yan';",
+                     "'ᤋ' > 'ta';",
+                     "'ᤌ' > 'tha';",
+                     "'ᤍ' > 'da';",
+                     "'ᤎ' > 'dha';",
+                     "'ᤏ' > 'na';",
+                     "'ᤐ' > 'pa';",
+                     "'ᤑ' > 'pha';",
+                     "'ᤒ' > 'ba';",
+                     "'ᤓ' > 'bha';",
+                     "'ᤔ' > 'ma';",
+                     "'ᤕ' > 'ya';",
+                     "'ᤖ' > 'ra';",
+                     "'ᤗ' > 'la';",
+                     "'ᤘ' > 'wa';",
+                     "'ᤙ' > 'sha';",
+                     "'ᤚ' > 'ssa';",
+                     "'ᤛ' > 'sa';",
+                     "'ᤜ' > 'ha';",
+                     "'ᥐ' > 'ka';",
+                     "'ᥑ' > 'xa';",
+                     "'ᥒ' > 'nga';",
+                     "'ᥓ' > 'tsa';",
+                     "'ᥔ' > 'sa';",
+                     "'ᥕ' > 'ya';",
+                     "'ᥖ' > 'ta';",
+                     "'ᥗ' > 'tha';",
+                     "'ᥘ' > 'la';",
+                     "'ᥙ' > 'pa';",
+                     "'ᥚ' > 'pha';",
+                     "'ᥛ' > 'ma';",
+                     "'ᥜ' > 'fa';",
+                     "'ᥝ' > 'va';",
+                     "'ᥞ' > 'ha';",
+                     "'ᥟ' > 'qa';",
+                     "'ᥠ' > 'kha';",
+                     "'ᥡ' > 'tsha';",
+                     "'ᥢ' > 'na';",
+                     "'ᥣ' > 'a';",
+                     "'ᥤ' > 'i';",
+                     "'ᥥ' > 'ee';",
+                     "'ᥦ' > 'eh';",
+                     "'ᥧ' > 'u';",
+                     "'ᥨ' > 'oo';",
+                     "'ᥩ' > 'o';",
+                     "'ᥪ' > 'ue';",
+                     "'ᥫ' > 'e';",
+                     "'ᥬ' > 'aue';",
+                     "'ᥭ' > 'ai';",
+                     "'ᦁ' > 'qa';",
+                     "'ᦅ' > 'ka';",
+                     "'ᦆ' > 'xa';",
+                     "'ᦇ' > 'nga';",
+                     "'ᦋ' > 'tsa';",
+                     "'ᦌ' > 'sa';",
+                     "'ᦍ' > 'ya';",
+                     "'ᦑ' > 'ta';",
+                     "'ᦒ' > 'tha';",
+                     "'ᦓ' > 'na';",
+                     "'ᦗ' > 'pa';",
+                     "'ᦘ' > 'pha';",
+                     "'ᦙ' > 'ma';",
+                     "'ᦝ' > 'fa';",
+                     "'ᦞ' > 'va';",
+                     "'ᦟ' > 'la';",
+                     "'ᦣ' > 'ha';",
+                     "'ᦤ' > 'da';",
+                     "'ᦥ' > 'ba';",
+                     "'ᦨ' > 'kva';",
+                     "'ᦩ' > 'xva';",
+                     "'ᦱ' > 'aa';",
+                     "'ᦲ' > 'ii';",
+                     "'ᦳ' > 'u';",
+                     "'ᦴ' > 'uu';",
+                     "'ᦵ' > 'e';",
+                     "'ᦶ' > 'ae';",
+                     "'ᦷ' > 'o';",
+                     "'ᦸ' > 'oa';",
+                     "'ᦹ' > 'ue';",
+                     "'ᦺ' > 'ay';",
+                     "'ᦻ' > 'aay';",
+                     "'ᦼ' > 'uy';",
+                     "'ᦽ' > 'oy';",
+                     "'ᦾ' > 'oay';",
+                     "'ᦿ' > 'uey';",
+                     "'ᧀ' > 'iy';",
+                     "'ᨀ' > 'ka';",
+                     "'ᨁ' > 'ga';",
+                     "'ᨂ' > 'nga';",
+                     "'ᨃ' > 'ngka';",
+                     "'ᨄ' > 'pa';",
+                     "'ᨅ' > 'ba';",
+                     "'ᨆ' > 'ma';",
+                     "'ᨇ' > 'mpa';",
+                     "'ᨈ' > 'ta';",
+                     "'ᨉ' > 'da';",
+                     "'ᨊ' > 'na';",
+                     "'ᨋ' > 'nra';",
+                     "'ᨌ' > 'ca';",
+                     "'ᨍ' > 'ja';",
+                     "'ᨎ' > 'nya';",
+                     "'ᨏ' > 'nyca';",
+                     "'ᨐ' > 'ya';",
+                     "'ᨑ' > 'ra';",
+                     "'ᨒ' > 'la';",
+                     "'ᨓ' > 'va';",
+                     "'ᨔ' > 'sa';",
+                     "'ᨕ' > 'a';",
+                     "'ᨖ' > 'ha';",
+                     "'ᬅ' > 'akara';",
+                     "'ᬆ' > 'akara';",
+                     "'ᬇ' > 'ikara';",
+                     "'ᬈ' > 'ikara';",
+                     "'ᬉ' > 'ukara';",
+                     "'ᬊ' > 'ukara';",
+                     "'ᬋ' > 'ra';",
+                     "'ᬌ' > 'ra';",
+                     "'ᬍ' > 'la';",
+                     "'ᬎ' > 'la';",
+                     "'ᬏ' > 'ekara';",
+                     "'ᬐ' > 'aikara';",
+                     "'ᬑ' > 'okara';",
+                     "'ᬒ' > 'okara';",
+                     "'ᬓ' > 'ka';",
+                     "'ᬔ' > 'ka';",
+                     "'ᬕ' > 'ga';",
+                     "'ᬖ' > 'ga';",
+                     "'ᬗ' > 'nga';",
+                     "'ᬘ' > 'ca';",
+                     "'ᬙ' > 'ca';",
+                     "'ᬚ' > 'ja';",
+                     "'ᬛ' > 'ja';",
+                     "'ᬜ' > 'nya';",
+                     "'ᬝ' > 'ta';",
+                     "'ᬞ' > 'ta';",
+                     "'ᬟ' > 'da';",
+                     "'ᬠ' > 'da';",
+                     "'ᬡ' > 'na';",
+                     "'ᬢ' > 'ta';",
+                     "'ᬣ' > 'ta';",
+                     "'ᬤ' > 'da';",
+                     "'ᬥ' > 'da';",
+                     "'ᬦ' > 'na';",
+                     "'ᬧ' > 'pa';",
+                     "'ᬨ' > 'pa';",
+                     "'ᬩ' > 'ba';",
+                     "'ᬪ' > 'ba';",
+                     "'ᬫ' > 'ma';",
+                     "'ᬬ' > 'ya';",
+                     "'ᬭ' > 'ra';",
+                     "'ᬮ' > 'la';",
+                     "'ᬯ' > 'wa';",
+                     "'ᬰ' > 'sa';",
+                     "'ᬱ' > 'sa';",
+                     "'ᬲ' > 'sa';",
+                     "'ᬳ' > 'ha';",
+                     "'ᭅ' > 'kaf';",
+                     "'ᭆ' > 'khot';",
+                     "'ᭇ' > 'tzir';",
+                     "'ᭈ' > 'ef';",
+                     "'ᭉ' > 've';",
+                     "'ᭊ' > 'zal';",
+                     "'ᭋ' > 'asyura';",
+                     "'ᮃ' > 'a';",
+                     "'ᮄ' > 'i';",
+                     "'ᮅ' > 'u';",
+                     "'ᮆ' > 'ae';",
+                     "'ᮇ' > 'o';",
+                     "'ᮈ' > 'e';",
+                     "'ᮉ' > 'eu';",
+                     "'ᮊ' > 'ka';",
+                     "'ᮋ' > 'qa';",
+                     "'ᮌ' > 'ga';",
+                     "'ᮍ' > 'nga';",
+                     "'ᮎ' > 'ca';",
+                     "'ᮏ' > 'ja';",
+                     "'ᮐ' > 'za';",
+                     "'ᮑ' > 'nya';",
+                     "'ᮒ' > 'ta';",
+                     "'ᮓ' > 'da';",
+                     "'ᮔ' > 'na';",
+                     "'ᮕ' > 'pa';",
+                     "'ᮖ' > 'fa';",
+                     "'ᮗ' > 'va';",
+                     "'ᮘ' > 'ba';",
+                     "'ᮙ' > 'ma';",
+                     "'ᮚ' > 'ya';",
+                     "'ᮛ' > 'ra';",
+                     "'ᮜ' > 'la';",
+                     "'ᮝ' > 'wa';",
+                     "'ᮞ' > 'sa';",
+                     "'ᮟ' > 'xa';",
+                     "'ᮠ' > 'ha';",
+                     "'ᮮ' > 'kha';",
+                     "'ᮯ' > 'sya';",
+                     "'ᰀ' > 'ka';",
+                     "'ᰁ' > 'kla';",
+                     "'ᰂ' > 'kha';",
+                     "'ᰃ' > 'ga';",
+                     "'ᰄ' > 'gla';",
+                     "'ᰅ' > 'nga';",
+                     "'ᰆ' > 'ca';",
+                     "'ᰇ' > 'cha';",
+                     "'ᰈ' > 'ja';",
+                     "'ᰉ' > 'nya';",
+                     "'ᰊ' > 'ta';",
+                     "'ᰋ' > 'tha';",
+                     "'ᰌ' > 'da';",
+                     "'ᰍ' > 'na';",
+                     "'ᰎ' > 'pa';",
+                     "'ᰏ' > 'pla';",
+                     "'ᰐ' > 'pha';",
+                     "'ᰑ' > 'fa';",
+                     "'ᰒ' > 'fla';",
+                     "'ᰓ' > 'ba';",
+                     "'ᰔ' > 'bla';",
+                     "'ᰕ' > 'ma';",
+                     "'ᰖ' > 'mla';",
+                     "'ᰗ' > 'tsa';",
+                     "'ᰘ' > 'tsha';",
+                     "'ᰙ' > 'dza';",
+                     "'ᰚ' > 'ya';",
+                     "'ᰛ' > 'ra';",
+                     "'ᰜ' > 'la';",
+                     "'ᰝ' > 'ha';",
+                     "'ᰞ' > 'hla';",
+                     "'ᰟ' > 'va';",
+                     "'ᰠ' > 'sa';",
+                     "'ᰡ' > 'sha';",
+                     "'ᰢ' > 'wa';",
+                     "'ᰣ' > 'a';",
+                     "'ᱍ' > 'tta';",
+                     "'ᱎ' > 'ttha';",
+                     "'ᱏ' > 'dda';",
+                     "'ᱚ' > 'la';",
+                     "'ᱛ' > 'at';",
+                     "'ᱜ' > 'ag';",
+                     "'ᱝ' > 'ang';",
+                     "'ᱞ' > 'al';",
+                     "'ᱟ' > 'laa';",
+                     "'ᱠ' > 'aak';",
+                     "'ᱡ' > 'aaj';",
+                     "'ᱢ' > 'aam';",
+                     "'ᱣ' > 'aaw';",
+                     "'ᱤ' > 'li';",
+                     "'ᱥ' > 'is';",
+                     "'ᱦ' > 'ih';",
+                     "'ᱧ' > 'iny';",
+                     "'ᱨ' > 'ir';",
+                     "'ᱩ' > 'lu';",
+                     "'ᱪ' > 'uc';",
+                     "'ᱫ' > 'ud';",
+                     "'ᱬ' > 'unn';",
+                     "'ᱭ' > 'uy';",
+                     "'ᱮ' > 'le';",
+                     "'ᱯ' > 'ep';",
+                     "'ᱰ' > 'edd';",
+                     "'ᱱ' > 'en';",
+                     "'ᱲ' > 'err';",
+                     "'ᱳ' > 'lo';",
+                     "'ᱴ' > 'ott';",
+                     "'ᱵ' > 'ob';",
+                     "'ᱶ' > 'ov';",
+                     "'ᱷ' > 'oh';",
+                     "'ᴂ' > 'ae';",
+                     "'ᴉ' > 'i';",
+                     "'ᴔ' > 'oe';",
+                     "'ᴥ' > 'ain';",
+                     "'ᵃ' > 'a';",
+                     "'ᵇ' > 'b';",
+                     "'ᵈ' > 'd';",
+                     "'ᵉ' > 'e';",
+                     "'ᵍ' > 'g';",
+                     "'ᵏ' > 'k';",
+                     "'ᵐ' > 'm';",
+                     "'ᵑ' > 'eng';",
+                     "'ᵒ' > 'o';",
+                     "'ᵖ' > 'p';",
+                     "'ᵗ' > 't';",
+                     "'ᵘ' > 'u';",
+                     "'ᵛ' > 'v';",
+                     "'ᵜ' > 'ain';",
+                     "'ᵝ' > 'beta';",
+                     "'ᵞ' > 'greek';",
+                     "'ᵟ' > 'delta';",
+                     "'ᵠ' > 'greek';",
+                     "'ᵡ' > 'chi';",
+                     "'ᵢ' > 'i';",
+                     "'ᵣ' > 'r';",
+                     "'ᵤ' > 'u';",
+                     "'ᵥ' > 'v';",
+                     "'ᵦ' > 'beta';",
+                     "'ᵧ' > 'gamma';",
+                     "'ᵨ' > 'rho';",
+                     "'ᵩ' > 'phi';",
+                     "'ᵪ' > 'chi';",
+                     "'ᵷ' > 'g';",
+                     "'ᵿ' > 'upsilon';",
+                     "'ᶋ' > 'esh';",
+                     "'ᶐ' > 'alpha';",
+                     "'ᶗ' > 'o';",
+                     "'ᶘ' > 'esh';",
+                     "'ᶚ' > 'ezh';",
+                     "'ᶜ' > 'c';",
+                     "'ᶝ' > 'c';",
+                     "'ᶞ' > 'eth';",
+                     "'ᶠ' > 'f';",
+                     "'ᶤ' > 'i';",
+                     "'ᶥ' > 'iota';",
+                     "'ᶨ' > 'j';",
+                     "'ᶩ' > 'l';",
+                     "'ᶪ' > 'l';",
+                     "'ᶬ' > 'm';",
+                     "'ᶮ' > 'n';",
+                     "'ᶯ' > 'n';",
+                     "'ᶲ' > 'phi';",
+                     "'ᶳ' > 's';",
+                     "'ᶴ' > 'esh';",
+                     "'ᶵ' > 't';",
+                     "'ᶶ' > 'u';",
+                     "'ᶷ' > 'upsilon';",
+                     "'ᶹ' > 'v';",
+                     "'ᶻ' > 'z';",
+                     "'ᶼ' > 'z';",
+                     "'ᶽ' > 'z';",
+                     "'ᶾ' > 'ezh';",
+                     "'ᶿ' > 'theta';",
+                     "'ẟ' > 'ddh';",
+                     "'ⁱ' > 'i';",
+                     "'ⁿ' > 'n';",
+                     "'ₐ' > 'a';",
+                     "'ₑ' > 'e';",
+                     "'ₒ' > 'o';",
+                     "'ₓ' > 'x';",
+                     "'ↄ' > 'c';",
+                     "'Ⰰ' > 'azu';",
+                     "'Ⰱ' > 'buky';",
+                     "'Ⰲ' > 'vede';",
+                     "'Ⰳ' > 'glagoli';",
+                     "'Ⰴ' > 'dobro';",
+                     "'Ⰵ' > 'yestu';",
+                     "'Ⰶ' > 'zhivete';",
+                     "'Ⰷ' > 'dzelo';",
+                     "'Ⰸ' > 'zemlja';",
+                     "'Ⰹ' > 'izhe';",
+                     "'Ⰺ' > 'initial';",
+                     "'Ⰻ' > 'i';",
+                     "'Ⰼ' > 'djervi';",
+                     "'Ⰽ' > 'kako';",
+                     "'Ⰾ' > 'ljudije';",
+                     "'Ⰿ' > 'myslite';",
+                     "'Ⱀ' > 'nashi';",
+                     "'Ⱁ' > 'onu';",
+                     "'Ⱂ' > 'pokoji';",
+                     "'Ⱃ' > 'ritsi';",
+                     "'Ⱄ' > 'slovo';",
+                     "'Ⱅ' > 'tvrido';",
+                     "'Ⱆ' > 'uku';",
+                     "'Ⱇ' > 'fritu';",
+                     "'Ⱈ' > 'heru';",
+                     "'Ⱉ' > 'otu';",
+                     "'Ⱊ' > 'pe';",
+                     "'Ⱋ' > 'shta';",
+                     "'Ⱌ' > 'tsi';",
+                     "'Ⱍ' > 'chrivi';",
+                     "'Ⱎ' > 'sha';",
+                     "'Ⱏ' > 'yeru';",
+                     "'Ⱐ' > 'yeri';",
+                     "'Ⱑ' > 'yati';",
+                     "'Ⱓ' > 'yu';",
+                     "'Ⱔ' > 'yus';",
+                     "'Ⱕ' > 'yus';",
+                     "'Ⱖ' > 'yo';",
+                     "'Ⱚ' > 'fita';",
+                     "'Ⱛ' > 'izhitsa';",
+                     "'Ⱜ' > 'shtapic';",
+                     "'Ⱝ' > 'trokutasti';",
+                     "'Ⱞ' > 'latinate';",
+                     "'ⰰ' > 'azu';",
+                     "'ⰱ' > 'buky';",
+                     "'ⰲ' > 'vede';",
+                     "'ⰳ' > 'glagoli';",
+                     "'ⰴ' > 'dobro';",
+                     "'ⰵ' > 'yestu';",
+                     "'ⰶ' > 'zhivete';",
+                     "'ⰷ' > 'dzelo';",
+                     "'ⰸ' > 'zemlja';",
+                     "'ⰹ' > 'izhe';",
+                     "'ⰺ' > 'initial';",
+                     "'ⰻ' > 'i';",
+                     "'ⰼ' > 'djervi';",
+                     "'ⰽ' > 'kako';",
+                     "'ⰾ' > 'ljudije';",
+                     "'ⰿ' > 'myslite';",
+                     "'ⱀ' > 'nashi';",
+                     "'ⱁ' > 'onu';",
+                     "'ⱂ' > 'pokoji';",
+                     "'ⱃ' > 'ritsi';",
+                     "'ⱄ' > 'slovo';",
+                     "'ⱅ' > 'tvrido';",
+                     "'ⱆ' > 'uku';",
+                     "'ⱇ' > 'fritu';",
+                     "'ⱈ' > 'heru';",
+                     "'ⱉ' > 'otu';",
+                     "'ⱊ' > 'pe';",
+                     "'ⱋ' > 'shta';",
+                     "'ⱌ' > 'tsi';",
+                     "'ⱍ' > 'chrivi';",
+                     "'ⱎ' > 'sha';",
+                     "'ⱏ' > 'yeru';",
+                     "'ⱐ' > 'yeri';",
+                     "'ⱑ' > 'yati';",
+                     "'ⱓ' > 'yu';",
+                     "'ⱔ' > 'yus';",
+                     "'ⱕ' > 'yus';",
+                     "'ⱖ' > 'yo';",
+                     "'ⱚ' > 'fita';",
+                     "'ⱛ' > 'izhitsa';",
+                     "'ⱜ' > 'shtapic';",
+                     "'ⱝ' > 'trokutasti';",
+                     "'ⱞ' > 'latinate';",
+                     "'Ⱡ' > 'l';",
+                     "'ⱡ' > 'l';",
+                     "'Ɫ' > 'l';",
+                     "'Ᵽ' > 'p';",
+                     "'Ɽ' > 'r';",
+                     "'ⱥ' > 'a';",
+                     "'ⱦ' > 't';",
+                     "'Ⱨ' > 'h';",
+                     "'ⱨ' > 'h';",
+                     "'Ⱪ' > 'k';",
+                     "'ⱪ' > 'k';",
+                     "'Ⱬ' > 'z';",
+                     "'ⱬ' > 'z';",
+                     "'Ɑ' > 'alpha';",
+                     "'Ɱ' > 'm';",
+                     "'Ɐ' > 'a';",
+                     "'ⱱ' > 'v';",
+                     "'Ⱳ' > 'w';",
+                     "'ⱳ' > 'w';",
+                     "'ⱴ' > 'v';",
+                     "'ⱸ' > 'e';",
+                     "'ⱹ' > 'r';",
+                     "'ⱺ' > 'o';",
+                     "'ⱼ' > 'j';",
+                     "'Ⲁ' > 'alfa';",
+                     "'ⲁ' > 'alfa';",
+                     "'Ⲃ' > 'vida';",
+                     "'ⲃ' > 'vida';",
+                     "'Ⲅ' > 'gamma';",
+                     "'ⲅ' > 'gamma';",
+                     "'Ⲇ' > 'dalda';",
+                     "'ⲇ' > 'dalda';",
+                     "'Ⲉ' > 'eie';",
+                     "'ⲉ' > 'eie';",
+                     "'Ⲋ' > 'sou';",
+                     "'ⲋ' > 'sou';",
+                     "'Ⲍ' > 'zata';",
+                     "'ⲍ' > 'zata';",
+                     "'Ⲏ' > 'hate';",
+                     "'ⲏ' > 'hate';",
+                     "'Ⲑ' > 'thethe';",
+                     "'ⲑ' > 'thethe';",
+                     "'Ⲓ' > 'iauda';",
+                     "'ⲓ' > 'iauda';",
+                     "'Ⲕ' > 'kapa';",
+                     "'ⲕ' > 'kapa';",
+                     "'Ⲗ' > 'laula';",
+                     "'ⲗ' > 'laula';",
+                     "'Ⲙ' > 'mi';",
+                     "'ⲙ' > 'mi';",
+                     "'Ⲛ' > 'ni';",
+                     "'ⲛ' > 'ni';",
+                     "'Ⲝ' > 'ksi';",
+                     "'ⲝ' > 'ksi';",
+                     "'Ⲟ' > 'o';",
+                     "'ⲟ' > 'o';",
+                     "'Ⲡ' > 'pi';",
+                     "'ⲡ' > 'pi';",
+                     "'Ⲣ' > 'ro';",
+                     "'ⲣ' > 'ro';",
+                     "'Ⲥ' > 'sima';",
+                     "'ⲥ' > 'sima';",
+                     "'Ⲧ' > 'tau';",
+                     "'ⲧ' > 'tau';",
+                     "'Ⲩ' > 'ua';",
+                     "'ⲩ' > 'ua';",
+                     "'Ⲫ' > 'fi';",
+                     "'ⲫ' > 'fi';",
+                     "'Ⲭ' > 'khi';",
+                     "'ⲭ' > 'khi';",
+                     "'Ⲯ' > 'psi';",
+                     "'ⲯ' > 'psi';",
+                     "'Ⲱ' > 'oou';",
+                     "'ⲱ' > 'oou';",
+                     "'Ⳁ' > 'sampi';",
+                     "'ⳁ' > 'sampi';",
+                     "'ⴀ' > 'an';",
+                     "'ⴁ' > 'ban';",
+                     "'ⴂ' > 'gan';",
+                     "'ⴃ' > 'don';",
+                     "'ⴄ' > 'en';",
+                     "'ⴅ' > 'vin';",
+                     "'ⴆ' > 'zen';",
+                     "'ⴇ' > 'tan';",
+                     "'ⴈ' > 'in';",
+                     "'ⴉ' > 'kan';",
+                     "'ⴊ' > 'las';",
+                     "'ⴋ' > 'man';",
+                     "'ⴌ' > 'nar';",
+                     "'ⴍ' > 'on';",
+                     "'ⴎ' > 'par';",
+                     "'ⴏ' > 'zhar';",
+                     "'ⴐ' > 'rae';",
+                     "'ⴑ' > 'san';",
+                     "'ⴒ' > 'tar';",
+                     "'ⴓ' > 'un';",
+                     "'ⴔ' > 'phar';",
+                     "'ⴕ' > 'khar';",
+                     "'ⴖ' > 'ghan';",
+                     "'ⴗ' > 'qar';",
+                     "'ⴘ' > 'shin';",
+                     "'ⴙ' > 'chin';",
+                     "'ⴚ' > 'can';",
+                     "'ⴛ' > 'jil';",
+                     "'ⴜ' > 'cil';",
+                     "'ⴝ' > 'char';",
+                     "'ⴞ' > 'xan';",
+                     "'ⴟ' > 'jhan';",
+                     "'ⴠ' > 'hae';",
+                     "'ⴡ' > 'he';",
+                     "'ⴢ' > 'hie';",
+                     "'ⴣ' > 'we';",
+                     "'ⴤ' > 'har';",
+                     "'ⴥ' > 'hoe';",
+                     "'ⴰ' > 'ya';",
+                     "'ⴱ' > 'yab';",
+                     "'ⴲ' > 'yabh';",
+                     "'ⴳ' > 'yag';",
+                     "'ⴴ' > 'yaghh';",
+                     "'ⴶ' > 'yaj';",
+                     "'ⴷ' > 'yad';",
+                     "'ⴸ' > 'yadh';",
+                     "'ⴹ' > 'yadd';",
+                     "'ⴺ' > 'yaddh';",
+                     "'ⴻ' > 'yey';",
+                     "'ⴼ' > 'yaf';",
+                     "'ⴽ' > 'yak';",
+                     "'ⴿ' > 'yakhh';",
+                     "'ⵀ' > 'yah';",
+                     "'ⵃ' > 'yahh';",
+                     "'ⵄ' > 'yaa';",
+                     "'ⵅ' > 'yakh';",
+                     "'ⵇ' > 'yaq';",
+                     "'ⵉ' > 'yi';",
+                     "'ⵊ' > 'yazh';",
+                     "'ⵋ' > 'ahaggar';",
+                     "'ⵍ' > 'yal';",
+                     "'ⵎ' > 'yam';",
+                     "'ⵏ' > 'yan';",
+                     "'ⵒ' > 'yap';",
+                     "'ⵓ' > 'yu';",
+                     "'ⵔ' > 'yar';",
+                     "'ⵕ' > 'yarr';",
+                     "'ⵖ' > 'yagh';",
+                     "'ⵘ' > 'ayer';",
+                     "'ⵙ' > 'yas';",
+                     "'ⵚ' > 'yass';",
+                     "'ⵛ' > 'yash';",
+                     "'ⵜ' > 'yat';",
+                     "'ⵝ' > 'yath';",
+                     "'ⵞ' > 'yach';",
+                     "'ⵟ' > 'yatt';",
+                     "'ⵠ' > 'yav';",
+                     "'ⵡ' > 'yaw';",
+                     "'ⵢ' > 'yay';",
+                     "'ⵣ' > 'yaz';",
+                     "'ⵤ' > 'tawellemet';",
+                     "'ⵥ' > 'yazz';",
+                     "'ⶀ' > 'loa';",
+                     "'ⶁ' > 'moa';",
+                     "'ⶂ' > 'roa';",
+                     "'ⶃ' > 'soa';",
+                     "'ⶄ' > 'shoa';",
+                     "'ⶅ' > 'boa';",
+                     "'ⶆ' > 'toa';",
+                     "'ⶇ' > 'coa';",
+                     "'ⶈ' > 'noa';",
+                     "'ⶉ' > 'nyoa';",
+                     "'ⶊ' > 'oa';",
+                     "'ⶋ' > 'zoa';",
+                     "'ⶌ' > 'doa';",
+                     "'ⶍ' > 'ddoa';",
+                     "'ⶎ' > 'joa';",
+                     "'ⶏ' > 'thoa';",
+                     "'ⶐ' > 'choa';",
+                     "'ⶑ' > 'phoa';",
+                     "'ⶒ' > 'poa';",
+                     "'ⶓ' > 'ggwa';",
+                     "'ⶔ' > 'ggwi';",
+                     "'ⶕ' > 'ggwee';",
+                     "'ⶖ' > 'ggwe';",
+                     "'ⶠ' > 'ssa';",
+                     "'ⶡ' > 'ssu';",
+                     "'ⶢ' > 'ssi';",
+                     "'ⶣ' > 'ssaa';",
+                     "'ⶤ' > 'ssee';",
+                     "'ⶥ' > 'sse';",
+                     "'ⶦ' > 'sso';",
+                     "'ⶨ' > 'cca';",
+                     "'ⶩ' > 'ccu';",
+                     "'ⶪ' > 'cci';",
+                     "'ⶫ' > 'ccaa';",
+                     "'ⶬ' > 'ccee';",
+                     "'ⶭ' > 'cce';",
+                     "'ⶮ' > 'cco';",
+                     "'ⶰ' > 'zza';",
+                     "'ⶱ' > 'zzu';",
+                     "'ⶲ' > 'zzi';",
+                     "'ⶳ' > 'zzaa';",
+                     "'ⶴ' > 'zzee';",
+                     "'ⶵ' > 'zze';",
+                     "'ⶶ' > 'zzo';",
+                     "'ⶸ' > 'ccha';",
+                     "'ⶹ' > 'cchu';",
+                     "'ⶺ' > 'cchi';",
+                     "'ⶻ' > 'cchaa';",
+                     "'ⶼ' > 'cchee';",
+                     "'ⶽ' > 'cche';",
+                     "'ⶾ' > 'ccho';",
+                     "'ⷀ' > 'qya';",
+                     "'ⷁ' > 'qyu';",
+                     "'ⷂ' > 'qyi';",
+                     "'ⷃ' > 'qyaa';",
+                     "'ⷄ' > 'qyee';",
+                     "'ⷅ' > 'qye';",
+                     "'ⷆ' > 'qyo';",
+                     "'ⷈ' > 'kya';",
+                     "'ⷉ' > 'kyu';",
+                     "'ⷊ' > 'kyi';",
+                     "'ⷋ' > 'kyaa';",
+                     "'ⷌ' > 'kyee';",
+                     "'ⷍ' > 'kye';",
+                     "'ⷎ' > 'kyo';",
+                     "'ⷐ' > 'xya';",
+                     "'ⷑ' > 'xyu';",
+                     "'ⷒ' > 'xyi';",
+                     "'ⷓ' > 'xyaa';",
+                     "'ⷔ' > 'xyee';",
+                     "'ⷕ' > 'xye';",
+                     "'ⷖ' > 'xyo';",
+                     "'ⷘ' > 'gya';",
+                     "'ⷙ' > 'gyu';",
+                     "'ⷚ' > 'gyi';",
+                     "'ⷛ' > 'gyaa';",
+                     "'ⷜ' > 'gyee';",
+                     "'ⷝ' > 'gye';",
+                     "'ⷞ' > 'gyo';",
+                     "'ゕ' > 'ka';",
+                     "'ゖ' > 'ke';",
+                     "'ㄪ' > 'v';",
+                     "'ㄫ' > 'ng';",
+                     "'ㄬ' > 'gn';",
+                     "'ㄭ' > 'ih';",
+                     "'ㅀ' > 'rieul-hieuh';",
+                     "'ㅄ' > 'pieup-sios';",
+                     "'ㅥ' > 'ssangnieun';",
+                     "'ㅦ' > 'nieun-tikeut';",
+                     "'ㅧ' > 'nieun-sios';",
+                     "'ㅨ' > 'nieun-pansios';",
+                     "'ㅩ' > 'rieul-kiyeok-sios';",
+                     "'ㅪ' > 'rieul-tikeut';",
+                     "'ㅫ' > 'rieul-pieup-sios';",
+                     "'ㅬ' > 'rieul-pansios';",
+                     "'ㅭ' > 'rieul-yeorinhieuh';",
+                     "'ㅮ' > 'mieum-pieup';",
+                     "'ㅯ' > 'mieum-sios';",
+                     "'ㅰ' > 'mieum-pansios';",
+                     "'ㅱ' > 'kapyeounmieum';",
+                     "'ㅲ' > 'pieup-kiyeok';",
+                     "'ㅳ' > 'pieup-tikeut';",
+                     "'ㅴ' > 'pieup-sios-kiyeok';",
+                     "'ㅵ' > 'pieup-sios-tikeut';",
+                     "'ㅶ' > 'pieup-cieuc';",
+                     "'ㅷ' > 'pieup-thieuth';",
+                     "'ㅸ' > 'kapyeounpieup';",
+                     "'ㅹ' > 'kapyeounssangpieup';",
+                     "'ㅺ' > 'sios-kiyeok';",
+                     "'ㅻ' > 'sios-nieun';",
+                     "'ㅼ' > 'sios-tikeut';",
+                     "'ㅽ' > 'sios-pieup';",
+                     "'ㅾ' > 'sios-cieuc';",
+                     "'ㅿ' > 'pansios';",
+                     "'ㆀ' > 'ssangieung';",
+                     "'ㆁ' > 'yesieung';",
+                     "'ㆂ' > 'yesieung-sios';",
+                     "'ㆃ' > 'yesieung-pansios';",
+                     "'ㆄ' > 'kapyeounphieuph';",
+                     "'ㆅ' > 'ssanghieuh';",
+                     "'ㆆ' > 'yeorinhieuh';",
+                     "'ㆇ' > 'yo-ya';",
+                     "'ㆈ' > 'yo-yae';",
+                     "'ㆉ' > 'yo-i';",
+                     "'ㆊ' > 'yu-yeo';",
+                     "'ㆋ' > 'yu-ye';",
+                     "'ㆌ' > 'yu-i';",
+                     "'ㆍ' > 'araea';",
+                     "'ㆎ' > 'araeae';",
+                     "'ㆠ' > 'bu';",
+                     "'ㆡ' > 'zi';",
+                     "'ㆢ' > 'ji';",
+                     "'ㆣ' > 'gu';",
+                     "'ㆤ' > 'ee';",
+                     "'ㆥ' > 'enn';",
+                     "'ㆦ' > 'oo';",
+                     "'ㆧ' > 'onn';",
+                     "'ㆨ' > 'ir';",
+                     "'ㆩ' > 'ann';",
+                     "'ㆪ' > 'inn';",
+                     "'ㆫ' > 'unn';",
+                     "'ㆬ' > 'im';",
+                     "'ㆭ' > 'ngg';",
+                     "'ㆮ' > 'ainn';",
+                     "'ㆯ' > 'aunn';",
+                     "'ㆰ' > 'am';",
+                     "'ㆱ' > 'om';",
+                     "'ㆲ' > 'ong';",
+                     "'ㆳ' > 'innn';",
+                     "'ㆴ' > 'p';",
+                     "'ㆵ' > 't';",
+                     "'ㆶ' > 'k';",
+                     "'ㆷ' > 'h';",
+                     "'ㇰ' > 'ku';",
+                     "'ㇱ' > 'si';",
+                     "'ㇲ' > 'su';",
+                     "'ㇳ' > 'to';",
+                     "'ㇴ' > 'nu';",
+                     "'ㇵ' > 'ha';",
+                     "'ㇶ' > 'hi';",
+                     "'ㇷ' > 'hu';",
+                     "'ㇸ' > 'he';",
+                     "'ㇹ' > 'ho';",
+                     "'ㇺ' > 'mu';",
+                     "'ㇻ' > 'ra';",
+                     "'ㇼ' > 'ri';",
+                     "'ㇽ' > 'ru';",
+                     "'ㇾ' > 're';",
+                     "'ㇿ' > 'ro';",
+                     "'兙' > ' shi';",
+                     "'兡' > ' bai';",
+                     "'嗧' > ' jia';",
+                     "'瓧' > ' seng';",
+                     "'瓰' > ' bo';",
+                     "'瓱' > ' gu';",
+                     "'瓼' > ' feng';",
+                     "'甅' > ' dang';",
+                     "'龦' > ' ze';",
+                     "'龧' > ' qie';",
+                     "'龨' > ' tuo';",
+                     "'龩' > ' luo';",
+                     "'龪' > ' dan';",
+                     "'龫' > ' xiao';",
+                     "'龬' > ' ruo';",
+                     "'龭' > ' jian';",
+                     "'龮' > ' xuan';",
+                     "'龯' > ' bian';",
+                     "'龰' > ' sun';",
+                     "'龱' > ' xiang';",
+                     "'龲' > ' xian';",
+                     "'龳' > ' ping';",
+                     "'龴' > ' zhen';",
+                     "'龵' > ' sheng';",
+                     "'龶' > ' hu';",
+                     "'龷' > ' shi';",
+                     "'龸' > ' zhu';",
+                     "'龹' > ' yue';",
+                     "'龺' > ' chun';",
+                     "'龻' > ' lu';",
+                     "'龼' > ' wu';",
+                     "'龽' > ' dong';",
+                     "'龾' > ' xiao';",
+                     "'龿' > ' ji';",
+                     "'鿀' > ' jie';",
+                     "'鿁' > ' huang';",
+                     "'鿂' > ' xing';",
+                     "'鿄' > ' fan';",
+                     "'鿅' > ' chui';",
+                     "'鿆' > ' zhuan';",
+                     "'鿇' > ' pian';",
+                     "'鿈' > ' feng';",
+                     "'鿉' > ' zhu';",
+                     "'鿊' > ' hong';",
+                     "'鿋' > ' qie';",
+                     "'鿌' > ' hou';",
+                     "'鿑' > ' kui';",
+                     "'鿒' > ' sik';",
+                     "'鿓' > ' lou';",
+                     "'鿖' > ' tang';",
+                     "'鿗' > ' yue';",
+                     "'鿘' > ' chou';",
+                     "'鿙' > ' gao';",
+                     "'鿚' > ' fei';",
+                     "'鿛' > ' ruo';",
+                     "'鿜' > ' zheng';",
+                     "'鿝' > ' gou';",
+                     "'鿞' > ' nie';",
+                     "'鿟' > ' qian';",
+                     "'鿠' > ' xiao';",
+                     "'鿡' > ' cuan';",
+                     "'鿢' > ' gong';",
+                     "'鿣' > ' pang';",
+                     "'鿤' > ' du';",
+                     "'鿥' > ' li';",
+                     "'鿦' > ' bi';",
+                     "'鿧' > ' zhuo';",
+                     "'鿨' > ' chu';",
+                     "'鿩' > ' shai';",
+                     "'鿪' > ' chi';",
+                     "'鿮' > ' lan';",
+                     "'鿯' > ' jian';",
+                     "'ꀀ' > ' ze';",
+                     "'ꀁ' > ' xi';",
+                     "'ꀂ' > ' guo';",
+                     "'ꀃ' > ' yi';",
+                     "'ꀄ' > ' hu';",
+                     "'ꀅ' > ' chan';",
+                     "'ꀆ' > ' kou';",
+                     "'ꀇ' > ' cu';",
+                     "'ꀈ' > ' ping';",
+                     "'ꀉ' > ' chou';",
+                     "'ꀊ' > ' ji';",
+                     "'ꀋ' > ' gui';",
+                     "'ꀌ' > ' su';",
+                     "'ꀍ' > ' lou';",
+                     "'ꀎ' > ' zha';",
+                     "'ꀏ' > ' lu';",
+                     "'ꀐ' > ' nian';",
+                     "'ꀑ' > ' suo';",
+                     "'ꀒ' > ' cuan';",
+                     "'ꀓ' > ' sasara';",
+                     "'ꀔ' > ' suo';",
+                     "'ꀕ' > ' le';",
+                     "'ꀖ' > ' duan';",
+                     "'ꀗ' > ' yana';",
+                     "'ꀘ' > ' xiao';",
+                     "'ꀙ' > ' bo';",
+                     "'ꀚ' > ' mi';",
+                     "'ꀛ' > ' si';",
+                     "'ꀜ' > ' dang';",
+                     "'ꀝ' > ' liao';",
+                     "'ꀞ' > ' dan';",
+                     "'ꀟ' > ' dian';",
+                     "'ꀠ' > ' fu';",
+                     "'ꀡ' > ' jian';",
+                     "'ꀢ' > ' min';",
+                     "'ꀣ' > ' kui';",
+                     "'ꀤ' > ' dai';",
+                     "'ꀥ' > ' qiao';",
+                     "'ꀦ' > ' deng';",
+                     "'ꀧ' > ' huang';",
+                     "'ꀨ' > ' sun';",
+                     "'ꀩ' > ' lao';",
+                     "'ꀪ' > ' zan';",
+                     "'ꀫ' > ' xiao';",
+                     "'ꀬ' > ' du';",
+                     "'ꀭ' > ' shi';",
+                     "'ꀮ' > ' zan';",
+                     "'ꀯ' > 'bup';",
+                     "'ꀰ' > ' pai';",
+                     "'ꀱ' > ' hata';",
+                     "'ꀲ' > ' pai';",
+                     "'ꀳ' > ' gan';",
+                     "'ꀴ' > ' ju';",
+                     "'ꀵ' > ' du';",
+                     "'ꀶ' > ' lu';",
+                     "'ꀷ' > ' yan';",
+                     "'ꀸ' > ' bo';",
+                     "'ꀹ' > ' dang';",
+                     "'ꀺ' > ' sai';",
+                     "'ꀻ' > ' ke';",
+                     "'ꀼ' > ' long';",
+                     "'ꀽ' > ' qian';",
+                     "'ꀾ' > ' lian';",
+                     "'ꀿ' > ' bo';",
+                     "'ꁀ' > ' zhou';",
+                     "'ꁁ' > ' lai';",
+                     "'ꁂ' > 'pap';",
+                     "'ꁃ' > ' lan';",
+                     "'ꁄ' > ' kui';",
+                     "'ꁅ' > ' yu';",
+                     "'ꁆ' > ' yue';",
+                     "'ꁇ' > ' hao';",
+                     "'ꁈ' > ' zhen';",
+                     "'ꁉ' > ' tai';",
+                     "'ꁊ' > ' ti';",
+                     "'ꁋ' > ' mi';",
+                     "'ꁌ' > ' chou';",
+                     "'ꁍ' > ' ji';",
+                     "'ꁎ' > 'purx';",
+                     "'ꁏ' > ' hata';",
+                     "'ꁐ' > ' teng';",
+                     "'ꁑ' > ' zhuan';",
+                     "'ꁒ' > ' zhou';",
+                     "'ꁓ' > ' fan';",
+                     "'ꁔ' > ' sou';",
+                     "'ꁕ' > ' zhou';",
+                     "'ꁖ' > ' kuji';",
+                     "'ꁗ' > ' zhuo';",
+                     "'ꁘ' > ' teng';",
+                     "'ꁙ' > ' lu';",
+                     "'ꁚ' > ' lu';",
+                     "'ꁛ' > ' jian';",
+                     "'ꁜ' > ' tuo';",
+                     "'ꁝ' > ' ying';",
+                     "'ꁞ' > ' yu';",
+                     "'ꁟ' > ' lai';",
+                     "'ꁠ' > ' long';",
+                     "'ꁡ' > ' shinshi';",
+                     "'ꁢ' > ' lian';",
+                     "'ꁣ' > ' lan';",
+                     "'ꁤ' > ' qian';",
+                     "'ꁥ' > ' yue';",
+                     "'ꁦ' > ' zhong';",
+                     "'ꁧ' > ' qu';",
+                     "'ꁨ' > ' lian';",
+                     "'ꁩ' > ' bian';",
+                     "'ꁪ' > ' duan';",
+                     "'ꁫ' > ' zuan';",
+                     "'ꁬ' > ' li';",
+                     "'ꁭ' > ' si';",
+                     "'ꁮ' > ' luo';",
+                     "'ꁯ' > ' ying';",
+                     "'ꁰ' > ' yue';",
+                     "'ꁱ' > ' zhuo';",
+                     "'ꁲ' > ' xu';",
+                     "'ꁳ' > ' mi';",
+                     "'ꁴ' > ' di';",
+                     "'ꁵ' > ' fan';",
+                     "'ꁶ' > ' shen';",
+                     "'ꁷ' > ' zhe';",
+                     "'ꁸ' > ' shen';",
+                     "'ꁹ' > ' nu';",
+                     "'ꁺ' > ' xie';",
+                     "'ꁻ' > ' lei';",
+                     "'ꁼ' > ' xian';",
+                     "'ꁽ' > ' zi';",
+                     "'ꁾ' > ' ni';",
+                     "'ꁿ' > ' cun';",
+                     "'ꂀ' > 'nbap';",
+                     "'ꂁ' > ' qian';",
+                     "'ꂂ' > ' kume';",
+                     "'ꂃ' > ' bi';",
+                     "'ꂄ' > ' ban';",
+                     "'ꂅ' > ' wu';",
+                     "'ꂆ' > ' sha';",
+                     "'ꂇ' > ' kang';",
+                     "'ꂈ' > ' rou';",
+                     "'ꂉ' > ' fen';",
+                     "'ꂊ' > ' bi';",
+                     "'ꂋ' > ' cui';",
+                     "'ꂌ' > 'nbyx';",
+                     "'ꂍ' > ' li';",
+                     "'ꂎ' > ' chi';",
+                     "'ꂏ' > ' nukamiso';",
+                     "'ꂐ' > ' ro';",
+                     "'ꂑ' > ' ba';",
+                     "'ꂒ' > ' li';",
+                     "'ꂓ' > ' gan';",
+                     "'ꂔ' > ' ju';",
+                     "'ꂕ' > ' po';",
+                     "'ꂖ' > ' mo';",
+                     "'ꂗ' > ' cu';",
+                     "'ꂘ' > ' nian';",
+                     "'ꂙ' > ' zhou';",
+                     "'ꂚ' > ' li';",
+                     "'ꂛ' > ' su';",
+                     "'ꂜ' > ' tiao';",
+                     "'ꂝ' > ' li';",
+                     "'ꂞ' > ' qi';",
+                     "'ꂟ' > ' su';",
+                     "'ꂠ' > ' hong';",
+                     "'ꂡ' > ' tong';",
+                     "'ꂢ' > ' zi';",
+                     "'ꂣ' > ' ce';",
+                     "'ꂤ' > ' yue';",
+                     "'ꂥ' > ' zhou';",
+                     "'ꂦ' > ' lin';",
+                     "'ꂧ' > ' zhuang';",
+                     "'ꂨ' > ' bai';",
+                     "'ꂩ' > 'hmyx';",
+                     "'ꂪ' > ' fen';",
+                     "'ꂫ' > ' ji';",
+                     "'ꂬ' > 'hmyrx';",
+                     "'ꂭ' > ' sukumo';",
+                     "'ꂮ' > ' liang';",
+                     "'ꂯ' > ' xian';",
+                     "'ꂰ' > ' fu';",
+                     "'ꂱ' > ' liang';",
+                     "'ꂲ' > ' can';",
+                     "'ꂳ' > ' geng';",
+                     "'ꂴ' > ' li';",
+                     "'ꂵ' > ' yue';",
+                     "'ꂶ' > ' lu';",
+                     "'ꂷ' > ' ju';",
+                     "'ꂸ' > ' qi';",
+                     "'ꂹ' > ' cui';",
+                     "'ꂺ' > ' bai';",
+                     "'ꂻ' > ' zhang';",
+                     "'ꂼ' > ' lin';",
+                     "'ꂽ' > ' zong';",
+                     "'ꂾ' > ' jing';",
+                     "'ꂿ' > ' guo';",
+                     "'ꃀ' > ' kouji';",
+                     "'ꃁ' > ' san';",
+                     "'ꃂ' > ' san';",
+                     "'ꃃ' > ' tang';",
+                     "'ꃄ' > ' bian';",
+                     "'ꃅ' > ' rou';",
+                     "'ꃆ' > ' mian';",
+                     "'ꃇ' > ' hou';",
+                     "'ꃈ' > ' xu';",
+                     "'ꃉ' > ' zong';",
+                     "'ꃊ' > ' hu';",
+                     "'ꃋ' > ' jian';",
+                     "'ꃌ' > ' zan';",
+                     "'ꃍ' > ' ci';",
+                     "'ꃎ' > ' li';",
+                     "'ꃏ' > ' xie';",
+                     "'ꃐ' > ' fu';",
+                     "'ꃑ' > ' ni';",
+                     "'ꃒ' > ' bei';",
+                     "'ꃓ' > ' gu';",
+                     "'ꃔ' > ' xiu';",
+                     "'ꃕ' > ' gao';",
+                     "'ꃖ' > ' tang';",
+                     "'ꃗ' > ' qiu';",
+                     "'ꃘ' > ' sukumo';",
+                     "'ꃙ' > ' cao';",
+                     "'ꃚ' > ' zhuang';",
+                     "'ꃛ' > ' tang';",
+                     "'ꃜ' > ' mi';",
+                     "'ꃝ' > ' san';",
+                     "'ꃞ' > ' fen';",
+                     "'ꃟ' > ' zao';",
+                     "'ꃠ' > ' kang';",
+                     "'ꃡ' > ' jiang';",
+                     "'ꃢ' > ' mo';",
+                     "'ꃣ' > ' san';",
+                     "'ꃤ' > ' san';",
+                     "'ꃥ' > ' nuo';",
+                     "'ꃦ' > ' xi';",
+                     "'ꃧ' > ' liang';",
+                     "'ꃨ' > ' jiang';",
+                     "'ꃩ' > ' kuai';",
+                     "'ꃪ' > ' bo';",
+                     "'ꃫ' > ' huan';",
+                     "'ꃬ' > 'va';",
+                     "'ꃭ' > ' zong';",
+                     "'ꃮ' > ' xian';",
+                     "'ꃯ' > ' nuo';",
+                     "'ꃰ' > ' tuan';",
+                     "'ꃱ' > ' nie';",
+                     "'ꃲ' > ' li';",
+                     "'ꃳ' > ' zuo';",
+                     "'ꃴ' > ' di';",
+                     "'ꃵ' > ' nie';",
+                     "'ꃶ' > ' tiao';",
+                     "'ꃷ' > ' lan';",
+                     "'ꃸ' > ' mi';",
+                     "'ꃹ' > ' jiao';",
+                     "'ꃺ' > ' jiu';",
+                     "'ꃻ' > ' xi';",
+                     "'ꃼ' > ' gong';",
+                     "'ꃽ' > ' zheng';",
+                     "'ꃾ' > ' jiu';",
+                     "'ꃿ' > ' you';",
+                     "'ꄀ' > ' ji';",
+                     "'ꄁ' > ' cha';",
+                     "'ꄂ' > ' zhou';",
+                     "'ꄃ' > ' xun';",
+                     "'ꄄ' > ' yue';",
+                     "'ꄅ' > ' hong';",
+                     "'ꄆ' > ' yu';",
+                     "'ꄇ' > ' he';",
+                     "'ꄈ' > ' wan';",
+                     "'ꄉ' > ' ren';",
+                     "'ꄊ' > ' wen';",
+                     "'ꄋ' > ' wen';",
+                     "'ꄌ' > ' qiu';",
+                     "'ꄍ' > ' na';",
+                     "'ꄎ' > ' zi';",
+                     "'ꄏ' > ' tou';",
+                     "'ꄐ' > ' niu';",
+                     "'ꄑ' > ' fou';",
+                     "'ꄒ' > ' jie';",
+                     "'ꄓ' > ' shu';",
+                     "'ꄔ' > ' chun';",
+                     "'ꄕ' > ' pi';",
+                     "'ꄖ' > ' yin';",
+                     "'ꄗ' > ' sha';",
+                     "'ꄘ' > ' hong';",
+                     "'ꄙ' > ' zhi';",
+                     "'ꄚ' > ' ji';",
+                     "'ꄛ' > ' fen';",
+                     "'ꄜ' > ' yun';",
+                     "'ꄝ' > ' ren';",
+                     "'ꄞ' > ' dan';",
+                     "'ꄟ' > ' jin';",
+                     "'ꄠ' > ' su';",
+                     "'ꄡ' > ' fang';",
+                     "'ꄢ' > ' suo';",
+                     "'ꄣ' > ' cui';",
+                     "'ꄤ' > ' jiu';",
+                     "'ꄥ' > ' zha';",
+                     "'ꄦ' > ' kinu';",
+                     "'ꄧ' > ' jin';",
+                     "'ꄨ' > ' fu';",
+                     "'ꄩ' > ' zhi';",
+                     "'ꄪ' > ' ci';",
+                     "'ꄫ' > ' zi';",
+                     "'ꄬ' > ' chou';",
+                     "'ꄭ' > ' hong';",
+                     "'ꄮ' > ' zha';",
+                     "'ꄯ' > ' lei';",
+                     "'ꄰ' > ' xi';",
+                     "'ꄱ' > ' fu';",
+                     "'ꄲ' > ' xie';",
+                     "'ꄳ' > ' shen';",
+                     "'ꄴ' > ' bei';",
+                     "'ꄵ' > ' zhu';",
+                     "'ꄶ' > ' qu';",
+                     "'ꄷ' > ' ling';",
+                     "'ꄸ' > ' zhu';",
+                     "'ꄹ' > ' shao';",
+                     "'ꄺ' > ' gan';",
+                     "'ꄻ' > ' yang';",
+                     "'ꄼ' > ' fu';",
+                     "'ꄽ' > ' tuo';",
+                     "'ꄾ' > ' zhen';",
+                     "'ꄿ' > ' dai';",
+                     "'ꅀ' > ' zhuo';",
+                     "'ꅁ' > ' shi';",
+                     "'ꅂ' > ' zhong';",
+                     "'ꅃ' > ' xian';",
+                     "'ꅄ' > ' zu';",
+                     "'ꅅ' > ' jiong';",
+                     "'ꅆ' > ' ban';",
+                     "'ꅇ' > ' ju';",
+                     "'ꅈ' > ' mo';",
+                     "'ꅉ' > ' shu';",
+                     "'ꅊ' > ' zui';",
+                     "'ꅋ' > ' wata';",
+                     "'ꅌ' > ' jing';",
+                     "'ꅍ' > ' ren';",
+                     "'ꅎ' > ' heng';",
+                     "'ꅏ' > ' xie';",
+                     "'ꅐ' > ' jie';",
+                     "'ꅑ' > ' zhu';",
+                     "'ꅒ' > ' chou';",
+                     "'ꅓ' > ' gua';",
+                     "'ꅔ' > ' bai';",
+                     "'ꅕ' > ' jue';",
+                     "'ꅖ' > ' kuang';",
+                     "'ꅗ' > ' hu';",
+                     "'ꅘ' > ' ci';",
+                     "'ꅙ' > ' geng';",
+                     "'ꅚ' > ' geng';",
+                     "'ꅛ' > ' tao';",
+                     "'ꅜ' > ' xie';",
+                     "'ꅝ' > ' ku';",
+                     "'ꅞ' > ' jiao';",
+                     "'ꅟ' > ' quan';",
+                     "'ꅠ' > ' gai';",
+                     "'ꅡ' > ' luo';",
+                     "'ꅢ' > ' xuan';",
+                     "'ꅣ' > ' bing';",
+                     "'ꅤ' > ' xian';",
+                     "'ꅥ' > ' fu';",
+                     "'ꅦ' > ' gei';",
+                     "'ꅧ' > ' tong';",
+                     "'ꅨ' > ' rong';",
+                     "'ꅩ' > ' tiao';",
+                     "'ꅪ' > ' yin';",
+                     "'ꅫ' > ' lei';",
+                     "'ꅬ' > ' xie';",
+                     "'ꅭ' > ' quan';",
+                     "'ꅮ' > ' xu';",
+                     "'ꅯ' > ' lun';",
+                     "'ꅰ' > ' die';",
+                     "'ꅱ' > ' tong';",
+                     "'ꅲ' > ' si';",
+                     "'ꅳ' > ' jiang';",
+                     "'ꅴ' > ' xiang';",
+                     "'ꅵ' > ' hui';",
+                     "'ꅶ' > ' jue';",
+                     "'ꅷ' > ' zhi';",
+                     "'ꅸ' > ' jian';",
+                     "'ꅹ' > ' juan';",
+                     "'ꅺ' > ' chi';",
+                     "'ꅻ' > ' mian';",
+                     "'ꅼ' > ' zhen';",
+                     "'ꅽ' > ' lu';",
+                     "'ꅾ' > ' cheng';",
+                     "'ꅿ' > ' qiu';",
+                     "'ꆀ' > ' shu';",
+                     "'ꆁ' > ' bang';",
+                     "'ꆂ' > ' tong';",
+                     "'ꆃ' > ' xiao';",
+                     "'ꆄ' > ' wan';",
+                     "'ꆅ' > ' qin';",
+                     "'ꆆ' > ' geng';",
+                     "'ꆇ' > ' xiu';",
+                     "'ꆈ' > ' ti';",
+                     "'ꆉ' > ' xiu';",
+                     "'ꆊ' > ' xie';",
+                     "'ꆋ' > ' hong';",
+                     "'ꆌ' > ' xi';",
+                     "'ꆍ' > ' fu';",
+                     "'ꆎ' > ' ting';",
+                     "'ꆏ' > ' sui';",
+                     "'ꆐ' > ' dui';",
+                     "'ꆑ' > ' kun';",
+                     "'ꆒ' > ' fu';",
+                     "'ꆓ' > ' jing';",
+                     "'ꆔ' > ' hu';",
+                     "'ꆕ' > ' zhi';",
+                     "'ꆖ' > ' yan';",
+                     "'ꆗ' > ' jiong';",
+                     "'ꆘ' > ' feng';",
+                     "'ꆙ' > ' ji';",
+                     "'ꆚ' > ' sok';",
+                     "'ꆛ' > ' kase';",
+                     "'ꆜ' > ' zong';",
+                     "'ꆝ' > ' lin';",
+                     "'ꆞ' > ' duo';",
+                     "'ꆟ' > ' li';",
+                     "'ꆠ' > ' lu';",
+                     "'ꆡ' > ' liang';",
+                     "'ꆢ' > ' chou';",
+                     "'ꆣ' > ' quan';",
+                     "'ꆤ' > ' shao';",
+                     "'ꆥ' > ' qi';",
+                     "'ꆦ' > ' qi';",
+                     "'ꆧ' > ' zhun';",
+                     "'ꆨ' > ' qi';",
+                     "'ꆩ' > ' wan';",
+                     "'ꆪ' > ' qian';",
+                     "'ꆫ' > ' xian';",
+                     "'ꆬ' > ' shou';",
+                     "'ꆭ' > ' wei';",
+                     "'ꆮ' > ' qi';",
+                     "'ꆯ' > ' tao';",
+                     "'ꆰ' > ' wan';",
+                     "'ꆱ' > ' gang';",
+                     "'ꆲ' > ' wang';",
+                     "'ꆳ' > ' beng';",
+                     "'ꆴ' > ' zhui';",
+                     "'ꆵ' > ' cai';",
+                     "'ꆶ' > ' guo';",
+                     "'ꆷ' > ' cui';",
+                     "'ꆸ' > ' lun';",
+                     "'ꆹ' > ' liu';",
+                     "'ꆺ' > ' qi';",
+                     "'ꆻ' > ' zhan';",
+                     "'ꆼ' > ' bei';",
+                     "'ꆽ' > ' chuo';",
+                     "'ꆾ' > ' ling';",
+                     "'ꆿ' > ' mian';",
+                     "'ꇀ' > ' qi';",
+                     "'ꇁ' > ' qie';",
+                     "'ꇂ' > ' tan';",
+                     "'ꇃ' > ' zong';",
+                     "'ꇄ' > ' gun';",
+                     "'ꇅ' > ' zou';",
+                     "'ꇆ' > ' yi';",
+                     "'ꇇ' > ' zi';",
+                     "'ꇈ' > ' xing';",
+                     "'ꇉ' > ' liang';",
+                     "'ꇊ' > ' jin';",
+                     "'ꇋ' > ' fei';",
+                     "'ꇌ' > ' rui';",
+                     "'ꇍ' > ' min';",
+                     "'ꇎ' > ' yu';",
+                     "'ꇏ' > ' zong';",
+                     "'ꇐ' > ' fan';",
+                     "'ꇑ' > ' lu';",
+                     "'ꇒ' > ' xu';",
+                     "'ꇓ' > ' yingl';",
+                     "'ꇔ' > ' zhang';",
+                     "'ꇕ' > ' kasuri';",
+                     "'ꇖ' > ' xu';",
+                     "'ꇗ' > ' xiang';",
+                     "'ꇘ' > ' jian';",
+                     "'ꇙ' > ' ke';",
+                     "'ꇚ' > ' xian';",
+                     "'ꇛ' > ' ruan';",
+                     "'ꇜ' > ' mian';",
+                     "'ꇝ' > ' qi';",
+                     "'ꇞ' > ' duan';",
+                     "'ꇟ' > ' zhong';",
+                     "'ꇠ' > ' di';",
+                     "'ꇡ' > ' min';",
+                     "'ꇢ' > ' miao';",
+                     "'ꇣ' > ' yuan';",
+                     "'ꇤ' > ' xie';",
+                     "'ꇥ' > ' bao';",
+                     "'ꇦ' > ' si';",
+                     "'ꇧ' > ' qiu';",
+                     "'ꇨ' > ' bian';",
+                     "'ꇩ' > ' huan';",
+                     "'ꇪ' > ' geng';",
+                     "'ꇫ' > ' cong';",
+                     "'ꇬ' > ' mian';",
+                     "'ꇭ' > ' wei';",
+                     "'ꇮ' > ' fu';",
+                     "'ꇯ' > ' wei';",
+                     "'ꇰ' > ' yu';",
+                     "'ꇱ' > ' gou';",
+                     "'ꇲ' > ' miao';",
+                     "'ꇳ' > ' xie';",
+                     "'ꇴ' > ' lian';",
+                     "'ꇵ' > ' zong';",
+                     "'ꇶ' > ' bian';",
+                     "'ꇷ' > ' yun';",
+                     "'ꇸ' > ' yin';",
+                     "'ꇹ' > ' ti';",
+                     "'ꇺ' > ' gua';",
+                     "'ꇻ' > ' zhi';",
+                     "'ꇼ' > ' yun';",
+                     "'ꇽ' > ' cheng';",
+                     "'ꇾ' > ' chan';",
+                     "'ꇿ' > ' dai';",
+                     "'ꈀ' > ' xia';",
+                     "'ꈁ' > ' yuan';",
+                     "'ꈂ' > ' zong';",
+                     "'ꈃ' > ' xu';",
+                     "'ꈄ' > ' nawa';",
+                     "'ꈅ' > ' odoshi';",
+                     "'ꈆ' > ' geng';",
+                     "'ꈇ' > ' sen';",
+                     "'ꈈ' > ' ying';",
+                     "'ꈉ' > ' jin';",
+                     "'ꈊ' > ' yi';",
+                     "'ꈋ' > ' zhui';",
+                     "'ꈌ' > ' ni';",
+                     "'ꈍ' > ' bang';",
+                     "'ꈎ' > ' gu';",
+                     "'ꈏ' > ' pan';",
+                     "'ꈐ' > ' zhou';",
+                     "'ꈑ' > ' jian';",
+                     "'ꈒ' > ' cuo';",
+                     "'ꈓ' > ' quan';",
+                     "'ꈔ' > ' shuang';",
+                     "'ꈕ' > ' yun';",
+                     "'ꈖ' > ' xia';",
+                     "'ꈗ' > ' shuai';",
+                     "'ꈘ' > ' xi';",
+                     "'ꈙ' > ' rong';",
+                     "'ꈚ' > ' tao';",
+                     "'ꈛ' > ' fu';",
+                     "'ꈜ' > ' yun';",
+                     "'ꈝ' > ' zhen';",
+                     "'ꈞ' > ' gao';",
+                     "'ꈟ' > ' ru';",
+                     "'ꈠ' > ' hu';",
+                     "'ꈡ' > ' zai';",
+                     "'ꈢ' > ' teng';",
+                     "'ꈣ' > ' xian';",
+                     "'ꈤ' > ' su';",
+                     "'ꈥ' > ' zhen';",
+                     "'ꈦ' > ' zong';",
+                     "'ꈧ' > ' tao';",
+                     "'ꈨ' > ' horo';",
+                     "'ꈩ' > ' cai';",
+                     "'ꈪ' > ' bi';",
+                     "'ꈫ' > ' feng';",
+                     "'ꈬ' > ' cu';",
+                     "'ꈭ' > ' li';",
+                     "'ꈮ' > ' suo';",
+                     "'ꈯ' > ' yin';",
+                     "'ꈰ' > ' xi';",
+                     "'ꈱ' > ' zong';",
+                     "'ꈲ' > ' lei';",
+                     "'ꈳ' > ' zhuan';",
+                     "'ꈴ' > ' qian';",
+                     "'ꈵ' > ' man';",
+                     "'ꈶ' > ' zhi';",
+                     "'ꈷ' > ' lu';",
+                     "'ꈸ' > ' mo';",
+                     "'ꈹ' > ' piao';",
+                     "'ꈺ' > ' lian';",
+                     "'ꈻ' > ' mi';",
+                     "'ꈼ' > ' xuan';",
+                     "'ꈽ' > ' zong';",
+                     "'ꈾ' > ' ji';",
+                     "'ꈿ' > ' shan';",
+                     "'ꉀ' > ' sui';",
+                     "'ꉁ' > ' fan';",
+                     "'ꉂ' > ' shuai';",
+                     "'ꉃ' > ' beng';",
+                     "'ꉄ' > ' yi';",
+                     "'ꉅ' > ' sao';",
+                     "'ꉆ' > ' mou';",
+                     "'ꉇ' > ' zhou';",
+                     "'ꉈ' > ' qiang';",
+                     "'ꉉ' > ' hun';",
+                     "'ꉊ' > ' sem';",
+                     "'ꉋ' > ' xi';",
+                     "'ꉌ' > ' jung';",
+                     "'ꉍ' > ' xiu';",
+                     "'ꉎ' > ' ran';",
+                     "'ꉏ' > ' xuan';",
+                     "'ꉐ' > ' hui';",
+                     "'ꉑ' > ' qiao';",
+                     "'ꉒ' > ' zeng';",
+                     "'ꉓ' > ' zuo';",
+                     "'ꉔ' > ' zhi';",
+                     "'ꉕ' > ' shan';",
+                     "'ꉖ' > ' san';",
+                     "'ꉗ' > ' lin';",
+                     "'ꉘ' > ' yu';",
+                     "'ꉙ' > ' fan';",
+                     "'ꉚ' > ' liao';",
+                     "'ꉛ' > ' chuo';",
+                     "'ꉜ' > ' zun';",
+                     "'ꉝ' > ' jian';",
+                     "'ꉞ' > ' rao';",
+                     "'ꉟ' > ' chan';",
+                     "'ꉠ' > ' rui';",
+                     "'ꉡ' > ' xiu';",
+                     "'ꉢ' > ' hui';",
+                     "'ꉣ' > ' hua';",
+                     "'ꉤ' > ' zuan';",
+                     "'ꉥ' > ' xi';",
+                     "'ꉦ' > ' qiang';",
+                     "'ꉧ' > ' un';",
+                     "'ꉨ' > ' da';",
+                     "'ꉩ' > ' sheng';",
+                     "'ꉪ' > ' hui';",
+                     "'ꉫ' > ' xi';",
+                     "'ꉬ' > ' se';",
+                     "'ꉭ' > ' jian';",
+                     "'ꉮ' > ' jiang';",
+                     "'ꉯ' > ' huan';",
+                     "'ꉰ' > ' zao';",
+                     "'ꉱ' > ' cong';",
+                     "'ꉲ' > ' jie';",
+                     "'ꉳ' > ' jiao';",
+                     "'ꉴ' > ' bo';",
+                     "'ꉵ' > ' chan';",
+                     "'ꉶ' > ' yi';",
+                     "'ꉷ' > ' nao';",
+                     "'ꉸ' > ' sui';",
+                     "'ꉹ' > ' yi';",
+                     "'ꉺ' > ' shai';",
+                     "'ꉻ' > ' xu';",
+                     "'ꉼ' > ' ji';",
+                     "'ꉽ' > ' bin';",
+                     "'ꉾ' > ' qian';",
+                     "'ꉿ' > ' lan';",
+                     "'ꊀ' > ' pu';",
+                     "'ꊁ' > ' xun';",
+                     "'ꊂ' > ' zuan';",
+                     "'ꊃ' > ' qi';",
+                     "'ꊄ' > ' peng';",
+                     "'ꊅ' > ' li';",
+                     "'ꊆ' > ' mo';",
+                     "'ꊇ' > ' lei';",
+                     "'ꊈ' > ' xie';",
+                     "'ꊉ' > ' zuan';",
+                     "'ꊊ' > ' kuang';",
+                     "'ꊋ' > ' you';",
+                     "'ꊌ' > ' xu';",
+                     "'ꊍ' > ' lei';",
+                     "'ꊎ' > ' xian';",
+                     "'ꊏ' > ' chan';",
+                     "'ꊐ' > ' kou';",
+                     "'ꊑ' > ' lu';",
+                     "'ꊒ' > ' chan';",
+                     "'ꊓ' > ' ying';",
+                     "'ꊔ' > ' cai';",
+                     "'ꊕ' > ' xiang';",
+                     "'ꊖ' > ' xian';",
+                     "'ꊗ' > ' zui';",
+                     "'ꊘ' > ' zuan';",
+                     "'ꊙ' > ' luo';",
+                     "'ꊚ' > ' xi';",
+                     "'ꊛ' > ' dao';",
+                     "'ꊜ' > ' lan';",
+                     "'ꊝ' > ' lei';",
+                     "'ꊞ' > ' lian';",
+                     "'ꊟ' > ' si';",
+                     "'ꊠ' > ' jiu';",
+                     "'ꊡ' > ' yu';",
+                     "'ꊢ' > ' hong';",
+                     "'ꊣ' > ' zhou';",
+                     "'ꊤ' > ' xian';",
+                     "'ꊥ' > ' he';",
+                     "'ꊦ' > ' yue';",
+                     "'ꊧ' > ' ji';",
+                     "'ꊨ' > ' wan';",
+                     "'ꊩ' > ' kuang';",
+                     "'ꊪ' > ' ji';",
+                     "'ꊫ' > ' ren';",
+                     "'ꊬ' > ' wei';",
+                     "'ꊭ' > ' yun';",
+                     "'ꊮ' > ' hong';",
+                     "'ꊯ' > ' chun';",
+                     "'ꊰ' > ' pi';",
+                     "'ꊱ' > ' sha';",
+                     "'ꊲ' > ' gang';",
+                     "'ꊳ' > ' na';",
+                     "'ꊴ' > ' ren';",
+                     "'ꊵ' > ' zong';",
+                     "'ꊶ' > ' lun';",
+                     "'ꊷ' > ' fen';",
+                     "'ꊸ' > ' zhi';",
+                     "'ꊹ' > ' wen';",
+                     "'ꊺ' > ' fang';",
+                     "'ꊻ' > ' zhu';",
+                     "'ꊼ' > ' yin';",
+                     "'ꊽ' > ' niu';",
+                     "'ꊾ' > ' shu';",
+                     "'ꊿ' > ' xian';",
+                     "'ꋀ' > ' gan';",
+                     "'ꋁ' > ' xie';",
+                     "'ꋂ' > ' fu';",
+                     "'ꋃ' > ' lian';",
+                     "'ꋄ' > ' zu';",
+                     "'ꋅ' > ' shen';",
+                     "'ꋆ' > ' xi';",
+                     "'ꋇ' > ' zhi';",
+                     "'ꋈ' > ' zhong';",
+                     "'ꋉ' > ' zhou';",
+                     "'ꋊ' > ' ban';",
+                     "'ꋋ' > ' fu';",
+                     "'ꋌ' > ' zhuo';",
+                     "'ꋍ' > ' shao';",
+                     "'ꋎ' > ' yi';",
+                     "'ꋏ' > ' jing';",
+                     "'ꋐ' > ' dai';",
+                     "'ꋑ' > ' bang';",
+                     "'ꋒ' > ' rong';",
+                     "'ꋓ' > ' jie';",
+                     "'ꋔ' > ' ku';",
+                     "'ꋕ' > ' rao';",
+                     "'ꋖ' > ' die';",
+                     "'ꋗ' > ' heng';",
+                     "'ꋘ' > ' hui';",
+                     "'ꋙ' > ' gei';",
+                     "'ꋚ' > ' xuan';",
+                     "'ꋛ' > ' jiang';",
+                     "'ꋜ' > ' luo';",
+                     "'ꋝ' > ' jue';",
+                     "'ꋞ' > ' jiao';",
+                     "'ꋟ' > ' tong';",
+                     "'ꋠ' > ' geng';",
+                     "'ꋡ' > ' xiao';",
+                     "'ꋢ' > ' juan';",
+                     "'ꋣ' > ' xiu';",
+                     "'ꋤ' > ' xi';",
+                     "'ꋥ' > ' sui';",
+                     "'ꋦ' > ' tao';",
+                     "'ꋧ' > ' ji';",
+                     "'ꋨ' > ' ti';",
+                     "'ꋩ' > ' ji';",
+                     "'ꋪ' > ' xu';",
+                     "'ꋫ' > ' ling';",
+                     "'ꋬ' > 'zzyr';",
+                     "'ꋭ' > ' xu';",
+                     "'ꋮ' > ' qi';",
+                     "'ꋯ' > ' fei';",
+                     "'ꋰ' > ' chuo';",
+                     "'ꋱ' > ' zhang';",
+                     "'ꋲ' > ' gun';",
+                     "'ꋳ' > ' sheng';",
+                     "'ꋴ' > ' wei';",
+                     "'ꋵ' > ' mian';",
+                     "'ꋶ' > ' shou';",
+                     "'ꋷ' > ' beng';",
+                     "'ꋸ' > ' chou';",
+                     "'ꋹ' > ' tao';",
+                     "'ꋺ' > ' liu';",
+                     "'ꋻ' > ' quan';",
+                     "'ꋼ' > ' zong';",
+                     "'ꋽ' > ' zhan';",
+                     "'ꋾ' > ' wan';",
+                     "'ꋿ' > ' lu';",
+                     "'ꌀ' > ' zhui';",
+                     "'ꌁ' > ' zi';",
+                     "'ꌂ' > ' ke';",
+                     "'ꌃ' > ' xiang';",
+                     "'ꌄ' > ' jian';",
+                     "'ꌅ' > ' mian';",
+                     "'ꌆ' > ' lan';",
+                     "'ꌇ' > ' ti';",
+                     "'ꌈ' > ' miao';",
+                     "'ꌉ' > ' qi';",
+                     "'ꌊ' > ' yun';",
+                     "'ꌋ' > ' hui';",
+                     "'ꌌ' > ' si';",
+                     "'ꌍ' > ' duo';",
+                     "'ꌎ' > ' duan';",
+                     "'ꌏ' > ' bian';",
+                     "'ꌐ' > ' xian';",
+                     "'ꌑ' > ' gou';",
+                     "'ꌒ' > ' zhui';",
+                     "'ꌓ' > ' huan';",
+                     "'ꌔ' > ' di';",
+                     "'ꌕ' > ' lu';",
+                     "'ꌖ' > ' bian';",
+                     "'ꌗ' > ' min';",
+                     "'ꌘ' > ' yuan';",
+                     "'ꌙ' > ' jin';",
+                     "'ꌚ' > ' fu';",
+                     "'ꌛ' > ' ru';",
+                     "'ꌜ' > ' zhen';",
+                     "'ꌝ' > ' feng';",
+                     "'ꌞ' > ' shuai';",
+                     "'ꌟ' > ' gao';",
+                     "'ꌠ' > ' chan';",
+                     "'ꌡ' > ' li';",
+                     "'ꌢ' > ' yi';",
+                     "'ꌣ' > ' jian';",
+                     "'ꌤ' > ' bin';",
+                     "'ꌥ' > ' piao';",
+                     "'ꌦ' > ' man';",
+                     "'ꌧ' > ' lei';",
+                     "'ꌨ' > ' ying';",
+                     "'ꌩ' > ' suo';",
+                     "'ꌪ' > ' mou';",
+                     "'ꌫ' > ' sao';",
+                     "'ꌬ' > ' xie';",
+                     "'ꌭ' > ' liao';",
+                     "'ꌮ' > ' shan';",
+                     "'ꌯ' > ' zeng';",
+                     "'ꌰ' > ' jiang';",
+                     "'ꌱ' > ' qian';",
+                     "'ꌲ' > ' zao';",
+                     "'ꌳ' > ' huan';",
+                     "'ꌴ' > ' jiao';",
+                     "'ꌵ' > ' zuan';",
+                     "'ꌶ' > ' fou';",
+                     "'ꌷ' > ' xie';",
+                     "'ꌸ' > ' gang';",
+                     "'ꌹ' > ' fou';",
+                     "'ꌺ' > ' que';",
+                     "'ꌻ' > ' fou';",
+                     "'ꌼ' > ' kaakeru';",
+                     "'ꌽ' > ' bo';",
+                     "'ꌾ' > ' ping';",
+                     "'ꌿ' > ' hou';",
+                     "'ꍀ' > 'ssyt';",
+                     "'ꍁ' > ' gang';",
+                     "'ꍂ' > ' ying';",
+                     "'ꍃ' > ' ying';",
+                     "'ꍄ' > ' qing';",
+                     "'ꍅ' > ' xia';",
+                     "'ꍆ' > ' guan';",
+                     "'ꍇ' > ' zun';",
+                     "'ꍈ' > ' tan';",
+                     "'ꍉ' > ' chang';",
+                     "'ꍊ' > ' qi';",
+                     "'ꍋ' > ' weng';",
+                     "'ꍌ' > ' ying';",
+                     "'ꍍ' > ' lei';",
+                     "'ꍎ' > ' tan';",
+                     "'ꍏ' > ' lu';",
+                     "'ꍐ' > ' guan';",
+                     "'ꍑ' > ' wang';",
+                     "'ꍒ' > ' wang';",
+                     "'ꍓ' > ' gang';",
+                     "'ꍔ' > ' wang';",
+                     "'ꍕ' > ' han';",
+                     "'ꍖ' > 'zhux';",
+                     "'ꍗ' > ' luo';",
+                     "'ꍘ' > ' fu';",
+                     "'ꍙ' > ' mi';",
+                     "'ꍚ' > ' fa';",
+                     "'ꍛ' > ' gu';",
+                     "'ꍜ' > ' zhu';",
+                     "'ꍝ' > ' ju';",
+                     "'ꍞ' > ' mao';",
+                     "'ꍟ' > ' gu';",
+                     "'ꍠ' > ' min';",
+                     "'ꍡ' > ' gang';",
+                     "'ꍢ' > ' ba';",
+                     "'ꍣ' > ' gua';",
+                     "'ꍤ' > ' ti';",
+                     "'ꍥ' > ' juan';",
+                     "'ꍦ' > ' fu';",
+                     "'ꍧ' > ' lin';",
+                     "'ꍨ' > ' yan';",
+                     "'ꍩ' > ' zhao';",
+                     "'ꍪ' > ' zui';",
+                     "'ꍫ' > ' gua';",
+                     "'ꍬ' > ' zhuo';",
+                     "'ꍭ' > ' yu';",
+                     "'ꍮ' > ' zhi';",
+                     "'ꍯ' > ' an';",
+                     "'ꍰ' > ' fa';",
+                     "'ꍱ' > ' nan';",
+                     "'ꍲ' > ' shu';",
+                     "'ꍳ' > ' si';",
+                     "'ꍴ' > ' pi';",
+                     "'ꍵ' > ' ma';",
+                     "'ꍶ' > ' liu';",
+                     "'ꍷ' > ' ba';",
+                     "'ꍸ' > ' fa';",
+                     "'ꍹ' > ' li';",
+                     "'ꍺ' > ' chao';",
+                     "'ꍻ' > ' wei';",
+                     "'ꍼ' > ' bi';",
+                     "'ꍽ' > ' ji';",
+                     "'ꍾ' > ' zeng';",
+                     "'ꍿ' > ' tong';",
+                     "'ꎀ' > ' liu';",
+                     "'ꎁ' > ' ji';",
+                     "'ꎂ' > ' juan';",
+                     "'ꎃ' > ' mi';",
+                     "'ꎄ' > ' zhao';",
+                     "'ꎅ' > ' luo';",
+                     "'ꎆ' > ' pi';",
+                     "'ꎇ' > ' ji';",
+                     "'ꎈ' > ' ji';",
+                     "'ꎉ' > ' luan';",
+                     "'ꎊ' > ' yang';",
+                     "'ꎋ' > ' mie';",
+                     "'ꎌ' > ' qiang';",
+                     "'ꎍ' > ' ta';",
+                     "'ꎎ' > ' mei';",
+                     "'ꎏ' > ' yang';",
+                     "'ꎐ' > ' you';",
+                     "'ꎑ' > ' you';",
+                     "'ꎒ' > ' fen';",
+                     "'ꎓ' > ' ba';",
+                     "'ꎔ' > ' gao';",
+                     "'ꎕ' > ' yang';",
+                     "'ꎖ' > ' gu';",
+                     "'ꎗ' > ' qiang';",
+                     "'ꎘ' > ' zang';",
+                     "'ꎙ' > ' gao';",
+                     "'ꎚ' > ' ling';",
+                     "'ꎛ' > ' yi';",
+                     "'ꎜ' > ' zhu';",
+                     "'ꎝ' > ' di';",
+                     "'ꎞ' > ' xiu';",
+                     "'ꎟ' > ' qian';",
+                     "'ꎠ' > ' yi';",
+                     "'ꎡ' > ' xian';",
+                     "'ꎢ' > ' rong';",
+                     "'ꎣ' > ' qun';",
+                     "'ꎤ' > ' qun';",
+                     "'ꎥ' > ' qian';",
+                     "'ꎦ' > ' huan';",
+                     "'ꎧ' > ' zui';",
+                     "'ꎨ' > ' xian';",
+                     "'ꎩ' > ' yi';",
+                     "'ꎪ' > ' yashinau';",
+                     "'ꎫ' > ' qiang';",
+                     "'ꎬ' > ' xian';",
+                     "'ꎭ' > ' yu';",
+                     "'ꎮ' > ' geng';",
+                     "'ꎯ' > ' jie';",
+                     "'ꎰ' > ' tang';",
+                     "'ꎱ' > ' yuan';",
+                     "'ꎲ' > ' xi';",
+                     "'ꎳ' > ' fan';",
+                     "'ꎴ' > ' shan';",
+                     "'ꎵ' > ' fen';",
+                     "'ꎶ' > ' shan';",
+                     "'ꎷ' > ' lian';",
+                     "'ꎸ' > ' lei';",
+                     "'ꎹ' > ' geng';",
+                     "'ꎺ' > ' nou';",
+                     "'ꎻ' > ' qiang';",
+                     "'ꎼ' > ' chan';",
+                     "'ꎽ' > ' yu';",
+                     "'ꎾ' > ' gong';",
+                     "'ꎿ' > ' yi';",
+                     "'ꏀ' > ' chong';",
+                     "'ꏁ' > ' weng';",
+                     "'ꏂ' > ' fen';",
+                     "'ꏃ' > ' hong';",
+                     "'ꏄ' > ' chi';",
+                     "'ꏅ' > ' chi';",
+                     "'ꏆ' > ' cui';",
+                     "'ꏇ' > ' fu';",
+                     "'ꏈ' > ' xia';",
+                     "'ꏉ' > ' pen';",
+                     "'ꏊ' > ' yi';",
+                     "'ꏋ' > ' la';",
+                     "'ꏌ' > ' yi';",
+                     "'ꏍ' > ' pi';",
+                     "'ꏎ' > ' ling';",
+                     "'ꏏ' > ' liu';",
+                     "'ꏐ' > ' zhi';",
+                     "'ꏑ' > ' qu';",
+                     "'ꏒ' > ' xi';",
+                     "'ꏓ' > ' xie';",
+                     "'ꏔ' > ' xiang';",
+                     "'ꏕ' > ' xi';",
+                     "'ꏖ' > ' xi';",
+                     "'ꏗ' > ' qi';",
+                     "'ꏘ' > ' qiao';",
+                     "'ꏙ' > ' hui';",
+                     "'ꏚ' > ' hui';",
+                     "'ꏛ' > ' xiao';",
+                     "'ꏜ' > ' se';",
+                     "'ꏝ' > ' hong';",
+                     "'ꏞ' > ' jiang';",
+                     "'ꏟ' > ' di';",
+                     "'ꏠ' > ' cui';",
+                     "'ꏡ' > ' fei';",
+                     "'ꏢ' > ' tao';",
+                     "'ꏣ' > ' sha';",
+                     "'ꏤ' > ' chi';",
+                     "'ꏥ' > ' zhu';",
+                     "'ꏦ' > ' jian';",
+                     "'ꏧ' > ' xuan';",
+                     "'ꏨ' > ' shi';",
+                     "'ꏩ' > ' pian';",
+                     "'ꏪ' > ' zong';",
+                     "'ꏫ' > ' wan';",
+                     "'ꏬ' > ' hui';",
+                     "'ꏭ' > ' hou';",
+                     "'ꏮ' > ' he';",
+                     "'ꏯ' > ' he';",
+                     "'ꏰ' > ' han';",
+                     "'ꏱ' > ' ao';",
+                     "'ꏲ' > ' piao';",
+                     "'ꏳ' > ' yi';",
+                     "'ꏴ' > ' lian';",
+                     "'ꏵ' > ' qu';",
+                     "'ꏶ' > 'jyt';",
+                     "'ꏷ' > ' lin';",
+                     "'ꏸ' > ' pen';",
+                     "'ꏹ' > ' qiao';",
+                     "'ꏺ' > ' ao';",
+                     "'ꏻ' > ' fan';",
+                     "'ꏼ' > ' yi';",
+                     "'ꏽ' > ' hui';",
+                     "'ꏾ' > ' xuan';",
+                     "'ꏿ' > ' dao';",
+                     "'ꐀ' > ' yao';",
+                     "'ꐁ' > ' lao';",
+                     "'ꐂ' > 'qie';",
+                     "'ꐃ' > ' kao';",
+                     "'ꐄ' > ' mao';",
+                     "'ꐅ' > ' zhe';",
+                     "'ꐆ' > ' qi';",
+                     "'ꐇ' > ' gou';",
+                     "'ꐈ' > ' gou';",
+                     "'ꐉ' > ' gou';",
+                     "'ꐊ' > ' die';",
+                     "'ꐋ' > ' die';",
+                     "'ꐌ' > ' er';",
+                     "'ꐍ' > ' shua';",
+                     "'ꐎ' > ' ruan';",
+                     "'ꐏ' > ' er';",
+                     "'ꐐ' > ' nai';",
+                     "'ꐑ' > ' zhuan';",
+                     "'ꐒ' > ' lei';",
+                     "'ꐓ' > ' ting';",
+                     "'ꐔ' > ' zi';",
+                     "'ꐕ' > ' geng';",
+                     "'ꐖ' > ' chao';",
+                     "'ꐗ' > ' hao';",
+                     "'ꐘ' > ' yun';",
+                     "'ꐙ' > ' pa';",
+                     "'ꐚ' > ' pi';",
+                     "'ꐛ' > ' chi';",
+                     "'ꐜ' > ' si';",
+                     "'ꐝ' > ' chu';",
+                     "'ꐞ' > ' jia';",
+                     "'ꐟ' > ' ju';",
+                     "'ꐠ' > ' he';",
+                     "'ꐡ' > ' chu';",
+                     "'ꐢ' > ' lao';",
+                     "'ꐣ' > ' lun';",
+                     "'ꐤ' > ' ji';",
+                     "'ꐥ' > ' tang';",
+                     "'ꐦ' > ' ou';",
+                     "'ꐧ' > ' lou';",
+                     "'ꐨ' > ' nou';",
+                     "'ꐩ' > ' gou';",
+                     "'ꐪ' > ' pang';",
+                     "'ꐫ' > ' ze';",
+                     "'ꐬ' > ' lou';",
+                     "'ꐭ' > ' ji';",
+                     "'ꐮ' > ' lao';",
+                     "'ꐯ' > ' huo';",
+                     "'ꐰ' > ' you';",
+                     "'ꐱ' > ' mo';",
+                     "'ꐲ' > ' huai';",
+                     "'ꐳ' > ' er';",
+                     "'ꐴ' > ' zhe';",
+                     "'ꐵ' > ' ting';",
+                     "'ꐶ' > ' ye';",
+                     "'ꐷ' > ' da';",
+                     "'ꐸ' > ' song';",
+                     "'ꐹ' > ' qin';",
+                     "'ꐺ' > ' yun';",
+                     "'ꐻ' > ' chi';",
+                     "'ꐼ' > ' dan';",
+                     "'ꐽ' > ' dan';",
+                     "'ꐾ' > ' hong';",
+                     "'ꐿ' > ' geng';",
+                     "'ꑀ' > ' zhi';",
+                     "'ꑁ' > 'njup';",
+                     "'ꑂ' > ' nie';",
+                     "'ꑃ' > ' dan';",
+                     "'ꑄ' > ' zhen';",
+                     "'ꑅ' > ' che';",
+                     "'ꑆ' > ' ling';",
+                     "'ꑇ' > ' zheng';",
+                     "'ꑈ' > ' you';",
+                     "'ꑉ' > ' wa';",
+                     "'ꑊ' > ' liao';",
+                     "'ꑋ' > ' long';",
+                     "'ꑌ' > ' zhi';",
+                     "'ꑍ' > ' ning';",
+                     "'ꑎ' > ' tiao';",
+                     "'ꑏ' > ' er';",
+                     "'ꑐ' > ' ya';",
+                     "'ꑑ' > ' die';",
+                     "'ꑒ' > ' gua';",
+                     "'ꑓ' > 'nyuo';",
+                     "'ꑔ' > ' lian';",
+                     "'ꑕ' > ' hao';",
+                     "'ꑖ' > ' sheng';",
+                     "'ꑗ' > ' lie';",
+                     "'ꑘ' > ' pin';",
+                     "'ꑙ' > ' jing';",
+                     "'ꑚ' > ' ju';",
+                     "'ꑛ' > ' bi';",
+                     "'ꑜ' > ' di';",
+                     "'ꑝ' > ' guo';",
+                     "'ꑞ' > ' wen';",
+                     "'ꑟ' > ' xu';",
+                     "'ꑠ' > ' ping';",
+                     "'ꑡ' > ' cong';",
+                     "'ꑢ' > ' shikato';",
+                     "'ꑣ' > 'xie';",
+                     "'ꑤ' > ' ting';",
+                     "'ꑥ' > ' yu';",
+                     "'ꑦ' > ' cong';",
+                     "'ꑧ' > ' kui';",
+                     "'ꑨ' > ' tsuraneru';",
+                     "'ꑩ' > ' kui';",
+                     "'ꑪ' > ' cong';",
+                     "'ꑫ' > ' lian';",
+                     "'ꑬ' > ' weng';",
+                     "'ꑭ' > ' kui';",
+                     "'ꑮ' > ' lian';",
+                     "'ꑯ' > ' lian';",
+                     "'ꑰ' > ' cong';",
+                     "'ꑱ' > ' ao';",
+                     "'ꑲ' > ' sheng';",
+                     "'ꑳ' > ' song';",
+                     "'ꑴ' > ' ting';",
+                     "'ꑵ' > ' kui';",
+                     "'ꑶ' > ' nie';",
+                     "'ꑷ' > ' zhi';",
+                     "'ꑸ' > ' dan';",
+                     "'ꑹ' > ' ning';",
+                     "'ꑺ' > ' qie';",
+                     "'ꑻ' > ' ji';",
+                     "'ꑼ' > ' ting';",
+                     "'ꑽ' > ' ting';",
+                     "'ꑾ' > ' long';",
+                     "'ꑿ' > ' yu';",
+                     "'ꒀ' > ' yu';",
+                     "'ꒁ' > ' zhao';",
+                     "'ꒂ' > ' si';",
+                     "'ꒃ' > ' su';",
+                     "'ꒄ' > ' yi';",
+                     "'ꒅ' > ' su';",
+                     "'ꒆ' > ' si';",
+                     "'ꒇ' > ' zhao';",
+                     "'ꒈ' > ' zhao';",
+                     "'ꒉ' > ' rou';",
+                     "'ꒊ' > ' yi';",
+                     "'ꒋ' > ' le';",
+                     "'ꒌ' > ' ji';",
+                     "'ꓐ' > ' ku';",
+                     "'ꓑ' > ' zhi';",
+                     "'ꓒ' > ' ni';",
+                     "'ꓓ' > ' ping';",
+                     "'ꓔ' > ' zi';",
+                     "'ꓕ' > ' fu';",
+                     "'ꓖ' > ' pang';",
+                     "'ꓗ' > ' zhen';",
+                     "'ꓘ' > ' xian';",
+                     "'ꓙ' > ' zuo';",
+                     "'ꓚ' > ' pei';",
+                     "'ꓛ' > ' jia';",
+                     "'ꓜ' > ' sheng';",
+                     "'ꓝ' > ' zhi';",
+                     "'ꓞ' > ' bao';",
+                     "'ꓟ' > ' mu';",
+                     "'ꓠ' > ' qu';",
+                     "'ꓡ' > ' hu';",
+                     "'ꓢ' > ' ke';",
+                     "'ꓣ' > ' yi';",
+                     "'ꓤ' > ' yin';",
+                     "'ꓥ' > ' xu';",
+                     "'ꓦ' > ' yang';",
+                     "'ꓧ' > ' long';",
+                     "'ꓨ' > ' dong';",
+                     "'ꓩ' > ' ka';",
+                     "'ꓪ' > ' lu';",
+                     "'ꓫ' > ' jing';",
+                     "'ꓬ' > ' nu';",
+                     "'ꓭ' > ' yan';",
+                     "'ꓮ' > ' pang';",
+                     "'ꓯ' > ' kua';",
+                     "'ꓰ' > ' yi';",
+                     "'ꓱ' > ' guang';",
+                     "'ꓲ' > ' gai';",
+                     "'ꓳ' > ' ge';",
+                     "'ꓴ' > ' dong';",
+                     "'ꓵ' > ' zhi';",
+                     "'ꓶ' > ' xiao';",
+                     "'ꓷ' > ' xiong';",
+                     "'ꓸ' > ' xiong';",
+                     "'ꓹ' > ' er';",
+                     "'ꓺ' > ' e';",
+                     "'ꓻ' > ' xing';",
+                     "'ꓼ' > ' pian';",
+                     "'ꓽ' > ' neng';",
+                     "'ꔀ' > 'ee';",
+                     "'ꔁ' > 'een';",
+                     "'ꔂ' > 'hee';",
+                     "'ꔃ' > 'wee';",
+                     "'ꔄ' > 'ween';",
+                     "'ꔅ' > 'pee';",
+                     "'ꔆ' > 'bhee';",
+                     "'ꔇ' > 'bee';",
+                     "'ꔈ' > 'mbee';",
+                     "'ꔉ' > 'kpee';",
+                     "'ꔊ' > 'mgbee';",
+                     "'ꔋ' > 'gbee';",
+                     "'ꔌ' > 'fee';",
+                     "'ꔍ' > 'vee';",
+                     "'ꔎ' > 'tee';",
+                     "'ꔏ' > 'thee';",
+                     "'ꔐ' > 'dhee';",
+                     "'ꔑ' > 'dhhee';",
+                     "'ꔒ' > 'lee';",
+                     "'ꔓ' > 'ree';",
+                     "'ꔔ' > 'dee';",
+                     "'ꔕ' > 'ndee';",
+                     "'ꔖ' > 'see';",
+                     "'ꔗ' > 'shee';",
+                     "'ꔘ' > 'zee';",
+                     "'ꔙ' > 'zhee';",
+                     "'ꔚ' > 'cee';",
+                     "'ꔛ' > 'jee';",
+                     "'ꔜ' > 'njee';",
+                     "'ꔝ' > 'yee';",
+                     "'ꔞ' > 'kee';",
+                     "'ꔟ' > 'nggee';",
+                     "'ꔠ' > 'gee';",
+                     "'ꔡ' > 'mee';",
+                     "'ꔢ' > 'nee';",
+                     "'ꔣ' > 'nyee';",
+                     "'ꔤ' > 'i';",
+                     "'ꔥ' > 'in';",
+                     "'ꔦ' > 'hi';",
+                     "'ꔧ' > 'hin';",
+                     "'ꔨ' > 'wi';",
+                     "'ꔩ' > 'win';",
+                     "'ꔪ' > 'pi';",
+                     "'ꔫ' > 'bhi';",
+                     "'ꔬ' > 'bi';",
+                     "'ꔭ' > 'mbi';",
+                     "'ꔮ' > 'kpi';",
+                     "'ꔯ' > 'mgbi';",
+                     "'ꔰ' > 'gbi';",
+                     "'ꔱ' > 'fi';",
+                     "'ꔲ' > 'vi';",
+                     "'ꔳ' > 'ti';",
+                     "'ꔴ' > 'thi';",
+                     "'ꔵ' > 'dhi';",
+                     "'ꔶ' > 'dhhi';",
+                     "'ꔷ' > 'li';",
+                     "'ꔸ' > 'ri';",
+                     "'ꔹ' > 'di';",
+                     "'ꔺ' > 'ndi';",
+                     "'ꔻ' > 'si';",
+                     "'ꔼ' > 'shi';",
+                     "'ꔽ' > 'zi';",
+                     "'ꔾ' > 'zhi';",
+                     "'ꔿ' > 'ci';",
+                     "'ꕀ' > 'ji';",
+                     "'ꕁ' > 'nji';",
+                     "'ꕂ' > 'yi';",
+                     "'ꕃ' > 'ki';",
+                     "'ꕄ' > 'nggi';",
+                     "'ꕅ' > 'gi';",
+                     "'ꕆ' > 'mi';",
+                     "'ꕇ' > 'ni';",
+                     "'ꕈ' > 'nyi';",
+                     "'ꕉ' > 'a';",
+                     "'ꕊ' > 'an';",
+                     "'ꕋ' > 'ngan';",
+                     "'ꕌ' > 'ha';",
+                     "'ꕍ' > 'han';",
+                     "'ꕎ' > 'wa';",
+                     "'ꕏ' > 'wan';",
+                     "'ꕐ' > 'pa';",
+                     "'ꕑ' > 'bha';",
+                     "'ꕒ' > 'ba';",
+                     "'ꕓ' > 'mba';",
+                     "'ꕔ' > 'kpa';",
+                     "'ꕕ' > 'kpan';",
+                     "'ꕖ' > 'mgba';",
+                     "'ꕗ' > 'gba';",
+                     "'ꕘ' > 'fa';",
+                     "'ꕙ' > 'va';",
+                     "'ꕚ' > 'ta';",
+                     "'ꕛ' > 'tha';",
+                     "'ꕜ' > 'dha';",
+                     "'ꕝ' > 'dhha';",
+                     "'ꕞ' > 'la';",
+                     "'ꕟ' > 'ra';",
+                     "'ꕠ' > 'da';",
+                     "'ꕡ' > 'nda';",
+                     "'ꕢ' > 'sa';",
+                     "'ꕣ' > 'sha';",
+                     "'ꕤ' > 'za';",
+                     "'ꕥ' > 'zha';",
+                     "'ꕦ' > 'ca';",
+                     "'ꕧ' > 'ja';",
+                     "'ꕨ' > 'nja';",
+                     "'ꕩ' > 'ya';",
+                     "'ꕪ' > 'ka';",
+                     "'ꕫ' > 'kan';",
+                     "'ꕬ' > 'ngga';",
+                     "'ꕭ' > 'ga';",
+                     "'ꕮ' > 'ma';",
+                     "'ꕯ' > 'na';",
+                     "'ꕰ' > 'nya';",
+                     "'ꕱ' > 'oo';",
+                     "'ꕲ' > 'oon';",
+                     "'ꕳ' > 'hoo';",
+                     "'ꕴ' > 'woo';",
+                     "'ꕵ' > 'woon';",
+                     "'ꕶ' > 'poo';",
+                     "'ꕷ' > 'bhoo';",
+                     "'ꕸ' > 'boo';",
+                     "'ꕹ' > 'mboo';",
+                     "'ꕺ' > 'kpoo';",
+                     "'ꕻ' > 'mgboo';",
+                     "'ꕼ' > 'gboo';",
+                     "'ꕽ' > 'foo';",
+                     "'ꕾ' > 'voo';",
+                     "'ꕿ' > 'too';",
+                     "'ꖀ' > 'thoo';",
+                     "'ꖁ' > 'dhoo';",
+                     "'ꖂ' > 'dhhoo';",
+                     "'ꖃ' > 'loo';",
+                     "'ꖄ' > 'roo';",
+                     "'ꖅ' > 'doo';",
+                     "'ꖆ' > 'ndoo';",
+                     "'ꖇ' > 'soo';",
+                     "'ꖈ' > 'shoo';",
+                     "'ꖉ' > 'zoo';",
+                     "'ꖊ' > 'zhoo';",
+                     "'ꖋ' > 'coo';",
+                     "'ꖌ' > 'joo';",
+                     "'ꖍ' > 'njoo';",
+                     "'ꖎ' > 'yoo';",
+                     "'ꖏ' > 'koo';",
+                     "'ꖐ' > 'nggoo';",
+                     "'ꖑ' > 'goo';",
+                     "'ꖒ' > 'moo';",
+                     "'ꖓ' > 'noo';",
+                     "'ꖔ' > 'nyoo';",
+                     "'ꖕ' > 'u';",
+                     "'ꖖ' > 'un';",
+                     "'ꖗ' > 'hu';",
+                     "'ꖘ' > 'hun';",
+                     "'ꖙ' > 'wu';",
+                     "'ꖚ' > 'wun';",
+                     "'ꖛ' > 'pu';",
+                     "'ꖜ' > 'bhu';",
+                     "'ꖝ' > 'bu';",
+                     "'ꖞ' > 'mbu';",
+                     "'ꖟ' > 'kpu';",
+                     "'ꖠ' > 'mgbu';",
+                     "'ꖡ' > 'gbu';",
+                     "'ꖢ' > 'fu';",
+                     "'ꖣ' > 'vu';",
+                     "'ꖤ' > 'tu';",
+                     "'ꖥ' > 'thu';",
+                     "'ꖦ' > 'dhu';",
+                     "'ꖧ' > 'dhhu';",
+                     "'ꖨ' > 'lu';",
+                     "'ꖩ' > 'ru';",
+                     "'ꖪ' > 'du';",
+                     "'ꖫ' > 'ndu';",
+                     "'ꖬ' > 'su';",
+                     "'ꖭ' > 'shu';",
+                     "'ꖮ' > 'zu';",
+                     "'ꖯ' > 'zhu';",
+                     "'ꖰ' > 'cu';",
+                     "'ꖱ' > 'ju';",
+                     "'ꖲ' > 'nju';",
+                     "'ꖳ' > 'yu';",
+                     "'ꖴ' > 'ku';",
+                     "'ꖵ' > 'nggu';",
+                     "'ꖶ' > 'gu';",
+                     "'ꖷ' > 'mu';",
+                     "'ꖸ' > 'nu';",
+                     "'ꖹ' > 'nyu';",
+                     "'ꖺ' > 'o';",
+                     "'ꖻ' > 'on';",
+                     "'ꖼ' > 'ngon';",
+                     "'ꖽ' > 'ho';",
+                     "'ꖾ' > 'hon';",
+                     "'ꖿ' > 'wo';",
+                     "'ꗀ' > 'won';",
+                     "'ꗁ' > 'po';",
+                     "'ꗂ' > 'bho';",
+                     "'ꗃ' > 'bo';",
+                     "'ꗄ' > 'mbo';",
+                     "'ꗅ' > 'kpo';",
+                     "'ꗆ' > 'mgbo';",
+                     "'ꗇ' > 'gbo';",
+                     "'ꗈ' > 'gbon';",
+                     "'ꗉ' > 'fo';",
+                     "'ꗊ' > 'vo';",
+                     "'ꗋ' > 'to';",
+                     "'ꗌ' > 'tho';",
+                     "'ꗍ' > 'dho';",
+                     "'ꗎ' > 'dhho';",
+                     "'ꗏ' > 'lo';",
+                     "'ꗐ' > 'ro';",
+                     "'ꗑ' > 'do';",
+                     "'ꗒ' > 'ndo';",
+                     "'ꗓ' > 'so';",
+                     "'ꗔ' > 'sho';",
+                     "'ꗕ' > 'zo';",
+                     "'ꗖ' > 'zho';",
+                     "'ꗗ' > 'co';",
+                     "'ꗘ' > 'jo';",
+                     "'ꗙ' > 'njo';",
+                     "'ꗚ' > 'yo';",
+                     "'ꗛ' > 'ko';",
+                     "'ꗜ' > 'nggo';",
+                     "'ꗝ' > 'go';",
+                     "'ꗞ' > 'mo';",
+                     "'ꗟ' > 'no';",
+                     "'ꗠ' > 'nyo';",
+                     "'ꗡ' > 'e';",
+                     "'ꗢ' > 'en';",
+                     "'ꗣ' > 'ngen';",
+                     "'ꗤ' > 'he';",
+                     "'ꗥ' > 'hen';",
+                     "'ꗦ' > 'we';",
+                     "'ꗧ' > 'wen';",
+                     "'ꗨ' > 'pe';",
+                     "'ꗩ' > 'bhe';",
+                     "'ꗪ' > 'be';",
+                     "'ꗫ' > 'mbe';",
+                     "'ꗬ' > 'kpe';",
+                     "'ꗭ' > 'kpen';",
+                     "'ꗮ' > 'mgbe';",
+                     "'ꗯ' > 'gbe';",
+                     "'ꗰ' > 'gben';",
+                     "'ꗱ' > 'fe';",
+                     "'ꗲ' > 've';",
+                     "'ꗳ' > 'te';",
+                     "'ꗴ' > 'the';",
+                     "'ꗵ' > 'dhe';",
+                     "'ꗶ' > 'dhhe';",
+                     "'ꗷ' > 'le';",
+                     "'ꗸ' > 're';",
+                     "'ꗹ' > 'de';",
+                     "'ꗺ' > 'nde';",
+                     "'ꗻ' > 'se';",
+                     "'ꗼ' > 'she';",
+                     "'ꗽ' > 'ze';",
+                     "'ꗾ' > 'zhe';",
+                     "'ꗿ' > 'ce';",
+                     "'ꘀ' > 'je';",
+                     "'ꘁ' > 'nje';",
+                     "'ꘂ' > 'ye';",
+                     "'ꘃ' > 'ke';",
+                     "'ꘄ' > 'ngge';",
+                     "'ꘅ' > 'nggen';",
+                     "'ꘆ' > 'ge';",
+                     "'ꘇ' > 'gen';",
+                     "'ꘈ' > 'me';",
+                     "'ꘉ' > 'ne';",
+                     "'ꘊ' > 'nye';",
+                     "'ꘋ' > 'ng';",
+                     "'ꘐ' > 'ndole';",
+                     "'ꘑ' > 'ndole';",
+                     "'ꘒ' > 'ndole';",
+                     "'ꘪ' > 'ndole';",
+                     "'ꘫ' > 'ndole';",
+                     "'Ꙁ' > 'zemlya';",
+                     "'ꙁ' > 'zemlya';",
+                     "'Ꙃ' > 'dzelo';",
+                     "'ꙃ' > 'dzelo';",
+                     "'Ꙅ' > 'dze';",
+                     "'ꙅ' > 'dze';",
+                     "'Ꙇ' > 'iota';",
+                     "'ꙇ' > 'iota';",
+                     "'Ꙉ' > 'djerv';",
+                     "'ꙉ' > 'djerv';",
+                     "'Ꙑ' > 'yeru';",
+                     "'ꙑ' > 'yeru';",
+                     "'Ꙕ' > 'yu';",
+                     "'ꙕ' > 'yu';",
+                     "'Ꙟ' > 'yn';",
+                     "'ꙟ' > 'yn';",
+                     "'Ꚁ' > 'dwe';",
+                     "'ꚁ' > 'dwe';",
+                     "'Ꚃ' > 'dzwe';",
+                     "'ꚃ' > 'dzwe';",
+                     "'Ꚅ' > 'zhwe';",
+                     "'ꚅ' > 'zhwe';",
+                     "'Ꚇ' > 'cche';",
+                     "'ꚇ' > 'cche';",
+                     "'Ꚉ' > 'dzze';",
+                     "'ꚉ' > 'dzze';",
+                     "'Ꚋ' > 'te';",
+                     "'ꚋ' > 'te';",
+                     "'Ꚍ' > 'twe';",
+                     "'ꚍ' > 'twe';",
+                     "'Ꚏ' > 'tswe';",
+                     "'ꚏ' > 'tswe';",
+                     "'Ꚑ' > 'tsse';",
+                     "'ꚑ' > 'tsse';",
+                     "'Ꚓ' > 'tche';",
+                     "'ꚓ' > 'tche';",
+                     "'Ꚕ' > 'hwe';",
+                     "'ꚕ' > 'hwe';",
+                     "'Ꚗ' > 'shwe';",
+                     "'ꚗ' > 'shwe';",
+                     "'Ꜧ' > 'heng';",
+                     "'ꜧ' > 'heng';",
+                     "'Ꜩ' > 'tz';",
+                     "'ꜩ' > 'tz';",
+                     "'Ꜫ' > 'tresillo';",
+                     "'ꜫ' > 'tresillo';",
+                     "'Ꜭ' > 'cuatrillo';",
+                     "'ꜭ' > 'cuatrillo';",
+                     "'Ꜯ' > 'cuatrillo';",
+                     "'ꜯ' > 'cuatrillo';",
+                     "'Ꜳ' > 'aa';",
+                     "'ꜳ' > 'aa';",
+                     "'Ꜵ' > 'ao';",
+                     "'ꜵ' > 'ao';",
+                     "'Ꜷ' > 'au';",
+                     "'ꜷ' > 'au';",
+                     "'Ꜹ' > 'av';",
+                     "'ꜹ' > 'av';",
+                     "'Ꜻ' > 'av';",
+                     "'ꜻ' > 'av';",
+                     "'Ꜽ' > 'ay';",
+                     "'ꜽ' > 'ay';",
+                     "'Ꜿ' > 'c';",
+                     "'ꜿ' > 'c';",
+                     "'Ꝁ' > 'k';",
+                     "'ꝁ' > 'k';",
+                     "'Ꝃ' > 'k';",
+                     "'ꝃ' > 'k';",
+                     "'Ꝅ' > 'k';",
+                     "'ꝅ' > 'k';",
+                     "'Ꝉ' > 'l';",
+                     "'ꝉ' > 'l';",
+                     "'Ꝋ' > 'o';",
+                     "'ꝋ' > 'o';",
+                     "'Ꝍ' > 'o';",
+                     "'ꝍ' > 'o';",
+                     "'Ꝏ' > 'oo';",
+                     "'ꝏ' > 'oo';",
+                     "'Ꝑ' > 'p';",
+                     "'ꝑ' > 'p';",
+                     "'Ꝓ' > 'p';",
+                     "'ꝓ' > 'p';",
+                     "'Ꝕ' > 'p';",
+                     "'ꝕ' > 'p';",
+                     "'Ꝗ' > 'q';",
+                     "'ꝗ' > 'q';",
+                     "'Ꝙ' > 'q';",
+                     "'ꝙ' > 'q';",
+                     "'Ꝛ' > 'r';",
+                     "'ꝛ' > 'r';",
+                     "'Ꝝ' > 'rum';",
+                     "'ꝝ' > 'rum';",
+                     "'Ꝟ' > 'v';",
+                     "'ꝟ' > 'v';",
+                     "'Ꝡ' > 'vy';",
+                     "'ꝡ' > 'vy';",
+                     "'Ꝥ' > 'thorn';",
+                     "'ꝥ' > 'thorn';",
+                     "'Ꝧ' > 'thorn';",
+                     "'ꝧ' > 'thorn';",
+                     "'Ꝩ' > 'vend';",
+                     "'ꝩ' > 'vend';",
+                     "'Ꝫ' > 'et';",
+                     "'ꝫ' > 'et';",
+                     "'Ꝭ' > 'is';",
+                     "'ꝭ' > 'is';",
+                     "'Ꝯ' > 'con';",
+                     "'ꝯ' > 'con';",
+                     "'ꝰ' > 'us';",
+                     "'ꝱ' > 'dum';",
+                     "'ꝲ' > 'lum';",
+                     "'ꝳ' > 'mum';",
+                     "'ꝴ' > 'num';",
+                     "'ꝵ' > 'rum';",
+                     "'ꝷ' > 'tum';",
+                     "'ꝸ' > 'um';",
+                     "'Ꞁ' > 'l';",
+                     "'ꞁ' > 'l';",
+                     "'ꟻ' > 'f';",
+                     "'ꟼ' > 'p';",
+                     "'ꟽ' > 'm';",
+                     "'ꟾ' > 'i';",
+                     "'ꟿ' > 'm';",
+                     "'ꠀ' > 'a';",
+                     "'ꠁ' > 'i';",
+                     "'ꠃ' > 'u';",
+                     "'ꠄ' > 'e';",
+                     "'ꠅ' > 'o';",
+                     "'ꠇ' > 'ko';",
+                     "'ꠈ' > 'kho';",
+                     "'ꠉ' > 'go';",
+                     "'ꠊ' > 'gho';",
+                     "'ꠌ' > 'co';",
+                     "'ꠍ' > 'cho';",
+                     "'ꠎ' > 'jo';",
+                     "'ꠏ' > 'jho';",
+                     "'ꠐ' > 'tto';",
+                     "'ꠑ' > 'ttho';",
+                     "'ꠒ' > 'ddo';",
+                     "'ꠓ' > 'ddho';",
+                     "'ꠔ' > 'to';",
+                     "'ꠕ' > 'tho';",
+                     "'ꠖ' > 'do';",
+                     "'ꠗ' > 'dho';",
+                     "'ꠘ' > 'no';",
+                     "'ꠙ' > 'po';",
+                     "'ꠚ' > 'pho';",
+                     "'ꠛ' > 'bo';",
+                     "'ꠜ' > 'bho';",
+                     "'ꠝ' > 'mo';",
+                     "'ꠞ' > 'ro';",
+                     "'ꠟ' > 'lo';",
+                     "'ꠠ' > 'rro';",
+                     "'ꠡ' > 'so';",
+                     "'ꠢ' > 'ho';",
+                     "'ꡀ' > 'ka';",
+                     "'ꡁ' > 'kha';",
+                     "'ꡂ' > 'ga';",
+                     "'ꡃ' > 'nga';",
+                     "'ꡄ' > 'ca';",
+                     "'ꡅ' > 'cha';",
+                     "'ꡆ' > 'ja';",
+                     "'ꡇ' > 'nya';",
+                     "'ꡈ' > 'ta';",
+                     "'ꡉ' > 'tha';",
+                     "'ꡊ' > 'da';",
+                     "'ꡋ' > 'na';",
+                     "'ꡌ' > 'pa';",
+                     "'ꡍ' > 'pha';",
+                     "'ꡎ' > 'ba';",
+                     "'ꡏ' > 'ma';",
+                     "'ꡐ' > 'tsa';",
+                     "'ꡑ' > 'tsha';",
+                     "'ꡒ' > 'dza';",
+                     "'ꡓ' > 'wa';",
+                     "'ꡔ' > 'zha';",
+                     "'ꡕ' > 'za';",
+                     "'ꡖ' > 'a';",
+                     "'ꡗ' > 'ya';",
+                     "'ꡘ' > 'ra';",
+                     "'ꡙ' > 'la';",
+                     "'ꡚ' > 'sha';",
+                     "'ꡛ' > 'sa';",
+                     "'ꡜ' > 'ha';",
+                     "'ꡝ' > 'a';",
+                     "'ꡞ' > 'i';",
+                     "'ꡟ' > 'u';",
+                     "'ꡠ' > 'e';",
+                     "'ꡡ' > 'o';",
+                     "'ꡢ' > 'qa';",
+                     "'ꡣ' > 'xa';",
+                     "'ꡤ' > 'fa';",
+                     "'ꡥ' > 'gga';",
+                     "'ꡦ' > 'ee';",
+                     "'ꡧ' > 'wa';",
+                     "'ꡨ' > 'ya';",
+                     "'ꡩ' > 'tta';",
+                     "'ꡪ' > 'ttha';",
+                     "'ꡫ' > 'dda';",
+                     "'ꡬ' > 'nna';",
+                     "'ꡱ' > 'ra';",
+                     "'ꡲ' > 'ra';",
+                     "'ꡳ' > 'candrabindu';",
+                     "'ꢂ' > 'a';",
+                     "'ꢃ' > 'aa';",
+                     "'ꢄ' > 'i';",
+                     "'ꢅ' > 'ii';",
+                     "'ꢆ' > 'u';",
+                     "'ꢇ' > 'uu';",
+                     "'ꢈ' > 'r';",
+                     "'ꢉ' > 'rr';",
+                     "'ꢊ' > 'l';",
+                     "'ꢋ' > 'll';",
+                     "'ꢌ' > 'e';",
+                     "'ꢍ' > 'ee';",
+                     "'ꢎ' > 'ai';",
+                     "'ꢏ' > 'o';",
+                     "'ꢐ' > 'oo';",
+                     "'ꢑ' > 'au';",
+                     "'ꢒ' > 'ka';",
+                     "'ꢓ' > 'kha';",
+                     "'ꢔ' > 'ga';",
+                     "'ꢕ' > 'gha';",
+                     "'ꢖ' > 'nga';",
+                     "'ꢗ' > 'ca';",
+                     "'ꢘ' > 'cha';",
+                     "'ꢙ' > 'ja';",
+                     "'ꢚ' > 'jha';",
+                     "'ꢛ' > 'nya';",
+                     "'ꢜ' > 'tta';",
+                     "'ꢝ' > 'ttha';",
+                     "'ꢞ' > 'dda';",
+                     "'ꢟ' > 'ddha';",
+                     "'ꢠ' > 'nna';",
+                     "'ꢡ' > 'ta';",
+                     "'ꢢ' > 'tha';",
+                     "'ꢣ' > 'da';",
+                     "'ꢤ' > 'dha';",
+                     "'ꢥ' > 'na';",
+                     "'ꢦ' > 'pa';",
+                     "'ꢧ' > 'pha';",
+                     "'ꢨ' > 'ba';",
+                     "'ꢩ' > 'bha';",
+                     "'ꢪ' > 'ma';",
+                     "'ꢫ' > 'ya';",
+                     "'ꢬ' > 'ra';",
+                     "'ꢭ' > 'la';",
+                     "'ꢮ' > 'va';",
+                     "'ꢯ' > 'sha';",
+                     "'ꢰ' > 'ssa';",
+                     "'ꢱ' > 'sa';",
+                     "'ꢲ' > 'ha';",
+                     "'ꢳ' > 'lla';",
+                     "'ꤊ' > 'ka';",
+                     "'ꤋ' > 'kha';",
+                     "'ꤌ' > 'ga';",
+                     "'ꤍ' > 'nga';",
+                     "'ꤎ' > 'sa';",
+                     "'ꤏ' > 'sha';",
+                     "'ꤐ' > 'za';",
+                     "'ꤑ' > 'nya';",
+                     "'ꤒ' > 'ta';",
+                     "'ꤓ' > 'hta';",
+                     "'ꤔ' > 'na';",
+                     "'ꤕ' > 'pa';",
+                     "'ꤖ' > 'pha';",
+                     "'ꤗ' > 'ma';",
+                     "'ꤘ' > 'da';",
+                     "'ꤙ' > 'ba';",
+                     "'ꤚ' > 'ra';",
+                     "'ꤛ' > 'ya';",
+                     "'ꤜ' > 'la';",
+                     "'ꤝ' > 'wa';",
+                     "'ꤞ' > 'tha';",
+                     "'ꤟ' > 'ha';",
+                     "'ꤠ' > 'va';",
+                     "'ꤡ' > 'ca';",
+                     "'ꤢ' > 'a';",
+                     "'ꤣ' > 'oe';",
+                     "'ꤤ' > 'i';",
+                     "'ꤥ' > 'oo';",
+                     "'ꤰ' > 'ka';",
+                     "'ꤱ' > 'ga';",
+                     "'ꤲ' > 'nga';",
+                     "'ꤳ' > 'ta';",
+                     "'ꤴ' > 'da';",
+                     "'ꤵ' > 'na';",
+                     "'ꤶ' > 'pa';",
+                     "'ꤷ' > 'ba';",
+                     "'ꤸ' > 'ma';",
+                     "'ꤹ' > 'ca';",
+                     "'ꤺ' > 'ja';",
+                     "'ꤻ' > 'nya';",
+                     "'ꤼ' > 'sa';",
+                     "'ꤽ' > 'ra';",
+                     "'ꤾ' > 'la';",
+                     "'ꤿ' > 'ya';",
+                     "'ꥀ' > 'wa';",
+                     "'ꥁ' > 'ha';",
+                     "'ꥂ' > 'mba';",
+                     "'ꥃ' > 'ngga';",
+                     "'ꥄ' > 'nda';",
+                     "'ꥅ' > 'nyja';",
+                     "'ꥆ' > 'a';",
+                     "'ꨀ' > 'a';",
+                     "'ꨁ' > 'i';",
+                     "'ꨂ' > 'u';",
+                     "'ꨃ' > 'e';",
+                     "'ꨄ' > 'ai';",
+                     "'ꨅ' > 'o';",
+                     "'ꨆ' > 'ka';",
+                     "'ꨇ' > 'kha';",
+                     "'ꨈ' > 'ga';",
+                     "'ꨉ' > 'gha';",
+                     "'ꨊ' > 'ngue';",
+                     "'ꨋ' > 'nga';",
+                     "'ꨌ' > 'cha';",
+                     "'ꨍ' > 'chha';",
+                     "'ꨎ' > 'ja';",
+                     "'ꨏ' > 'jha';",
+                     "'ꨐ' > 'nhue';",
+                     "'ꨑ' > 'nha';",
+                     "'ꨒ' > 'nhja';",
+                     "'ꨓ' > 'ta';",
+                     "'ꨔ' > 'tha';",
+                     "'ꨕ' > 'da';",
+                     "'ꨖ' > 'dha';",
+                     "'ꨗ' > 'nue';",
+                     "'ꨘ' > 'na';",
+                     "'ꨙ' > 'dda';",
+                     "'ꨚ' > 'pa';",
+                     "'ꨛ' > 'ppa';",
+                     "'ꨜ' > 'pha';",
+                     "'ꨝ' > 'ba';",
+                     "'ꨞ' > 'bha';",
+                     "'ꨟ' > 'mue';",
+                     "'ꨠ' > 'ma';",
+                     "'ꨡ' > 'bba';",
+                     "'ꨢ' > 'ya';",
+                     "'ꨣ' > 'ra';",
+                     "'ꨤ' > 'la';",
+                     "'ꨥ' > 'va';",
+                     "'ꨦ' > 'ssa';",
+                     "'ꨧ' > 'sa';",
+                     "'ꨨ' > 'ha';",
+                     "'ힰ' > 'gyeol';",
+                     "'ힱ' > 'gyeolg';",
+                     "'ힲ' > 'gyeolm';",
+                     "'ힳ' > 'gyeolb';",
+                     "'ힴ' > 'gyeols';",
+                     "'ힵ' > 'gyeolt';",
+                     "'ힶ' > 'gyeolp';",
+                     "'ힷ' > 'gyeolh';",
+                     "'ힸ' > 'gyeom';",
+                     "'ힹ' > 'gyeob';",
+                     "'ힺ' > 'gyeobs';",
+                     "'ힻ' > 'gyeos';",
+                     "'ힼ' > 'gyeoss';",
+                     "'ힽ' > 'gyeong';",
+                     "'ힾ' > 'gyeoj';",
+                     "'ힿ' > 'gyeoc';",
+                     "'ퟀ' > 'gyeok';",
+                     "'ퟁ' > 'gyeot';",
+                     "'ퟂ' > 'gyeop';",
+                     "'ퟃ' > 'gyeoh';",
+                     "'ퟄ' > 'gye';",
+                     "'ퟅ' > 'gyeg';",
+                     "'ퟆ' > 'gyegg';",
+                     "'ퟋ' > 'gyed';",
+                     "'ퟌ' > 'gyel';",
+                     "'ퟍ' > 'gyelg';",
+                     "'ퟎ' > 'gyelm';",
+                     "'ퟏ' > 'gyelb';",
+                     "'ퟐ' > 'gyels';",
+                     "'ퟑ' > 'gyelt';",
+                     "'ퟒ' > 'gyelp';",
+                     "'ퟓ' > 'gyelh';",
+                     "'ퟔ' > 'gyem';",
+                     "'ퟕ' > 'gyeb';",
+                     "'ퟖ' > 'gyebs';",
+                     "'ퟗ' > 'gyes';",
+                     "'ퟘ' > 'gyess';",
+                     "'ퟙ' > 'gyeng';",
+                     "'ퟚ' > 'gyej';",
+                     "'ퟛ' > 'gyec';",
+                     "'ퟜ' > 'gyek';",
+                     "'ퟝ' > 'gyet';",
+                     "'ퟞ' > 'gyep';",
+                     "'ퟟ' > 'gyeh';",
+                     "'ퟠ' > 'go';",
+                     "'ퟡ' > 'gog';",
+                     "'ퟢ' > 'gogg';",
+                     "'ퟣ' > 'gogs';",
+                     "'ퟤ' > 'gon';",
+                     "'ퟥ' > 'gonj';",
+                     "'ퟦ' > 'gonh';",
+                     "'ퟧ' > 'god';",
+                     "'ퟨ' > 'gol';",
+                     "'ퟩ' > 'golg';",
+                     "'ퟪ' > 'golm';",
+                     "'ퟫ' > 'golb';",
+                     "'ퟬ' > 'gols';",
+                     "'ퟭ' > 'golt';",
+                     "'ퟮ' > 'golp';",
+                     "'ퟯ' > 'golh';",
+                     "'ퟰ' > 'gom';",
+                     "'ퟱ' > 'gob';",
+                     "'ퟲ' > 'gobs';",
+                     "'ퟳ' > 'gos';",
+                     "'ퟴ' > 'goss';",
+                     "'ퟵ' > 'gong';",
+                     "'ퟶ' > 'goj';",
+                     "'ퟷ' > 'goc';",
+                     "'ퟸ' > 'gok';",
+                     "'ퟹ' > 'got';",
+                     "'ퟺ' > 'gop';",
+                     "'ퟻ' > 'goh';",
+                     "'﨎' > 'geuj';",
+                     "'﨏' > 'geuc';",
+                     "'﨑' > 'geut';",
+                     "'﨓' > 'geuh';",
+                     "'﨔' > 'gyi';",
+                     "'﨟' > 'gyilb';",
+                     "'﨡' > 'gyilt';",
+                     "'﨣' > 'gyilh';",
+                     "'﨤' > 'gyim';",
+                     "'﨧' > 'gyis';",
+                     "'﨨' > 'gyiss';",
+                     "'﨩' > 'gying';",
+                     "'ﬓ' > 'ggyegs';",
+                     "'ﬔ' > 'ggyen';",
+                     "'ﬕ' > 'ggyenj';",
+                     "'ﬖ' > 'ggyenh';",
+                     "'ﬗ' > 'ggyed';",
+                     "'ﹳ' > 'nwih';",
+                     "'ｰ' > 'de';",
+                     "'ﾞ' > 'dyeobs';",
+                     "'ﾟ' > 'dyeos';",
+                     "'ﾠ' > 'dyeoss';",
+                     "'ﾰ' > 'dyel';",
+                     "'ﾴ' > 'dyels';",
+                     ":: Ascii ()",
+                     ":: NFD ()",
+                     "'' >",
+                     "[[:Nonspacing Mark:] [:Cf:]] >",
+                     "[^[:Ascii:]] >",
+                     ":: lower ()",
+                     "[[:Punctuation:][:Space:]]+ > ' '",
+                     ":: NFC ()"
+                   ],
+  "abbreviations": [
+    [" national wildlife refuge area ", " nwra "],
+    [" national recreation area ", " nra "],
+    [" air national guard base ", " angb "],
+    [" zhilishchien komplieks ", " zh k "],
+    [" trung tam thuong mdhi ", " tttm "],
+    [" poligono industrial ", " pgind "],
+    [" trung hoc pho thong ", " thpt "],
+    [" onze lieve vrouw e ", " olv "],
+    [" strada provinciale ", " sp "],
+    ["onze lieve vrouw e ", " olv "],
+    [" punto kilometrico ", " pk "],
+    [" cong vien van hoa ", " cvvh "],
+    [" can cu khong quan ", " cckq "],
+    ["strada provinciale ", " sp "],
+    [" strada regionale ", " sr "],
+    [" strada comunale ", " sc "],
+    ["strada regionale ", " sr "],
+    [" trung hoc co so ", " thcs "],
+    [" san bay quoc te ", " sbqt "],
+    [" cong ty co phyn ", " ctcp "],
+    [" khu cong nghiep ", " kcn "],
+    [" air force base ", " afb "],
+    [" strada statale ", " ss "],
+    [" vien bcyo tang ", " vbt "],
+    ["strada comunale ", " sc "],
+    [" circunvalacion ", " ccvcn "],
+    [" paseo maritimo ", " psmar "],
+    [" wielkopolskie ", " wlkp "],
+    [" national park ", " np "],
+    [" middle school ", " ms "],
+    [" international ", " intl "],
+    [" burgermeister ", " bgm "],
+    [" vuon quoc gia ", " vqg "],
+    [" qucyng truong ", " qt "],
+    ["strada statale ", " ss "],
+    [" state highway ", " sh "],
+    ["burgermeister ", " bgm "],
+    [" right of way ", " rowy "],
+    [" hauptbahnhof ", " hbf "],
+    [" apartamentos ", " aptos "],
+    [" wielkopolski ", " wlkp "],
+    [" burgemeester ", " bg "],
+    [" camino nuevo ", " c n "],
+    [" camino hondo ", " c h "],
+    [" urbanizacion ", " urb "],
+    [" camino viejo ", " c v "],
+    [" wielkopolska ", " wlkp "],
+    [" wojewodztwie ", " woj "],
+    [" county route ", " cr "],
+    [" prolongacion ", " prol "],
+    [" thoroughfare ", " thor "],
+    [" san van dong ", " svd "],
+    [" tong cong ty ", " tct "],
+    [" khu nghi mat ", " knm "],
+    [" nha thi dzu ", " ntd "],
+    [" khu du lich ", " kdl "],
+    [" demarcacion ", " demar "],
+    [" cau ldhc bo ", " clb "],
+    [" interchange ", " intg "],
+    [" distributor ", " dstr "],
+    [" state route ", " sr "],
+    [" wojewodztwo ", " woj "],
+    [" reservation ", " res "],
+    [" monseigneur ", " mgr "],
+    [" transversal ", " trval "],
+    [" extrarradio ", " extrr "],
+    [" high school ", " hs "],
+    [" mazowieckie ", " maz "],
+    [" residencial ", " resid "],
+    [" cong truong ", " ct "],
+    [" cooperativa ", " coop "],
+    [" diseminado ", " disem "],
+    [" barranquil ", " bqllo "],
+    [" fire track ", " ftrk "],
+    [" south east ", " se "],
+    [" north east ", " ne "],
+    [" university ", " univ "],
+    [" south west ", " sw "],
+    [" monasterio ", " mtrio "],
+    [" vecindario ", " vecin "],
+    [" carreterin ", " ctrin "],
+    [" callejuela ", " cjla "],
+    [" north-east ", " ne "],
+    [" south-west ", " sw "],
+    [" gebroeders ", " gebr "],
+    [" serviceway ", " swy "],
+    [" quadrangle ", " qdgl "],
+    [" commandant ", " cmdt "],
+    [" extramuros ", " extrm "],
+    [" escalinata ", " escal "],
+    [" north-west ", " n "],
+    [" bulevardul ", " bd "],
+    [" particular ", " parti "],
+    [" mazowiecka ", " maz "],
+    [" mazowiecki ", " maz "],
+    [" north west ", " n "],
+    [" industrial ", " ind "],
+    [" costanilla ", " cstan "],
+    [" khach sdhn ", " ks "],
+    [" south-east ", " se "],
+    [" phi truong ", " pt "],
+    [" expressway ", " exp "],
+    [" fondamenta ", " f ta "],
+    [" apartments ", " apts "],
+    [" cul de sac ", " cds "],
+    [" corralillo ", " crrlo "],
+    [" mitropolit ", " mit "],
+    [" etorbidea ", " etorb "],
+    [" ploshchad ", " pl "],
+    [" cobertizo ", " cbtiz "],
+    [" underpass ", " upas "],
+    [" crossroad ", " crd "],
+    [" fundatura ", " fnd "],
+    [" foreshore ", " fshr "],
+    [" parklands ", " pkld "],
+    [" esplanade ", " esp "],
+    [" centreway ", " cnwy "],
+    [" formation ", " form "],
+    [" explanada ", " expla "],
+    [" viviendas ", " vvdas "],
+    [" northeast ", " ne "],
+    [" cong vien ", " cv "],
+    [" northwest ", " n "],
+    [" buildings ", " bldgs "],
+    [" errepidea ", " err "],
+    [" extension ", " ex "],
+    [" municipal ", " mun "],
+    [" southeast ", " se "],
+    [" sanatorio ", " sanat "],
+    [" thanh pho ", " tp "],
+    [" firetrail ", " fit "],
+    [" santuario ", " santu "],
+    [" southwest ", " sw "],
+    [" autopista ", " auto "],
+    [" president ", " pres "],
+    [" rinconada ", " rcda "],
+    [" kardinaal ", " kard "],
+    [" plazoleta ", " pzta "],
+    [" duong sat ", " ds "],
+    [" trung tam ", " tt "],
+    [" piazzetta ", " pta "],
+    [" boardwalk ", " bwlk "],
+    [" bulievard ", " bd "],
+    [" luitenant ", " luit "],
+    [" courtyard ", " ctyd "],
+    [" reservoir ", " res "],
+    [" bulevardu ", " bd "],
+    [" community ", " comm "],
+    [" concourse ", " con "],
+    [" profiesor ", " prof "],
+    [" promenade ", " prom "],
+    [" gienieral ", " ghien "],
+    [" puistikko ", " pko "],
+    [" balneario ", " balnr "],
+    [" carretera ", " ctra "],
+    [" ingenieur ", " ir "],
+    [" boulevard ", " bd "],
+    [" deviation ", " devn "],
+    [" hipodromo ", " hipod "],
+    [" professor ", " prof "],
+    [" triangle ", " tri "],
+    [" dotsient ", " dots "],
+    [" boundary ", " bdy "],
+    [" salizada ", " s da "],
+    [" trunkway ", " tkwy "],
+    [" cinturon ", " cint "],
+    ["president ", " pres "],
+    [" military ", " mil "],
+    [" jonkheer ", " jhr "],
+    [" motorway ", " mwy "],
+    [" steenweg ", " stwg "],
+    [" crescent ", " cr "],
+    [" kanunnik ", " kan "],
+    [" koningin ", " kon "],
+    [" crossing ", " xing "],
+    [" callejon ", " cjon "],
+    [" pasadizo ", " pzo "],
+    [" crossway ", " cowy "],
+    [" cottages ", " cotts "],
+    [" mountain ", " mtn "],
+    [" business ", " bus "],
+    [" pierwszy ", " 1 "],
+    [" pierwsza ", " 1 "],
+    [" pierwsze ", " 1 "],
+    [" barriada ", " barda "],
+    [" entrance ", " ent "],
+    [" causeway ", " cway "],
+    [" generaal ", " gen "],
+    [" driveway ", " dvwy "],
+    [" township ", " twp "],
+    [" stazione ", " staz "],
+    [" broadway ", " bway "],
+    [" alleyway ", " alwy "],
+    [" quadrant ", " qdrt "],
+    [" apeadero ", " apdro "],
+    [" arboleda ", " arb "],
+    [" escalera ", " esca "],
+    [" rdhp hat ", " rh "],
+    [" transito ", " trans "],
+    [" ddhi hoc ", " dh "],
+    [" travesia ", " trva "],
+    [" barranco ", " branc "],
+    [" namestie ", " nam "],
+    [" viaducto ", " vcto "],
+    [" convento ", " cnvto "],
+    [" estacion ", " estcn "],
+    ["puistikko ", " pko "],
+    [" precinct ", " pct "],
+    [" heiligen ", " hl "],
+    [" edificio ", " edifc "],
+    [" prazuela ", " przla "],
+    [" thi trzn ", " tt "],
+    [" ridgeway ", " rgwy "],
+    [" riverway ", " rvwy "],
+    [" corredor ", " crrdo "],
+    [" passatge ", " ptge "],
+    [" junction ", " jnc "],
+    [" hospital ", " hosp "],
+    [" highroad ", " hrd "],
+    [" torrente ", " trrnt "],
+    [" avinguda ", " av "],
+    [" portillo ", " ptilo "],
+    [" diagonal ", " diag "],
+    [" buu dien ", " bd "],
+    [" alqueria ", " alque "],
+    [" poligono ", " polig "],
+    [" roadside ", " rdsd "],
+    [" glorieta ", " gta "],
+    [" fundacul ", " fdc "],
+    [" cao dang ", " cd "],
+    [" rosebowl ", " rsbl "],
+    [" complejo ", " compj "],
+    [" carretil ", " crtil "],
+    [" intrarea ", " int "],
+    [" gran via ", " g v "],
+    [" approach ", " app "],
+    [" stradela ", " sdla "],
+    [" conjunto ", " cjto "],
+    [" arterial ", " artl "],
+    [" plazuela ", " plzla "],
+    [" frontage ", " frtg "],
+    [" faubourg ", " fg "],
+    [" mansions ", " mans "],
+    [" turnpike ", " tpk "],
+    [" piazzale ", " p le "],
+    [" tieu hoc ", " th "],
+    [" bulevard ", " bd "],
+    [" sendera ", " sedra "],
+    [" cutting ", " cutt "],
+    [" cantina ", " canti "],
+    [" cantera ", " cantr "],
+    [" rotonda ", " rtda "],
+    [" pasillo ", " psllo "],
+    [" landing ", " ldg "],
+    [" kolonel ", " kol "],
+    [" cong ty ", " cty "],
+    [" fairway ", " fawy "],
+    [" highway ", " hwy "],
+    [" lookout ", " lkt "],
+    [" meander ", " mr "],
+    [" carrera ", " cra "],
+    [" station ", " stn "],
+    [" kapitan ", " kap "],
+    [" medical ", " med "],
+    [" broeder ", " br "],
+    [" poblado ", " pbdo "],
+    [" impasse ", " imp "],
+    [" gardens ", " gdn "],
+    [" nha tho ", " nt "],
+    [" nha hat ", " nh "],
+    [" freeway ", " fwy "],
+    [" trasera ", " tras "],
+    [" portico ", " prtco "],
+    [" terrace ", " ter "],
+    [" heights ", " hts "],
+    [" camping ", " campg "],
+    [" callizo ", " cllzo "],
+    [" footway ", " ftwy "],
+    [" calzada ", " czada "],
+    [" dominee ", " ds "],
+    [" meadows ", " mdws "],
+    [" sendero ", " send "],
+    [" osiedle ", " os "],
+    [" estrada ", " estda "],
+    [" avenida ", " av "],
+    [" zgornji ", " zg "],
+    [" zgornje ", " zg "],
+    [" zgornja ", " zg "],
+    [" arrabal ", " arral "],
+    [" espalda ", " eslda "],
+    [" entrada ", " entd "],
+    [" kleiner ", " kl "],
+    [" kleines ", " kl "],
+    [" viaduct ", " via "],
+    [" roadway ", " rdwy "],
+    [" strasse ", " st "],
+    [" spodnje ", " sp "],
+    [" spodnji ", " sp "],
+    [" spodnja ", " sp "],
+    [" fabrica ", " fca "],
+    [" muntele ", " mt "],
+    [" maantee ", " mt "],
+    [" srednje ", " sr "],
+    [" unterer ", " u "],
+    [" unteres ", " u "],
+    [" plateau ", " plat "],
+    [" srednji ", " sr "],
+    [" empresa ", " empr "],
+    [" angosta ", " angta "],
+    [" costera ", " coste "],
+    [" tinh lo ", " tl "],
+    [" quoc lo ", " ql "],
+    [" auf der ", " a d "],
+    [" bulvari ", " bl "],
+    [" ddhi lo ", " dl "],
+    [" namesti ", " nam "],
+    [" passeig ", " pg "],
+    [" carrero ", " cro "],
+    [" cortijo ", " crtjo "],
+    [" san bay ", " sb "],
+    [" riviera ", " rvra "],
+    [" caddesi ", " cd "],
+    [" andador ", " andad "],
+    [" walkway ", " wkwy "],
+    [" granden ", " gr "],
+    [" grosser ", " gr "],
+    [" grosses ", " gr "],
+    [" reserve ", " res "],
+    [" alameda ", " alam "],
+    [" retreat ", " rtt "],
+    [" acequia ", " aceq "],
+    [" platsen ", " pl "],
+    [" bahnhof ", " bf "],
+    [" autovia ", " autov "],
+    [" srednja ", " sr "],
+    [" galeria ", " gale "],
+    [" circuit ", " cct "],
+    [" svingen ", " sv "],
+    [" plassen ", " pl "],
+    [" mirador ", " mrdor "],
+    [" laneway ", " lnwy "],
+    [" kolonia ", " kol "],
+    [" outlook ", " otlk "],
+    [" caravan ", " cvn "],
+    [" osiedlu ", " os "],
+    [" palacio ", " palac "],
+    [" pantano ", " pant "],
+    [" partida ", " ptda "],
+    [" calleja ", " cllja "],
+    [" mevrouw ", " mevr "],
+    [" meester ", " mr "],
+    [" pastoor ", " past "],
+    [" prinses ", " pr "],
+    [" bulevar ", " bd "],
+    [" tollway ", " tlwy "],
+    ["steenweg ", " stwg "],
+    [" caserio ", " csrio "],
+    [" mercado ", " merc "],
+    [" alejach ", " al "],
+    [" kvartal ", " kv "],
+    [" parkway ", " pwy "],
+    [" passage ", " ps "],
+    [" pathway ", " pway "],
+    [" splaiul ", " sp "],
+    [" soseaua ", " sos "],
+    [" colonia ", " col "],
+    [" wielkie ", " wlk "],
+    [" trzecie ", " 3 "],
+    [" llanura ", " llnra "],
+    [" malecon ", " malec "],
+    [" trzecia ", " 3 "],
+    [" trailer ", " trlr "],
+    [" cuadra ", " cuadr "],
+    [" cty cp ", " ctcp "],
+    [" paraje ", " praje "],
+    [" parque ", " pque "],
+    [" piazza ", " p za "],
+    [" puerta ", " pta "],
+    [" little ", " lt "],
+    [" pueblo ", " pblo "],
+    [" puente ", " pnte "],
+    [" jardin ", " jdin "],
+    [" granja ", " granj "],
+    [" market ", " mkt "],
+    [" pasaje ", " psaje "],
+    [" rotary ", " rty "],
+    [" corral ", " crral "],
+    [" siding ", " sdng "],
+    [" nucleo ", " ncleo "],
+    [" muelle ", " muell "],
+    [" carril ", " crril "],
+    [" portal ", " prtal "],
+    [" ramble ", " rmbl "],
+    [" pocket ", " pkt "],
+    [" chalet ", " chlet "],
+    [" canton ", " cant "],
+    [" ladera ", " ldera "],
+    [" parade ", " pde "],
+    [" dehesa ", " dhsa "],
+    [" museum ", " mus "],
+    [" middle ", " mid "],
+    [" cuesta ", " custa "],
+    [" gracht ", " gr "],
+    [" virful ", " vf "],
+    [" m tele ", " mt "],
+    [" varful ", " vf "],
+    [" str la ", " sdla "],
+    [" arcade ", " arc "],
+    [" strada ", " st "],
+    [" access ", " accs "],
+    [" bajada ", " bjada "],
+    [" veliki ", " v "],
+    ["strasse ", " st "],
+    [" velike ", " v "],
+    [" untere ", " u "],
+    [" velika ", " v "],
+    [" artery ", " arty "],
+    [" avenue ", " av "],
+    [" miasto ", " m "],
+    [" bypass ", " byp "],
+    [" placem ", " pl "],
+    [" barrio ", " bo "],
+    [" center ", " ctr "],
+    [" bldngs ", " bldgs "],
+    [" puerto ", " pto "],
+    [" wielka ", " wlk "],
+    [" tunnel ", " tun "],
+    [" wielki ", " wlk "],
+    [" bridge ", " bri "],
+    [" trzeci ", " 3 "],
+    [" veliko ", " v "],
+    [" quelle ", " qu "],
+    [" acceso ", " acces "],
+    [" bulvar ", " bl "],
+    [" sokagi ", " sk "],
+    ["platsen ", " pl "],
+    [" stigen ", " st "],
+    [" brucke ", " br "],
+    [" an der ", " a d "],
+    [" thi xa ", " tx "],
+    [" nordre ", " ndr "],
+    [" rambla ", " rbla "],
+    [" sondre ", " sdr "],
+    ["quoc lo ", " ql "],
+    [" phuong ", " p "],
+    [" vastra ", " v "],
+    [" carrer ", " c "],
+    [" oberes ", " o "],
+    [" raitti ", " r "],
+    [" puisto ", " ps "],
+    [" arroyo ", " arry "],
+    [" penger ", " pgr "],
+    [" oberer ", " o "],
+    [" kleine ", " kl "],
+    [" grosse ", " gr "],
+    ["granden ", " gr "],
+    [" villas ", " vlls "],
+    [" taival ", " tvl "],
+    [" in der ", " i d "],
+    [" centre ", " ctr "],
+    [" drugie ", " 2 "],
+    [" dokter ", " dr "],
+    [" grange ", " gra "],
+    [" doctor ", " dr "],
+    [" vicolo ", " v lo "],
+    [" kort e ", " k "],
+    [" koning ", " kon "],
+    [" straat ", " st "],
+    [" svieti ", " sv "],
+    [" callej ", " cjon "],
+    [" ground ", " grnd "],
+    [" vereda ", " vreda "],
+    [" chemin ", " ch "],
+    [" street ", " st "],
+    [" strand ", " st "],
+    [" sainte ", " ste "],
+    [" camino ", " cno "],
+    [" garden ", " gdn "],
+    [" follow ", " folw "],
+    [" estate ", " est "],
+    [" doktor ", " d r "],
+    [" subway ", " sbwy "],
+    [" ulitsa ", " ul "],
+    [" square ", " sq "],
+    [" towers ", " twrs "],
+    ["plassen ", " pl "],
+    [" county ", " co "],
+    [" brazal ", " brzal "],
+    [" circus ", " crcs "],
+    ["svingen ", " sv "],
+    [" rampla ", " rampa "],
+    [" bloque ", " blque "],
+    [" circle ", " cir "],
+    [" island ", " is "],
+    [" common ", " comm "],
+    [" ribera ", " rbra "],
+    [" sector ", " sect "],
+    [" rincon ", " rcon "],
+    [" van de ", " vd "],
+    [" corner ", " cnr "],
+    [" subida ", " sbida "],
+    [" banda ", " b "],
+    [" bulev ", " bd "],
+    [" barro ", " bo "],
+    [" cllon ", " cjon "],
+    [" p zza ", " p za "],
+    [" drugi ", " 2 "],
+    [" druga ", " 2 "],
+    [" placu ", " pl "],
+    [" aleji ", " al "],
+    [" aleja ", " al "],
+    [" aleje ", " al "],
+    [" stary ", " st "],
+    [" stara ", " st "],
+    [" dolny ", " dln "],
+    [" dolna ", " dln "],
+    [" gorne ", " gn "],
+    [" gorna ", " gn "],
+    [" stare ", " st "],
+    [" gorny ", " gn "],
+    [" ulicy ", " ul "],
+    [" ulica ", " ul "],
+    [" o l v ", " olv "],
+    [" plein ", " pln "],
+    [" markt ", " mkt "],
+    [" lange ", " l "],
+    [" viale ", " v le "],
+    ["gracht ", " gr "],
+    [" prins ", " pr "],
+    ["straat ", " st "],
+    [" plass ", " pl "],
+    [" sving ", " sv "],
+    [" gaten ", " g "],
+    [" veien ", " v "],
+    [" vliet ", " vlt "],
+    [" dolne ", " dln "],
+    [" b dul ", " bd "],
+    [" sodra ", " s "],
+    [" norra ", " n "],
+    [" gamla ", " gla "],
+    [" grand ", " gr "],
+    [" vagen ", " v "],
+    [" gatan ", " g "],
+    [" ostra ", " o "],
+    ["vastra ", " v "],
+    [" cadde ", " cd "],
+    [" duong ", " d "],
+    [" sokak ", " sk "],
+    [" plats ", " pl "],
+    ["stigen ", " st "],
+    [" vayla ", " vla "],
+    ["taival ", " tvl "],
+    [" sveti ", " sv "],
+    [" aukio ", " auk "],
+    [" sveta ", " sv "],
+    [" cesta ", " c "],
+    [" piata ", " pta "],
+    [" aleea ", " al "],
+    [" kaari ", " kri "],
+    ["penger ", " pgr "],
+    [" ranta ", " rt "],
+    [" rinne ", " rn "],
+    ["raitti ", " r "],
+    ["puisto ", " ps "],
+    [" polku ", " p "],
+    [" porta ", " pta "],
+    [" ponte ", " p te "],
+    [" paseo ", " po "],
+    [" fbrca ", " fca "],
+    [" allee ", " al "],
+    [" cours ", " crs "],
+    ["sainte ", " ste "],
+    ["square ", " sq "],
+    [" largo ", " l go "],
+    [" wharf ", " whrf "],
+    [" corte ", " c te "],
+    [" corso ", " c so "],
+    [" campo ", " c po "],
+    [" santa ", " sta "],
+    [" calle ", " c "],
+    [" strip ", " strp "],
+    [" alley ", " al "],
+    [" north ", " n "],
+    [" block ", " blk "],
+    [" gully ", " gly "],
+    [" sielo ", " s "],
+    [" brace ", " br "],
+    [" ronde ", " rnde "],
+    [" grove ", " gr "],
+    [" break ", " brk "],
+    [" roads ", " rds "],
+    [" track ", " trk "],
+    [" house ", " ho "],
+    [" trail ", " trl "],
+    [" mount ", " mt "],
+    [" cross ", " crss "],
+    [" beach ", " bch "],
+    [" point ", " pt "],
+    [" basin ", " basn "],
+    [" green ", " gn "],
+    [" plaza ", " pl "],
+    [" lille ", " ll "],
+    [" slope ", " slpe "],
+    [" placa ", " pl "],
+    [" place ", " pl "],
+    [" shunt ", " shun "],
+    [" saint ", " st "],
+    [" ulice ", " ul "],
+    [" amble ", " ambl "],
+    [" route ", " rt "],
+    [" sound ", " snd "],
+    [" store ", " st "],
+    [" front ", " frnt "],
+    [" elbow ", " elb "],
+    [" glade ", " gl "],
+    [" south ", " s "],
+    [" round ", " rnd "],
+    [" drive ", " dr "],
+    [" croft ", " cft "],
+    [" platz ", " pl "],
+    [" ferry ", " fy "],
+    [" ridge ", " rdge "],
+    [" tanav ", " tn "],
+    [" banan ", " ba "],
+    [" quays ", " qys "],
+    [" sankt ", " st "],
+    [" vkhod ", " vkh "],
+    [" chase ", " ch "],
+    [" vista ", " vsta "],
+    [" rhein ", " rh "],
+    [" court ", " ct "],
+    ["brucke ", " br "],
+    [" upper ", " up "],
+    [" river ", " r "],
+    [" range ", " rnge "],
+    [" lower ", " lr "],
+    [" kalea ", " k "],
+    [" crest ", " crst "],
+    [" obere ", " o "],
+    [" manor ", " mnr "],
+    [" byway ", " bywy "],
+    [" reach ", " rch "],
+    [" copse ", " cps "],
+    ["quelle ", " qu "],
+    [" creek ", " cr "],
+    [" close ", " c "],
+    [" fort ", " ft "],
+    [" apch ", " app "],
+    [" mont ", " mt "],
+    [" bdul ", " bd "],
+    ["saint ", " st "],
+    [" back ", " bk "],
+    [" c le ", " c "],
+    ["place ", " pl "],
+    [" frwy ", " fwy "],
+    [" quai ", " qu "],
+    [" ally ", " al "],
+    [" m te ", " mt "],
+    [" lane ", " ln "],
+    ["aukio ", " auk "],
+    [" loop ", " lp "],
+    [" line ", " ln "],
+    [" alue ", " al "],
+    [" link ", " lk "],
+    [" glde ", " gl "],
+    [" alea ", " al "],
+    [" gate ", " g "],
+    [" intr ", " int "],
+    [" gdns ", " gdn "],
+    [" hird ", " hrd "],
+    [" varf ", " vf "],
+    [" virf ", " vf "],
+    [" hgts ", " hts "],
+    [" expy ", " exp "],
+    ["markt ", " mkt "],
+    [" bypa ", " byp "],
+    ["o l v ", " olv "],
+    [" cres ", " cr "],
+    [" bdwy ", " bway "],
+    [" csac ", " cds "],
+    [" nowy ", " n "],
+    [" laan ", " ln "],
+    [" crsg ", " xing "],
+    ["vliet ", " vlt "],
+    [" city ", " cty "],
+    ["sving ", " sv "],
+    ["plass ", " pl "],
+    ["gaten ", " g "],
+    ["veien ", " v "],
+    [" gata ", " g "],
+    [" sint ", " st "],
+    [" caus ", " cway "],
+    [" cove ", " cv "],
+    ["plein ", " pln "],
+    [" cswy ", " cway "],
+    [" plac ", " pl "],
+    [" nowa ", " n "],
+    [" kolo ", " k "],
+    [" katu ", " k "],
+    [" duze ", " dz "],
+    [" blvd ", " bd "],
+    [" p ta ", " pta "],
+    [" maly ", " ml "],
+    [" mala ", " ml "],
+    [" bdge ", " bri "],
+    [" nowe ", " n "],
+    [" brdg ", " bri "],
+    [" male ", " ml "],
+    [" drwy ", " dvwy "],
+    [" duza ", " dz "],
+    [" utca ", " u "],
+    [" east ", " e "],
+    [" duzy ", " dz "],
+    ["kaari ", " kri "],
+    [" quan ", " q "],
+    [" svwy ", " swy "],
+    [" shwy ", " sh "],
+    [" road ", " rd "],
+    ["sankt ", " st "],
+    [" quay ", " qy "],
+    ["plats ", " pl "],
+    [" rise ", " ri "],
+    [" berg ", " bg "],
+    [" tcty ", " tct "],
+    [" viad ", " via "],
+    [" view ", " vw "],
+    [" vdct ", " via "],
+    [" vale ", " v "],
+    [" avda ", " av "],
+    [" grad ", " ghr "],
+    [" walk ", " wlk "],
+    [" west ", " w "],
+    [" yard ", " yd "],
+    [" blok ", " bl "],
+    [" terr ", " ter "],
+    [" cmno ", " cno "],
+    [" stra ", " st "],
+    [" thfr ", " thor "],
+    [" turn ", " tn "],
+    [" tpke ", " tpk "],
+    [" burg ", " bg "],
+    ["vayla ", " vla "],
+    ["vagen ", " v "],
+    [" tori ", " tr "],
+    ["gatan ", " g "],
+    ["grand ", " gr "],
+    [" pass ", " ps "],
+    [" pkwy ", " pwy "],
+    [" park ", " pk "],
+    ["rinne ", " rn "],
+    [" mtwy ", " mwy "],
+    [" mndr ", " mr "],
+    [" kyla ", " kl "],
+    [" kuja ", " kj "],
+    ["platz ", " pl "],
+    ["ranta ", " rt "],
+    [" mile ", " mi "],
+    [" pfad ", " p "],
+    [" mews ", " m "],
+    ["polku ", " p "],
+    [" psge ", " ps "],
+    [" plza ", " pl "],
+    ["ostra ", " o "],
+    ["gamla ", " gla "],
+    [" stig ", " st "],
+    ["norra ", " n "],
+    ["sodra ", " s "],
+    [" pike ", " pk "],
+    [" dorf ", " df "],
+    [" piaz ", " p za "],
+    [" phwy ", " pway "],
+    ["pfad ", " p "],
+    [" mnt ", " mt "],
+    ["gata ", " g "],
+    [" bhf ", " bf "],
+    [" bad ", " b "],
+    ["gate ", " g "],
+    [" zum ", " z "],
+    ["stig ", " st "],
+    [" blv ", " bd "],
+    ["kuja ", " kj "],
+    [" bul ", " bd "],
+    [" str ", " st "],
+    ["alue ", " al "],
+    [" cen ", " ctr "],
+    [" ave ", " av "],
+    ["kyla ", " kl "],
+    [" ale ", " al "],
+    [" spl ", " sp "],
+    [" all ", " al "],
+    [" k s ", " ks "],
+    [" aly ", " al "],
+    ["dorf ", " df "],
+    [" bvd ", " bd "],
+    [" vag ", " v "],
+    [" iii ", " 3 "],
+    [" tie ", " t "],
+    [" sok ", " sk "],
+    ["burg ", " bg "],
+    ["katu ", " k "],
+    ["berg ", " bg "],
+    ["tori ", " tr "],
+    [" kte ", " k "],
+    [" gro ", " gr "],
+    [" grn ", " gn "],
+    [" gld ", " gl "],
+    [" san ", " s "],
+    [" hse ", " ho "],
+    [" gte ", " g "],
+    [" rte ", " rt "],
+    [" rue ", " r "],
+    [" che ", " ch "],
+    [" pas ", " ps "],
+    [" plz ", " pl "],
+    [" pnt ", " pt "],
+    [" pky ", " pwy "],
+    [" pza ", " pl "],
+    [" rvr ", " r "],
+    [" riv ", " r "],
+    [" lit ", " lt "],
+    [" p k ", " pk "],
+    [" lwr ", " lr "],
+    [" low ", " lr "],
+    [" sth ", " s "],
+    [" crk ", " cr "],
+    ["pres ", " pres "],
+    ["laan ", " ln "],
+    [" bda ", " b "],
+    [" vei ", " v "],
+    [" via ", " v "],
+    [" way ", " wy "],
+    [" upr ", " up "],
+    [" avd ", " av "],
+    [" crt ", " ct "],
+    ["stwg ", " stwg "],
+    ["sint ", " st "],
+    [" v d ", " vd "],
+    [" van ", " v "],
+    [" drv ", " dr "],
+    [" tce ", " ter "],
+    [" va ", " v "],
+    [" oa ", " o "],
+    [" sa ", " s "],
+    [" na ", " n "],
+    ["bgm ", " bgm "],
+    [" nw ", " n "],
+    ["vag ", " v "],
+    [" im ", " 1 "],
+    ["vla ", " vla "],
+    ["gla ", " gla "],
+    [" am ", " a "],
+    [" ph ", " p "],
+    ["rue ", " r "],
+    [" ga ", " g "],
+    ["ste ", " ste "],
+    ["str ", " st "],
+    [" cl ", " c "],
+    [" vn ", " v "],
+    [" gt ", " g "],
+    ["vei ", " v "],
+    ["vlt ", " vlt "],
+    [" ce ", " cv "],
+    [" ii ", " 2 "],
+    ["pln ", " pln "],
+    ["olv ", " olv "],
+    ["mkt ", " mkt "],
+    ["tvl ", " tvl "],
+    [" ob ", " o "],
+    ["pgr ", " pgr "],
+    [" in ", " 1 "],
+    [" mw ", " m "],
+    ["kri ", " kri "],
+    ["pko ", " pko "],
+    ["auk ", " auk "],
+    ["tie ", " t "],
+    [" i ", " 1 "]
+  ]
+}
diff --git a/test/bdd/api/search/queries.feature b/test/bdd/api/search/queries.feature

index ea353f4568ad46f83f8d62d7511ef400e7234e67..6d697ef96fe7ea3a3cded93f086d8e4f96ab1174 100644 (file)
--- a/test/bdd/api/search/queries.feature
+++ b/test/bdd/api/search/queries.feature
@@ -163,7 +163,7 @@ Feature: Search queries
          Then exactly 0 results are returned
  
      Scenario: Ignore country searches when query is restricted to countries
-        When sending json search query "de"
+        When sending json search query "fr"
              | countrycodes |
              | li  |
          Then exactly 0 results are returned
diff --git a/test/bdd/db/import/naming.feature b/test/bdd/db/import/naming.feature

index f3019e2a52fe9e6e03b65e190443da5633e5a51a..bb29d2a314a46f7f917e298c4cf37fa311eb340c 100644 (file)
--- a/test/bdd/db/import/naming.feature
+++ b/test/bdd/db/import/naming.feature
@@ -37,3 +37,24 @@ Feature: Import and search of names
          Then placex contains
            | object | country_code | name   | name+name:fi | name+name:de |
            | N1     | de           | german | finnish      | local        |
+
+    Scenario Outline: Names in any script can be found
+        Given the places
+            | osm | class | type   | name   |
+            | N1  | place | hamlet | <name> |
+        When importing
+        And sending search query "<name>"
+        Then results contain
+            | osm |
+            | N1  |
+
+     Examples:
+        | name |
+        | Berlin |
+        | 北京 |
+        | Вологда |
+        | Αθήνα |
+        | القاهرة |
+        | រាជធានីភ្នំពេញ |
+        | 東京都 |
+        | ပုဗ္ဗသီရိ |
diff --git a/test/bdd/db/import/rank_computation.feature b/test/bdd/db/import/rank_computation.feature

index 0fe440ce835210144b9046afd7c7e641f8f70e9b..c8b5de5cf9c7d6b9b137c24baa586a2e5c18e838 100644 (file)
--- a/test/bdd/db/import/rank_computation.feature
+++ b/test/bdd/db/import/rank_computation.feature
@@ -4,22 +4,22 @@ Feature: Rank assignment
  
      Scenario: Ranks for place nodes are assigned according to their type
          Given the named places
-          | osm  | class     | type      |
-          | N1   | foo       | bar       |
-          | N11  | place     | Continent |
-          | N12  | place     | continent |
-          | N13  | place     | sea       |
-          | N14  | place     | country   |
-          | N15  | place     | state     |
-          | N16  | place     | region    |
-          | N17  | place     | county    |
-          | N18  | place     | city      |
-          | N19  | place     | island    |
-          | N36  | place     | house               |
-          | N38  | place     | houses              |
+          | osm  | class     | type      | geometry |
+          | N1   | foo       | bar       | 0 0 |
+          | N11  | place     | Continent | 0 0 |
+          | N12  | place     | continent | 0 0 |
+          | N13  | place     | sea       | 0 0 |
+          | N14  | place     | country   | 0 0 |
+          | N15  | place     | state     | 0 0 |
+          | N16  | place     | region    | 0 0 |
+          | N17  | place     | county    | 0 0 |
+          | N18  | place     | city      | 0 0 |
+          | N19  | place     | island    | 0 0 |
+          | N36  | place     | house     | 0 0 |
+          | N38  | place     | houses    | 0 0 |
          And the named places
-          | osm  | class     | type      | extra+capital |
-          | N101 | place     | city      | yes |
+          | osm  | class     | type      | extra+capital | geometry |
+          | N101 | place     | city      | yes           | 0 0 |
          When importing
          Then placex contains
            | object | rank_search | rank_address |
diff --git a/test/bdd/db/import/search_name.feature b/test/bdd/db/import/search_name.feature

index 0e922e1d6b503716f649ff8b13409adf21bd9a1f..fd207059408fc96092b537149c0f83e807a5260c 100644 (file)
--- a/test/bdd/db/import/search_name.feature
+++ b/test/bdd/db/import/search_name.feature
@@ -24,7 +24,7 @@ Feature: Creation of search terms
          When importing
          Then search_name contains
           | object | nameaddress_vector |
-         | N1     | Rose, Street, Walltown |
+         | N1     | #Rose Street, Walltown |
          When searching for "23 Rose Street, Walltown"
          Then results contain
           | osm_type | osm_id | name |
@@ -248,7 +248,7 @@ Feature: Creation of search terms
          When importing
          Then search_name contains
           | object | name_vector | nameaddress_vector |
-         | N1     | #Green Moss | Rose, Street, Walltown |
+         | N1     | #Green Moss | #Rose Street, Walltown |
          When searching for "Green Moss, Rose Street, Walltown"
          Then results contain
           | osm_type | osm_id | name |
@@ -299,7 +299,7 @@ Feature: Creation of search terms
          When importing
          Then search_name contains
           | object | name_vector | nameaddress_vector |
-         | N1     | foo         | the road |
+         | N1     | foo         | #the road |
  
      Scenario: Some addr: tags are added to address
          Given the scene roads-with-pois
diff --git a/test/bdd/environment.py b/test/bdd/environment.py

index 30ea30a261c873cd10fecf334ce1dca9a5d90cf6..f179c8f13da343283b0aaf4deb855587a471cd6f 100644 (file)
--- a/test/bdd/environment.py
+++ b/test/bdd/environment.py
@@ -20,6 +20,7 @@ userconfig = {
      'API_TEST_DB' : 'test_api_nominatim',
      'API_TEST_FILE'  : (TEST_BASE_DIR / 'testdb' / 'apidb-test-data.pbf').resolve(),
      'SERVER_MODULE_PATH' : None,
+    'TOKENIZER' : None, # Test with a custom tokenizer
      'PHPCOV' : False, # set to output directory to enable code coverage
  }
  
diff --git a/test/bdd/steps/http_responses.py b/test/bdd/steps/http_responses.py

index beafcd9e1ee16773294f8279d8dd26ce080a3d6e..247a397bd47b1ed7c7d5e886e4c95e1811759973 100644 (file)
--- a/test/bdd/steps/http_responses.py
+++ b/test/bdd/steps/http_responses.py
@@ -8,6 +8,8 @@ import xml.etree.ElementTree as ET
  
  from check_functions import Almost
  
+OSM_TYPE = {'N' : 'node', 'W' : 'way', 'R' : 'relation'}
+
  def _geojson_result_to_json_result(geojson_result):
      result = geojson_result['properties']
      result['geojson'] = geojson_result['geometry']
@@ -131,7 +133,11 @@ class GenericResponse:
                  if name == 'ID':
                      pass
                  elif name == 'osm':
-                    self.assert_field(i, 'osm_type', value[0])
+                    assert 'osm_type' in self.result[i], \
+                           "Result row {} has no field 'osm_type'.\nFull row: {}"\
+                               .format(i, json.dumps(self.result[i], indent=4))
+                    assert self.result[i]['osm_type'] in (OSM_TYPE[value[0]], value[0]), \
+                           BadRowValueAssert(self, i, 'osm_type', value)
                      self.assert_field(i, 'osm_id', value[1:])
                  elif name == 'centroid':
                      lon, lat = value.split(' ')
diff --git a/test/bdd/steps/nominatim_environment.py b/test/bdd/steps/nominatim_environment.py

index 6381e4b4a2da8865c89e711910458a2a7fb13d74..de02e3460b420a8991c49be208b5171b339463c1 100644 (file)
--- a/test/bdd/steps/nominatim_environment.py
+++ b/test/bdd/steps/nominatim_environment.py
@@ -10,6 +10,7 @@ sys.path.insert(1, str((Path(__file__) / '..' / '..' / '..' / '..').resolve()))
  from nominatim import cli
  from nominatim.config import Configuration
  from nominatim.tools import refresh
+from nominatim.tokenizer import factory as tokenizer_factory
  from steps.utils import run_script
  
  class NominatimEnvironment:
@@ -27,6 +28,7 @@ class NominatimEnvironment:
          self.test_db = config['TEST_DB']
          self.api_test_db = config['API_TEST_DB']
          self.api_test_file = config['API_TEST_FILE']
+        self.tokenizer = config['TOKENIZER']
          self.server_module_path = config['SERVER_MODULE_PATH']
          self.reuse_template = not config['REMOVE_TEMPLATE']
          self.keep_scenario_db = config['KEEP_TEST_DB']
@@ -95,6 +97,8 @@ class NominatimEnvironment:
          self.test_env['NOMINATIM_DATABASE_MODULE_SRC_PATH'] = str((self.build_dir / 'module').resolve())
          self.test_env['NOMINATIM_OSM2PGSQL_BINARY'] = str((self.build_dir / 'osm2pgsql' / 'osm2pgsql').resolve())
          self.test_env['NOMINATIM_NOMINATIM_TOOL'] = str((self.build_dir / 'nominatim').resolve())
+        if self.tokenizer is not None:
+            self.test_env['NOMINATIM_TOKENIZER'] = self.tokenizer
  
          if self.server_module_path:
              self.test_env['NOMINATIM_DATABASE_MODULE_PATH'] = self.server_module_path
@@ -106,9 +110,19 @@ class NominatimEnvironment:
              self.website_dir.cleanup()
  
          self.website_dir = tempfile.TemporaryDirectory()
-        cfg = Configuration(None, self.src_dir / 'settings', environ=self.test_env)
-        cfg.lib_dir.php = self.src_dir / 'lib-php'
-        refresh.setup_website(Path(self.website_dir.name) / 'website', cfg)
+        refresh.setup_website(Path(self.website_dir.name) / 'website',
+                              self.get_test_config())
+
+
+    def get_test_config(self):
+        cfg = Configuration(Path(self.website_dir.name), self.src_dir / 'settings',
+                            environ=self.test_env)
+        cfg.set_libdirs(module=self.build_dir / 'module',
+                        osm2pgsql=self.build_dir / 'osm2pgsql' / 'osm2pgsql',
+                        php=self.src_dir / 'lib-php',
+                        sql=self.src_dir / 'lib-sql',
+                        data=self.src_dir / 'data')
+        return cfg
  
      def get_libpq_dsn(self):
          dsn = self.test_env['NOMINATIM_DATABASE_DSN']
@@ -169,33 +183,49 @@ class NominatimEnvironment:
          """
          self.write_nominatim_config(self.api_test_db)
  
-        if self.api_db_done:
-            return
+        if not self.api_db_done:
+            self.api_db_done = True
  
-        self.api_db_done = True
-
-        if self._reuse_or_drop_db(self.api_test_db):
-            return
+            if not self._reuse_or_drop_db(self.api_test_db):
+                testdata = Path('__file__') / '..' / '..' / 'testdb'
+                self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
  
-        testdata = Path('__file__') / '..' / '..' / 'testdb'
-        self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
+                try:
+                    self.run_nominatim('import', '--osm-file', str(self.api_test_file))
+                    if self.tokenizer != 'legacy_icu':
+                        self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
+                    self.run_nominatim('freeze')
  
-        try:
-            self.run_nominatim('import', '--osm-file', str(self.api_test_file))
-            self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
-            self.run_nominatim('freeze')
+                    if self.tokenizer != 'legacy_icu':
+                        phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
+                        run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
+                    else:
+                        # XXX Temporary use the wiki while there is no CSV import
+                        # available.
+                        self.test_env['NOMINATIM_LANGUAGES'] = 'en'
+                        self.run_nominatim('special-phrases', '--import-from-wiki')
+                        del self.test_env['NOMINATIM_LANGUAGES']
+                except:
+                    self.db_drop_database(self.api_test_db)
+                    raise
  
-            phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
-            run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
-        except:
-            self.db_drop_database(self.api_test_db)
-            raise
+        tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
  
  
      def setup_unknown_db(self):
          """ Setup a test against a non-existing database.
          """
-        self.write_nominatim_config('UNKNOWN_DATABASE_NAME')
+        # The tokenizer needs an existing database to function.
+        # So start with the usual database
+        class _Context:
+            db = None
+
+        context = _Context()
+        self.setup_db(context)
+        tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
+
+        # Then drop the DB again
+        self.teardown_db(context, force_drop=True)
  
      def setup_db(self, context):
          """ Setup a test against a fresh, empty test database.
@@ -212,13 +242,13 @@ class NominatimEnvironment:
          context.db.autocommit = True
          psycopg2.extras.register_hstore(context.db, globally=False)
  
-    def teardown_db(self, context):
+    def teardown_db(self, context, force_drop=False):
          """ Remove the test database, if it exists.
          """
-        if 'db' in context:
+        if hasattr(context, 'db'):
              context.db.close()
  
-        if not self.keep_scenario_db:
+        if force_drop or not self.keep_scenario_db:
              self.db_drop_database(self.test_db)
  
      def _reuse_or_drop_db(self, name):
diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py

index 72a610eb123733db313ee74d510b633afdac5fb3..6d7bc188905c597211110c13b92952e336bab088 100644 (file)
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@@ -7,6 +7,7 @@ from place_inserter import PlaceColumn
  from table_compare import NominatimID, DBRow
  
  from nominatim.indexer import indexer
+from nominatim.tokenizer import factory as tokenizer_factory
  
  def check_database_integrity(context):
      """ Check some generic constraints on the tables.
@@ -86,6 +87,9 @@ def add_data_to_planet_ways(context):
  def import_and_index_data_from_place_table(context):
      """ Import data previously set up in the place table.
      """
+    nctx = context.nominatim
+
+    tokenizer = tokenizer_factory.create_tokenizer(nctx.get_test_config())
      context.nominatim.copy_from_place(context.db)
  
      # XXX use tool function as soon as it is ported
@@ -105,7 +109,7 @@ def import_and_index_data_from_place_table(context):
  
      # Call directly as the refresh function does not include postcodes.
      indexer.LOG.setLevel(logging.ERROR)
-    indexer.Indexer(context.nominatim.get_libpq_dsn(), 1).index_full(analyse=False)
+    indexer.Indexer(context.nominatim.get_libpq_dsn(), tokenizer, 1).index_full(analyse=False)
  
      check_database_integrity(context)
  
@@ -195,44 +199,35 @@ def check_search_name_contents(context, exclude):
          have an identifier of the form '<NRW><osm id>[:<class>]'. All
          expected rows are expected to be present with at least one database row.
      """
-    with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        for row in context.table:
-            nid = NominatimID(row['object'])
-            nid.row_by_place_id(cur, 'search_name',
-                                ['ST_X(centroid) as cx', 'ST_Y(centroid) as cy'])
-            assert cur.rowcount > 0, "No rows found for " + row['object']
+    tokenizer = tokenizer_factory.get_tokenizer_for_db(context.nominatim.get_test_config())
+
+    with tokenizer.name_analyzer() as analyzer:
+        with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+            for row in context.table:
+                nid = NominatimID(row['object'])
+                nid.row_by_place_id(cur, 'search_name',
+                                    ['ST_X(centroid) as cx', 'ST_Y(centroid) as cy'])
+                assert cur.rowcount > 0, "No rows found for " + row['object']
+
+                for res in cur:
+                    db_row = DBRow(nid, res, context)
+                    for name, value in zip(row.headings, row.cells):
+                        if name in ('name_vector', 'nameaddress_vector'):
+                            items = [x.strip() for x in value.split(',')]
+                            tokens = analyzer.get_word_token_info(context.db, items)
  
-            for res in cur:
-                db_row = DBRow(nid, res, context)
-                for name, value in zip(row.headings, row.cells):
-                    if name in ('name_vector', 'nameaddress_vector'):
-                        items = [x.strip() for x in value.split(',')]
-                        with context.db.cursor() as subcur:
-                            subcur.execute(""" SELECT word_id, word_token
-                                               FROM word, (SELECT unnest(%s::TEXT[]) as term) t
-                                               WHERE word_token = make_standard_name(t.term)
-                                                     and class is null and country_code is null
-                                                     and operator is null
-                                              UNION
-                                               SELECT word_id, word_token
-                                               FROM word, (SELECT unnest(%s::TEXT[]) as term) t
-                                               WHERE word_token = ' ' || make_standard_name(t.term)
-                                                     and class is null and country_code is null
-                                                     and operator is null
-                                           """,
-                                           (list(filter(lambda x: not x.startswith('#'), items)),
-                                            list(filter(lambda x: x.startswith('#'), items))))
                              if not exclude:
-                                assert subcur.rowcount >= len(items), \
-                                    "No word entry found for {}. Entries found: {!s}".format(value, subcur.rowcount)
-                            for wid in subcur:
-                                present = wid[0] in res[name]
+                                assert len(tokens) >= len(items), \
+                                       "No word entry found for {}. Entries found: {!s}".format(value, len(tokens))
+                            for word, token, wid in tokens:
                                  if exclude:
-                                    assert not present, "Found term for {}/{}: {}".format(row['object'], name, wid[1])
+                                    assert wid not in res[name], \
+                                           "Found term for {}/{}: {}".format(nid, name, wid)
                                  else:
-                                    assert present, "Missing term for {}/{}: {}".fromat(row['object'], name, wid[1])
-                    elif name != 'object':
-                        assert db_row.contains(name, value), db_row.assert_msg(name, value)
+                                    assert wid in res[name], \
+                                           "Missing term for {}/{}: {}".format(nid, name, wid)
+                        elif name != 'object':
+                            assert db_row.contains(name, value), db_row.assert_msg(name, value)
  
  @then("search_name has no entry for (?P<oid>.*)")
  def check_search_name_has_entry(context, oid):
diff --git a/test/php/Nominatim/PhraseTest.php b/test/php/Nominatim/PhraseTest.php

index 42166e34f330da72a06f2089e6e99d0a4b73db21..e4c2bbd1be0f80130b66a018e0ee0790f217ab4f 100644 (file)
--- a/test/php/Nominatim/PhraseTest.php
+++ b/test/php/Nominatim/PhraseTest.php
@@ -44,19 +44,16 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
      public function testEmptyPhrase()
      {
          $oPhrase = new Phrase('', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array(), new TokensFullSet());
  
-        $this->assertEquals(
-            array(array('')),
-            $oPhrase->getWordSets()
-        );
+        $this->assertNull($oPhrase->getWordSets());
      }
  
  
      public function testSingleWordPhrase()
      {
          $oPhrase = new Phrase('a', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a'), new TokensFullSet());
  
          $this->assertEquals(
              '(a)',
@@ -68,21 +65,21 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
      public function testMultiWordPhrase()
      {
          $oPhrase = new Phrase('a b', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a', 'b'), new TokensFullSet());
          $this->assertEquals(
              '(a b),(a|b)',
              $this->serializeSets($oPhrase->getWordSets())
          );
  
          $oPhrase = new Phrase('a b c', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
          $this->assertEquals(
              '(a b c),(a|b c),(a b|c),(a|b|c)',
              $this->serializeSets($oPhrase->getWordSets())
          );
  
          $oPhrase = new Phrase('a b c d', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensFullSet());
          $this->assertEquals(
              '(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)',
              $this->serializeSets($oPhrase->getWordSets())
@@ -93,7 +90,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
      public function testInverseWordSets()
      {
          $oPhrase = new Phrase('a b c', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
          $oPhrase->invertWordSets();
  
          $this->assertEquals(
@@ -105,14 +102,16 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
  
      public function testMaxWordSets()
      {
-        $oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $aWords = array_fill(0, 4, 'a');
+        $oPhrase = new Phrase(join(' ', $aWords), '');
+        $oPhrase->computeWordSets($aWords, new TokensFullSet());
          $this->assertEquals(8, count($oPhrase->getWordSets()));
          $oPhrase->invertWordSets();
          $this->assertEquals(8, count($oPhrase->getWordSets()));
  
-        $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $aWords = array_fill(0, 18, 'a');
+        $oPhrase = new Phrase(join(' ', $aWords), '');
+        $oPhrase->computeWordSets($aWords, new TokensFullSet());
          $this->assertEquals(100, count($oPhrase->getWordSets()));
          $oPhrase->invertWordSets();
          $this->assertEquals(100, count($oPhrase->getWordSets()));
@@ -122,7 +121,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
      public function testPartialTokensShortTerm()
      {
          $oPhrase = new Phrase('a b c d', '');
-        $oPhrase->computeWordSets(new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
+        $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
          $this->assertEquals(
              '(a|b c d),(a|b c|d)',
              $this->serializeSets($oPhrase->getWordSets())
@@ -132,8 +131,9 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
  
      public function testPartialTokensLongTerm()
      {
-        $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
-        $oPhrase->computeWordSets(new TokensPartialSet(array('a', 'a a a a a')));
+        $aWords = array_fill(0, 18, 'a');
+        $oPhrase = new Phrase(join(' ', $aWords), '');
+        $oPhrase->computeWordSets($aWords, new TokensPartialSet(array('a', 'a a a a a')));
          $this->assertEquals(80, count($oPhrase->getWordSets()));
      }
  }
diff --git a/test/php/Nominatim/StatusTest.php b/test/php/Nominatim/StatusTest.php

index 8cb8a703a07f7d186603eb58a6d0862e0b7cf47c..9e03a970fd2aa1211c234460918dd119cf6a7cc0 100644 (file)
--- a/test/php/Nominatim/StatusTest.php
+++ b/test/php/Nominatim/StatusTest.php
@@ -2,6 +2,8 @@
  
  namespace Nominatim;
  
+@define('CONST_TokenizerDir', dirname(__FILE__));
+
  require_once(CONST_LibDir.'/DB.php');
  require_once(CONST_LibDir.'/Status.php');
  
@@ -40,45 +42,6 @@ class StatusTest extends \PHPUnit\Framework\TestCase
          $this->assertEquals('No database', $oStatus->status());
      }
  
-
-    public function testModuleFail()
-    {
-        $this->expectException(\Exception::class);
-        $this->expectExceptionMessage('Module call failed');
-        $this->expectExceptionCode(702);
-
-        // stub has getOne method but doesn't return anything
-        $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
-                        ->setMethods(array('connect', 'getOne'))
-                        ->getMock();
-
-        $oStatus = new Status($oDbStub);
-        $this->assertNull($oStatus->status());
-    }
-
-
-    public function testWordIdQueryFail()
-    {
-        $this->expectException(\Exception::class);
-        $this->expectExceptionMessage('No value');
-        $this->expectExceptionCode(704);
-
-        $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
-                        ->setMethods(array('connect', 'getOne'))
-                        ->getMock();
-
-        // return no word_id
-        $oDbStub->method('getOne')
-                ->will($this->returnCallback(function ($sql) {
-                    if (preg_match("/make_standard_name\('a'\)/", $sql)) return 'a';
-                    if (preg_match('/SELECT word_id, word_token/', $sql)) return null;
-                }));
-
-        $oStatus = new Status($oDbStub);
-        $this->assertNull($oStatus->status());
-    }
-
-
      public function testOK()
      {
          $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
@@ -100,7 +63,7 @@ class StatusTest extends \PHPUnit\Framework\TestCase
          $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
                          ->setMethods(array('getOne'))
                          ->getMock();
-     
+
          $oDbStub->method('getOne')
                  ->willReturn(1519430221);
  
diff --git a/test/php/Nominatim/TokenListTest.php b/test/php/Nominatim/TokenListTest.php

index 14a595ea9f3c115da1e8063e3436739174699e18..f0139d7670b4dd246bf3797aa2b49de6704918a6 100644 (file)
--- a/test/php/Nominatim/TokenListTest.php
+++ b/test/php/Nominatim/TokenListTest.php
@@ -49,88 +49,4 @@ class TokenTest extends \PHPUnit\Framework\TestCase
          $this->assertFalse($TL->contains('unknownword'));
          $this->assertEquals(array(), $TL->get('unknownword'));
      }
-
-    public function testAddress()
-    {
-        $this->expectOutputRegex('/<p><tt>/');
-
-        $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
-                        ->setMethods(array('getAll', 'getDBQuotedList'))
-                        ->getMock();
-
-        $oDbStub->method('getDBQuotedList')
-                ->will($this->returnCallback(function ($aVals) {
-                    return array_map(function ($sVal) {
-                        return "'".$sVal."'";
-                    }, $aVals);
-                }));
-
-
-        $oDbStub->method('getAll')
-                ->will($this->returnCallback(function ($sql) {
-                    $aResults = array();
-                    if (preg_match('/1051/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => '1051',
-                                                         'class' => 'place',
-                                                         'type' => 'house'
-                                                        ));
-                    }
-                    if (preg_match('/hauptstr/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => 'hauptstr',
-                                                         'class' => 'place',
-                                                         'type' => 'street',
-                                                         'operator' => true
-                                                        ));
-                    }
-                    if (preg_match('/64286/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => '64286',
-                                                         'word' => '64286',
-                                                         'class' => 'place',
-                                                         'type' => 'postcode'
-                                                        ));
-                    }
-                    if (preg_match('/darmstadt/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => 'darmstadt',
-                                                         'count' => 533
-                                                        ));
-                    }
-                    if (preg_match('/alemagne/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => 'alemagne',
-                                                         'country_code' => 'de',
-                                                        ));
-                    }
-                    if (preg_match('/mexico/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => 'mexico',
-                                                         'country_code' => 'mx',
-                                                        ));
-                    }
-                    return $aResults;
-                }));
-
-        $aCountryCodes = array('de', 'fr');
-        $sNormQuery = '1051 hauptstr 64286 darmstadt alemagne mexico';
-        $aTokens = explode(' ', $sNormQuery);
-
-        $TL = new TokenList;
-        $TL->addTokensFromDB($oDbStub, $aTokens, $aCountryCodes, $sNormQuery, $this->oNormalizer);
-        $this->assertEquals(5, $TL->count());
-
-        $this->assertEquals(array(new Token\HouseNumber(999, '1051')), $TL->get('1051'));
-        $this->assertEquals(array(new Token\Country(999, 'de')), $TL->get('alemagne'));
-        $this->assertEquals(array(new Token\Postcode(999, '64286')), $TL->get('64286'));
-        $this->assertEquals(array(new Token\Word(999, true, 533, 0)), $TL->get('darmstadt'));
-        $this->assertEquals(array(new Token\SpecialTerm(999, 'place', 'street', true)), $TL->get('hauptstr'));
-    }
  }
diff --git a/test/php/Nominatim/tokenizer.php b/test/php/Nominatim/tokenizer.php

new file mode 100644 (file)

index 0000000..0735e66
--- /dev/null
+++ b/test/php/Nominatim/tokenizer.php
@@ -0,0 +1,17 @@
+<?php
+
+namespace Nominatim;
+
+class Tokenizer
+{
+    private $oDB;
+
+    public function __construct(&$oDB)
+    {
+        $this->oDB =& $oDB;
+    }
+
+    public function checkStatus()
+    {
+    }
+}
diff --git a/test/python/conftest.py b/test/python/conftest.py

index 4b9749c01f4c2f2d9159f30fc53961107705ce90..493620c45ece19c5abf3a7477e41886f7a36c192 100644 (file)
--- a/test/python/conftest.py
+++ b/test/python/conftest.py
@@ -1,3 +1,4 @@
+import importlib
  import itertools
  import sys
  from pathlib import Path
@@ -15,6 +16,9 @@ sys.path.insert(0, str(SRC_DIR.resolve()))
  from nominatim.config import Configuration
  from nominatim.db import connection
  from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.db import properties
+
+import dummy_tokenizer
  
  class _TestingCursor(psycopg2.extras.DictCursor):
      """ Extension to the DictCursor class that provides execution
@@ -117,9 +121,8 @@ def table_factory(temp_db_cursor):
      def mk_table(name, definition='id INT', content=None):
          temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition))
          if content is not None:
-            if not isinstance(content, str):
-                content = '),('.join([str(x) for x in content])
-            temp_db_cursor.execute("INSERT INTO {} VALUES ({})".format(name, content))
+            psycopg2.extras.execute_values(
+                temp_db_cursor, "INSERT INTO {} VALUES %s".format(name), content)
  
      return mk_table
  
@@ -144,6 +147,11 @@ def tmp_phplib_dir():
  
          yield Path(phpdir)
  
+
+@pytest.fixture
+def property_table(table_factory):
+    table_factory('nominatim_properties', 'property TEXT, value TEXT')
+
  @pytest.fixture
  def status_table(temp_db_conn):
      """ Create an empty version of the status table and
@@ -281,10 +289,29 @@ def osm2pgsql_options(temp_db):
  
  @pytest.fixture
  def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
-    monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
-    table_factory('country_name', 'partition INT', (0, 1, 2))
+    table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
      cfg = Configuration(None, SRC_DIR.resolve() / 'settings')
      cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php',
                      sql=tmp_path, data=SRC_DIR / 'data')
  
      return SQLPreprocessor(temp_db_conn, cfg)
+
+
+@pytest.fixture
+def tokenizer_mock(monkeypatch, property_table, temp_db_conn, tmp_path):
+    """ Sets up the configuration so that the test dummy tokenizer will be
+        loaded when the tokenizer factory is used. Also returns a factory
+        with which a new dummy tokenizer may be created.
+    """
+    monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
+
+    def _import_dummy(module, *args, **kwargs):
+        return dummy_tokenizer
+
+    monkeypatch.setattr(importlib, "import_module", _import_dummy)
+    properties.set_property(temp_db_conn, 'tokenizer', 'dummy')
+
+    def _create_tokenizer():
+        return dummy_tokenizer.DummyTokenizer(None, None)
+
+    return _create_tokenizer
diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py

new file mode 100644 (file)

index 0000000..6352a64
--- /dev/null
+++ b/test/python/dummy_tokenizer.py
@@ -0,0 +1,64 @@
+"""
+Tokenizer for testing.
+"""
+
+def create(dsn, data_dir):
+    """ Create a new instance of the tokenizer provided by this module.
+    """
+    return DummyTokenizer(dsn, data_dir)
+
+class DummyTokenizer:
+
+    def __init__(self, dsn, data_dir):
+        self.dsn = dsn
+        self.data_dir = data_dir
+        self.init_state = None
+        self.analyser_cache = {}
+
+
+    def init_new_db(self, *args, **kwargs):
+        assert self.init_state == None
+        self.init_state = "new"
+
+
+    def init_from_project(self):
+        assert self.init_state == None
+        self.init_state = "loaded"
+
+
+    def finalize_import(self, _):
+        pass
+
+
+    def name_analyzer(self):
+        return DummyNameAnalyzer(self.analyser_cache)
+
+
+class DummyNameAnalyzer:
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+
+    def __init__(self, cache):
+        self.analyser_cache = cache
+        cache['countries'] = []
+
+
+    def close(self):
+        pass
+
+    def add_postcodes_from_db(self):
+        pass
+
+    def update_special_phrases(self, phrases):
+        self.analyser_cache['special_phrases'] = phrases
+
+    def add_country_names(self, code, names):
+        self.analyser_cache['countries'].append((code, names))
+
+    def process_place(self, place):
+        return {}
diff --git a/test/python/test_cli.py b/test/python/test_cli.py

index afa01e575c497ca70805e6c26835ef32da3f57cd..a286995611ddf524c54ae5c1657b3e3e15bf55c2 100644 (file)
--- a/test/python/test_cli.py
+++ b/test/python/test_cli.py
@@ -22,6 +22,7 @@ import nominatim.tools.database_import
  import nominatim.tools.freeze
  import nominatim.tools.refresh
  import nominatim.tools.postcodes
+import nominatim.tokenizer.factory
  
  from mocks import MockParamCapture
  
@@ -56,6 +57,28 @@ def mock_func_factory(monkeypatch):
      return get_mock
  
  
+@pytest.fixture
+def tokenizer_mock(monkeypatch):
+    class DummyTokenizer:
+        def __init__(self, *args, **kwargs):
+            self.update_sql_functions_called = False
+            self.finalize_import_called = False
+
+        def update_sql_functions(self, *args):
+            self.update_sql_functions_called = True
+
+        def finalize_import(self, *args):
+            self.finalize_import_called = True
+
+    tok = DummyTokenizer()
+    monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
+                        lambda *args: tok)
+    monkeypatch.setattr(nominatim.tokenizer.factory, 'create_tokenizer' ,
+                        lambda *args: tok)
+
+    return tok
+
+
  def test_cli_help(capsys):
      """ Running nominatim tool without arguments prints help.
      """
@@ -84,10 +107,9 @@ def test_import_bad_file(temp_db):
      assert 1 == call_nominatim('import', '--osm-file', '.')
  
  
-def test_import_full(temp_db, mock_func_factory):
+def test_import_full(temp_db, mock_func_factory, tokenizer_mock):
      mocks = [
          mock_func_factory(nominatim.tools.database_import, 'setup_database_skeleton'),
-        mock_func_factory(nominatim.tools.database_import, 'install_module'),
          mock_func_factory(nominatim.tools.database_import, 'import_osm_data'),
          mock_func_factory(nominatim.tools.refresh, 'import_wikipedia_articles'),
          mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),
@@ -107,6 +129,7 @@ def test_import_full(temp_db, mock_func_factory):
      cf_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
  
      assert 0 == call_nominatim('import', '--osm-file', __file__)
+    assert tokenizer_mock.finalize_import_called
  
      assert cf_mock.called > 1
  
@@ -114,7 +137,7 @@ def test_import_full(temp_db, mock_func_factory):
          assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
  
  
-def test_import_continue_load_data(temp_db, mock_func_factory):
+def test_import_continue_load_data(temp_db, mock_func_factory, tokenizer_mock):
      mocks = [
          mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),
          mock_func_factory(nominatim.tools.database_import, 'load_data'),
@@ -127,12 +150,14 @@ def test_import_continue_load_data(temp_db, mock_func_factory):
      ]
  
      assert 0 == call_nominatim('import', '--continue', 'load-data')
+    assert tokenizer_mock.finalize_import_called
  
      for mock in mocks:
          assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
  
  
-def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp_db_conn):
+def test_import_continue_indexing(temp_db, mock_func_factory, placex_table,
+                                  temp_db_conn, tokenizer_mock):
      mocks = [
          mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
          mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
@@ -153,7 +178,7 @@ def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp
      assert temp_db_conn.index_exists('idx_placex_pendingsector')
  
  
-def test_import_continue_postprocess(temp_db, mock_func_factory):
+def test_import_continue_postprocess(temp_db, mock_func_factory, tokenizer_mock):
      mocks = [
          mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
          mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
@@ -163,6 +188,8 @@ def test_import_continue_postprocess(temp_db, mock_func_factory):
  
      assert 0 == call_nominatim('import', '--continue', 'db-postprocess')
  
+    assert tokenizer_mock.finalize_import_called
+
      for mock in mocks:
          assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
  
@@ -217,7 +244,8 @@ def test_add_data_command(mock_run_legacy, name, oid):
                            (['--boundaries-only'], 1, 0),
                            (['--no-boundaries'], 0, 1),
                            (['--boundaries-only', '--no-boundaries'], 0, 0)])
-def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ranks):
+def test_index_command(mock_func_factory, temp_db_cursor, tokenizer_mock,
+                       params, do_bnds, do_ranks):
      temp_db_cursor.execute("CREATE TABLE import_status (indexed bool)")
      bnd_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_boundaries')
      rank_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_by_rank')
@@ -227,7 +255,7 @@ def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ra
      assert bnd_mock.called == do_bnds
      assert rank_mock.called == do_ranks
  
-def test_special_phrases_command(temp_db, mock_func_factory):
+def test_special_phrases_command(temp_db, mock_func_factory, tokenizer_mock):
      func = mock_func_factory(nominatim.clicmd.special_phrases.SpecialPhrasesImporter, 'import_from_wiki')
  
      call_nominatim('special-phrases', '--import-from-wiki')
@@ -238,7 +266,6 @@ def test_special_phrases_command(temp_db, mock_func_factory):
                           ('postcodes', 'update_postcodes'),
                           ('word-counts', 'recompute_word_counts'),
                           ('address-levels', 'load_address_levels_from_file'),
-                         ('functions', 'create_functions'),
                           ('wiki-data', 'import_wikipedia_articles'),
                           ('importance', 'recompute_importance'),
                           ('website', 'setup_website'),
@@ -250,6 +277,14 @@ def test_refresh_command(mock_func_factory, temp_db, command, func):
      assert func_mock.called == 1
  
  
+def test_refresh_create_functions(mock_func_factory, temp_db, tokenizer_mock):
+    func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
+
+    assert 0 == call_nominatim('refresh', '--functions')
+    assert func_mock.called == 1
+    assert tokenizer_mock.update_sql_functions_called
+
+
  def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db):
      calls = []
      monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',
diff --git a/test/python/test_cli_replication.py b/test/python/test_cli_replication.py

index a62ad1a4a69be28887755ea301aeb747788d4e4d..b95e6ede1787dd7619bee512e95a5694c1e30b02 100644 (file)
--- a/test/python/test_cli_replication.py
+++ b/test/python/test_cli_replication.py
@@ -27,7 +27,29 @@ def call_nominatim(*args):
                                     cli_args=['replication'] + list(args))
  
  @pytest.fixture
-def index_mock(monkeypatch):
+def tokenizer_mock(monkeypatch):
+    class DummyTokenizer:
+        def __init__(self, *args, **kwargs):
+            self.update_sql_functions_called = False
+            self.finalize_import_called = False
+
+        def update_sql_functions(self, *args):
+            self.update_sql_functions_called = True
+
+        def finalize_import(self, *args):
+            self.finalize_import_called = True
+
+    tok = DummyTokenizer()
+    monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
+                        lambda *args: tok)
+    monkeypatch.setattr(nominatim.tokenizer.factory, 'create_tokenizer' ,
+                        lambda *args: tok)
+
+    return tok
+
+
+@pytest.fixture
+def index_mock(monkeypatch, tokenizer_mock):
      mock = MockParamCapture()
      monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', mock)
      monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_by_rank', mock)
@@ -52,7 +74,7 @@ def init_status(temp_db_conn, status_table):
  
  
  @pytest.fixture
-def update_mock(mock_func_factory, init_status):
+def update_mock(mock_func_factory, init_status, tokenizer_mock):
      return mock_func_factory(nominatim.tools.replication, 'update')
  
  @pytest.mark.parametrize("params,func", [
diff --git a/test/python/test_db_sql_preprocessor.py b/test/python/test_db_sql_preprocessor.py

index 08a195bd2d794d9c6f9f01cd18116df64d466b42..6a254ef3b8e6943342168349dc04aa43598af80e 100644 (file)
--- a/test/python/test_db_sql_preprocessor.py
+++ b/test/python/test_db_sql_preprocessor.py
@@ -24,7 +24,6 @@ def sql_factory(tmp_path):
      ("'{{db.partitions|join}}'", '012'),
      ("{% if 'country_name' in db.tables %}'yes'{% else %}'no'{% endif %}", "yes"),
      ("{% if 'xxx' in db.tables %}'yes'{% else %}'no'{% endif %}", "no"),
-    ("'{{config.DATABASE_MODULE_PATH}}'", '.')
      ])
  def test_load_file_simple(sql_preprocessor, sql_factory, temp_db_conn, temp_db_cursor, expr, ret):
      sqlfile = sql_factory("RETURN {};".format(expr))
diff --git a/test/python/test_db_status.py b/test/python/test_db_status.py

index c659147148d4b2d970a5008e984e8fce02cc7242..9f0327637d561314e05819198f6473eef4c54cb1 100644 (file)
--- a/test/python/test_db_status.py
+++ b/test/python/test_db_status.py
@@ -19,6 +19,11 @@ OSM_NODE_DATA = """\
  </osm>
  """
  
+def iso_date(date):
+    return dt.datetime.strptime(date, nominatim.db.status.ISODATE_FORMAT)\
+               .replace(tzinfo=dt.timezone.utc)
+
+
  def test_compute_database_date_valid(monkeypatch, status_table, place_row, temp_db_conn):
      place_row(osm_type='N', osm_id=45673)
  
@@ -32,7 +37,7 @@ def test_compute_database_date_valid(monkeypatch, status_table, place_row, temp_
      date = nominatim.db.status.compute_database_date(temp_db_conn)
  
      assert requested_url == ['https://www.openstreetmap.org/api/0.6/node/45673/1']
-    assert date == dt.datetime.fromisoformat('2006-01-27T22:09:10').replace(tzinfo=dt.timezone.utc)
+    assert date == iso_date('2006-01-27T22:09:10')
  
  
  def test_compute_database_broken_api(monkeypatch, status_table, place_row, temp_db_conn):
diff --git a/test/python/test_indexing.py b/test/python/test_indexing.py

index ee9c6c7e99f8cc87b7ce159dfe0c5e6f9fee4b39..ff84e37964b660a33be2faa0908ba2ad0c1519a7 100644 (file)
--- a/test/python/test_indexing.py
+++ b/test/python/test_indexing.py
@@ -5,7 +5,8 @@ import itertools
  import psycopg2
  import pytest
  
-from nominatim.indexer.indexer import Indexer
+from nominatim.indexer import indexer
+from nominatim.tokenizer import factory
  
  class IndexerTestDB:
  
@@ -17,6 +18,7 @@ class IndexerTestDB:
          self.conn = conn
          self.conn.set_isolation_level(0)
          with self.conn.cursor() as cur:
+            cur.execute('CREATE EXTENSION hstore')
              cur.execute("""CREATE TABLE placex (place_id BIGINT,
                                                  class TEXT,
                                                  type TEXT,
@@ -26,9 +28,14 @@ class IndexerTestDB:
                                                  indexed_date TIMESTAMP,
                                                  partition SMALLINT,
                                                  admin_level SMALLINT,
+                                                address HSTORE,
+                                                token_info JSONB,
                                                  geometry_sector INTEGER)""")
              cur.execute("""CREATE TABLE location_property_osmline (
                                 place_id BIGINT,
+                               osm_id BIGINT,
+                               address HSTORE,
+                               token_info JSONB,
                                 indexed_status SMALLINT,
                                 indexed_date TIMESTAMP,
                                 geometry_sector INTEGER)""")
@@ -46,6 +53,25 @@ class IndexerTestDB:
                               END IF;
                               RETURN NEW;
                             END; $$ LANGUAGE plpgsql;""")
+            cur.execute("""CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
+                                                      OUT name HSTORE,
+                                                      OUT address HSTORE,
+                                                      OUT country_feature VARCHAR)
+                           AS $$
+                           BEGIN
+                            address := p.address;
+                            name := p.address;
+                           END;
+                           $$ LANGUAGE plpgsql STABLE;
+                        """)
+            cur.execute("""CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
+                           RETURNS HSTORE AS $$
+                           BEGIN
+                             RETURN in_address;
+                           END;
+                           $$ LANGUAGE plpgsql STABLE;
+                        """)
+
              for table in ('placex', 'location_property_osmline', 'location_postcode'):
                  cur.execute("""CREATE TRIGGER {0}_update BEFORE UPDATE ON {0}
                                 FOR EACH ROW EXECUTE PROCEDURE date_update()
@@ -76,9 +102,9 @@ class IndexerTestDB:
          next_id = next(self.osmline_id)
          with self.conn.cursor() as cur:
              cur.execute("""INSERT INTO location_property_osmline
-                              (place_id, indexed_status, geometry_sector)
-                              VALUES (%s, 1, %s)""",
-                        (next_id, sector))
+                              (place_id, osm_id, indexed_status, geometry_sector)
+                              VALUES (%s, %s, 1, %s)""",
+                        (next_id, next_id, sector))
          return next_id
  
      def add_postcode(self, country, postcode):
@@ -102,8 +128,14 @@ def test_db(temp_db_conn):
      yield IndexerTestDB(temp_db_conn)
  
  
+@pytest.fixture
+def test_tokenizer(tokenizer_mock, def_config, tmp_path):
+    def_config.project_dir = tmp_path
+    return factory.create_tokenizer(def_config)
+
+
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_all_by_rank(test_db, threads):
+def test_index_all_by_rank(test_db, threads, test_tokenizer):
      for rank in range(31):
          test_db.add_place(rank_address=rank, rank_search=rank)
      test_db.add_osmline()
@@ -111,7 +143,7 @@ def test_index_all_by_rank(test_db, threads):
      assert 31 == test_db.placex_unindexed()
      assert 1 == test_db.osmline_unindexed()
  
-    idx = Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
      idx.index_by_rank(0, 30)
  
      assert 0 == test_db.placex_unindexed()
@@ -142,7 +174,7 @@ def test_index_all_by_rank(test_db, threads):
  
  
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_partial_without_30(test_db, threads):
+def test_index_partial_without_30(test_db, threads, test_tokenizer):
      for rank in range(31):
          test_db.add_place(rank_address=rank, rank_search=rank)
      test_db.add_osmline()
@@ -150,7 +182,8 @@ def test_index_partial_without_30(test_db, threads):
      assert 31 == test_db.placex_unindexed()
      assert 1 == test_db.osmline_unindexed()
  
-    idx = Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest',
+                          test_tokenizer, threads)
      idx.index_by_rank(4, 15)
  
      assert 19 == test_db.placex_unindexed()
@@ -162,7 +195,7 @@ def test_index_partial_without_30(test_db, threads):
  
  
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_partial_with_30(test_db, threads):
+def test_index_partial_with_30(test_db, threads, test_tokenizer):
      for rank in range(31):
          test_db.add_place(rank_address=rank, rank_search=rank)
      test_db.add_osmline()
@@ -170,7 +203,7 @@ def test_index_partial_with_30(test_db, threads):
      assert 31 == test_db.placex_unindexed()
      assert 1 == test_db.osmline_unindexed()
  
-    idx = Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
      idx.index_by_rank(28, 30)
  
      assert 27 == test_db.placex_unindexed()
@@ -181,7 +214,7 @@ def test_index_partial_with_30(test_db, threads):
                        WHERE indexed_status = 0 AND rank_address between 1 and 27""")
  
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_boundaries(test_db, threads):
+def test_index_boundaries(test_db, threads, test_tokenizer):
      for rank in range(4, 10):
          test_db.add_admin(rank_address=rank, rank_search=rank)
      for rank in range(31):
@@ -191,7 +224,7 @@ def test_index_boundaries(test_db, threads):
      assert 37 == test_db.placex_unindexed()
      assert 1 == test_db.osmline_unindexed()
  
-    idx = Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
      idx.index_boundaries(0, 30)
  
      assert 31 == test_db.placex_unindexed()
@@ -203,20 +236,21 @@ def test_index_boundaries(test_db, threads):
  
  
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_postcodes(test_db, threads):
+def test_index_postcodes(test_db, threads, test_tokenizer):
      for postcode in range(1000):
          test_db.add_postcode('de', postcode)
      for postcode in range(32000, 33000):
          test_db.add_postcode('us', postcode)
  
-    idx = Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
      idx.index_postcodes()
  
      assert 0 == test_db.scalar("""SELECT count(*) FROM location_postcode
                                    WHERE indexed_status != 0""")
  
  
-def test_index_full(test_db):
+@pytest.mark.parametrize("analyse", [True, False])
+def test_index_full(test_db, analyse, test_tokenizer):
      for rank in range(4, 10):
          test_db.add_admin(rank_address=rank, rank_search=rank)
      for rank in range(31):
@@ -225,10 +259,23 @@ def test_index_full(test_db):
      for postcode in range(1000):
          test_db.add_postcode('de', postcode)
  
-    idx = Indexer('dbname=test_nominatim_python_unittest', 4)
-    idx.index_full()
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, 4)
+    idx.index_full(analyse=analyse)
  
      assert 0 == test_db.placex_unindexed()
      assert 0 == test_db.osmline_unindexed()
      assert 0 == test_db.scalar("""SELECT count(*) FROM location_postcode
                                    WHERE indexed_status != 0""")
+
+
+@pytest.mark.parametrize("threads", [1, 15])
+def test_index_reopen_connection(test_db, threads, monkeypatch, test_tokenizer):
+    monkeypatch.setattr(indexer.WorkerPool, "REOPEN_CONNECTIONS_AFTER", 15)
+
+    for _ in range(1000):
+        test_db.add_place(rank_address=30, rank_search=30)
+
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
+    idx.index_by_rank(28, 30)
+
+    assert 0 == test_db.placex_unindexed()
diff --git a/test/python/test_tokenizer_factory.py b/test/python/test_tokenizer_factory.py

new file mode 100644 (file)

index 0000000..69517e9
--- /dev/null
+++ b/test/python/test_tokenizer_factory.py
@@ -0,0 +1,77 @@
+"""
+Tests for creating new tokenizers.
+"""
+import importlib
+import pytest
+
+from nominatim.db import properties
+from nominatim.tokenizer import factory
+from nominatim.errors import UsageError
+from dummy_tokenizer import DummyTokenizer
+
+@pytest.fixture
+def test_config(def_config, tmp_path):
+    def_config.project_dir = tmp_path
+    return def_config
+
+
+def test_setup_dummy_tokenizer(temp_db_conn, test_config,
+                               tokenizer_mock, property_table):
+    tokenizer = factory.create_tokenizer(test_config)
+
+    assert isinstance(tokenizer, DummyTokenizer)
+    assert tokenizer.init_state == "new"
+    assert (test_config.project_dir / 'tokenizer').is_dir()
+
+    assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy'
+
+
+def test_setup_tokenizer_dir_exists(test_config, tokenizer_mock, property_table):
+    (test_config.project_dir / 'tokenizer').mkdir()
+
+    tokenizer = factory.create_tokenizer(test_config)
+
+    assert isinstance(tokenizer, DummyTokenizer)
+    assert tokenizer.init_state == "new"
+
+
+def test_setup_tokenizer_dir_failure(test_config, tokenizer_mock, property_table):
+    (test_config.project_dir / 'tokenizer').write_text("foo")
+
+    with pytest.raises(UsageError):
+        factory.create_tokenizer(test_config)
+
+
+def test_setup_bad_tokenizer_name(test_config, monkeypatch):
+    monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
+
+    with pytest.raises(UsageError):
+        factory.create_tokenizer(test_config)
+
+def test_load_tokenizer(temp_db_conn, test_config,
+                        tokenizer_mock, property_table):
+    factory.create_tokenizer(test_config)
+
+    tokenizer = factory.get_tokenizer_for_db(test_config)
+
+    assert isinstance(tokenizer, DummyTokenizer)
+    assert tokenizer.init_state == "loaded"
+
+
+def test_load_no_tokenizer_dir(test_config, tokenizer_mock, property_table):
+    factory.create_tokenizer(test_config)
+
+    test_config.project_dir = test_config.project_dir / 'foo'
+
+    with pytest.raises(UsageError):
+        factory.get_tokenizer_for_db(test_config)
+
+
+def test_load_missing_propoerty(temp_db_cursor, test_config, tokenizer_mock, property_table):
+    factory.create_tokenizer(test_config)
+
+    temp_db_cursor.execute("TRUNCATE TABLE nominatim_properties")
+
+    with pytest.raises(UsageError):
+        factory.get_tokenizer_for_db(test_config)
+
diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py

new file mode 100644 (file)

index 0000000..c567a4c
--- /dev/null
+++ b/test/python/test_tokenizer_legacy.py
@@ -0,0 +1,299 @@
+"""
+Test for legacy tokenizer.
+"""
+import shutil
+
+import pytest
+
+from nominatim.tokenizer import legacy_tokenizer
+from nominatim.db import properties
+from nominatim.errors import UsageError
+
+@pytest.fixture
+def test_config(def_config, tmp_path):
+    def_config.project_dir = tmp_path / 'project'
+    def_config.project_dir.mkdir()
+
+    module_dir = tmp_path / 'module_src'
+    module_dir.mkdir()
+    (module_dir / 'nominatim.so').write_text('TEST nomiantim.so')
+
+    def_config.lib_dir.module = module_dir
+
+    sqldir = tmp_path / 'sql'
+    sqldir.mkdir()
+    (sqldir / 'tokenizer').mkdir()
+    (sqldir / 'tokenizer' / 'legacy_tokenizer.sql').write_text("SELECT 'a'")
+    (sqldir / 'words.sql').write_text("SELECT 'a'")
+    shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
+                str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
+
+    def_config.lib_dir.sql = sqldir
+    def_config.lib_dir.data = sqldir
+
+    return def_config
+
+
+@pytest.fixture
+def tokenizer_factory(dsn, tmp_path, property_table):
+    (tmp_path / 'tokenizer').mkdir()
+
+    def _maker():
+        return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer')
+
+    return _maker
+
+
+@pytest.fixture
+def tokenizer_setup(tokenizer_factory, test_config, monkeypatch, sql_preprocessor):
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+
+
+@pytest.fixture
+def analyzer(tokenizer_factory, test_config, monkeypatch, sql_preprocessor,
+             word_table, temp_db_with_extensions, tmp_path):
+    sql = tmp_path / 'sql' / 'tokenizer' / 'legacy_tokenizer.sql'
+    sql.write_text("""
+        CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
+          RETURNS INTEGER AS $$ SELECT 342; $$ LANGUAGE SQL;
+        """)
+
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+    monkeypatch.undo()
+
+    with tok.name_analyzer() as analyzer:
+        yield analyzer
+
+
+@pytest.fixture
+def make_standard_name(temp_db_cursor):
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
+                              RETURNS TEXT AS $$ SELECT ' ' || name; $$ LANGUAGE SQL""")
+
+
+@pytest.fixture
+def create_postcode_id(table_factory, temp_db_cursor):
+    table_factory('out_postcode_table', 'postcode TEXT')
+
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
+                              RETURNS BOOLEAN AS $$
+                              INSERT INTO out_postcode_table VALUES (postcode) RETURNING True;
+                              $$ LANGUAGE SQL""")
+
+
+@pytest.fixture
+def create_housenumbers(temp_db_cursor):
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_housenumbers(
+                                  housenumbers TEXT[],
+                                  OUT tokens TEXT, OUT normtext TEXT)
+                              AS $$
+                              SELECT housenumbers::TEXT, array_to_string(housenumbers, ';')
+                              $$ LANGUAGE SQL""")
+
+
+@pytest.fixture
+def make_keywords(temp_db_cursor, temp_db_with_extensions):
+    temp_db_cursor.execute(
+        """CREATE OR REPLACE FUNCTION make_keywords(names HSTORE)
+           RETURNS INTEGER[] AS $$ SELECT ARRAY[1, 2, 3] $$ LANGUAGE SQL""")
+
+def test_init_new(tokenizer_factory, test_config, monkeypatch,
+                  temp_db_conn, sql_preprocessor):
+    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv')
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+
+    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) == 'xxvv'
+
+    outfile = test_config.project_dir / 'module' / 'nominatim.so'
+
+    assert outfile.exists()
+    assert outfile.read_text() == 'TEST nomiantim.so'
+    assert outfile.stat().st_mode == 33261
+
+
+def test_init_module_load_failed(tokenizer_factory, test_config,
+                                 monkeypatch, temp_db_conn):
+    tok = tokenizer_factory()
+
+    with pytest.raises(UsageError):
+        tok.init_new_db(test_config)
+
+
+def test_init_module_custom(tokenizer_factory, test_config,
+                            monkeypatch, tmp_path, sql_preprocessor):
+    module_dir = (tmp_path / 'custom').resolve()
+    module_dir.mkdir()
+    (module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so')
+
+    monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', str(module_dir))
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+
+    assert not (test_config.project_dir / 'module').exists()
+
+
+def test_init_from_project(tokenizer_setup, tokenizer_factory):
+    tok = tokenizer_factory()
+
+    tok.init_from_project()
+
+    assert tok.normalization is not None
+
+
+def test_update_sql_functions(sql_preprocessor, temp_db_conn,
+                              tokenizer_factory, test_config, table_factory,
+                              monkeypatch, temp_db_cursor):
+    monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+    monkeypatch.undo()
+
+    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
+
+    table_factory('test', 'txt TEXT')
+
+    func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer.sql'
+    func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}'),
+                                                   ('{{modulepath}}')""")
+
+    tok.update_sql_functions(test_config)
+
+    test_content = temp_db_cursor.row_set('SELECT * FROM test')
+    assert test_content == set((('1133', ), (str(test_config.project_dir / 'module'), )))
+
+
+def test_migrate_database(tokenizer_factory, test_config, temp_db_conn, monkeypatch):
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+    tok = tokenizer_factory()
+    tok.migrate_database(test_config)
+
+    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) is not None
+    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) is not None
+
+    outfile = test_config.project_dir / 'module' / 'nominatim.so'
+
+    assert outfile.exists()
+    assert outfile.read_text() == 'TEST nomiantim.so'
+    assert outfile.stat().st_mode == 33261
+
+
+def test_normalize(analyzer):
+    assert analyzer.normalize('TEsT') == 'test'
+
+
+def test_add_postcodes_from_db(analyzer, table_factory, temp_db_cursor,
+                               create_postcode_id):
+    table_factory('location_postcode', 'postcode TEXT',
+                  content=(('1234',), ('12 34',), ('AB23',), ('1234',)))
+
+    analyzer.add_postcodes_from_db()
+
+    assert temp_db_cursor.row_set("SELECT * from out_postcode_table") \
+               == set((('1234', ), ('12 34', ), ('AB23',)))
+
+
+def test_update_special_phrase_empty_table(analyzer, word_table, temp_db_cursor,
+                                           make_standard_name):
+    analyzer.update_special_phrases([
+        ("König bei", "amenity", "royal", "near"),
+        ("Könige", "amenity", "royal", "-"),
+        ("strasse", "highway", "primary", "in")
+    ])
+
+    assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
+                                     FROM word WHERE class != 'place'""") \
+               == set(((' könig bei', 'könig bei', 'amenity', 'royal', 'near'),
+                       (' könige', 'könige', 'amenity', 'royal', None),
+                       (' strasse', 'strasse', 'highway', 'primary', 'in')))
+
+
+def test_update_special_phrase_delete_all(analyzer, word_table, temp_db_cursor,
+                                          make_standard_name):
+    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
+                              VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
+                                     (' bar', 'bar', 'highway', 'road', null)""")
+
+    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+    analyzer.update_special_phrases([])
+
+    assert 0 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+
+def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor,
+                                      make_standard_name):
+    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
+                              VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
+                                     (' bar', 'bar', 'highway', 'road', null)""")
+
+    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+    analyzer.update_special_phrases([
+      ('prison', 'amenity', 'prison', 'in'),
+      ('bar', 'highway', 'road', '-'),
+      ('garden', 'leisure', 'garden', 'near')
+    ])
+
+    assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
+                                     FROM word WHERE class != 'place'""") \
+               == set(((' prison', 'prison', 'amenity', 'prison', 'in'),
+                       (' bar', 'bar', 'highway', 'road', None),
+                       (' garden', 'garden', 'leisure', 'garden', 'near')))
+
+
+def test_process_place_names(analyzer, make_keywords):
+
+    info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
+
+    assert info['names'] == '{1,2,3}'
+
+
+@pytest.mark.parametrize('pc', ['12345', 'AB 123', '34-345'])
+def test_process_place_postcode(analyzer, temp_db_cursor, create_postcode_id, pc):
+
+    info = analyzer.process_place({'address': {'postcode' : pc}})
+
+    assert temp_db_cursor.row_set("SELECT * from out_postcode_table") \
+               == set(((pc, ),))
+
+
+@pytest.mark.parametrize('pc', ['12:23', 'ab;cd;f', '123;836'])
+def test_process_place_bad_postcode(analyzer, temp_db_cursor, create_postcode_id,
+                                    pc):
+
+    info = analyzer.process_place({'address': {'postcode' : pc}})
+
+    assert 0 == temp_db_cursor.scalar("SELECT count(*) from out_postcode_table")
+
+
+@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
+def test_process_place_housenumbers_simple(analyzer, create_housenumbers, hnr):
+    info = analyzer.process_place({'address': {'housenumber' : hnr}})
+
+    assert info['hnr'] == hnr
+    assert info['hnr_tokens'].startswith("{")
+
+
+def test_process_place_housenumbers_lists(analyzer, create_housenumbers):
+    info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}})
+
+    assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
+
+
+def test_process_place_housenumbers_duplicates(analyzer, create_housenumbers):
+    info = analyzer.process_place({'address': {'housenumber' : '134',
+                                               'conscriptionnumber' : '134',
+                                               'streetnumber' : '99a'}})
+
+    assert set(info['hnr'].split(';')) == set(('134', '99a'))
diff --git a/test/python/test_tokenizer_legacy_icu.py b/test/python/test_tokenizer_legacy_icu.py

new file mode 100644 (file)

index 0000000..836f15b
--- /dev/null
+++ b/test/python/test_tokenizer_legacy_icu.py
@@ -0,0 +1,256 @@
+"""
+Tests for Legacy ICU tokenizer.
+"""
+import shutil
+
+import pytest
+
+from nominatim.tokenizer import legacy_icu_tokenizer
+from nominatim.db import properties
+
+
+@pytest.fixture
+def test_config(def_config, tmp_path):
+    def_config.project_dir = tmp_path / 'project'
+    def_config.project_dir.mkdir()
+
+    sqldir = tmp_path / 'sql'
+    sqldir.mkdir()
+    (sqldir / 'tokenizer').mkdir()
+    (sqldir / 'tokenizer' / 'legacy_icu_tokenizer.sql').write_text("SELECT 'a'")
+    shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
+                str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
+
+    def_config.lib_dir.sql = sqldir
+
+    return def_config
+
+
+@pytest.fixture
+def tokenizer_factory(dsn, tmp_path, property_table,
+                      sql_preprocessor, place_table, word_table):
+    (tmp_path / 'tokenizer').mkdir()
+
+    def _maker():
+        return legacy_icu_tokenizer.create(dsn, tmp_path / 'tokenizer')
+
+    return _maker
+
+
+@pytest.fixture
+def db_prop(temp_db_conn):
+    def _get_db_property(name):
+        return properties.get_property(temp_db_conn,
+                                       getattr(legacy_icu_tokenizer, name))
+
+    return _get_db_property
+
+@pytest.fixture
+def tokenizer_setup(tokenizer_factory, test_config, monkeypatch, sql_preprocessor):
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+
+
+@pytest.fixture
+def analyzer(tokenizer_factory, test_config, monkeypatch, sql_preprocessor,
+             word_table, temp_db_with_extensions, tmp_path):
+    sql = tmp_path / 'sql' / 'tokenizer' / 'legacy_icu_tokenizer.sql'
+    sql.write_text("SELECT 'a';")
+
+    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+    monkeypatch.undo()
+
+    def _mk_analyser(trans=':: upper();', abbr=(('STREET', 'ST'), )):
+        tok.transliteration = trans
+        tok.abbreviations = abbr
+
+        return tok.name_analyzer()
+
+    return _mk_analyser
+
+
+@pytest.fixture
+def getorcreate_term_id(temp_db_cursor):
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_term_id(lookup_term TEXT)
+                              RETURNS INTEGER AS $$ SELECT nextval('seq_word')::INTEGER; $$ LANGUAGE SQL""")
+
+
+@pytest.fixture
+def getorcreate_hnr_id(temp_db_cursor):
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
+                              RETURNS INTEGER AS $$ SELECT -nextval('seq_word')::INTEGER; $$ LANGUAGE SQL""")
+
+
+def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop,
+                  sql_preprocessor, place_table, word_table):
+    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
+
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+
+    assert db_prop('DBCFG_NORMALIZATION') == ':: lower();'
+    assert db_prop('DBCFG_TRANSLITERATION') is not None
+    assert db_prop('DBCFG_ABBREVIATIONS') is not None
+
+
+def test_init_from_project(tokenizer_setup, tokenizer_factory):
+    tok = tokenizer_factory()
+
+    tok.init_from_project()
+
+    assert tok.normalization is not None
+    assert tok.transliteration is not None
+    assert tok.abbreviations is not None
+
+
+def test_update_sql_functions(temp_db_conn, db_prop, temp_db_cursor,
+                              tokenizer_factory, test_config, table_factory,
+                              monkeypatch,
+                              sql_preprocessor, place_table, word_table):
+    monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+    monkeypatch.undo()
+
+    assert db_prop('DBCFG_MAXWORDFREQ') == '1133'
+
+    table_factory('test', 'txt TEXT')
+
+    func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_icu_tokenizer.sql'
+    func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""")
+
+    tok.update_sql_functions(test_config)
+
+    test_content = temp_db_cursor.row_set('SELECT * FROM test')
+    assert test_content == set((('1133', ), ))
+
+
+def test_make_standard_word(analyzer):
+    with analyzer(abbr=(('STREET', 'ST'), ('tiny', 't'))) as a:
+        assert a.make_standard_word('tiny street') == 'TINY ST'
+
+    with analyzer(abbr=(('STRASSE', 'STR'), ('STR', 'ST'))) as a:
+        assert a.make_standard_word('Hauptstrasse') == 'HAUPTST'
+
+
+def test_make_standard_hnr(analyzer):
+    with analyzer(abbr=(('IV', '4'),)) as a:
+        assert a._make_standard_hnr('345') == '345'
+        assert a._make_standard_hnr('iv') == 'IV'
+
+
+def test_add_postcodes_from_db(analyzer, word_table, table_factory, temp_db_cursor):
+    table_factory('location_postcode', 'postcode TEXT',
+                  content=(('1234',), ('12 34',), ('AB23',), ('1234',)))
+
+    with analyzer() as a:
+        a.add_postcodes_from_db()
+
+    assert temp_db_cursor.row_set("""SELECT word, word_token from word
+                                     """) \
+               == set((('1234', ' 1234'), ('12 34', ' 12 34'), ('AB23', ' AB23')))
+
+
+def test_update_special_phrase_empty_table(analyzer, word_table, temp_db_cursor):
+    with analyzer() as a:
+        a.update_special_phrases([
+            ("König bei", "amenity", "royal", "near"),
+            ("Könige", "amenity", "royal", "-"),
+            ("street", "highway", "primary", "in")
+        ])
+
+    assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
+                                     FROM word WHERE class != 'place'""") \
+               == set(((' KÖNIG BEI', 'könig bei', 'amenity', 'royal', 'near'),
+                       (' KÖNIGE', 'könige', 'amenity', 'royal', None),
+                       (' ST', 'street', 'highway', 'primary', 'in')))
+
+
+def test_update_special_phrase_delete_all(analyzer, word_table, temp_db_cursor):
+    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
+                              VALUES (' FOO', 'foo', 'amenity', 'prison', 'in'),
+                                     (' BAR', 'bar', 'highway', 'road', null)""")
+
+    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+    with analyzer() as a:
+        a.update_special_phrases([])
+
+    assert 0 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+
+def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor):
+    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
+                              VALUES (' FOO', 'foo', 'amenity', 'prison', 'in'),
+                                     (' BAR', 'bar', 'highway', 'road', null)""")
+
+    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+    with analyzer() as a:
+        a.update_special_phrases([
+          ('prison', 'amenity', 'prison', 'in'),
+          ('bar', 'highway', 'road', '-'),
+          ('garden', 'leisure', 'garden', 'near')
+        ])
+
+    assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
+                                     FROM word WHERE class != 'place'""") \
+               == set(((' PRISON', 'prison', 'amenity', 'prison', 'in'),
+                       (' BAR', 'bar', 'highway', 'road', None),
+                       (' GARDEN', 'garden', 'leisure', 'garden', 'near')))
+
+
+def test_process_place_names(analyzer, getorcreate_term_id):
+
+    with analyzer() as a:
+        info = a.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
+
+    assert info['names'] == '{1,2,3,4,5,6}'
+
+
+@pytest.mark.parametrize('pc', ['12345', 'AB 123', '34-345'])
+def test_process_place_postcode(analyzer, temp_db_cursor, pc):
+    with analyzer() as a:
+        info = a.process_place({'address': {'postcode' : pc}})
+
+    assert temp_db_cursor.row_set("""SELECT word FROM word
+                                     WHERE class = 'place' and type = 'postcode'""") \
+               == set(((pc, ),))
+
+
+@pytest.mark.parametrize('pc', ['12:23', 'ab;cd;f', '123;836'])
+def test_process_place_bad_postcode(analyzer, temp_db_cursor, pc):
+    with analyzer() as a:
+        info = a.process_place({'address': {'postcode' : pc}})
+
+    assert 0 == temp_db_cursor.scalar("""SELECT count(*) FROM word
+                                         WHERE class = 'place' and type = 'postcode'""")
+
+
+@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
+def test_process_place_housenumbers_simple(analyzer, hnr, getorcreate_hnr_id):
+    with analyzer() as a:
+        info = a.process_place({'address': {'housenumber' : hnr}})
+
+    assert info['hnr'] == hnr.upper()
+    assert info['hnr_tokens'] == "{-1}"
+
+
+def test_process_place_housenumbers_lists(analyzer, getorcreate_hnr_id):
+    with analyzer() as a:
+        info = a.process_place({'address': {'conscriptionnumber' : '1; 2;3'}})
+
+    assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
+    assert info['hnr_tokens'] == "{-1,-2,-3}"
+
+
+def test_process_place_housenumbers_duplicates(analyzer, getorcreate_hnr_id):
+    with analyzer() as a:
+        info = a.process_place({'address': {'housenumber' : '134',
+                                               'conscriptionnumber' : '134',
+                                               'streetnumber' : '99a'}})
+
+    assert set(info['hnr'].split(';')) == set(('134', '99A'))
+    assert info['hnr_tokens'] == "{-1,-2}"
diff --git a/test/python/test_tools_check_database.py b/test/python/test_tools_check_database.py

index 68b376a781c585b417c09a441e3f7485d7d231fa..53001c271691ed5d1d39571c25346203db73a4aa 100644 (file)
--- a/test/python/test_tools_check_database.py
+++ b/test/python/test_tools_check_database.py
@@ -43,8 +43,22 @@ def test_check_placex_table_size_bad(temp_db_cursor, temp_db_conn, def_config):
      assert chkdb.check_placex_size(temp_db_conn, def_config) == chkdb.CheckState.FATAL
  
  
-def test_check_module_bad(temp_db_conn, def_config):
-    assert chkdb.check_module(temp_db_conn, def_config) == chkdb.CheckState.FAIL
+def test_check_tokenizer_missing(temp_db_conn, def_config, tmp_path):
+    def_config.project_dir = tmp_path
+    assert chkdb.check_tokenizer(temp_db_conn, def_config) == chkdb.CheckState.FAIL
+
+
+@pytest.mark.parametrize("check_result,state", [(None, chkdb.CheckState.OK),
+                                                ("Something wrong", chkdb.CheckState.FAIL)])
+def test_check_tokenizer(tokenizer_mock, temp_db_conn, def_config, monkeypatch,
+                         check_result, state):
+    class _TestTokenizer:
+        def check_database(self):
+            return check_result
+
+    monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',
+                         lambda *a, **k: _TestTokenizer())
+    assert chkdb.check_tokenizer(temp_db_conn, def_config) == state
  
  
  def test_check_indexing_good(temp_db_cursor, temp_db_conn, def_config):
diff --git a/test/python/test_tools_database_import.py b/test/python/test_tools_database_import.py

index e2852acb45adae34d5761b0e70ac6636341c9ea2..ceac7a2421dc43afd23ecc1cf3a4c3066c7b02f1 100644 (file)
--- a/test/python/test_tools_database_import.py
+++ b/test/python/test_tools_database_import.py
@@ -80,39 +80,6 @@ def test_setup_extensions_old_postgis(temp_db_conn, monkeypatch):
          database_import.setup_extensions(temp_db_conn)
  
  
-def test_install_module(tmp_path):
-    src_dir = tmp_path / 'source'
-    src_dir.mkdir()
-    (src_dir / 'nominatim.so').write_text('TEST nomiantim.so')
-
-    project_dir = tmp_path / 'project'
-    project_dir.mkdir()
-
-    database_import.install_module(src_dir, project_dir, '')
-
-    outfile = project_dir / 'module' / 'nominatim.so'
-
-    assert outfile.exists()
-    assert outfile.read_text() == 'TEST nomiantim.so'
-    assert outfile.stat().st_mode == 33261
-
-
-def test_install_module_custom(tmp_path):
-    (tmp_path / 'nominatim.so').write_text('TEST nomiantim.so')
-
-    database_import.install_module(tmp_path, tmp_path, str(tmp_path.resolve()))
-
-    assert not (tmp_path / 'module').exists()
-
-
-def test_install_module_fail_access(temp_db_conn, tmp_path):
-    (tmp_path / 'nominatim.so').write_text('TEST nomiantim.so')
-
-    with pytest.raises(UsageError, match='.*module cannot be accessed.*'):
-        database_import.install_module(tmp_path, tmp_path, '',
-                                       conn=temp_db_conn)
-
-
  def test_import_base_data(src_dir, temp_db, temp_db_cursor):
      temp_db_cursor.execute('CREATE EXTENSION hstore')
      temp_db_cursor.execute('CREATE EXTENSION postgis')
@@ -171,14 +138,15 @@ def test_import_osm_data_default_cache(temp_db_cursor,osm2pgsql_options):
  
  
  def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory):
-    tables = ('word', 'placex', 'place_addressline', 'location_area',
-              'location_area_country', 'location_property',
+    tables = ('placex', 'place_addressline', 'location_area',
+              'location_area_country',
                'location_property_tiger', 'location_property_osmline',
                'location_postcode', 'search_name', 'location_road_23')
      for table in tables:
-        table_factory(table, content=(1, 2, 3))
+        table_factory(table, content=((1, ), (2, ), (3, )))
+        assert temp_db_cursor.table_rows(table) == 3
  
-    database_import.truncate_data_tables(temp_db_conn, max_word_frequency=23)
+    database_import.truncate_data_tables(temp_db_conn)
  
      for table in tables:
          assert temp_db_cursor.table_rows(table) == 0
@@ -187,7 +155,7 @@ def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory):
  @pytest.mark.parametrize("threads", (1, 5))
  def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_table,
                     temp_db_cursor, threads):
-    for func in ('make_keywords', 'getorcreate_housenumber_id', 'make_standard_name'):
+    for func in ('precompute_words', 'getorcreate_housenumber_id', 'make_standard_name'):
          temp_db_cursor.execute("""CREATE FUNCTION {} (src TEXT)
                                    RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
                                 """.format(func))
@@ -196,36 +164,33 @@ def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_ta
      place_row(osm_type='W', osm_id=342, cls='place', typ='houses',
                geom='SRID=4326;LINESTRING(0 0, 10 10)')
  
-    database_import.load_data(dsn, src_dir / 'data', threads)
+    database_import.load_data(dsn, threads)
  
      assert temp_db_cursor.table_rows('placex') == 30
      assert temp_db_cursor.table_rows('location_property_osmline') == 1
  
-@pytest.mark.parametrize("languages", (False, True))
-def test_create_country_names(temp_db_conn, temp_db_cursor, def_config,
-                              temp_db_with_extensions, monkeypatch, languages):
-    if languages:
-        monkeypatch.setenv('NOMINATIM_LANGUAGES', 'fr,en')
-    temp_db_cursor.execute("""CREATE FUNCTION make_standard_name (name TEXT)
-                                  RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
-                               """)
-    temp_db_cursor.execute('CREATE TABLE country_name (country_code varchar(2), name hstore)')
-    temp_db_cursor.execute('CREATE TABLE word (code varchar(2))')
-    temp_db_cursor.execute("""INSERT INTO country_name VALUES ('us',
-                              '"name"=>"us","name:af"=>"us"')""")
-    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
-                            lookup_country_code varchar(2))
-                            RETURNS INTEGER
-                            AS $$
-                            BEGIN
-                                INSERT INTO word VALUES (lookup_country_code);
-                                RETURN 5;
-                            END;
-                            $$
-                            LANGUAGE plpgsql;
-                               """)
-    database_import.create_country_names(temp_db_conn, def_config)
+
+@pytest.mark.parametrize("languages", (None, ' fr,en'))
+def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor,
+                              table_factory, tokenizer_mock, languages):
+
+    table_factory('country_name', 'country_code varchar(2), name hstore',
+                  content=(('us', '"name"=>"us1","name:af"=>"us2"'),
+                           ('fr', '"name"=>"Fra", "name:en"=>"Fren"')))
+
+    assert temp_db_cursor.scalar("SELECT count(*) FROM country_name") == 2
+
+    tokenizer = tokenizer_mock()
+
+    database_import.create_country_names(temp_db_conn, tokenizer, languages)
+
+    assert len(tokenizer.analyser_cache['countries']) == 2
+
+    result_set = {k: set(v) for k, v in tokenizer.analyser_cache['countries']}
+
      if languages:
-        assert temp_db_cursor.table_rows('word') == 4
+        assert result_set == {'us' : set(('us', 'us1', 'United States')),
+                              'fr' : set(('fr', 'Fra', 'Fren'))}
      else:
-        assert temp_db_cursor.table_rows('word') == 5
+        assert result_set == {'us' : set(('us', 'us1', 'us2', 'United States')),
+                              'fr' : set(('fr', 'Fra', 'Fren'))}
diff --git a/test/python/test_tools_import_special_phrases.py b/test/python/test_tools_import_special_phrases.py

index 4890e0b22be519e4c53a94880eeacec7e4075d3e..24b3318da37d86db31535dbb427f8081dcb5f44b 100644 (file)
--- a/test/python/test_tools_import_special_phrases.py
+++ b/test/python/test_tools_import_special_phrases.py
@@ -2,51 +2,15 @@
      Tests for import special phrases methods
      of the class SpecialPhrasesImporter.
  """
-from mocks import MockParamCapture
  from nominatim.errors import UsageError
  from pathlib import Path
  import tempfile
  from shutil import copyfile
  import pytest
-from nominatim.tools.special_phrases import SpecialPhrasesImporter
+from nominatim.tools import SpecialPhrasesImporter
  
  TEST_BASE_DIR = Path(__file__) / '..' / '..'
  
-def test_fetch_existing_words_phrases_basic(special_phrases_importer, word_table,
-                                            temp_db_cursor):
-    """
-        Check for the fetch_existing_words_phrases() method.
-        It should return special phrase term added to the word
-        table.
-    """
-    query ="""
-        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
-        'class', 'type', null, 0, 'near');
-    """
-    temp_db_cursor.execute(query)
-
-    assert not special_phrases_importer.words_phrases_to_delete
-    special_phrases_importer._fetch_existing_words_phrases()
-    contained_phrase = special_phrases_importer.words_phrases_to_delete.pop()
-    assert contained_phrase == ('normalized_word', 'class', 'type', 'near')
-
-@pytest.mark.parametrize("house_type", ['house', 'postcode'])
-def test_fetch_existing_words_phrases_special_cases(special_phrases_importer, word_table,
-                                                    house_type, temp_db_cursor):
-    """
-        Check for the fetch_existing_words_phrases() method.
-        It should return nothing as the terms added correspond
-        to a housenumber and postcode term.
-    """
-    query ="""
-        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
-        'place', %s, null, 0, 'near');
-    """
-    temp_db_cursor.execute(query, (house_type,))
-
-    special_phrases_importer._fetch_existing_words_phrases()
-    assert not special_phrases_importer.words_phrases_to_delete
-
  def test_fetch_existing_place_classtype_tables(special_phrases_importer, temp_db_cursor):
      """
          Check for the fetch_existing_place_classtype_tables() method.
@@ -119,41 +83,11 @@ def test_convert_settings_giving_json(special_phrases_importer):
          the same path is directly returned
      """
      json_file = (TEST_BASE_DIR / 'testfiles' / 'phrase_settings.json').resolve()
-    
+
      returned = special_phrases_importer._convert_php_settings_if_needed(json_file)
  
      assert returned == json_file
  
-def test_process_amenity_with_operator(special_phrases_importer, getorcreate_amenityoperator_funcs,
-                                       temp_db_conn, word_table):
-    """
-        Test that _process_amenity() execute well the 
-        getorcreate_amenityoperator() SQL function and that
-        the 2 differents operators are well handled.
-    """
-    special_phrases_importer._process_amenity('', '', '', '', 'near')
-    special_phrases_importer._process_amenity('', '', '', '', 'in')
-
-    with temp_db_conn.cursor() as temp_db_cursor:
-        temp_db_cursor.execute("SELECT * FROM word WHERE operator='near' OR operator='in'")
-        results = temp_db_cursor.fetchall()
-
-    assert len(results) == 2
-
-def test_process_amenity_without_operator(special_phrases_importer, getorcreate_amenity_funcs,
-                                          temp_db_conn, word_table):
-    """
-        Test that _process_amenity() execute well the
-        getorcreate_amenity() SQL function.
-    """
-    special_phrases_importer._process_amenity('', '', '', '', '')
-
-    with temp_db_conn.cursor() as temp_db_cursor:
-        temp_db_cursor.execute("SELECT * FROM word WHERE operator='no_operator'")
-        result = temp_db_cursor.fetchone()
-
-    assert result
-
  def test_create_place_classtype_indexes(temp_db_conn, special_phrases_importer):
      """
          Test that _create_place_classtype_indexes() create the
@@ -216,8 +150,7 @@ def test_create_place_classtype_table_and_indexes(
          assert check_placeid_and_centroid_indexes(temp_db_conn, pair[0], pair[1])
          assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, pair[0], pair[1])
  
-def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer, word_table,
-                             getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs):
+def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer):
      """
          Test that _process_xml_content() process the given xml content right
          by executing the right SQL functions for amenities and 
@@ -229,11 +162,9 @@ def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer,
      #Converted output set to a dict for easy assert further.
      results = dict(special_phrases_importer._process_xml_content(get_test_xml_wiki_content(), 'en'))
  
-    assert check_amenities_with_op(temp_db_conn)
-    assert check_amenities_without_op(temp_db_conn)
      assert results[class_test] and type_test in results.values()
  
-def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_phrases,
+def test_remove_non_existent_tables_from_db(special_phrases_importer, default_phrases,
                                               temp_db_conn):
      """
          Check for the remove_non_existent_phrases_from_db() method.
@@ -246,22 +177,10 @@ def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_p
          be deleted.
      """
      with temp_db_conn.cursor() as temp_db_cursor:
-        to_delete_phrase_tuple = ('normalized_word', 'class', 'type', 'near')
-        to_keep_phrase_tuple = (
-            'normalized_word_exists', 'class_exists', 'type_exists', 'near'
-        )
-        special_phrases_importer.words_phrases_to_delete = {
-            to_delete_phrase_tuple,
-            to_keep_phrase_tuple
-        }
-        special_phrases_importer.words_phrases_still_exist = {
-            to_keep_phrase_tuple
-        }
          special_phrases_importer.table_phrases_to_delete = {
              'place_classtype_testclasstypetable_to_delete'
          }
  
-        query_words = 'SELECT word, class, type, operator FROM word;'
          query_tables = """
              SELECT table_name
              FROM information_schema.tables
@@ -269,21 +188,16 @@ def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_p
              AND table_name like 'place_classtype_%';
          """
  
-        special_phrases_importer._remove_non_existent_phrases_from_db()
+        special_phrases_importer._remove_non_existent_tables_from_db()
  
-        temp_db_cursor.execute(query_words)
-        words_result = temp_db_cursor.fetchall()
          temp_db_cursor.execute(query_tables)
          tables_result = temp_db_cursor.fetchall()
-        assert len(words_result) == 1 and words_result[0] == [
-            'normalized_word_exists', 'class_exists', 'type_exists', 'near'
-        ]
          assert (len(tables_result) == 1 and
              tables_result[0][0] == 'place_classtype_testclasstypetable_to_keep'
          )
  
-def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer, placex_table, 
-                          getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs, word_table):
+def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer,
+                          placex_table, tokenizer_mock):
      """
          Check that the main import_from_wiki() method is well executed.
          It should create the place_classtype table, the place_id and centroid indexes,
@@ -295,17 +209,14 @@ def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases
      #what is deleted and what is preserved.
      with temp_db_conn.cursor() as temp_db_cursor:
          temp_db_cursor.execute("""
-            INSERT INTO word VALUES(99999, ' animal shelter', 'animal shelter',
-            'amenity', 'animal_shelter', null, 0, null);
-
-            INSERT INTO word VALUES(99999, ' wrong_lookup_token', 'wrong_normalized_word',
-            'wrong_class', 'wrong_type', null, 0, 'near');
-
              CREATE TABLE place_classtype_amenity_animal_shelter();
              CREATE TABLE place_classtype_wrongclass_wrongtype();""")
  
-    monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._get_wiki_content', mock_get_wiki_content)
-    special_phrases_importer.import_from_wiki(['en'])
+    monkeypatch.setattr('nominatim.tools.SpecialPhrasesImporter._get_wiki_content', mock_get_wiki_content)
+    tokenizer = tokenizer_mock()
+    special_phrases_importer.import_from_wiki(tokenizer, ['en'])
+
+    assert len(tokenizer.analyser_cache['special_phrases']) == 18
  
      class_test = 'aerialway'
      type_test = 'zip_line'
@@ -313,22 +224,12 @@ def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases
      assert check_table_exist(temp_db_conn, class_test, type_test)
      assert check_placeid_and_centroid_indexes(temp_db_conn, class_test, type_test)
      assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test)
-    assert check_amenities_with_op(temp_db_conn)
-    assert check_amenities_without_op(temp_db_conn)
      assert check_table_exist(temp_db_conn, 'amenity', 'animal_shelter')
      assert not check_table_exist(temp_db_conn, 'wrong_class', 'wrong_type')
  
      #Format (query, should_return_something_bool) use to easily execute all asserts
      queries_tests = set()
  
-    #Used to check that the correct phrase already in the word table before is still there.
-    query_correct_word = "SELECT * FROM word WHERE word = 'animal shelter'"
-    queries_tests.add((query_correct_word, True))
-
-    #Used to check if wrong phrase was deleted from the word table of the database.
-    query_wrong_word = "SELECT word FROM word WHERE word = 'wrong_normalized_word'"
-    queries_tests.add((query_wrong_word, False))
-
      #Used to check that correct place_classtype table already in the datase before is still there.
      query_existing_table = """
          SELECT table_name
@@ -413,24 +314,6 @@ def check_placeid_and_centroid_indexes(temp_db_conn, phrase_class, phrase_type):
          temp_db_conn.index_exists(index_prefix + 'place_id')
      )
  
-def check_amenities_with_op(temp_db_conn):
-    """
-        Check that the test table for the SQL function getorcreate_amenityoperator()
-        contains more than one value (so that the SQL function was call more than one time).
-    """
-    with temp_db_conn.cursor() as temp_db_cursor:
-        temp_db_cursor.execute("SELECT * FROM word WHERE operator != 'no_operator'")
-        return len(temp_db_cursor.fetchall()) > 1
-
-def check_amenities_without_op(temp_db_conn):
-    """
-        Check that the test table for the SQL function getorcreate_amenity()
-        contains more than one value (so that the SQL function was call more than one time).
-    """
-    with temp_db_conn.cursor() as temp_db_cursor:
-        temp_db_cursor.execute("SELECT * FROM word WHERE operator = 'no_operator'")
-        return len(temp_db_cursor.fetchall()) > 1
-
  @pytest.fixture
  def special_phrases_importer(temp_db_conn, def_config, temp_phplib_dir_with_migration):
      """
@@ -454,48 +337,7 @@ def temp_phplib_dir_with_migration():
          yield Path(phpdir)
  
  @pytest.fixture
-def default_phrases(word_table, temp_db_cursor):
+def default_phrases(temp_db_cursor):
      temp_db_cursor.execute("""
-        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
-        'class', 'type', null, 0, 'near');
-
-        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word_exists',
-        'class_exists', 'type_exists', null, 0, 'near');
-
          CREATE TABLE place_classtype_testclasstypetable_to_delete();
          CREATE TABLE place_classtype_testclasstypetable_to_keep();""")
-
-@pytest.fixture
-def make_strandard_name_func(temp_db_cursor):
-    temp_db_cursor.execute("""
-        CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT AS $$
-        BEGIN
-        RETURN trim(name); --Basically return only the trimed name for the tests
-        END;
-        $$ LANGUAGE plpgsql IMMUTABLE;""")
-        
-@pytest.fixture
-def getorcreate_amenity_funcs(temp_db_cursor, make_strandard_name_func):
-    temp_db_cursor.execute("""
-        CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
-                                                    lookup_class text, lookup_type text)
-        RETURNS void as $$
-        BEGIN
-            INSERT INTO word VALUES(null, lookup_word, normalized_word,
-            lookup_class, lookup_type, null, 0, 'no_operator');
-        END;
-        $$ LANGUAGE plpgsql""")
-
-@pytest.fixture
-def getorcreate_amenityoperator_funcs(temp_db_cursor, make_strandard_name_func):
-    temp_db_cursor.execute("""
-        CREATE TABLE temp_with_operator(op TEXT);
-
-        CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT, normalized_word TEXT,
-                                                    lookup_class text, lookup_type text, op text)
-        RETURNS void as $$
-        BEGIN 
-            INSERT INTO word VALUES(null, lookup_word, normalized_word,
-            lookup_class, lookup_type, null, 0, op);
-        END;
-        $$ LANGUAGE plpgsql""")
-\ No newline at end of file
diff --git a/test/python/test_tools_postcodes.py b/test/python/test_tools_postcodes.py

index 1fc060b0c6439677e592aa90d4d48d3db51a8ea6..37b47dfa680258e5587fa40e5057d7fd85904a96 100644 (file)
--- a/test/python/test_tools_postcodes.py
+++ b/test/python/test_tools_postcodes.py
@@ -5,6 +5,11 @@ Tests for functions to maintain the artificial postcode table.
  import pytest
  
  from nominatim.tools import postcodes
+import dummy_tokenizer
+
+@pytest.fixture
+def tokenizer():
+    return dummy_tokenizer.DummyTokenizer(None, None)
  
  @pytest.fixture
  def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
@@ -20,26 +25,26 @@ def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
                        postcode TEXT,
                        geometry GEOMETRY(Geometry, 4326)""")
      temp_db_cursor.execute('CREATE SEQUENCE seq_place')
-    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
-                              RETURNS INTEGER AS $$ BEGIN RETURN 1; END; $$ LANGUAGE plpgsql;
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+                              RETURNS TEXT AS $$ BEGIN RETURN postcode; END; $$ LANGUAGE plpgsql;
                             """)
  
  
-def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path):
-    postcodes.import_postcodes(dsn, tmp_path)
+def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
+    postcodes.import_postcodes(dsn, tmp_path, tokenizer)
  
      assert temp_db_cursor.table_exists('gb_postcode')
      assert temp_db_cursor.table_exists('us_postcode')
      assert temp_db_cursor.table_rows('location_postcode') == 0
  
  
-def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path):
+def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
      temp_db_cursor.execute("""
          INSERT INTO placex (place_id, country_code, address, geometry)
            VALUES (1, 'xx', '"postcode"=>"9486"', 'SRID=4326;POINT(10 12)')
      """)
  
-    postcodes.import_postcodes(dsn, tmp_path)
+    postcodes.import_postcodes(dsn, tmp_path, tokenizer)
  
      rows = temp_db_cursor.row_set(""" SELECT postcode, country_code,
                                        ST_X(geometry), ST_Y(geometry)
diff --git a/test/python/test_tools_refresh_create_functions.py b/test/python/test_tools_refresh_create_functions.py

index 53ea2b520a2cb207c0d9ac85ddada67604be5d5e..3f9bccbdd4162f81165dcdddd64f88879b179ad7 100644 (file)
--- a/test/python/test_tools_refresh_create_functions.py
+++ b/test/python/test_tools_refresh_create_functions.py
@@ -11,9 +11,7 @@ def sql_tmp_path(tmp_path, def_config):
      return tmp_path
  
  @pytest.fixture
-def conn(temp_db_conn, table_factory, monkeypatch):
-    monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
-    table_factory('country_name', 'partition INT', (0, 1, 2))
+def conn(sql_preprocessor, temp_db_conn):
      return temp_db_conn
  
  
diff --git a/test/python/test_tools_refresh_setup_website.py b/test/python/test_tools_refresh_setup_website.py

index 18b146fc22b4b07b5c4696bf2f0c8a7cfea30bd6..dc822e3c166051bf6812b75d41fb64703427ae97 100644 (file)
--- a/test/python/test_tools_refresh_setup_website.py
+++ b/test/python/test_tools_refresh_setup_website.py
@@ -26,6 +26,7 @@ def test_script(envdir):
  
  def run_website_script(envdir, config):
      config.lib_dir.php = envdir / 'php'
+    config.project_dir = envdir
      refresh.setup_website(envdir, config)
  
      proc = subprocess.run(['/usr/bin/env', 'php', '-Cq',
diff --git a/test/python/test_tools_replication.py b/test/python/test_tools_replication.py

index 156385ad8bb337e1403b1a55e737c7ef766d90ef..affe13174a6256dbb8c960758c636b9bfd7d397e 100644 (file)
--- a/test/python/test_tools_replication.py
+++ b/test/python/test_tools_replication.py
@@ -41,7 +41,8 @@ def test_init_replication_success(monkeypatch, status_table, place_row, temp_db_
  
      temp_db_cursor.execute("SELECT * FROM import_status")
  
-    expected_date = dt.datetime.fromisoformat('2006-01-27T19:09:10').replace(tzinfo=dt.timezone.utc)
+    expected_date = dt.datetime.strptime('2006-01-27T19:09:10', status.ISODATE_FORMAT)\
+                        .replace(tzinfo=dt.timezone.utc)
      assert temp_db_cursor.rowcount == 1
      assert temp_db_cursor.fetchone() == [expected_date, 234, True]
  
diff --git a/test/testdb/specialphrases_testdb.sql b/test/testdb/specialphrases_testdb.sql

index b3b5d76d770d9c3e311181f8419296c5359d137e..7e72076e574783c6b9831a2f2db6f3c3a555b58b 100644 (file)
--- a/test/testdb/specialphrases_testdb.sql
+++ b/test/testdb/specialphrases_testdb.sql
@@ -1,120 +1,170 @@
-SELECT getorcreate_amenity(make_standard_name('Aerodrome'), 'aerodrome', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenity(make_standard_name('Aerodromes'), 'aerodromes', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodrome in'), 'aerodrome in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodromes in'), 'aerodromes in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodrome near'), 'aerodrome near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodromes near'), 'aerodromes near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenity(make_standard_name('Airport'), 'airport', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenity(make_standard_name('Airports'), 'airports', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenityoperator(make_standard_name('Airport in'), 'airport in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Airports in'), 'airports in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Airport near'), 'airport near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Airports near'), 'airports near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'bar');
-SELECT getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'bar');
-SELECT getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'pub');
-SELECT getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'pub');
-SELECT getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
-SELECT getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
-SELECT getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'bar');
-SELECT getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'bar');
-SELECT getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'pub');
-SELECT getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'pub');
-SELECT getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenity(make_standard_name('Restaurant'), 'restaurant', 'amenity', 'restaurant');
-SELECT getorcreate_amenity(make_standard_name('Restaurants'), 'restaurants', 'amenity', 'restaurant');
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurant in'), 'restaurant in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurants in'), 'restaurants in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurant near'), 'restaurant near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurants near'), 'restaurants near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenity(make_standard_name('Mural'), 'mural', 'artwork_type', 'mural');
-SELECT getorcreate_amenity(make_standard_name('Murals'), 'murals', 'artwork_type', 'mural');
-SELECT getorcreate_amenityoperator(make_standard_name('Mural in'), 'mural in', 'artwork_type', 'mural', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Murals in'), 'murals in', 'artwork_type', 'mural', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Mural near'), 'mural near', 'artwork_type', 'mural', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Murals near'), 'murals near', 'artwork_type', 'mural', 'near');
-SELECT getorcreate_amenity(make_standard_name('Sculpture'), 'sculpture', 'artwork_type', 'sculpture');
-SELECT getorcreate_amenity(make_standard_name('Sculptures'), 'sculptures', 'artwork_type', 'sculpture');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculpture in'), 'sculpture in', 'artwork_type', 'sculpture', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculptures in'), 'sculptures in', 'artwork_type', 'sculpture', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculpture near'), 'sculpture near', 'artwork_type', 'sculpture', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculptures near'), 'sculptures near', 'artwork_type', 'sculpture', 'near');
-SELECT getorcreate_amenity(make_standard_name('Statue'), 'statue', 'artwork_type', 'statue');
-SELECT getorcreate_amenity(make_standard_name('Statues'), 'statues', 'artwork_type', 'statue');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'artwork_type', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'artwork_type', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'artwork_type', 'statue', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'artwork_type', 'statue', 'near');
-SELECT getorcreate_amenity(make_standard_name('ATM'), 'atm', 'atm', 'yes');
-SELECT getorcreate_amenity(make_standard_name('ATMs'), 'atms', 'atm', 'yes');
-SELECT getorcreate_amenityoperator(make_standard_name('ATM in'), 'atm in', 'atm', 'yes', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('ATMs in'), 'atms in', 'atm', 'yes', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('ATM near'), 'atm near', 'atm', 'yes', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('ATMs near'), 'atms near', 'atm', 'yes', 'near');
-SELECT getorcreate_amenity(make_standard_name('National Park'), 'national park', 'boundary', 'national_park');
-SELECT getorcreate_amenity(make_standard_name('National Parks'), 'national parks', 'boundary', 'national_park');
-SELECT getorcreate_amenityoperator(make_standard_name('National Park in'), 'national park in', 'boundary', 'national_park', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('National Parks in'), 'national parks in', 'boundary', 'national_park', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('National Park near'), 'national park near', 'boundary', 'national_park', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('National Parks near'), 'national parks near', 'boundary', 'national_park', 'near');
-SELECT getorcreate_amenity(make_standard_name('Changing table'), 'changing table', 'changing_table', 'yes');
-SELECT getorcreate_amenity(make_standard_name('Changing tables'), 'changing tables', 'changing_table', 'yes');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing table in'), 'changing table in', 'changing_table', 'yes', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing tables in'), 'changing tables in', 'changing_table', 'yes', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing table near'), 'changing table near', 'changing_table', 'yes', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing tables near'), 'changing tables near', 'changing_table', 'yes', 'near');
-SELECT getorcreate_amenity(make_standard_name('Roundabout'), 'roundabout', 'junction', 'roundabout');
-SELECT getorcreate_amenity(make_standard_name('Roundabouts'), 'roundabouts', 'junction', 'roundabout');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabout in'), 'roundabout in', 'junction', 'roundabout', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabouts in'), 'roundabouts in', 'junction', 'roundabout', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabout near'), 'roundabout near', 'junction', 'roundabout', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabouts near'), 'roundabouts near', 'junction', 'roundabout', 'near');
-SELECT getorcreate_amenity(make_standard_name('Plaque'), 'plaque', 'memorial', 'plaque');
-SELECT getorcreate_amenity(make_standard_name('Plaques'), 'plaques', 'memorial', 'plaque');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaque in'), 'plaque in', 'memorial', 'plaque', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaques in'), 'plaques in', 'memorial', 'plaque', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaque near'), 'plaque near', 'memorial', 'plaque', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaques near'), 'plaques near', 'memorial', 'plaque', 'near');
-SELECT getorcreate_amenity(make_standard_name('Statue'), 'statue', 'memorial', 'statue');
-SELECT getorcreate_amenity(make_standard_name('Statues'), 'statues', 'memorial', 'statue');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'memorial', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'memorial', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'memorial', 'statue', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'memorial', 'statue', 'near');
-SELECT getorcreate_amenity(make_standard_name('Stolperstein'), 'stolperstein', 'memorial', 'stolperstein');
-SELECT getorcreate_amenity(make_standard_name('Stolpersteins'), 'stolpersteins', 'memorial', 'stolperstein');
-SELECT getorcreate_amenity(make_standard_name('Stolpersteine'), 'stolpersteine', 'memorial', 'stolperstein');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolperstein in'), 'stolperstein in', 'memorial', 'stolperstein', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteins in'), 'stolpersteins in', 'memorial', 'stolperstein', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteine in'), 'stolpersteine in', 'memorial', 'stolperstein', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolperstein near'), 'stolperstein near', 'memorial', 'stolperstein', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteins near'), 'stolpersteins near', 'memorial', 'stolperstein', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteine near'), 'stolpersteine near', 'memorial', 'stolperstein', 'near');
-SELECT getorcreate_amenity(make_standard_name('War Memorial'), 'war memorial', 'memorial', 'war_memorial');
-SELECT getorcreate_amenity(make_standard_name('War Memorials'), 'war memorials', 'memorial', 'war_memorial');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorial in'), 'war memorial in', 'memorial', 'war_memorial', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorials in'), 'war memorials in', 'memorial', 'war_memorial', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorial near'), 'war memorial near', 'memorial', 'war_memorial', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorials near'), 'war memorials near', 'memorial', 'war_memorial', 'near');
+CREATE OR REPLACE FUNCTION test_getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
+                                               lookup_class text, lookup_type text)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  return_word_id INTEGER;
+BEGIN
+  lookup_token := ' '||trim(lookup_word);
+  SELECT min(word_id) FROM word
+  WHERE word_token = lookup_token and word = normalized_word
+        and class = lookup_class and type = lookup_type
+  INTO return_word_id;
+  IF return_word_id IS NULL THEN
+    return_word_id := nextval('seq_word');
+    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
+                             lookup_class, lookup_type, null, 0);
+  END IF;
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION test_getorcreate_amenityoperator(lookup_word TEXT,
+                                                       normalized_word TEXT,
+                                                       lookup_class text,
+                                                       lookup_type text,
+                                                       op text)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  return_word_id INTEGER;
+BEGIN
+  lookup_token := ' '||trim(lookup_word);
+  SELECT min(word_id) FROM word
+  WHERE word_token = lookup_token and word = normalized_word
+        and class = lookup_class and type = lookup_type and operator = op
+  INTO return_word_id;
+  IF return_word_id IS NULL THEN
+    return_word_id := nextval('seq_word');
+    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
+                             lookup_class, lookup_type, null, 0, op);
+  END IF;
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+SELECT test_getorcreate_amenity(make_standard_name('Aerodrome'), 'aerodrome', 'aeroway', 'aerodrome');
+SELECT test_getorcreate_amenity(make_standard_name('Aerodromes'), 'aerodromes', 'aeroway', 'aerodrome');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodrome in'), 'aerodrome in', 'aeroway', 'aerodrome', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodromes in'), 'aerodromes in', 'aeroway', 'aerodrome', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodrome near'), 'aerodrome near', 'aeroway', 'aerodrome', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodromes near'), 'aerodromes near', 'aeroway', 'aerodrome', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Airport'), 'airport', 'aeroway', 'aerodrome');
+SELECT test_getorcreate_amenity(make_standard_name('Airports'), 'airports', 'aeroway', 'aerodrome');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airport in'), 'airport in', 'aeroway', 'aerodrome', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airports in'), 'airports in', 'aeroway', 'aerodrome', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airport near'), 'airport near', 'aeroway', 'aerodrome', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airports near'), 'airports near', 'aeroway', 'aerodrome', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'bar');
+SELECT test_getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'bar');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'bar', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'bar', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'bar', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'bar', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'pub');
+SELECT test_getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'pub');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'pub', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'pub', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'pub', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'pub', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
+SELECT test_getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'bar');
+SELECT test_getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'bar');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'bar', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'bar', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'bar', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'bar', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'pub');
+SELECT test_getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'pub');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'pub', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'pub', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'pub', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'pub', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Restaurant'), 'restaurant', 'amenity', 'restaurant');
+SELECT test_getorcreate_amenity(make_standard_name('Restaurants'), 'restaurants', 'amenity', 'restaurant');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurant in'), 'restaurant in', 'amenity', 'restaurant', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurants in'), 'restaurants in', 'amenity', 'restaurant', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurant near'), 'restaurant near', 'amenity', 'restaurant', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurants near'), 'restaurants near', 'amenity', 'restaurant', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Mural'), 'mural', 'artwork_type', 'mural');
+SELECT test_getorcreate_amenity(make_standard_name('Murals'), 'murals', 'artwork_type', 'mural');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Mural in'), 'mural in', 'artwork_type', 'mural', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Murals in'), 'murals in', 'artwork_type', 'mural', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Mural near'), 'mural near', 'artwork_type', 'mural', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Murals near'), 'murals near', 'artwork_type', 'mural', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Sculpture'), 'sculpture', 'artwork_type', 'sculpture');
+SELECT test_getorcreate_amenity(make_standard_name('Sculptures'), 'sculptures', 'artwork_type', 'sculpture');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculpture in'), 'sculpture in', 'artwork_type', 'sculpture', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculptures in'), 'sculptures in', 'artwork_type', 'sculpture', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculpture near'), 'sculpture near', 'artwork_type', 'sculpture', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculptures near'), 'sculptures near', 'artwork_type', 'sculpture', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Statue'), 'statue', 'artwork_type', 'statue');
+SELECT test_getorcreate_amenity(make_standard_name('Statues'), 'statues', 'artwork_type', 'statue');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'artwork_type', 'statue', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'artwork_type', 'statue', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'artwork_type', 'statue', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'artwork_type', 'statue', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('ATM'), 'atm', 'atm', 'yes');
+SELECT test_getorcreate_amenity(make_standard_name('ATMs'), 'atms', 'atm', 'yes');
+SELECT test_getorcreate_amenityoperator(make_standard_name('ATM in'), 'atm in', 'atm', 'yes', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('ATMs in'), 'atms in', 'atm', 'yes', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('ATM near'), 'atm near', 'atm', 'yes', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('ATMs near'), 'atms near', 'atm', 'yes', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('National Park'), 'national park', 'boundary', 'national_park');
+SELECT test_getorcreate_amenity(make_standard_name('National Parks'), 'national parks', 'boundary', 'national_park');
+SELECT test_getorcreate_amenityoperator(make_standard_name('National Park in'), 'national park in', 'boundary', 'national_park', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('National Parks in'), 'national parks in', 'boundary', 'national_park', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('National Park near'), 'national park near', 'boundary', 'national_park', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('National Parks near'), 'national parks near', 'boundary', 'national_park', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Changing table'), 'changing table', 'changing_table', 'yes');
+SELECT test_getorcreate_amenity(make_standard_name('Changing tables'), 'changing tables', 'changing_table', 'yes');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Changing table in'), 'changing table in', 'changing_table', 'yes', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Changing tables in'), 'changing tables in', 'changing_table', 'yes', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Changing table near'), 'changing table near', 'changing_table', 'yes', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Changing tables near'), 'changing tables near', 'changing_table', 'yes', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Roundabout'), 'roundabout', 'junction', 'roundabout');
+SELECT test_getorcreate_amenity(make_standard_name('Roundabouts'), 'roundabouts', 'junction', 'roundabout');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabout in'), 'roundabout in', 'junction', 'roundabout', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabouts in'), 'roundabouts in', 'junction', 'roundabout', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabout near'), 'roundabout near', 'junction', 'roundabout', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabouts near'), 'roundabouts near', 'junction', 'roundabout', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Plaque'), 'plaque', 'memorial', 'plaque');
+SELECT test_getorcreate_amenity(make_standard_name('Plaques'), 'plaques', 'memorial', 'plaque');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Plaque in'), 'plaque in', 'memorial', 'plaque', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Plaques in'), 'plaques in', 'memorial', 'plaque', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Plaque near'), 'plaque near', 'memorial', 'plaque', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Plaques near'), 'plaques near', 'memorial', 'plaque', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Statue'), 'statue', 'memorial', 'statue');
+SELECT test_getorcreate_amenity(make_standard_name('Statues'), 'statues', 'memorial', 'statue');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'memorial', 'statue', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'memorial', 'statue', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'memorial', 'statue', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'memorial', 'statue', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Stolperstein'), 'stolperstein', 'memorial', 'stolperstein');
+SELECT test_getorcreate_amenity(make_standard_name('Stolpersteins'), 'stolpersteins', 'memorial', 'stolperstein');
+SELECT test_getorcreate_amenity(make_standard_name('Stolpersteine'), 'stolpersteine', 'memorial', 'stolperstein');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolperstein in'), 'stolperstein in', 'memorial', 'stolperstein', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteins in'), 'stolpersteins in', 'memorial', 'stolperstein', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteine in'), 'stolpersteine in', 'memorial', 'stolperstein', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolperstein near'), 'stolperstein near', 'memorial', 'stolperstein', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteins near'), 'stolpersteins near', 'memorial', 'stolperstein', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteine near'), 'stolpersteine near', 'memorial', 'stolperstein', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('War Memorial'), 'war memorial', 'memorial', 'war_memorial');
+SELECT test_getorcreate_amenity(make_standard_name('War Memorials'), 'war memorials', 'memorial', 'war_memorial');
+SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorial in'), 'war memorial in', 'memorial', 'war_memorial', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorials in'), 'war memorials in', 'memorial', 'war_memorial', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorial near'), 'war memorial near', 'memorial', 'war_memorial', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorials near'), 'war memorials near', 'memorial', 'war_memorial', 'near');
  CREATE INDEX idx_placex_classtype ON placex (class, type);CREATE TABLE place_classtype_aeroway_aerodrome AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex WHERE class = 'aeroway' AND type = 'aerodrome';
  CREATE INDEX idx_place_classtype_aeroway_aerodrome_centroid ON place_classtype_aeroway_aerodrome USING GIST (centroid);
  CREATE INDEX idx_place_classtype_aeroway_aerodrome_place_id ON place_classtype_aeroway_aerodrome USING btree(place_id);
@@ -175,4 +225,7 @@ CREATE TABLE place_classtype_memorial_war_memorial AS SELECT place_id AS place_i
  CREATE INDEX idx_place_classtype_memorial_war_memorial_centroid ON place_classtype_memorial_war_memorial USING GIST (centroid);
  CREATE INDEX idx_place_classtype_memorial_war_memorial_place_id ON place_classtype_memorial_war_memorial USING btree(place_id);
  GRANT SELECT ON place_classtype_memorial_war_memorial TO "www-data";
-DROP INDEX idx_placex_classtype;
-\ No newline at end of file
+DROP INDEX idx_placex_classtype;
+
+DROP FUNCTION test_getorcreate_amenity;
+DROP FUNCTION test_getorcreate_amenityoperator;
diff --git a/vagrant/Install-on-Centos-7.sh b/vagrant/Install-on-Centos-7.sh

index 32cd3a308962fdf4d8e8a6b183a4efcc23c51fcf..10684f20bd8b51a83c69721200a27139c2aecac8 100755 (executable)
--- a/vagrant/Install-on-Centos-7.sh
+++ b/vagrant/Install-on-Centos-7.sh
@@ -42,7 +42,7 @@
                          python3-pip python3-setuptools python3-devel \
                          expat-devel zlib-devel libicu-dev
  
-    pip3 install --user psycopg2 python-dotenv psutil Jinja2 PyICU argparse-manpage
+    pip3 install --user psycopg2 python-dotenv psutil Jinja2 PyICU
  
  
  #
diff --git a/vagrant/Install-on-Centos-8.sh b/vagrant/Install-on-Centos-8.sh

index 1e028b65ee3bd9b8676b020b68915df9cf01e7a5..788f5aa22f1f033f6161db5c04dc9f41a19d3930 100755 (executable)
--- a/vagrant/Install-on-Centos-8.sh
+++ b/vagrant/Install-on-Centos-8.sh
@@ -35,7 +35,7 @@
                          python3-pip python3-setuptools python3-devel \
                          expat-devel zlib-devel libicu-dev
  
-    pip3 install --user psycopg2 python-dotenv psutil Jinja2 PyICU argparse-manpage
+    pip3 install --user psycopg2 python-dotenv psutil Jinja2 PyICU
  
  
  #
diff --git a/vagrant/Install-on-Ubuntu-18.sh b/vagrant/Install-on-Ubuntu-18.sh

index 36e28ca1071fb6b43c85bb0d8ef9273d21e472e0..33075baba7dfc9b171f14df8e00e21bc491e3a1c 100755 (executable)
--- a/vagrant/Install-on-Ubuntu-18.sh
+++ b/vagrant/Install-on-Ubuntu-18.sh
@@ -30,8 +30,7 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
                          postgresql-server-dev-10 postgresql-10-postgis-2.4 \
                          postgresql-contrib-10 postgresql-10-postgis-scripts \
                          php php-pgsql php-intl libicu-dev python3-pip \
-                        python3-psycopg2 python3-psutil python3-jinja2 python3-icu git \
-                        python3-argparse-manpage
+                        python3-psycopg2 python3-psutil python3-jinja2 python3-icu git
  
  # The python-dotenv package that comes with Ubuntu 18.04 is too old, so
  # install the latest version from pip:
diff --git a/vagrant/Install-on-Ubuntu-20.sh b/vagrant/Install-on-Ubuntu-20.sh

index 1e15f850c5c24bb90170e7ee3c8fe5fd3984c1a6..1e10f0412abd5fc3052f04123902316e2a4b7483 100755 (executable)
--- a/vagrant/Install-on-Ubuntu-20.sh
+++ b/vagrant/Install-on-Ubuntu-20.sh
@@ -33,8 +33,7 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
                          postgresql-server-dev-12 postgresql-12-postgis-3 \
                          postgresql-contrib-12 postgresql-12-postgis-3-scripts \
                          php php-pgsql php-intl libicu-dev python3-dotenv \
-                        python3-psycopg2 python3-psutil python3-jinja2 python3-icu git \
-                        python3-argparse-manpage
+                        python3-psycopg2 python3-psutil python3-jinja2 python3-icu git
  
  #
  # System Configuration
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 12 May 2021 14:18:34 +0000 (16:18 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 12 May 2021 14:18:34 +0000 (16:18 +0200)
.github/actions/build-nominatim/action.yml		patch \| blob \| history
.github/workflows/ci-tests.yml		patch \| blob \| history
.pylintrc		patch \| blob \| history
data/words.sql		patch \| blob \| history
docs/admin/Deployment.md		patch \| blob \| history
docs/admin/Update.md		patch \| blob \| history
lib-php/Geocode.php		patch \| blob \| history
lib-php/Phrase.php		patch \| blob \| history
lib-php/PlaceLookup.php		patch \| blob \| history
lib-php/Result.php		patch \| blob \| history
lib-php/SearchDescription.php		patch \| blob \| history
lib-php/Status.php		patch \| blob \| history
lib-php/TokenList.php		patch \| blob \| history
lib-php/admin/query.php		patch \| blob \| history
lib-php/admin/warm.php		patch \| blob \| history
lib-php/tokenizer/legacy_icu_tokenizer.php	[new file with mode: 0644]	patch \| blob
lib-php/tokenizer/legacy_tokenizer.php	[new file with mode: 0644]	patch \| blob
lib-php/website/details.php		patch \| blob \| history
lib-php/website/status.php		patch \| blob \| history
lib-sql/aux_tables.sql	[deleted file]	patch \| blob \| history
lib-sql/functions.sql		patch \| blob \| history
lib-sql/functions/address_lookup.sql		patch \| blob \| history
lib-sql/functions/aux_property.sql	[deleted file]	patch \| blob \| history
lib-sql/functions/interpolation.sql		patch \| blob \| history
lib-sql/functions/normalization.sql	[deleted file]	patch \| blob \| history
lib-sql/functions/partition-functions.sql		patch \| blob \| history
lib-sql/functions/placex_triggers.sql		patch \| blob \| history
lib-sql/functions/utils.sql		patch \| blob \| history
lib-sql/indices.sql		patch \| blob \| history
lib-sql/tables.sql		patch \| blob \| history
lib-sql/tokenizer/legacy_icu_tokenizer.sql	[new file with mode: 0644]	patch \| blob
lib-sql/tokenizer/legacy_tokenizer.sql	[new file with mode: 0644]	patch \| blob
lib-sql/tokenizer/legacy_tokenizer_indices.sql	[new file with mode: 0644]	patch \| blob
lib-sql/tokenizer/legacy_tokenizer_tables.sql	[new file with mode: 0644]	patch \| blob
lib-sql/words.sql	[deleted file]	patch \| blob \| history
manual/nominatim.1		patch \| blob \| history
nominatim/cli.py		patch \| blob \| history
nominatim/clicmd/args.py		patch \| blob \| history
nominatim/clicmd/index.py		patch \| blob \| history
nominatim/clicmd/refresh.py		patch \| blob \| history
nominatim/clicmd/replication.py		patch \| blob \| history
nominatim/clicmd/setup.py		patch \| blob \| history
nominatim/clicmd/special_phrases.py		patch \| blob \| history
nominatim/config.py		patch \| blob \| history
nominatim/db/async_connection.py		patch \| blob \| history
nominatim/db/sql_preprocessor.py		patch \| blob \| history
nominatim/db/status.py		patch \| blob \| history
nominatim/indexer/indexer.py		patch \| blob \| history
nominatim/indexer/runners.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/__init__.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/factory.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/legacy_icu_tokenizer.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/legacy_tokenizer.py	[new file with mode: 0644]	patch \| blob
nominatim/tools/__init__.py		patch \| blob \| history
nominatim/tools/check_database.py		patch \| blob \| history
nominatim/tools/database_import.py		patch \| blob \| history
nominatim/tools/exec_utils.py		patch \| blob \| history
nominatim/tools/migration.py		patch \| blob \| history
nominatim/tools/postcodes.py		patch \| blob \| history
nominatim/tools/refresh.py		patch \| blob \| history
nominatim/tools/replication.py		patch \| blob \| history
nominatim/tools/special_phrases/__init__.py	[new file with mode: 0644]	patch \| blob
nominatim/tools/special_phrases/importer_statistics.py	[new file with mode: 0644]	patch \| blob
nominatim/tools/special_phrases/special_phrases_importer.py	[moved from nominatim/tools/special_phrases.py with 71% similarity]	patch \| blob \| history
nominatim/version.py		patch \| blob \| history
settings/env.defaults		patch \| blob \| history
settings/legacy_icu_tokenizer.json	[new file with mode: 0644]	patch \| blob
test/bdd/api/search/queries.feature		patch \| blob \| history
test/bdd/db/import/naming.feature		patch \| blob \| history
test/bdd/db/import/rank_computation.feature		patch \| blob \| history
test/bdd/db/import/search_name.feature		patch \| blob \| history
test/bdd/environment.py		patch \| blob \| history
test/bdd/steps/http_responses.py		patch \| blob \| history
test/bdd/steps/nominatim_environment.py		patch \| blob \| history
test/bdd/steps/steps_db_ops.py		patch \| blob \| history
test/php/Nominatim/PhraseTest.php		patch \| blob \| history
test/php/Nominatim/StatusTest.php		patch \| blob \| history
test/php/Nominatim/TokenListTest.php		patch \| blob \| history
test/php/Nominatim/tokenizer.php	[new file with mode: 0644]	patch \| blob
test/python/conftest.py		patch \| blob \| history
test/python/dummy_tokenizer.py	[new file with mode: 0644]	patch \| blob
test/python/test_cli.py		patch \| blob \| history
test/python/test_cli_replication.py		patch \| blob \| history
test/python/test_db_sql_preprocessor.py		patch \| blob \| history
test/python/test_db_status.py		patch \| blob \| history
test/python/test_indexing.py		patch \| blob \| history
test/python/test_tokenizer_factory.py	[new file with mode: 0644]	patch \| blob
test/python/test_tokenizer_legacy.py	[new file with mode: 0644]	patch \| blob
test/python/test_tokenizer_legacy_icu.py	[new file with mode: 0644]	patch \| blob
test/python/test_tools_check_database.py		patch \| blob \| history
test/python/test_tools_database_import.py		patch \| blob \| history
test/python/test_tools_import_special_phrases.py		patch \| blob \| history
test/python/test_tools_postcodes.py		patch \| blob \| history
test/python/test_tools_refresh_create_functions.py		patch \| blob \| history
test/python/test_tools_refresh_setup_website.py		patch \| blob \| history
test/python/test_tools_replication.py		patch \| blob \| history
test/testdb/specialphrases_testdb.sql		patch \| blob \| history
vagrant/Install-on-Centos-7.sh		patch \| blob \| history
vagrant/Install-on-Centos-8.sh		patch \| blob \| history
vagrant/Install-on-Ubuntu-18.sh		patch \| blob \| history
vagrant/Install-on-Ubuntu-20.sh		patch \| blob \| history