]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge remote-tracking branch 'upstream/master'
authorSarah Hoffmann <lonvia@denofr.de>
Thu, 15 Apr 2021 08:17:45 +0000 (10:17 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Thu, 15 Apr 2021 08:17:45 +0000 (10:17 +0200)
24 files changed:
.github/actions/setup-postgresql/action.yml
.github/workflows/ci-tests.yml
CMakeLists.txt
ChangeLog
README.md
codecov.yml [new file with mode: 0644]
docs/admin/Import.md
docs/admin/Migration.md
lib-php/Geocode.php
lib-php/admin/query.php
lib-sql/functions/placex_triggers.sql
lib-sql/indices.sql
lib-sql/tables.sql
nominatim/tools/check_database.py
nominatim/tools/migration.py
nominatim/tools/special_phrases.py
nominatim/tools/tiger_data.py
nominatim/version.py
test/python/test_tools_import_special_phrases.py
test/python/test_tools_tiger_data.py
vagrant/Install-on-Centos-7.sh
vagrant/Install-on-Centos-8.sh
vagrant/Install-on-Ubuntu-18.sh
vagrant/Install-on-Ubuntu-20.sh

index 0742e66314a10a1afc6288c7e40391fc166b8ee6..060a678941b2610c5bfc29c4bada275812a7ccd3 100644 (file)
@@ -14,8 +14,10 @@ runs:
     steps:
         - name: Remove existing PostgreSQL
           run: |
-              sudo apt-get update -qq
               sudo apt-get purge -yq postgresql*
+              sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list'
+              sudo apt-get update -qq
+
           shell: bash
 
         - name: Install PostgreSQL
index 2f920a660f95c3f19fd7b8ad4ca1fcd542c243ca..a1a4344a1105c17c81582298da361f4f25a4731c 100644 (file)
@@ -25,7 +25,7 @@ jobs:
               uses: shivammathur/setup-php@v2
               with:
                   php-version: '7.4'
-                  tools: phpunit, phpcs
+                  tools: phpunit, phpcs, composer
 
             - name: Get Date
               id: get-date
@@ -46,7 +46,7 @@ jobs:
             - uses: ./Nominatim/.github/actions/build-nominatim
 
             - name: Install test prerequsites
-              run: sudo apt-get install -y -qq php-codesniffer pylint python3-pytest python3-behave
+              run: sudo apt-get install -y -qq php-codesniffer pylint python3-pytest python3-behave python3-pytest-cov php-codecoverage php-xdebug
 
             - name: PHP linting
               run: phpcs --report-width=120 .
@@ -57,17 +57,30 @@ jobs:
               working-directory: Nominatim
 
             - name: PHP unit tests
-              run: phpunit ./
+              run: phpunit --coverage-clover ../../coverage-php.xml ./
               working-directory: Nominatim/test/php
 
             - name: Python unit tests
-              run: py.test-3 test/python
+              run: py.test-3 --cov=nominatim --cov-report=xml test/python
               working-directory: Nominatim
 
             - name: BDD tests
-              run: behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build --format=progress3
+              run: |
+                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build --format=progress3 -DPHPCOV=./cov
+                  composer require phpunit/phpcov:7.0.2
+                  vendor/bin/phpcov merge --clover ../../coverage-bdd.xml ./cov
               working-directory: Nominatim/test/bdd
 
+            - name: Upload coverage to Codecov
+              uses: codecov/codecov-action@v1
+              with:
+                files: ./Nominatim/coverage*.xml
+                directory: ./
+                name: codecov-umbrella
+                fail_ci_if_error: true
+                path_to_write_report: ./coverage/codecov_report.txt
+                verbose: true
+
     import:
         runs-on: ubuntu-20.04
 
index 7c2d7bb38a3ffcc7d61b341376902e1342988e51..33b6ae649b4a507abf1cf9aea4cd3466e1f3817b 100644 (file)
@@ -19,7 +19,7 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 project(nominatim)
 
 set(NOMINATIM_VERSION_MAJOR 3)
-set(NOMINATIM_VERSION_MINOR 6)
+set(NOMINATIM_VERSION_MINOR 7)
 set(NOMINATIM_VERSION_PATCH 0)
 
 set(NOMINATIM_VERSION "${NOMINATIM_VERSION_MAJOR}.${NOMINATIM_VERSION_MINOR}.${NOMINATIM_VERSION_PATCH}")
index 633d0c538f74fa1ab44f68323c659cbbe691117a..4d66ee068423b83bb4c641fb2615de422918b3ef 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,26 @@
+3.7.0
+
+ * switch to dotenv for configuration file
+ * introduce 'make install' (reorganising most of the code)
+ * introduce nominatim tool as replacement for various php scripts
+ * introduce project directories and allow multiple installations from same build
+ * clean up BDD tests: drop nose, reorganise step code
+ * simplify test database for API BDD tests and autoinstall database
+ * port most of the code for command-line tools to Python
+   (thanks to @darkshredder and @AntoJvlt)
+ * add tests for all tooling
+ * replace pyosmium-get-changes with custom internal implementation using
+   pyosmium
+ * improve search for queries with housenumber and partial terms
+ * add database versioning
+ * use jinja2 for preprocessing SQL files
+ * introduce automatic migrations
+ * reverse fix preference of interpolations over housenumbers
+ * parallelize indexing of postcodes
+ * add non-key indexes to speed up housenumber + street searches
+ * switch housenumber field in placex to save transliterated names
+
+
 3.6.0
 
  * add full support for searching by and displaying of addr:* tags
index 6fd0cd4595fbce93ecd499e1758a8f92cb2b67ec..b643088b3636db3e777c6a3d4bce681f17f02043 100644 (file)
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
 [![Build Status](https://github.com/osm-search/Nominatim/workflows/CI%20Tests/badge.svg)](https://github.com/osm-search/Nominatim/actions?query=workflow%3A%22CI+Tests%22)
+[![codecov](https://codecov.io/gh/osm-search/Nominatim/branch/master/graph/badge.svg?token=8P1LXrhCMy)](https://codecov.io/gh/osm-search/Nominatim)
 
 Nominatim
 =========
diff --git a/codecov.yml b/codecov.yml
new file mode 100644 (file)
index 0000000..0404bdb
--- /dev/null
@@ -0,0 +1,14 @@
+codecov:
+  require_ci_to_pass: yes
+
+coverage:
+  status:
+    project: off
+    patch: off
+
+comment:
+  require_changes: true
+  after_n_builds: 2
+
+fixes:
+  - "Nominatim/::"
\ No newline at end of file
index e3a32481819d6a6b0cc93528f8c77b91eea309ab..6b417ff28c0589e7ab93e06133f80f6b06f194b2 100644 (file)
@@ -40,9 +40,9 @@ all commands from the project directory.
 
 ### Configuration setup in `.env`
 
-The Nominatim server can be customized via a `.env` in the project directory.
-This is a file in [dotenv](https://github.com/theskumar/python-dotenv) format
-which looks the same as variable settings in a standard shell environment.
+The Nominatim server can be customized via an `.env` configuration file in the
+project directory. This is a file in [dotenv](https://github.com/theskumar/python-dotenv)
+format which looks the same as variable settings in a standard shell environment.
 You can also set the same configuration via environment variables. All
 settings have a `NOMINATIM_` prefix to avoid conflicts with other environment
 variables.
@@ -283,16 +283,14 @@ address set to complement the OSM house number data in the US. You can add
 TIGER data to your own Nominatim instance by following these steps. The
 entire US adds about 10GB to your database.
 
-  1. Get preprocessed TIGER 2020 data and unpack it into your project
-     directory:
+  1. Get preprocessed TIGER 2020 data:
 
         cd $PROJECT_DIR
         wget https://nominatim.org/data/tiger2020-nominatim-preprocessed.tar.gz
-        tar xf tiger2020-nominatim-preprocessed.tar.gz
 
   2. Import the data into your Nominatim database:
 
-        nominatim add-data --tiger-data tiger
+        nominatim add-data --tiger-data tiger2020-nominatim-preprocessed.tar.gz
 
   3. Enable use of the Tiger data in your `.env` by adding:
 
index 52970a33d65c078189aabf4a0537e1e15abe3385..0ca6ebf29b660399ba42528607a84c2addfa45cf 100644 (file)
@@ -4,18 +4,29 @@ Since version 3.7.0 Nominatim offers automatic migrations. Please follow
 the following steps:
 
 * stop any updates that are potentially running
-* update Nominatim to the nwer version
-* goto your project directory and run `nominatim admin --migrate`
+* update Nominatim to the newer version
+* go to your project directory and run `nominatim admin --migrate`
 * (optionally) restart updates
 
 Below you find additional migrations and hints about other structural and
-breaking changes.
+breaking changes. **Please read them before running the migration.**
 
 !!! note
     If you are migrating from a version <3.6, then you still have to follow
     the manual migration steps up to 3.6.
 
-## 3.6.0 -> master
+## 3.6.0 -> 3.7.0
+
+### New format and name of configuration file
+
+The configuration for an import is now saved in a `.env` file in the project
+directory. This file follows the dotenv format. For more information, see
+the [installation chapter](Import.md#configuration-setup-in-env).
+
+To migrate to the new system, create a new project directory, add the `.env`
+file and port your custom configuration from `settings/local.php`. Most
+settings are named similar and only have received a `NOMINATIM_` prefix.
+Use the default settings in `settings/env.defaults` as a reference.
 
 ### New location for data files
 
@@ -45,6 +56,12 @@ Try `nominatim <command> --help` for more information about each subcommand.
 `./utils/query.php` no longer exists in its old form. `nominatim search`
 provides a replacement but returns different output.
 
+### Switch to normalized house numbers
+
+The housenumber column in the placex table uses now normalized version.
+The automatic migration step will convert the column but this may take a
+very long time. It is advisable to take the machine offline while doing that.
+
 ## 3.5.0 -> 3.6.0
 
 ### Change of layout of search_name_* tables
index f638af9a9a300a927f066a59690c23b6ee58e966..ec6876faa51bbd4b64402e1abaf3450993cdc81b 100644 (file)
@@ -18,7 +18,6 @@ class Geocode
     protected $aLangPrefOrder = array();
 
     protected $aExcludePlaceIDs = array();
-    protected $bReverseInPlan = true;
 
     protected $iLimit = 20;
     protected $iFinalLimit = 10;
@@ -61,11 +60,6 @@ class Geocode
         return $this->oNormalizer->transliterate($sTerm);
     }
 
-    public function setReverseInPlan($bReverse)
-    {
-        $this->bReverseInPlan = $bReverse;
-    }
-
     public function setLanguagePreference($aLangPref)
     {
         $this->aLangPrefOrder = $aLangPref;
@@ -262,7 +256,6 @@ class Geocode
                 $oParams->getString('country'),
                 $oParams->getString('postalcode')
             );
-            $this->setReverseInPlan(false);
         } else {
             $this->setQuery($sQuery);
         }
@@ -330,7 +323,7 @@ class Geocode
         return false;
     }
 
-    public function getGroupedSearches($aSearches, $aPhrases, $oValidTokens, $bIsStructured)
+    public function getGroupedSearches($aSearches, $aPhrases, $oValidTokens)
     {
         /*
              Calculate all searches using oValidTokens i.e.
@@ -345,7 +338,7 @@ class Geocode
          */
         foreach ($aPhrases as $iPhrase => $oPhrase) {
             $aNewPhraseSearches = array();
-            $sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : '';
+            $sPhraseType = $oPhrase->getPhraseType();
 
             foreach ($oPhrase->getWordSets() as $aWordset) {
                 $aWordsetSearches = $aSearches;
@@ -388,7 +381,7 @@ class Geocode
                                 $aNewSearches = $oCurrentSearch->extendWithPartialTerm(
                                     $sToken,
                                     $oSearchTerm,
-                                    $bIsStructured,
+                                    (bool) $sPhraseType,
                                     $iPhrase,
                                     $oValidTokens->get(' '.$sToken)
                                 );
@@ -607,10 +600,8 @@ class Geocode
             // Commas are used to reduce the search space by indicating where phrases split
             if ($this->aStructuredQuery) {
                 $aInPhrases = $this->aStructuredQuery;
-                $bStructuredPhrases = true;
             } else {
                 $aInPhrases = explode(',', $sQuery);
-                $bStructuredPhrases = false;
             }
 
             Debug::printDebugArray('Search context', $oCtx);
@@ -684,9 +675,9 @@ class Geocode
 
                 Debug::newSection('Search candidates');
 
-                $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens, $bStructuredPhrases);
+                $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens);
 
-                if ($this->bReverseInPlan) {
+                if (!$this->aStructuredQuery) {
                     // Reverse phrase array and also reverse the order of the wordsets in
                     // the first and final phrase. Don't bother about phrases in the middle
                     // because order in the address doesn't matter.
@@ -695,7 +686,7 @@ class Geocode
                     if (count($aPhrases) > 1) {
                         $aPhrases[count($aPhrases)-1]->invertWordSets();
                     }
-                    $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens, false);
+                    $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens);
 
                     foreach ($aGroupedSearches as $aSearches) {
                         foreach ($aSearches as $aSearch) {
@@ -999,7 +990,6 @@ class Geocode
                 'Structured query' => $this->aStructuredQuery,
                 'Name keys' => Debug::fmtArrayVals($this->aLangPrefOrder),
                 'Excluded place IDs' => Debug::fmtArrayVals($this->aExcludePlaceIDs),
-                'Try reversed query'=> $this->bReverseInPlan,
                 'Limit (for searches)' => $this->iLimit,
                 'Limit (for results)'=> $this->iFinalLimit,
                 'Country codes' => Debug::fmtArrayVals($this->aCountryCodes),
index beb2f9ef32c034e516aaad8266b932d685b03745..35fd1184a579e7ebc0219b93a32c240465323c94 100644 (file)
@@ -79,7 +79,6 @@ if (!$oParams->hasSetAny($aSearchParams)) {
 $oGeocode = new Nominatim\Geocode($oDB);
 
 $oGeocode->setLanguagePreference($oParams->getPreferredLanguages(false));
-$oGeocode->setReverseInPlan(true);
 $oGeocode->loadParamArray($oParams);
 
 if ($oParams->getBool('search')) {
index 6998224e7b851893e590d9f63ca47ca6acd1b18e..812bc79ff5c44314858af1ae350ec7d2c8cd0597 100644 (file)
@@ -169,7 +169,7 @@ BEGIN
   END IF;
 
   IF bnd.name ? 'name' THEN
-    bnd_name := make_standard_name(bnd.name->'name');
+    bnd_name := lower(bnd.name->'name');
     IF bnd_name = '' THEN
       bnd_name := NULL;
     END IF;
@@ -180,12 +180,14 @@ BEGIN
   IF bnd.extratags ? 'place' and bnd_name is not null THEN
     FOR linked_placex IN
       SELECT * FROM placex
-      WHERE make_standard_name(name->'name') = bnd_name
+      WHERE (position(lower(name->'name') in bnd_name) > 0
+             OR position(bnd_name in lower(name->'name')) > 0)
         AND placex.class = 'place' AND placex.type = bnd.extratags->'place'
         AND placex.osm_type = 'N'
         AND placex.linked_place_id is null
         AND placex.rank_search < 26 -- needed to select the right index
-        AND _st_covers(bnd.geometry, placex.geometry)
+        AND placex.type != 'postcode'
+        AND ST_Covers(bnd.geometry, placex.geometry)
     LOOP
       {% if debug %}RAISE WARNING 'Found type-matching place node %', linked_placex.osm_id;{% endif %}
       RETURN linked_placex;
@@ -201,7 +203,7 @@ BEGIN
         AND placex.linked_place_id is null
         AND placex.rank_search < 26
         AND _st_covers(bnd.geometry, placex.geometry)
-      ORDER BY make_standard_name(name->'name') = bnd_name desc
+      ORDER BY lower(name->'name') = bnd_name desc
     LOOP
       {% if debug %}RAISE WARNING 'Found wikidata-matching place node %', linked_placex.osm_id;{% endif %}
       RETURN linked_placex;
@@ -213,7 +215,7 @@ BEGIN
     {% if debug %}RAISE WARNING 'Looking for nodes with matching names';{% endif %}
     FOR linked_placex IN
       SELECT placex.* from placex
-      WHERE make_standard_name(name->'name') = bnd_name
+      WHERE lower(name->'name') = bnd_name
         AND ((bnd.rank_address > 0
               and bnd.rank_address = (compute_place_rank(placex.country_code,
                                                          'N', placex.class,
@@ -221,9 +223,11 @@ BEGIN
                                                          false, placex.postcode)).address_rank)
              OR (bnd.rank_address = 0 and placex.rank_search = bnd.rank_search))
         AND placex.osm_type = 'N'
+        AND placex.class = 'place'
         AND placex.linked_place_id is null
         AND placex.rank_search < 26 -- needed to select the right index
-        AND _st_covers(bnd.geometry, placex.geometry)
+        AND placex.type != 'postcode'
+        AND ST_Covers(bnd.geometry, placex.geometry)
     LOOP
       {% if debug %}RAISE WARNING 'Found matching place node %', linked_placex.osm_id;{% endif %}
       RETURN linked_placex;
index c121a9631b84b5d758dc552f9d45e84695bdc8be..a6f7cf95fcb6c7e0346ee0cdc8ca54fe2a77be81 100644 (file)
@@ -23,12 +23,6 @@ CREATE INDEX {{sql.if_index_not_exists}} idx_placex_geometry_reverse_lookupPolyg
     AND rank_address between 4 and 25 AND type != 'postcode'
     AND name is not null AND indexed_status = 0 AND linked_place_id is null;
 
-CREATE INDEX {{sql.if_index_not_exists}} idx_placex_geometry_reverse_placeNode
-  ON placex USING gist (geometry) {{db.tablespace.search_index}}
-  WHERE osm_type = 'N' AND rank_search between 5 and 25
-    AND class = 'place' AND type != 'postcode'
-    AND name is not null AND indexed_status = 0 AND linked_place_id is null;
-
 CREATE INDEX {{sql.if_index_not_exists}} idx_osmline_parent_place_id
   ON location_property_osmline USING BTREE (parent_place_id) {{db.tablespace.search_index}};
 
index 329eb7a1ab9491f8fb063b92747dd0abfcf9c2d2..aa213dbaccb4ebd49436fc68853ed0ab8d9c6789 100644 (file)
@@ -184,7 +184,10 @@ CREATE INDEX idx_placex_osmid ON placex USING BTREE (osm_type, osm_id) {{db.tabl
 CREATE INDEX idx_placex_linked_place_id ON placex USING BTREE (linked_place_id) {{db.tablespace.address_index}} WHERE linked_place_id IS NOT NULL;
 CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}};
 CREATE INDEX idx_placex_geometry ON placex USING GIST (geometry) {{db.tablespace.search_index}};
-CREATE INDEX idx_placex_adminname on placex USING BTREE (make_standard_name(name->'name')) {{db.tablespace.address_index}} WHERE osm_type='N' and rank_search < 26;
+CREATE INDEX idx_placex_geometry_placenode ON placex
+  USING GIST (geometry) {{db.tablespace.search_index}}
+  WHERE osm_type = 'N' and rank_search < 26
+        and class = 'place' and type != 'postcode' and linked_place_id is null;
 CREATE INDEX idx_placex_wikidata on placex USING BTREE ((extratags -> 'wikidata')) {{db.tablespace.address_index}} WHERE extratags ? 'wikidata' and class = 'place' and osm_type = 'N' and rank_search < 26;
 
 DROP SEQUENCE IF EXISTS seq_place;
index d8ab08ccd7a8593bdf149235a9e91c36482bebc7..5b39085d76f72bc52c31677fa191f8f6b0ace673 100644 (file)
@@ -84,7 +84,8 @@ def _get_indexes(conn):
                'idx_placex_rank_address',
                'idx_placex_parent_place_id',
                'idx_placex_geometry_reverse_lookuppolygon',
-               'idx_placex_geometry_reverse_placenode',
+               'idx_placex_geometry_placenode',
+               'idx_placex_housenumber',
                'idx_osmline_parent_place_id',
                'idx_osmline_parent_osm_id',
                'idx_postcode_id',
index b5f0b80e7786328955c910e7b58ffa6b7190c71c..5484834182d876f540eb919b605abff458bbabc0 100644 (file)
@@ -156,3 +156,20 @@ def change_housenumber_transliteration(conn, **_):
         cur.execute("""UPDATE placex
                        SET housenumber = create_housenumber_id(housenumber)
                        WHERE housenumber is not null""")
+
+
+@_migration(3, 7, 0, 0)
+def switch_placenode_geometry_index(conn, **_):
+    """ Replace idx_placex_geometry_reverse_placeNode index.
+
+        Make the index slightly more permissive, so that it can also be used
+        when matching up boundaries and place nodes. It makes the index
+        idx_placex_adminname index unnecessary.
+    """
+    with conn.cursor() as cur:
+        cur.execute(""" CREATE INDEX IF NOT EXISTS idx_placex_geometry_placenode ON placex
+                        USING GIST (geometry)
+                        WHERE osm_type = 'N' and rank_search < 26
+                              and class = 'place' and type != 'postcode'
+                              and linked_place_id is null""")
+        cur.execute(""" DROP INDEX IF EXISTS idx_placex_adminname """)
index fd46a18d827810aaf11a380a9cee659d3187a3fb..0c1258fefb0b9cece4c4e7ca88869b39fd97ab0e 100644 (file)
@@ -32,6 +32,17 @@ class SpecialPhrasesImporter():
         self.sanity_check_pattern = re.compile(r'^\w+$')
         self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
                                                              self.config.TERM_NORMALIZATION)
+        #This set will contain all existing phrases from the word table which
+        #no longer exist on the wiki.
+        #It contain tuples with the following format: (normalized_word, class, type, operator)
+        self.words_phrases_to_delete = set()
+        #This set will contain the phrases which still exist from the wiki.
+        #It is used to prevent duplicates on the wiki by removing them from
+        #the word_phrases_to_delete only at the end.
+        self.words_phrases_still_exist = set()
+        #This set will contain all existing place_classtype tables which doesn't match any
+        #special phrases class/type on the wiki.
+        self.table_phrases_to_delete = set()
 
     def import_from_wiki(self, languages=None):
         """
@@ -41,6 +52,9 @@ class SpecialPhrasesImporter():
         if languages is not None and not isinstance(languages, list):
             raise TypeError('The \'languages\' argument should be of type list.')
 
+        self._fetch_existing_words_phrases()
+        self._fetch_existing_place_classtype_tables()
+
         #Get all languages to process.
         languages = self._load_languages() if not languages else languages
 
@@ -53,9 +67,46 @@ class SpecialPhrasesImporter():
             class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
 
         self._create_place_classtype_table_and_indexes(class_type_pairs)
+        self._remove_non_existent_phrases_from_db()
         self.db_connection.commit()
         LOG.warning('Import done.')
 
+    def _fetch_existing_words_phrases(self):
+        """
+            Fetch existing special phrases from the word table.
+            Fill the word_phrases_to_delete set of the class.
+        """
+        #Only extract special phrases terms:
+        #If class=place and type=house then it is a housenumber term.
+        #If class=place and type=postcode then it is a postcode term.
+        word_query = """
+            SELECT word, class, type, operator FROM word
+            WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
+        """
+        with self.db_connection.cursor() as db_cursor:
+            db_cursor.execute(SQL(word_query))
+            for row in db_cursor:
+                row[3] = '-' if row[3] is None else row[3]
+                self.words_phrases_to_delete.add(
+                    (row[0], row[1], row[2], row[3])
+                )
+
+    def _fetch_existing_place_classtype_tables(self):
+        """
+            Fetch existing place_classtype tables.
+            Fill the table_phrases_to_delete set of the class.
+        """
+        query = """
+            SELECT table_name
+            FROM information_schema.tables
+            WHERE table_schema='public'
+            AND table_name like 'place_classtype_%';
+        """
+        with self.db_connection.cursor() as db_cursor:
+            db_cursor.execute(SQL(query))
+            for row in db_cursor:
+                self.table_phrases_to_delete.add(row[0])
+
     def _load_white_and_black_lists(self):
         """
             Load white and black lists from phrases-settings.json.
@@ -80,7 +131,7 @@ class SpecialPhrasesImporter():
             'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
             'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
             'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
-        return self.config.LANGUAGES or default_languages
+        return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
 
     @staticmethod
     def _get_wiki_content(lang):
@@ -122,12 +173,11 @@ class SpecialPhrasesImporter():
             phrase_class = match[1].strip()
             phrase_type = match[2].strip()
             phrase_operator = match[3].strip()
+            #Needed if some operator in the wiki are not written in english
+            phrase_operator = '-' if phrase_operator not in ('near', 'in') else phrase_operator
             #hack around a bug where building=yes was imported with quotes into the wiki
             phrase_type = re.sub(r'\"|&quot;', '', phrase_type)
 
-            #sanity check, in case somebody added garbage in the wiki
-            self._check_sanity(lang, phrase_class, phrase_type)
-
             #blacklisting: disallow certain class/type combinations
             if (
                     phrase_class in self.black_list.keys() and
@@ -141,7 +191,22 @@ class SpecialPhrasesImporter():
             ):
                 continue
 
-            #add class/type to the pairs dict
+            #Check if the phrase already exists in the database.
+            if (
+                    (normalized_label, phrase_class, phrase_type, phrase_operator)
+                    in self.words_phrases_to_delete
+            ):
+                #Remove this phrase from the ones to delete as it still exist on the wiki.
+                self.words_phrases_still_exist.add(
+                    (normalized_label, phrase_class, phrase_type, phrase_operator)
+                )
+                class_type_pairs.add((phrase_class, phrase_type))
+                #Dont need to add this phrase as it already exists in the word table.
+                continue
+
+            #sanity check, in case somebody added garbage in the wiki
+            self._check_sanity(lang, phrase_class, phrase_type)
+
             class_type_pairs.add((phrase_class, phrase_type))
 
             self._process_amenity(
@@ -191,6 +256,15 @@ class SpecialPhrasesImporter():
             phrase_class = pair[0]
             phrase_type = pair[1]
 
+            table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
+
+            if table_name in self.table_phrases_to_delete:
+                #Remove this table from the ones to delete as it match a class/type
+                #still existing on the special phrases of the wiki.
+                self.table_phrases_to_delete.remove(table_name)
+                #So dont need to create the table and indexes.
+                continue
+
             #Table creation
             self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
 
@@ -251,6 +325,41 @@ class SpecialPhrasesImporter():
                               .format(Identifier(table_name),
                                       Identifier(self.config.DATABASE_WEBUSER)))
 
+    def _remove_non_existent_phrases_from_db(self):
+        """
+            Remove special phrases which doesn't exist on the wiki anymore.
+            Delete from the word table and delete the place_classtype tables.
+        """
+        LOG.warning('Cleaning database...')
+        self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
+        #Array containing all queries to execute. Contain tuples of format (query, parameters)
+        queries_parameters = []
+
+        #Delete phrases from the word table which are not on the wiki anymore.
+        for phrase_to_delete in self.words_phrases_to_delete:
+            if phrase_to_delete[3] == '-':
+                query = """
+                    DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
+                """
+                parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
+                queries_parameters.append((query, parameters))
+            else:
+                query = """
+                    DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
+                """
+                parameters = (phrase_to_delete[0], phrase_to_delete[1],
+                              phrase_to_delete[2], phrase_to_delete[3], )
+                queries_parameters.append((query, parameters))
+
+        #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
+        for table in self.table_phrases_to_delete:
+            query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table))
+            queries_parameters.append((query, ()))
+
+        with self.db_connection.cursor() as db_cursor:
+            for query, parameters in queries_parameters:
+                db_cursor.execute(query, parameters)
+
     def _convert_php_settings_if_needed(self, file_path):
         """
             Convert php settings file of special phrases to json file if it is still in php format.
index 9b960e2d54ee82df6f05c921ccb44062f33d9ec6..c655f91d73bf096f0af4fcf7cbc7ce75c4415bd0 100644 (file)
@@ -61,6 +61,20 @@ def handle_threaded_sql_statements(sel, file):
             except Exception as exc: # pylint: disable=broad-except
                 LOG.info('Wrong SQL statement: %s', exc)
 
+def handle_unregister_connection_pool(sel, place_threads):
+    """ Handles unregistering pool of connections
+    """
+
+    while place_threads > 0:
+        for key, _ in sel.select(1):
+            conn = key.data
+            sel.unregister(conn)
+            try:
+                conn.wait()
+            except Exception as exc: # pylint: disable=broad-except
+                LOG.info('Wrong SQL statement: %s', exc)
+            conn.close()
+            place_threads -= 1
 
 def add_tiger_data(dsn, data_dir, threads, config, sqllib_dir):
     """ Import tiger data from directory or tar file
@@ -95,13 +109,7 @@ def add_tiger_data(dsn, data_dir, threads, config, sqllib_dir):
         handle_threaded_sql_statements(sel, file)
 
     # Unregistering pool of database connections
-    while place_threads > 0:
-        for key, _ in sel.select(1):
-            conn = key.data
-            sel.unregister(conn)
-            conn.wait()
-            conn.close()
-            place_threads -= 1
+    handle_unregister_connection_pool(sel, place_threads)
 
     if tar:
         tar.close()
index 352e6e55d79c39e455db9d003cd863f2bc337ad9..9670ea6062b2c7a14971fa39993e16d165f99a1e 100644 (file)
@@ -10,7 +10,7 @@ Version information for Nominatim.
 # and must always be increased when there is a change to the database or code
 # that requires a migration.
 # Released versions always have a database patch level of 0.
-NOMINATIM_VERSION = (3, 6, 0, 1)
+NOMINATIM_VERSION = (3, 7, 0, 1)
 
 POSTGRESQL_REQUIRED_VERSION = (9, 3)
 POSTGIS_REQUIRED_VERSION = (2, 2)
index b77ae10dc1233f0ede3a3535a2924a22b89251db..9ca6bbebd5793d9c872fdbaa22e2454d16dc0d2a 100644 (file)
@@ -2,6 +2,7 @@
     Tests for import special phrases methods
     of the class SpecialPhrasesImporter.
 """
+from mocks import MockParamCapture
 from nominatim.errors import UsageError
 from pathlib import Path
 import tempfile
@@ -11,6 +12,72 @@ from nominatim.tools.special_phrases import SpecialPhrasesImporter
 
 TEST_BASE_DIR = Path(__file__) / '..' / '..'
 
+def test_fetch_existing_words_phrases_basic(special_phrases_importer, word_table,
+                                            temp_db_conn):
+    """
+        Check for the fetch_existing_words_phrases() method.
+        It should return special phrase term added to the word
+        table.
+    """
+    with temp_db_conn.cursor() as temp_db_cursor:
+        query ="""
+            INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
+            'class', 'type', null, 0, 'near');
+        """
+        temp_db_cursor.execute(query)
+
+    assert not special_phrases_importer.words_phrases_to_delete
+    special_phrases_importer._fetch_existing_words_phrases()
+    contained_phrase = special_phrases_importer.words_phrases_to_delete.pop()
+    assert contained_phrase == ('normalized_word', 'class', 'type', 'near')
+
+def test_fetch_existing_words_phrases_housenumber(special_phrases_importer, word_table,
+                                                  temp_db_conn):
+    """
+        Check for the fetch_existing_words_phrases() method.
+        It should return nothing as the term added correspond
+        to a housenumber term.
+    """
+    with temp_db_conn.cursor() as temp_db_cursor:
+        query ="""
+            INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
+            'place', 'house', null, 0, 'near');
+        """
+        temp_db_cursor.execute(query)
+
+    special_phrases_importer._fetch_existing_words_phrases()
+    assert not special_phrases_importer.words_phrases_to_delete
+
+def test_fetch_existing_words_phrases_postcode(special_phrases_importer, word_table,
+                                               temp_db_conn):
+    """
+        Check for the fetch_existing_words_phrases() method.
+        It should return nothing as the term added correspond
+        to a postcode term.
+    """
+    with temp_db_conn.cursor() as temp_db_cursor:
+        query ="""
+            INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
+            'place', 'postcode', null, 0, 'near');
+        """
+        temp_db_cursor.execute(query)
+
+    special_phrases_importer._fetch_existing_words_phrases()
+    assert not special_phrases_importer.words_phrases_to_delete
+
+def test_fetch_existing_place_classtype_tables(special_phrases_importer, temp_db_conn):
+    """
+        Check for the fetch_existing_place_classtype_tables() method.
+        It should return the table just created.
+    """
+    with temp_db_conn.cursor() as temp_db_cursor:
+        query = 'CREATE TABLE place_classtype_testclasstypetable()'
+        temp_db_cursor.execute(query)
+
+    special_phrases_importer._fetch_existing_place_classtype_tables()
+    contained_table = special_phrases_importer.table_phrases_to_delete.pop()
+    assert contained_table == 'place_classtype_testclasstypetable'
+
 def test_check_sanity_class(special_phrases_importer):
     """
         Check for _check_sanity() method.
@@ -80,7 +147,7 @@ def test_convert_settings_giving_json(special_phrases_importer):
     assert returned == json_file
 
 def test_process_amenity_with_operator(special_phrases_importer, getorcreate_amenityoperator_funcs,
-                                       word_table, temp_db_conn):
+                                       temp_db_conn):
     """
         Test that _process_amenity() execute well the 
         getorcreate_amenityoperator() SQL function and that
@@ -188,13 +255,72 @@ def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer,
     assert check_amenities_without_op(temp_db_conn)
     assert results[class_test] and type_test in results.values()
 
+def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_phrases,
+                                             temp_db_conn):
+    """
+        Check for the remove_non_existent_phrases_from_db() method.
+
+        It should removed entries from the word table which are contained
+        in the words_phrases_to_delete set and not those also contained
+        in the words_phrases_still_exist set.
+
+        place_classtype tables contained in table_phrases_to_delete should
+        be deleted.
+    """
+    with temp_db_conn.cursor() as temp_db_cursor:
+        to_delete_phrase_tuple = ('normalized_word', 'class', 'type', 'near')
+        to_keep_phrase_tuple = (
+            'normalized_word_exists', 'class_exists', 'type_exists', 'near'
+        )
+        special_phrases_importer.words_phrases_to_delete = {
+            to_delete_phrase_tuple,
+            to_keep_phrase_tuple
+        }
+        special_phrases_importer.words_phrases_still_exist = {
+            to_keep_phrase_tuple
+        }
+        special_phrases_importer.table_phrases_to_delete = {
+            'place_classtype_testclasstypetable_to_delete'
+        }
+
+        query_words = 'SELECT word, class, type, operator FROM word;'
+        query_tables = """
+            SELECT table_name
+            FROM information_schema.tables
+            WHERE table_schema='public'
+            AND table_name like 'place_classtype_%';
+        """
+
+        special_phrases_importer._remove_non_existent_phrases_from_db()
+
+        temp_db_cursor.execute(query_words)
+        words_result = temp_db_cursor.fetchall()
+        temp_db_cursor.execute(query_tables)
+        tables_result = temp_db_cursor.fetchall()
+        assert len(words_result) == 1 and words_result[0] == [
+            'normalized_word_exists', 'class_exists', 'type_exists', 'near'
+        ]
+        assert (len(tables_result) == 1 and
+            tables_result[0][0] == 'place_classtype_testclasstypetable_to_keep'
+        )
+
 def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer, placex_table, 
-                          getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs):
+                          getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs, word_table):
     """
         Check that the main import_from_wiki() method is well executed.
         It should create the place_classtype table, the place_id and centroid indexes,
         grand access to the web user and executing the SQL functions for amenities.
     """
+    mock_fetch_existing_words_phrases = MockParamCapture()
+    mock_fetch_existing_place_classtype_tables = MockParamCapture()
+    mock_remove_non_existent_phrases_from_db = MockParamCapture()
+
+    monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._fetch_existing_words_phrases',
+                        mock_fetch_existing_words_phrases)
+    monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._fetch_existing_place_classtype_tables',
+                        mock_fetch_existing_place_classtype_tables)
+    monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._remove_non_existent_phrases_from_db',
+                        mock_remove_non_existent_phrases_from_db)
     monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._get_wiki_content', mock_get_wiki_content)
     special_phrases_importer.import_from_wiki(['en'])
 
@@ -206,6 +332,9 @@ def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases
     assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test)
     assert check_amenities_with_op(temp_db_conn)
     assert check_amenities_without_op(temp_db_conn)
+    assert mock_fetch_existing_words_phrases.called == 1
+    assert mock_fetch_existing_place_classtype_tables.called == 1
+    assert mock_remove_non_existent_phrases_from_db.called == 1
 
 def mock_get_wiki_content(lang):
     """
@@ -305,6 +434,18 @@ def temp_phplib_dir_with_migration():
 
         yield Path(phpdir)
 
+@pytest.fixture
+def default_phrases(word_table, temp_db_cursor):
+    temp_db_cursor.execute("""
+        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
+        'class', 'type', null, 0, 'near');
+
+        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word_exists',
+        'class_exists', 'type_exists', null, 0, 'near');
+
+        CREATE TABLE place_classtype_testclasstypetable_to_delete();
+        CREATE TABLE place_classtype_testclasstypetable_to_keep();""")
+
 @pytest.fixture
 def make_strandard_name_func(temp_db_cursor):
     temp_db_cursor.execute("""
index 5029a1320c9d36c5df426660a2e84e143c9d0024..1366fe3ec06327997ef90598c86ef5ecda051450 100644 (file)
@@ -16,11 +16,24 @@ def test_add_tiger_data(dsn, src_dir, def_config, tmp_path, sql_preprocessor,
     temp_db_cursor.execute('CREATE EXTENSION postgis')
     temp_db_cursor.execute('CREATE TABLE place (id INT)')
     sqlfile = tmp_path / '1010.sql'
-    sqlfile.write_text("""INSERT INTO place values (1)""")
+    sqlfile.write_text("""INSERT INTO place values (1);
+                          INSERT INTO non_existant_table values (1);""")
     tiger_data.add_tiger_data(dsn, str(tmp_path), threads, def_config, src_dir / 'lib-sql')
 
     assert temp_db_cursor.table_rows('place') == 1
 
+@pytest.mark.parametrize("threads", (1, 5))
+def test_add_tiger_data_bad_file(dsn, src_dir, def_config, tmp_path, sql_preprocessor,
+                        temp_db_cursor, threads, temp_db):
+    temp_db_cursor.execute('CREATE EXTENSION hstore')
+    temp_db_cursor.execute('CREATE EXTENSION postgis')
+    temp_db_cursor.execute('CREATE TABLE place (id INT)')
+    sqlfile = tmp_path / '1010.txt'
+    sqlfile.write_text("""Random text""")
+    tiger_data.add_tiger_data(dsn, str(tmp_path), threads, def_config, src_dir / 'lib-sql')
+
+    assert temp_db_cursor.table_rows('place') == 0
+
 @pytest.mark.parametrize("threads", (1, 5))
 def test_add_tiger_data_tarfile(dsn, src_dir, def_config, tmp_path,
                         temp_db_cursor, threads, temp_db, sql_preprocessor):
@@ -28,10 +41,26 @@ def test_add_tiger_data_tarfile(dsn, src_dir, def_config, tmp_path,
     temp_db_cursor.execute('CREATE EXTENSION postgis')
     temp_db_cursor.execute('CREATE TABLE place (id INT)')
     sqlfile = tmp_path / '1010.sql'
-    sqlfile.write_text("""INSERT INTO place values (1)""")
+    sqlfile.write_text("""INSERT INTO place values (1);
+                          INSERT INTO non_existant_table values (1);""")
+    tar = tarfile.open("sample.tar.gz", "w:gz")
+    tar.add(sqlfile)
+    tar.close()
+    tiger_data.add_tiger_data(dsn, str(src_dir / 'sample.tar.gz'), threads, def_config, src_dir / 'lib-sql')
+    
+    assert temp_db_cursor.table_rows('place') == 1
+
+@pytest.mark.parametrize("threads", (1, 5))
+def test_add_tiger_data_bad_tarfile(dsn, src_dir, def_config, tmp_path,
+                        temp_db_cursor, threads, temp_db, sql_preprocessor):
+    temp_db_cursor.execute('CREATE EXTENSION hstore')
+    temp_db_cursor.execute('CREATE EXTENSION postgis')
+    temp_db_cursor.execute('CREATE TABLE place (id INT)')
+    sqlfile = tmp_path / '1010.txt'
+    sqlfile.write_text("""Random text""")
     tar = tarfile.open("sample.tar.gz", "w:gz")
     tar.add(sqlfile)
     tar.close()
     tiger_data.add_tiger_data(dsn, str(src_dir / 'sample.tar.gz'), threads, def_config, src_dir / 'lib-sql')
     
-    assert temp_db_cursor.table_rows('place') == 1
\ No newline at end of file
+    assert temp_db_cursor.table_rows('place') == 0
\ No newline at end of file
index fae961f1795afd94aac6b7348ff660a4d368e07e..32cd3a308962fdf4d8e8a6b183a4efcc23c51fcf 100755 (executable)
@@ -145,7 +145,7 @@ fi                                 #DOCS:
 # -------------------------------
 #
 # The webserver should serve the php scripts from the website directory of your
-# [project directory](../admin/import.md#creating-the-project-directory).
+# [project directory](../admin/Import.md#creating-the-project-directory).
 # Therefore set up a project directory and populate the website directory:
 #
     mkdir $USERHOME/nominatim-project
index ac3a1b6087f08b09cc4877675a327a3071bb4dca..1e028b65ee3bd9b8676b020b68915df9cf01e7a5 100755 (executable)
@@ -139,7 +139,7 @@ fi                                 #DOCS:
 # -------------------------------
 #
 # The webserver should serve the php scripts from the website directory of your
-# [project directory](../admin/import.md#creating-the-project-directory).
+# [project directory](../admin/Import.md#creating-the-project-directory).
 # Therefore set up a project directory and populate the website directory:
 #
     mkdir $USERHOME/nominatim-project
index 9fda122ee8f969ada75738321053aee0f70d2118..36e28ca1071fb6b43c85bb0d8ef9273d21e472e0 100755 (executable)
@@ -133,7 +133,7 @@ fi                                 #DOCS:
 # ======================
 #
 # The webserver should serve the php scripts from the website directory of your
-# [project directory](../admin/import.md#creating-the-project-directory).
+# [project directory](../admin/Import.md#creating-the-project-directory).
 # Therefore set up a project directory and populate the website directory:
 
     mkdir $USERHOME/nominatim-project
index 292947d179ec91ecd12a265e852c98be5711a215..1e15f850c5c24bb90170e7ee3c8fe5fd3984c1a6 100755 (executable)
@@ -130,7 +130,7 @@ fi                                 #DOCS:
 # ======================
 #
 # The webserver should serve the php scripts from the website directory of your
-# [project directory](../admin/import.md#creating-the-project-directory).
+# [project directory](../admin/Import.md#creating-the-project-directory).
 # Therefore set up a project directory and populate the website directory:
 
     mkdir $USERHOME/nominatim-project