From: Sarah Hoffmann Date: Thu, 15 Apr 2021 08:17:45 +0000 (+0200) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Tag: deploy~171 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/0d567291d668dcb912375d0e8ec779525c7c0bba?hp=e8caf8d78d046008b7a951f6b1ba5d015dafb6bc Merge remote-tracking branch 'upstream/master' --- diff --git a/.github/actions/setup-postgresql/action.yml b/.github/actions/setup-postgresql/action.yml index 0742e663..060a6789 100644 --- a/.github/actions/setup-postgresql/action.yml +++ b/.github/actions/setup-postgresql/action.yml @@ -14,8 +14,10 @@ runs: steps: - name: Remove existing PostgreSQL run: | - sudo apt-get update -qq sudo apt-get purge -yq postgresql* + sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' + sudo apt-get update -qq + shell: bash - name: Install PostgreSQL diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 2f920a66..a1a4344a 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -25,7 +25,7 @@ jobs: uses: shivammathur/setup-php@v2 with: php-version: '7.4' - tools: phpunit, phpcs + tools: phpunit, phpcs, composer - name: Get Date id: get-date @@ -46,7 +46,7 @@ jobs: - uses: ./Nominatim/.github/actions/build-nominatim - name: Install test prerequsites - run: sudo apt-get install -y -qq php-codesniffer pylint python3-pytest python3-behave + run: sudo apt-get install -y -qq php-codesniffer pylint python3-pytest python3-behave python3-pytest-cov php-codecoverage php-xdebug - name: PHP linting run: phpcs --report-width=120 . @@ -57,17 +57,30 @@ jobs: working-directory: Nominatim - name: PHP unit tests - run: phpunit ./ + run: phpunit --coverage-clover ../../coverage-php.xml ./ working-directory: Nominatim/test/php - name: Python unit tests - run: py.test-3 test/python + run: py.test-3 --cov=nominatim --cov-report=xml test/python working-directory: Nominatim - name: BDD tests - run: behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build --format=progress3 + run: | + behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build --format=progress3 -DPHPCOV=./cov + composer require phpunit/phpcov:7.0.2 + vendor/bin/phpcov merge --clover ../../coverage-bdd.xml ./cov working-directory: Nominatim/test/bdd + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 + with: + files: ./Nominatim/coverage*.xml + directory: ./ + name: codecov-umbrella + fail_ci_if_error: true + path_to_write_report: ./coverage/codecov_report.txt + verbose: true + import: runs-on: ubuntu-20.04 diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c2d7bb3..33b6ae64 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,7 +19,7 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") project(nominatim) set(NOMINATIM_VERSION_MAJOR 3) -set(NOMINATIM_VERSION_MINOR 6) +set(NOMINATIM_VERSION_MINOR 7) set(NOMINATIM_VERSION_PATCH 0) set(NOMINATIM_VERSION "${NOMINATIM_VERSION_MAJOR}.${NOMINATIM_VERSION_MINOR}.${NOMINATIM_VERSION_PATCH}") diff --git a/ChangeLog b/ChangeLog index 633d0c53..4d66ee06 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,26 @@ +3.7.0 + + * switch to dotenv for configuration file + * introduce 'make install' (reorganising most of the code) + * introduce nominatim tool as replacement for various php scripts + * introduce project directories and allow multiple installations from same build + * clean up BDD tests: drop nose, reorganise step code + * simplify test database for API BDD tests and autoinstall database + * port most of the code for command-line tools to Python + (thanks to @darkshredder and @AntoJvlt) + * add tests for all tooling + * replace pyosmium-get-changes with custom internal implementation using + pyosmium + * improve search for queries with housenumber and partial terms + * add database versioning + * use jinja2 for preprocessing SQL files + * introduce automatic migrations + * reverse fix preference of interpolations over housenumbers + * parallelize indexing of postcodes + * add non-key indexes to speed up housenumber + street searches + * switch housenumber field in placex to save transliterated names + + 3.6.0 * add full support for searching by and displaying of addr:* tags diff --git a/README.md b/README.md index 6fd0cd45..b643088b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ [![Build Status](https://github.com/osm-search/Nominatim/workflows/CI%20Tests/badge.svg)](https://github.com/osm-search/Nominatim/actions?query=workflow%3A%22CI+Tests%22) +[![codecov](https://codecov.io/gh/osm-search/Nominatim/branch/master/graph/badge.svg?token=8P1LXrhCMy)](https://codecov.io/gh/osm-search/Nominatim) Nominatim ========= diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 00000000..0404bdb5 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,14 @@ +codecov: + require_ci_to_pass: yes + +coverage: + status: + project: off + patch: off + +comment: + require_changes: true + after_n_builds: 2 + +fixes: + - "Nominatim/::" \ No newline at end of file diff --git a/docs/admin/Import.md b/docs/admin/Import.md index e3a32481..6b417ff2 100644 --- a/docs/admin/Import.md +++ b/docs/admin/Import.md @@ -40,9 +40,9 @@ all commands from the project directory. ### Configuration setup in `.env` -The Nominatim server can be customized via a `.env` in the project directory. -This is a file in [dotenv](https://github.com/theskumar/python-dotenv) format -which looks the same as variable settings in a standard shell environment. +The Nominatim server can be customized via an `.env` configuration file in the +project directory. This is a file in [dotenv](https://github.com/theskumar/python-dotenv) +format which looks the same as variable settings in a standard shell environment. You can also set the same configuration via environment variables. All settings have a `NOMINATIM_` prefix to avoid conflicts with other environment variables. @@ -283,16 +283,14 @@ address set to complement the OSM house number data in the US. You can add TIGER data to your own Nominatim instance by following these steps. The entire US adds about 10GB to your database. - 1. Get preprocessed TIGER 2020 data and unpack it into your project - directory: + 1. Get preprocessed TIGER 2020 data: cd $PROJECT_DIR wget https://nominatim.org/data/tiger2020-nominatim-preprocessed.tar.gz - tar xf tiger2020-nominatim-preprocessed.tar.gz 2. Import the data into your Nominatim database: - nominatim add-data --tiger-data tiger + nominatim add-data --tiger-data tiger2020-nominatim-preprocessed.tar.gz 3. Enable use of the Tiger data in your `.env` by adding: diff --git a/docs/admin/Migration.md b/docs/admin/Migration.md index 52970a33..0ca6ebf2 100644 --- a/docs/admin/Migration.md +++ b/docs/admin/Migration.md @@ -4,18 +4,29 @@ Since version 3.7.0 Nominatim offers automatic migrations. Please follow the following steps: * stop any updates that are potentially running -* update Nominatim to the nwer version -* goto your project directory and run `nominatim admin --migrate` +* update Nominatim to the newer version +* go to your project directory and run `nominatim admin --migrate` * (optionally) restart updates Below you find additional migrations and hints about other structural and -breaking changes. +breaking changes. **Please read them before running the migration.** !!! note If you are migrating from a version <3.6, then you still have to follow the manual migration steps up to 3.6. -## 3.6.0 -> master +## 3.6.0 -> 3.7.0 + +### New format and name of configuration file + +The configuration for an import is now saved in a `.env` file in the project +directory. This file follows the dotenv format. For more information, see +the [installation chapter](Import.md#configuration-setup-in-env). + +To migrate to the new system, create a new project directory, add the `.env` +file and port your custom configuration from `settings/local.php`. Most +settings are named similar and only have received a `NOMINATIM_` prefix. +Use the default settings in `settings/env.defaults` as a reference. ### New location for data files @@ -45,6 +56,12 @@ Try `nominatim --help` for more information about each subcommand. `./utils/query.php` no longer exists in its old form. `nominatim search` provides a replacement but returns different output. +### Switch to normalized house numbers + +The housenumber column in the placex table uses now normalized version. +The automatic migration step will convert the column but this may take a +very long time. It is advisable to take the machine offline while doing that. + ## 3.5.0 -> 3.6.0 ### Change of layout of search_name_* tables diff --git a/lib-php/Geocode.php b/lib-php/Geocode.php index f638af9a..ec6876fa 100644 --- a/lib-php/Geocode.php +++ b/lib-php/Geocode.php @@ -18,7 +18,6 @@ class Geocode protected $aLangPrefOrder = array(); protected $aExcludePlaceIDs = array(); - protected $bReverseInPlan = true; protected $iLimit = 20; protected $iFinalLimit = 10; @@ -61,11 +60,6 @@ class Geocode return $this->oNormalizer->transliterate($sTerm); } - public function setReverseInPlan($bReverse) - { - $this->bReverseInPlan = $bReverse; - } - public function setLanguagePreference($aLangPref) { $this->aLangPrefOrder = $aLangPref; @@ -262,7 +256,6 @@ class Geocode $oParams->getString('country'), $oParams->getString('postalcode') ); - $this->setReverseInPlan(false); } else { $this->setQuery($sQuery); } @@ -330,7 +323,7 @@ class Geocode return false; } - public function getGroupedSearches($aSearches, $aPhrases, $oValidTokens, $bIsStructured) + public function getGroupedSearches($aSearches, $aPhrases, $oValidTokens) { /* Calculate all searches using oValidTokens i.e. @@ -345,7 +338,7 @@ class Geocode */ foreach ($aPhrases as $iPhrase => $oPhrase) { $aNewPhraseSearches = array(); - $sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : ''; + $sPhraseType = $oPhrase->getPhraseType(); foreach ($oPhrase->getWordSets() as $aWordset) { $aWordsetSearches = $aSearches; @@ -388,7 +381,7 @@ class Geocode $aNewSearches = $oCurrentSearch->extendWithPartialTerm( $sToken, $oSearchTerm, - $bIsStructured, + (bool) $sPhraseType, $iPhrase, $oValidTokens->get(' '.$sToken) ); @@ -607,10 +600,8 @@ class Geocode // Commas are used to reduce the search space by indicating where phrases split if ($this->aStructuredQuery) { $aInPhrases = $this->aStructuredQuery; - $bStructuredPhrases = true; } else { $aInPhrases = explode(',', $sQuery); - $bStructuredPhrases = false; } Debug::printDebugArray('Search context', $oCtx); @@ -684,9 +675,9 @@ class Geocode Debug::newSection('Search candidates'); - $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens, $bStructuredPhrases); + $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens); - if ($this->bReverseInPlan) { + if (!$this->aStructuredQuery) { // Reverse phrase array and also reverse the order of the wordsets in // the first and final phrase. Don't bother about phrases in the middle // because order in the address doesn't matter. @@ -695,7 +686,7 @@ class Geocode if (count($aPhrases) > 1) { $aPhrases[count($aPhrases)-1]->invertWordSets(); } - $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens, false); + $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens); foreach ($aGroupedSearches as $aSearches) { foreach ($aSearches as $aSearch) { @@ -999,7 +990,6 @@ class Geocode 'Structured query' => $this->aStructuredQuery, 'Name keys' => Debug::fmtArrayVals($this->aLangPrefOrder), 'Excluded place IDs' => Debug::fmtArrayVals($this->aExcludePlaceIDs), - 'Try reversed query'=> $this->bReverseInPlan, 'Limit (for searches)' => $this->iLimit, 'Limit (for results)'=> $this->iFinalLimit, 'Country codes' => Debug::fmtArrayVals($this->aCountryCodes), diff --git a/lib-php/admin/query.php b/lib-php/admin/query.php index beb2f9ef..35fd1184 100644 --- a/lib-php/admin/query.php +++ b/lib-php/admin/query.php @@ -79,7 +79,6 @@ if (!$oParams->hasSetAny($aSearchParams)) { $oGeocode = new Nominatim\Geocode($oDB); $oGeocode->setLanguagePreference($oParams->getPreferredLanguages(false)); -$oGeocode->setReverseInPlan(true); $oGeocode->loadParamArray($oParams); if ($oParams->getBool('search')) { diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql index 6998224e..812bc79f 100644 --- a/lib-sql/functions/placex_triggers.sql +++ b/lib-sql/functions/placex_triggers.sql @@ -169,7 +169,7 @@ BEGIN END IF; IF bnd.name ? 'name' THEN - bnd_name := make_standard_name(bnd.name->'name'); + bnd_name := lower(bnd.name->'name'); IF bnd_name = '' THEN bnd_name := NULL; END IF; @@ -180,12 +180,14 @@ BEGIN IF bnd.extratags ? 'place' and bnd_name is not null THEN FOR linked_placex IN SELECT * FROM placex - WHERE make_standard_name(name->'name') = bnd_name + WHERE (position(lower(name->'name') in bnd_name) > 0 + OR position(bnd_name in lower(name->'name')) > 0) AND placex.class = 'place' AND placex.type = bnd.extratags->'place' AND placex.osm_type = 'N' AND placex.linked_place_id is null AND placex.rank_search < 26 -- needed to select the right index - AND _st_covers(bnd.geometry, placex.geometry) + AND placex.type != 'postcode' + AND ST_Covers(bnd.geometry, placex.geometry) LOOP {% if debug %}RAISE WARNING 'Found type-matching place node %', linked_placex.osm_id;{% endif %} RETURN linked_placex; @@ -201,7 +203,7 @@ BEGIN AND placex.linked_place_id is null AND placex.rank_search < 26 AND _st_covers(bnd.geometry, placex.geometry) - ORDER BY make_standard_name(name->'name') = bnd_name desc + ORDER BY lower(name->'name') = bnd_name desc LOOP {% if debug %}RAISE WARNING 'Found wikidata-matching place node %', linked_placex.osm_id;{% endif %} RETURN linked_placex; @@ -213,7 +215,7 @@ BEGIN {% if debug %}RAISE WARNING 'Looking for nodes with matching names';{% endif %} FOR linked_placex IN SELECT placex.* from placex - WHERE make_standard_name(name->'name') = bnd_name + WHERE lower(name->'name') = bnd_name AND ((bnd.rank_address > 0 and bnd.rank_address = (compute_place_rank(placex.country_code, 'N', placex.class, @@ -221,9 +223,11 @@ BEGIN false, placex.postcode)).address_rank) OR (bnd.rank_address = 0 and placex.rank_search = bnd.rank_search)) AND placex.osm_type = 'N' + AND placex.class = 'place' AND placex.linked_place_id is null AND placex.rank_search < 26 -- needed to select the right index - AND _st_covers(bnd.geometry, placex.geometry) + AND placex.type != 'postcode' + AND ST_Covers(bnd.geometry, placex.geometry) LOOP {% if debug %}RAISE WARNING 'Found matching place node %', linked_placex.osm_id;{% endif %} RETURN linked_placex; diff --git a/lib-sql/indices.sql b/lib-sql/indices.sql index c121a963..a6f7cf95 100644 --- a/lib-sql/indices.sql +++ b/lib-sql/indices.sql @@ -23,12 +23,6 @@ CREATE INDEX {{sql.if_index_not_exists}} idx_placex_geometry_reverse_lookupPolyg AND rank_address between 4 and 25 AND type != 'postcode' AND name is not null AND indexed_status = 0 AND linked_place_id is null; -CREATE INDEX {{sql.if_index_not_exists}} idx_placex_geometry_reverse_placeNode - ON placex USING gist (geometry) {{db.tablespace.search_index}} - WHERE osm_type = 'N' AND rank_search between 5 and 25 - AND class = 'place' AND type != 'postcode' - AND name is not null AND indexed_status = 0 AND linked_place_id is null; - CREATE INDEX {{sql.if_index_not_exists}} idx_osmline_parent_place_id ON location_property_osmline USING BTREE (parent_place_id) {{db.tablespace.search_index}}; diff --git a/lib-sql/tables.sql b/lib-sql/tables.sql index 329eb7a1..aa213dba 100644 --- a/lib-sql/tables.sql +++ b/lib-sql/tables.sql @@ -184,7 +184,10 @@ CREATE INDEX idx_placex_osmid ON placex USING BTREE (osm_type, osm_id) {{db.tabl CREATE INDEX idx_placex_linked_place_id ON placex USING BTREE (linked_place_id) {{db.tablespace.address_index}} WHERE linked_place_id IS NOT NULL; CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}}; CREATE INDEX idx_placex_geometry ON placex USING GIST (geometry) {{db.tablespace.search_index}}; -CREATE INDEX idx_placex_adminname on placex USING BTREE (make_standard_name(name->'name')) {{db.tablespace.address_index}} WHERE osm_type='N' and rank_search < 26; +CREATE INDEX idx_placex_geometry_placenode ON placex + USING GIST (geometry) {{db.tablespace.search_index}} + WHERE osm_type = 'N' and rank_search < 26 + and class = 'place' and type != 'postcode' and linked_place_id is null; CREATE INDEX idx_placex_wikidata on placex USING BTREE ((extratags -> 'wikidata')) {{db.tablespace.address_index}} WHERE extratags ? 'wikidata' and class = 'place' and osm_type = 'N' and rank_search < 26; DROP SEQUENCE IF EXISTS seq_place; diff --git a/nominatim/tools/check_database.py b/nominatim/tools/check_database.py index d8ab08cc..5b39085d 100644 --- a/nominatim/tools/check_database.py +++ b/nominatim/tools/check_database.py @@ -84,7 +84,8 @@ def _get_indexes(conn): 'idx_placex_rank_address', 'idx_placex_parent_place_id', 'idx_placex_geometry_reverse_lookuppolygon', - 'idx_placex_geometry_reverse_placenode', + 'idx_placex_geometry_placenode', + 'idx_placex_housenumber', 'idx_osmline_parent_place_id', 'idx_osmline_parent_osm_id', 'idx_postcode_id', diff --git a/nominatim/tools/migration.py b/nominatim/tools/migration.py index b5f0b80e..54848341 100644 --- a/nominatim/tools/migration.py +++ b/nominatim/tools/migration.py @@ -156,3 +156,20 @@ def change_housenumber_transliteration(conn, **_): cur.execute("""UPDATE placex SET housenumber = create_housenumber_id(housenumber) WHERE housenumber is not null""") + + +@_migration(3, 7, 0, 0) +def switch_placenode_geometry_index(conn, **_): + """ Replace idx_placex_geometry_reverse_placeNode index. + + Make the index slightly more permissive, so that it can also be used + when matching up boundaries and place nodes. It makes the index + idx_placex_adminname index unnecessary. + """ + with conn.cursor() as cur: + cur.execute(""" CREATE INDEX IF NOT EXISTS idx_placex_geometry_placenode ON placex + USING GIST (geometry) + WHERE osm_type = 'N' and rank_search < 26 + and class = 'place' and type != 'postcode' + and linked_place_id is null""") + cur.execute(""" DROP INDEX IF EXISTS idx_placex_adminname """) diff --git a/nominatim/tools/special_phrases.py b/nominatim/tools/special_phrases.py index fd46a18d..0c1258fe 100644 --- a/nominatim/tools/special_phrases.py +++ b/nominatim/tools/special_phrases.py @@ -32,6 +32,17 @@ class SpecialPhrasesImporter(): self.sanity_check_pattern = re.compile(r'^\w+$') self.transliterator = Transliterator.createFromRules("special-phrases normalizer", self.config.TERM_NORMALIZATION) + #This set will contain all existing phrases from the word table which + #no longer exist on the wiki. + #It contain tuples with the following format: (normalized_word, class, type, operator) + self.words_phrases_to_delete = set() + #This set will contain the phrases which still exist from the wiki. + #It is used to prevent duplicates on the wiki by removing them from + #the word_phrases_to_delete only at the end. + self.words_phrases_still_exist = set() + #This set will contain all existing place_classtype tables which doesn't match any + #special phrases class/type on the wiki. + self.table_phrases_to_delete = set() def import_from_wiki(self, languages=None): """ @@ -41,6 +52,9 @@ class SpecialPhrasesImporter(): if languages is not None and not isinstance(languages, list): raise TypeError('The \'languages\' argument should be of type list.') + self._fetch_existing_words_phrases() + self._fetch_existing_place_classtype_tables() + #Get all languages to process. languages = self._load_languages() if not languages else languages @@ -53,9 +67,46 @@ class SpecialPhrasesImporter(): class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang)) self._create_place_classtype_table_and_indexes(class_type_pairs) + self._remove_non_existent_phrases_from_db() self.db_connection.commit() LOG.warning('Import done.') + def _fetch_existing_words_phrases(self): + """ + Fetch existing special phrases from the word table. + Fill the word_phrases_to_delete set of the class. + """ + #Only extract special phrases terms: + #If class=place and type=house then it is a housenumber term. + #If class=place and type=postcode then it is a postcode term. + word_query = """ + SELECT word, class, type, operator FROM word + WHERE class != 'place' OR (type != 'house' AND type != 'postcode') + """ + with self.db_connection.cursor() as db_cursor: + db_cursor.execute(SQL(word_query)) + for row in db_cursor: + row[3] = '-' if row[3] is None else row[3] + self.words_phrases_to_delete.add( + (row[0], row[1], row[2], row[3]) + ) + + def _fetch_existing_place_classtype_tables(self): + """ + Fetch existing place_classtype tables. + Fill the table_phrases_to_delete set of the class. + """ + query = """ + SELECT table_name + FROM information_schema.tables + WHERE table_schema='public' + AND table_name like 'place_classtype_%'; + """ + with self.db_connection.cursor() as db_cursor: + db_cursor.execute(SQL(query)) + for row in db_cursor: + self.table_phrases_to_delete.add(row[0]) + def _load_white_and_black_lists(self): """ Load white and black lists from phrases-settings.json. @@ -80,7 +131,7 @@ class SpecialPhrasesImporter(): 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu', 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl', 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi'] - return self.config.LANGUAGES or default_languages + return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages @staticmethod def _get_wiki_content(lang): @@ -122,12 +173,11 @@ class SpecialPhrasesImporter(): phrase_class = match[1].strip() phrase_type = match[2].strip() phrase_operator = match[3].strip() + #Needed if some operator in the wiki are not written in english + phrase_operator = '-' if phrase_operator not in ('near', 'in') else phrase_operator #hack around a bug where building=yes was imported with quotes into the wiki phrase_type = re.sub(r'\"|"', '', phrase_type) - #sanity check, in case somebody added garbage in the wiki - self._check_sanity(lang, phrase_class, phrase_type) - #blacklisting: disallow certain class/type combinations if ( phrase_class in self.black_list.keys() and @@ -141,7 +191,22 @@ class SpecialPhrasesImporter(): ): continue - #add class/type to the pairs dict + #Check if the phrase already exists in the database. + if ( + (normalized_label, phrase_class, phrase_type, phrase_operator) + in self.words_phrases_to_delete + ): + #Remove this phrase from the ones to delete as it still exist on the wiki. + self.words_phrases_still_exist.add( + (normalized_label, phrase_class, phrase_type, phrase_operator) + ) + class_type_pairs.add((phrase_class, phrase_type)) + #Dont need to add this phrase as it already exists in the word table. + continue + + #sanity check, in case somebody added garbage in the wiki + self._check_sanity(lang, phrase_class, phrase_type) + class_type_pairs.add((phrase_class, phrase_type)) self._process_amenity( @@ -191,6 +256,15 @@ class SpecialPhrasesImporter(): phrase_class = pair[0] phrase_type = pair[1] + table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type) + + if table_name in self.table_phrases_to_delete: + #Remove this table from the ones to delete as it match a class/type + #still existing on the special phrases of the wiki. + self.table_phrases_to_delete.remove(table_name) + #So dont need to create the table and indexes. + continue + #Table creation self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type) @@ -251,6 +325,41 @@ class SpecialPhrasesImporter(): .format(Identifier(table_name), Identifier(self.config.DATABASE_WEBUSER))) + def _remove_non_existent_phrases_from_db(self): + """ + Remove special phrases which doesn't exist on the wiki anymore. + Delete from the word table and delete the place_classtype tables. + """ + LOG.warning('Cleaning database...') + self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist + #Array containing all queries to execute. Contain tuples of format (query, parameters) + queries_parameters = [] + + #Delete phrases from the word table which are not on the wiki anymore. + for phrase_to_delete in self.words_phrases_to_delete: + if phrase_to_delete[3] == '-': + query = """ + DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null + """ + parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], ) + queries_parameters.append((query, parameters)) + else: + query = """ + DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s + """ + parameters = (phrase_to_delete[0], phrase_to_delete[1], + phrase_to_delete[2], phrase_to_delete[3], ) + queries_parameters.append((query, parameters)) + + #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore + for table in self.table_phrases_to_delete: + query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table)) + queries_parameters.append((query, ())) + + with self.db_connection.cursor() as db_cursor: + for query, parameters in queries_parameters: + db_cursor.execute(query, parameters) + def _convert_php_settings_if_needed(self, file_path): """ Convert php settings file of special phrases to json file if it is still in php format. diff --git a/nominatim/tools/tiger_data.py b/nominatim/tools/tiger_data.py index 9b960e2d..c655f91d 100644 --- a/nominatim/tools/tiger_data.py +++ b/nominatim/tools/tiger_data.py @@ -61,6 +61,20 @@ def handle_threaded_sql_statements(sel, file): except Exception as exc: # pylint: disable=broad-except LOG.info('Wrong SQL statement: %s', exc) +def handle_unregister_connection_pool(sel, place_threads): + """ Handles unregistering pool of connections + """ + + while place_threads > 0: + for key, _ in sel.select(1): + conn = key.data + sel.unregister(conn) + try: + conn.wait() + except Exception as exc: # pylint: disable=broad-except + LOG.info('Wrong SQL statement: %s', exc) + conn.close() + place_threads -= 1 def add_tiger_data(dsn, data_dir, threads, config, sqllib_dir): """ Import tiger data from directory or tar file @@ -95,13 +109,7 @@ def add_tiger_data(dsn, data_dir, threads, config, sqllib_dir): handle_threaded_sql_statements(sel, file) # Unregistering pool of database connections - while place_threads > 0: - for key, _ in sel.select(1): - conn = key.data - sel.unregister(conn) - conn.wait() - conn.close() - place_threads -= 1 + handle_unregister_connection_pool(sel, place_threads) if tar: tar.close() diff --git a/nominatim/version.py b/nominatim/version.py index 352e6e55..9670ea60 100644 --- a/nominatim/version.py +++ b/nominatim/version.py @@ -10,7 +10,7 @@ Version information for Nominatim. # and must always be increased when there is a change to the database or code # that requires a migration. # Released versions always have a database patch level of 0. -NOMINATIM_VERSION = (3, 6, 0, 1) +NOMINATIM_VERSION = (3, 7, 0, 1) POSTGRESQL_REQUIRED_VERSION = (9, 3) POSTGIS_REQUIRED_VERSION = (2, 2) diff --git a/test/python/test_tools_import_special_phrases.py b/test/python/test_tools_import_special_phrases.py index b77ae10d..9ca6bbeb 100644 --- a/test/python/test_tools_import_special_phrases.py +++ b/test/python/test_tools_import_special_phrases.py @@ -2,6 +2,7 @@ Tests for import special phrases methods of the class SpecialPhrasesImporter. """ +from mocks import MockParamCapture from nominatim.errors import UsageError from pathlib import Path import tempfile @@ -11,6 +12,72 @@ from nominatim.tools.special_phrases import SpecialPhrasesImporter TEST_BASE_DIR = Path(__file__) / '..' / '..' +def test_fetch_existing_words_phrases_basic(special_phrases_importer, word_table, + temp_db_conn): + """ + Check for the fetch_existing_words_phrases() method. + It should return special phrase term added to the word + table. + """ + with temp_db_conn.cursor() as temp_db_cursor: + query =""" + INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word', + 'class', 'type', null, 0, 'near'); + """ + temp_db_cursor.execute(query) + + assert not special_phrases_importer.words_phrases_to_delete + special_phrases_importer._fetch_existing_words_phrases() + contained_phrase = special_phrases_importer.words_phrases_to_delete.pop() + assert contained_phrase == ('normalized_word', 'class', 'type', 'near') + +def test_fetch_existing_words_phrases_housenumber(special_phrases_importer, word_table, + temp_db_conn): + """ + Check for the fetch_existing_words_phrases() method. + It should return nothing as the term added correspond + to a housenumber term. + """ + with temp_db_conn.cursor() as temp_db_cursor: + query =""" + INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word', + 'place', 'house', null, 0, 'near'); + """ + temp_db_cursor.execute(query) + + special_phrases_importer._fetch_existing_words_phrases() + assert not special_phrases_importer.words_phrases_to_delete + +def test_fetch_existing_words_phrases_postcode(special_phrases_importer, word_table, + temp_db_conn): + """ + Check for the fetch_existing_words_phrases() method. + It should return nothing as the term added correspond + to a postcode term. + """ + with temp_db_conn.cursor() as temp_db_cursor: + query =""" + INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word', + 'place', 'postcode', null, 0, 'near'); + """ + temp_db_cursor.execute(query) + + special_phrases_importer._fetch_existing_words_phrases() + assert not special_phrases_importer.words_phrases_to_delete + +def test_fetch_existing_place_classtype_tables(special_phrases_importer, temp_db_conn): + """ + Check for the fetch_existing_place_classtype_tables() method. + It should return the table just created. + """ + with temp_db_conn.cursor() as temp_db_cursor: + query = 'CREATE TABLE place_classtype_testclasstypetable()' + temp_db_cursor.execute(query) + + special_phrases_importer._fetch_existing_place_classtype_tables() + contained_table = special_phrases_importer.table_phrases_to_delete.pop() + assert contained_table == 'place_classtype_testclasstypetable' + def test_check_sanity_class(special_phrases_importer): """ Check for _check_sanity() method. @@ -80,7 +147,7 @@ def test_convert_settings_giving_json(special_phrases_importer): assert returned == json_file def test_process_amenity_with_operator(special_phrases_importer, getorcreate_amenityoperator_funcs, - word_table, temp_db_conn): + temp_db_conn): """ Test that _process_amenity() execute well the getorcreate_amenityoperator() SQL function and that @@ -188,13 +255,72 @@ def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer, assert check_amenities_without_op(temp_db_conn) assert results[class_test] and type_test in results.values() +def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_phrases, + temp_db_conn): + """ + Check for the remove_non_existent_phrases_from_db() method. + + It should removed entries from the word table which are contained + in the words_phrases_to_delete set and not those also contained + in the words_phrases_still_exist set. + + place_classtype tables contained in table_phrases_to_delete should + be deleted. + """ + with temp_db_conn.cursor() as temp_db_cursor: + to_delete_phrase_tuple = ('normalized_word', 'class', 'type', 'near') + to_keep_phrase_tuple = ( + 'normalized_word_exists', 'class_exists', 'type_exists', 'near' + ) + special_phrases_importer.words_phrases_to_delete = { + to_delete_phrase_tuple, + to_keep_phrase_tuple + } + special_phrases_importer.words_phrases_still_exist = { + to_keep_phrase_tuple + } + special_phrases_importer.table_phrases_to_delete = { + 'place_classtype_testclasstypetable_to_delete' + } + + query_words = 'SELECT word, class, type, operator FROM word;' + query_tables = """ + SELECT table_name + FROM information_schema.tables + WHERE table_schema='public' + AND table_name like 'place_classtype_%'; + """ + + special_phrases_importer._remove_non_existent_phrases_from_db() + + temp_db_cursor.execute(query_words) + words_result = temp_db_cursor.fetchall() + temp_db_cursor.execute(query_tables) + tables_result = temp_db_cursor.fetchall() + assert len(words_result) == 1 and words_result[0] == [ + 'normalized_word_exists', 'class_exists', 'type_exists', 'near' + ] + assert (len(tables_result) == 1 and + tables_result[0][0] == 'place_classtype_testclasstypetable_to_keep' + ) + def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer, placex_table, - getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs): + getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs, word_table): """ Check that the main import_from_wiki() method is well executed. It should create the place_classtype table, the place_id and centroid indexes, grand access to the web user and executing the SQL functions for amenities. """ + mock_fetch_existing_words_phrases = MockParamCapture() + mock_fetch_existing_place_classtype_tables = MockParamCapture() + mock_remove_non_existent_phrases_from_db = MockParamCapture() + + monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._fetch_existing_words_phrases', + mock_fetch_existing_words_phrases) + monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._fetch_existing_place_classtype_tables', + mock_fetch_existing_place_classtype_tables) + monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._remove_non_existent_phrases_from_db', + mock_remove_non_existent_phrases_from_db) monkeypatch.setattr('nominatim.tools.special_phrases.SpecialPhrasesImporter._get_wiki_content', mock_get_wiki_content) special_phrases_importer.import_from_wiki(['en']) @@ -206,6 +332,9 @@ def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test) assert check_amenities_with_op(temp_db_conn) assert check_amenities_without_op(temp_db_conn) + assert mock_fetch_existing_words_phrases.called == 1 + assert mock_fetch_existing_place_classtype_tables.called == 1 + assert mock_remove_non_existent_phrases_from_db.called == 1 def mock_get_wiki_content(lang): """ @@ -305,6 +434,18 @@ def temp_phplib_dir_with_migration(): yield Path(phpdir) +@pytest.fixture +def default_phrases(word_table, temp_db_cursor): + temp_db_cursor.execute(""" + INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word', + 'class', 'type', null, 0, 'near'); + + INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word_exists', + 'class_exists', 'type_exists', null, 0, 'near'); + + CREATE TABLE place_classtype_testclasstypetable_to_delete(); + CREATE TABLE place_classtype_testclasstypetable_to_keep();""") + @pytest.fixture def make_strandard_name_func(temp_db_cursor): temp_db_cursor.execute(""" diff --git a/test/python/test_tools_tiger_data.py b/test/python/test_tools_tiger_data.py index 5029a132..1366fe3e 100644 --- a/test/python/test_tools_tiger_data.py +++ b/test/python/test_tools_tiger_data.py @@ -16,11 +16,24 @@ def test_add_tiger_data(dsn, src_dir, def_config, tmp_path, sql_preprocessor, temp_db_cursor.execute('CREATE EXTENSION postgis') temp_db_cursor.execute('CREATE TABLE place (id INT)') sqlfile = tmp_path / '1010.sql' - sqlfile.write_text("""INSERT INTO place values (1)""") + sqlfile.write_text("""INSERT INTO place values (1); + INSERT INTO non_existant_table values (1);""") tiger_data.add_tiger_data(dsn, str(tmp_path), threads, def_config, src_dir / 'lib-sql') assert temp_db_cursor.table_rows('place') == 1 +@pytest.mark.parametrize("threads", (1, 5)) +def test_add_tiger_data_bad_file(dsn, src_dir, def_config, tmp_path, sql_preprocessor, + temp_db_cursor, threads, temp_db): + temp_db_cursor.execute('CREATE EXTENSION hstore') + temp_db_cursor.execute('CREATE EXTENSION postgis') + temp_db_cursor.execute('CREATE TABLE place (id INT)') + sqlfile = tmp_path / '1010.txt' + sqlfile.write_text("""Random text""") + tiger_data.add_tiger_data(dsn, str(tmp_path), threads, def_config, src_dir / 'lib-sql') + + assert temp_db_cursor.table_rows('place') == 0 + @pytest.mark.parametrize("threads", (1, 5)) def test_add_tiger_data_tarfile(dsn, src_dir, def_config, tmp_path, temp_db_cursor, threads, temp_db, sql_preprocessor): @@ -28,10 +41,26 @@ def test_add_tiger_data_tarfile(dsn, src_dir, def_config, tmp_path, temp_db_cursor.execute('CREATE EXTENSION postgis') temp_db_cursor.execute('CREATE TABLE place (id INT)') sqlfile = tmp_path / '1010.sql' - sqlfile.write_text("""INSERT INTO place values (1)""") + sqlfile.write_text("""INSERT INTO place values (1); + INSERT INTO non_existant_table values (1);""") + tar = tarfile.open("sample.tar.gz", "w:gz") + tar.add(sqlfile) + tar.close() + tiger_data.add_tiger_data(dsn, str(src_dir / 'sample.tar.gz'), threads, def_config, src_dir / 'lib-sql') + + assert temp_db_cursor.table_rows('place') == 1 + +@pytest.mark.parametrize("threads", (1, 5)) +def test_add_tiger_data_bad_tarfile(dsn, src_dir, def_config, tmp_path, + temp_db_cursor, threads, temp_db, sql_preprocessor): + temp_db_cursor.execute('CREATE EXTENSION hstore') + temp_db_cursor.execute('CREATE EXTENSION postgis') + temp_db_cursor.execute('CREATE TABLE place (id INT)') + sqlfile = tmp_path / '1010.txt' + sqlfile.write_text("""Random text""") tar = tarfile.open("sample.tar.gz", "w:gz") tar.add(sqlfile) tar.close() tiger_data.add_tiger_data(dsn, str(src_dir / 'sample.tar.gz'), threads, def_config, src_dir / 'lib-sql') - assert temp_db_cursor.table_rows('place') == 1 \ No newline at end of file + assert temp_db_cursor.table_rows('place') == 0 \ No newline at end of file diff --git a/vagrant/Install-on-Centos-7.sh b/vagrant/Install-on-Centos-7.sh index fae961f1..32cd3a30 100755 --- a/vagrant/Install-on-Centos-7.sh +++ b/vagrant/Install-on-Centos-7.sh @@ -145,7 +145,7 @@ fi #DOCS: # ------------------------------- # # The webserver should serve the php scripts from the website directory of your -# [project directory](../admin/import.md#creating-the-project-directory). +# [project directory](../admin/Import.md#creating-the-project-directory). # Therefore set up a project directory and populate the website directory: # mkdir $USERHOME/nominatim-project diff --git a/vagrant/Install-on-Centos-8.sh b/vagrant/Install-on-Centos-8.sh index ac3a1b60..1e028b65 100755 --- a/vagrant/Install-on-Centos-8.sh +++ b/vagrant/Install-on-Centos-8.sh @@ -139,7 +139,7 @@ fi #DOCS: # ------------------------------- # # The webserver should serve the php scripts from the website directory of your -# [project directory](../admin/import.md#creating-the-project-directory). +# [project directory](../admin/Import.md#creating-the-project-directory). # Therefore set up a project directory and populate the website directory: # mkdir $USERHOME/nominatim-project diff --git a/vagrant/Install-on-Ubuntu-18.sh b/vagrant/Install-on-Ubuntu-18.sh index 9fda122e..36e28ca1 100755 --- a/vagrant/Install-on-Ubuntu-18.sh +++ b/vagrant/Install-on-Ubuntu-18.sh @@ -133,7 +133,7 @@ fi #DOCS: # ====================== # # The webserver should serve the php scripts from the website directory of your -# [project directory](../admin/import.md#creating-the-project-directory). +# [project directory](../admin/Import.md#creating-the-project-directory). # Therefore set up a project directory and populate the website directory: mkdir $USERHOME/nominatim-project diff --git a/vagrant/Install-on-Ubuntu-20.sh b/vagrant/Install-on-Ubuntu-20.sh index 292947d1..1e15f850 100755 --- a/vagrant/Install-on-Ubuntu-20.sh +++ b/vagrant/Install-on-Ubuntu-20.sh @@ -130,7 +130,7 @@ fi #DOCS: # ====================== # # The webserver should serve the php scripts from the website directory of your -# [project directory](../admin/import.md#creating-the-project-directory). +# [project directory](../admin/Import.md#creating-the-project-directory). # Therefore set up a project directory and populate the website directory: mkdir $USERHOME/nominatim-project