]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge pull request #742 from lonvia/compare-normalized
authorSarah Hoffmann <lonvia@denofr.de>
Mon, 5 Jun 2017 06:44:04 +0000 (08:44 +0200)
committerGitHub <noreply@github.com>
Mon, 5 Jun 2017 06:44:04 +0000 (08:44 +0200)
Require more exact match for special search terms

.travis.yml
Vagrantfile
docs/Installation.md
lib/Geocode.php
settings/defaults.php
sql/functions.sql
utils/specialphrases.php
vagrant/install-on-centos-7.sh
vagrant/install-on-travis-ci.sh
vagrant/install-on-ubuntu-16.sh

index 68d5be4e1ea9e91e6c1af69ad064b0d7bc6b3f49..5efc9f083171489a6d391896047c52afb322cdda 100644 (file)
@@ -19,6 +19,7 @@ script:
   - cd $TRAVIS_BUILD_DIR/build
   - if [[ $TEST_SUITE == "monaco" ]]; then wget --no-verbose --output-document=../data/monaco.osm.pbf http://download.geofabrik.de/europe/monaco-latest.osm.pbf; fi
   - if [[ $TEST_SUITE == "monaco" ]]; then ./utils/setup.php --osm-file ../data/monaco.osm.pbf --osm2pgsql-cache 1000 --all 2>&1 | grep -v 'ETA (seconds)'; fi
+  - if [[ $TEST_SUITE == "monaco" ]]; then ./utils/specialphrases.php --wiki-import | psql -d test_api_nominatim >/dev/null; fi
   - cd $TRAVIS_BUILD_DIR/test/php
   - if [[ $TEST_SUITE == "tests" ]]; then phpunit ./ ; fi
   - if [[ $TEST_SUITE == "tests" ]]; then phpcs --report-width=120 */**.php ; fi
index 15d66e9ed72bfbb158edbbb90c8636ad29afbb8d..b9d618e20f8434c07a9c1acb51a683128cf843fc 100644 (file)
@@ -23,7 +23,16 @@ Vagrant.configure("2") do |config|
       end
   end
 
-  config.vm.define "centos" do |sub|
+  config.vm.define "travis" do |sub|
+      sub.vm.box = "bento/ubuntu-14.04"
+      sub.vm.provision :shell do |s|
+        s.path = "vagrant/install-on-travis-ci.sh"
+        s.privileged = false
+        s.args = [checkout]
+      end
+  end
+
+   config.vm.define "centos" do |sub|
       sub.vm.box = "bento/centos-7.2"
       sub.vm.provision :shell do |s|
         s.path = "vagrant/install-on-centos-7.sh"
index 41f76df1dba87a8190de25e2be81f251f4f61cc0..88f32ada55ea2e9b77051b9d4fff34ee37c7e410 100644 (file)
@@ -39,6 +39,7 @@ For running Nominatim:
   * [PostGIS](http://postgis.refractions.net) (2.0 or later)
   * [PHP](http://php.net) (5.4 or later)
   * PHP-pgsql
+  * PHP-intl (bundled with PHP)
   * [PEAR::DB](http://pear.php.net/package/DB)
   * a webserver (apache or nginx are recommended)
 
index ec8eb3489e6daaa15f11ba3d6aa0029284abb114..17aaf826e2963e2f9405561bafd04604a2fad651 100644 (file)
@@ -653,7 +653,7 @@ class Geocode
         return $aSearchResults;
     }
 
-    public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases)
+    public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery)
     {
         /*
              Calculate all searches using aValidTokens i.e.
@@ -752,13 +752,19 @@ class Geocode
                                          */
                                     }
                                 } elseif ($sPhraseType == '' && $aSearchTerm['class'] !== '' && $aSearchTerm['class'] !== null) {
-                                    if ($aSearch['sClass'] === '') {
-                                        $aSearch['sOperator'] = $aSearchTerm['operator'];
+                                    // require a normalized exact match of the term
+                                    // if we have the normalizer version of the query
+                                    // available
+                                    if ($aSearch['sClass'] === ''
+                                        && ($sNormQuery === null || !($aSearchTerm['word'] && strpos($sNormQuery, $aSearchTerm['word']) === false))) {
                                         $aSearch['sClass'] = $aSearchTerm['class'];
                                         $aSearch['sType'] = $aSearchTerm['type'];
-                                        if (sizeof($aSearch['aName'])) $aSearch['sOperator'] = 'name';
-                                        else $aSearch['sOperator'] = 'near'; // near = in for the moment
-                                        if (strlen($aSearchTerm['operator']) == 0) $aSearch['iSearchRank'] += 1;
+                                        if ($aSearchTerm['operator'] == '') {
+                                            $aSearch['sOperator'] = sizeof($aSearch['aName']) ? 'name' :  'near';
+                                            $aSearch['iSearchRank'] += 2;
+                                        } else {
+                                            $aSearch['sOperator'] = 'near'; // near = in for the moment
+                                        }
 
                                         if ($aSearch['iSearchRank'] < $this->iMaxRank) $aNewWordsetSearches[] = $aSearch;
                                     }
@@ -913,6 +919,13 @@ class Geocode
     {
         if (!$this->sQuery && !$this->aStructuredQuery) return array();
 
+        $oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
+        if ($oNormalizer !== null) {
+            $sNormQuery = $oNormalizer->transliterate($this->sQuery);
+        } else {
+            $sNormQuery = null;
+        }
+
         $sLanguagePrefArraySQL = "ARRAY[".join(',', array_map("getDBQuoted", $this->aLangPrefOrder))."]";
         $sCountryCodesSQL = false;
         if ($this->aCountryCodes) {
@@ -1139,7 +1152,7 @@ class Geocode
                 // array with: placeid => -1 | tiger-housenumber
                 $aResultPlaceIDs = array();
 
-                $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases);
+                $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery);
 
                 if ($this->bReverseInPlan) {
                     // Reverse phrase array and also reverse the order of the wordsets in
@@ -1151,7 +1164,7 @@ class Geocode
                         $aFinalPhrase = end($aPhrases);
                         $aPhrases[sizeof($aPhrases)-1]['wordsets'] = getInverseWordSets($aFinalPhrase['words'], 0);
                     }
-                    $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false);
+                    $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery);
 
                     foreach ($aGroupedSearches as $aSearches) {
                         foreach ($aSearches as $aSearch) {
index 16711542faee15872a176e6eb0f10f20185f293c..9f694c89fb24699a62ee1d05c09b20b57c98f59a 100644 (file)
@@ -17,6 +17,10 @@ if (isset($_GET['debug']) && $_GET['debug']) @define('CONST_Debug', true);
 // codes, to restrict import to a subset of languages.
 // Currently only affects the import of country names and special phrases.
 @define('CONST_Languages', false);
+// Rules for normalizing terms for comparison before doing comparisons.
+// The default is to remove accents and punctuation and to lower-case the
+// term. Spaces are kept but collapsed to one standard space.
+@define('CONST_Term_Normalization_Rules', ":: NFD (); [:Nonspacing Mark:] >;  :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();");
 
 // Set to false to avoid importing extra postcodes for the US.
 @define('CONST_Use_Extra_US_Postcodes', true);
index 6cc4280344eab72da713d00ea5e9d263c2577fa4..da496a10c34cb9ed7ab5aa83e86bb5b06a713c1c 100644 (file)
@@ -101,7 +101,7 @@ END;
 $$
 LANGUAGE plpgsql;
 
-CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, lookup_class text, lookup_type text)
+CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT, lookup_class text, lookup_type text)
   RETURNS INTEGER
   AS $$
 DECLARE
@@ -109,17 +109,17 @@ DECLARE
   return_word_id INTEGER;
 BEGIN
   lookup_token := ' '||trim(lookup_word);
-  SELECT min(word_id) FROM word WHERE word_token = lookup_token and class=lookup_class and type = lookup_type into return_word_id;
+  SELECT min(word_id) FROM word WHERE word_token = lookup_token and word=normalized_word and class=lookup_class and type = lookup_type into return_word_id;
   IF return_word_id IS NULL THEN
     return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, null, lookup_class, lookup_type, null, 0);
+    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word, lookup_class, lookup_type, null, 0);
   END IF;
   RETURN return_word_id;
 END;
 $$
 LANGUAGE plpgsql;
 
-CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT, lookup_class text, lookup_type text, op text)
+CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT, normalized_word TEXT, lookup_class text, lookup_type text, op text)
   RETURNS INTEGER
   AS $$
 DECLARE
@@ -127,10 +127,10 @@ DECLARE
   return_word_id INTEGER;
 BEGIN
   lookup_token := ' '||trim(lookup_word);
-  SELECT min(word_id) FROM word WHERE word_token = lookup_token and class=lookup_class and type = lookup_type and operator = op into return_word_id;
+  SELECT min(word_id) FROM word WHERE word_token = lookup_token and word=normalized_word and class=lookup_class and type = lookup_type and operator = op into return_word_id;
   IF return_word_id IS NULL THEN
     return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, null, lookup_class, lookup_type, null, 0, op);
+    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word, lookup_class, lookup_type, null, 0, op);
   END IF;
   RETURN return_word_id;
 END;
index 50522fc2a3975164a4794f4c315bac053a63564a..1a4a51d758e9b9ce7c1ece8280caeb11743f2c7a 100755 (executable)
@@ -19,6 +19,7 @@ getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
 include(CONST_InstallPath.'/settings/phrase_settings.php');
 
 if ($aCMDResult['wiki-import']) {
+    $oNormalizer = Transliterator::createFromRules(CONST_Term_Normalization_Rules);
     $aPairs = array();
 
     $sLanguageIn = CONST_Languages ? CONST_Languages :
@@ -31,6 +32,11 @@ if ($aCMDResult['wiki-import']) {
         if (preg_match_all('#\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([\\-YN])#', $sWikiPageXML, $aMatches, PREG_SET_ORDER)) {
             foreach ($aMatches as $aMatch) {
                 $sLabel = trim($aMatch[1]);
+                if ($oNormalizer !== null) {
+                    $sTrans = pg_escape_string($oNormalizer->transliterate($sLabel));
+                } else {
+                    $sTrans = null;
+                }
                 $sClass = trim($aMatch[2]);
                 $sType = trim($aMatch[3]);
                 // hack around a bug where building=yes was imported with
@@ -57,13 +63,13 @@ if ($aCMDResult['wiki-import']) {
 
                 switch (trim($aMatch[4])) {
                     case 'near':
-                        echo "select getorcreate_amenityoperator(make_standard_name('".pg_escape_string($sLabel)."'), '$sClass', '$sType', 'near');\n";
+                        echo "select getorcreate_amenityoperator(make_standard_name('".pg_escape_string($sLabel)."'), '$sTrans', '$sClass', '$sType', 'near');\n";
                         break;
                     case 'in':
-                        echo "select getorcreate_amenityoperator(make_standard_name('".pg_escape_string($sLabel)."'), '$sClass', '$sType', 'in');\n";
+                        echo "select getorcreate_amenityoperator(make_standard_name('".pg_escape_string($sLabel)."'), '$sTrans', '$sClass', '$sType', 'in');\n";
                         break;
                     default:
-                        echo "select getorcreate_amenity(make_standard_name('".pg_escape_string($sLabel)."'), '$sClass', '$sType');\n";
+                        echo "select getorcreate_amenity(make_standard_name('".pg_escape_string($sLabel)."'), '$sTrans', '$sClass', '$sType');\n";
                         break;
                 }
             }
index 8aeedcc6c72ea5dffe38a06a16922f1c407c2e80..8b283ef63f1b7af7b953f54a91828f6459bb5ea5 100755 (executable)
@@ -21,7 +21,7 @@
 
     sudo yum install -y postgresql-server postgresql-contrib postgresql-devel postgis postgis-utils \
                         git cmake make gcc gcc-c++ libtool policycoreutils-python \
-                        php-pgsql php php-pear php-pear-DB libpqxx-devel proj-epsg \
+                        php-pgsql php php-pear php-pear-DB php-intl libpqxx-devel proj-epsg \
                         bzip2-devel proj-devel geos-devel libxml2-devel boost-devel expat-devel zlib-devel
 
 # If you want to run the test suite, you need to install the following
index 44faa614e5de7bcba05172c5f3fd349ab23592fa..ec0a92dabd4b6ab128c8a9e1c0e29f68e10247cb 100755 (executable)
@@ -16,7 +16,7 @@ sudo apt-get install -y -qq libboost-dev libboost-system-dev \
                             libboost-filesystem-dev libexpat1-dev zlib1g-dev libxml2-dev\
                             libbz2-dev libpq-dev libgeos-c1 libgeos++-dev libproj-dev \
                             postgresql-server-dev-9.6 postgresql-9.6-postgis-2.3 postgresql-contrib-9.6 \
-                            apache2 php5 php5-pgsql php-pear php-db
+                            apache2 php5 php5-pgsql php-pear php-db php5-intl
 
 sudo apt-get install -y -qq python3-dev python3-pip python3-psycopg2 phpunit php5-cgi
 
index c347923fcf0f53ca784138fb7d5d44d637d599f1..11f80a3e4b4ec8949f25ee0586499cc7e0fd6a1b 100755 (executable)
@@ -28,7 +28,7 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
                             libbz2-dev libpq-dev libgeos-dev libgeos++-dev libproj-dev \
                             postgresql-server-dev-9.5 postgresql-9.5-postgis-2.2 postgresql-contrib-9.5 \
                             apache2 php php-pgsql libapache2-mod-php php-pear php-db \
-                            git
+                            php-intl git
 
 # If you want to run the test suite, you need to install the following
 # additional packages: