]> git.openstreetmap.org Git - nominatim.git/commitdiff
move special hack for US states to legacy tokenizer
authorSarah Hoffmann <lonvia@denofr.de>
Tue, 17 Aug 2021 12:28:55 +0000 (14:28 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Tue, 17 Aug 2021 12:28:55 +0000 (14:28 +0200)
The hack for IL, AL and LA is only needed because these abbreviations
are removed by the legacy tokenizer as a stop word. There is no need
to keep the hack for future tokenizers. Move it therefore to the
token extraction function.

lib-php/Geocode.php
lib-php/Phrase.php
lib-php/tokenizer/legacy_tokenizer.php

index 0f76a9c472749652823f8f4b8a8f0f306f8b2edb..43d10368eb9292ffcd90d2fcc8103b4a07ded395 100644 (file)
@@ -506,13 +506,6 @@ class Geocode
             userError('Query string is not UTF-8 encoded.');
         }
 
-        // Conflicts between US state abreviations and various words for 'the' in different languages
-        if (isset($this->aLangPrefOrder['name:en'])) {
-            $sQuery = preg_replace('/(^|,)\s*il\s*(,|$)/i', '\1illinois\2', $sQuery);
-            $sQuery = preg_replace('/(^|,)\s*al\s*(,|$)/i', '\1alabama\2', $sQuery);
-            $sQuery = preg_replace('/(^|,)\s*la\s*(,|$)/i', '\1louisiana\2', $sQuery);
-        }
-
         // Do we have anything that looks like a lat/lon pair?
         $sQuery = $oCtx->setNearPointFromQuery($sQuery);
 
index cdde6134768dd7e0af7501f9c5fb517ef0372931..4307a23022c640760fb08f92acddb3d01e8d70a0 100644 (file)
@@ -9,7 +9,8 @@ namespace Nominatim;
  */
 class Phrase
 {
-    // Complete phrase as a string.
+    // Complete phrase as a string (guaranteed to have no leading or trailing
+    // spaces).
     private $sPhrase;
     // Element type for structured searches.
     private $sPhraseType;
index e5ffbe025f05aabb268b886bc6c055174eb831e5..b508d220aca3f07e861fd0bbb6efc39a13cf5528 100644 (file)
@@ -87,6 +87,23 @@ class Tokenizer
             $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
             $sSQL .= 'make_standard_name(:' .$iPhrase.') as p'.$iPhrase.',';
             $aParams[':'.$iPhrase] = $oPhrase->getPhrase();
+
+            // Conflicts between US state abbreviations and various words
+            // for 'the' in different languages
+            switch (strtolower($oPhrase->getPhrase())) {
+                case 'il':
+                    $aParams[':'.$iPhrase] = 'illinois';
+                    break;
+                case 'al':
+                    $aParams[':'.$iPhrase] = 'alabama';
+                    break;
+                case 'la':
+                    $aParams[':'.$iPhrase] = 'louisiana';
+                    break;
+                default:
+                    $aParams[':'.$iPhrase] = $oPhrase->getPhrase();
+                    break;
+            }
         }
         $sSQL = substr($sSQL, 0, -1);