From f00b8dd1c315e0b887df340b1dc4f6cced3b06c2 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 17 Aug 2021 14:28:55 +0200 Subject: [PATCH] move special hack for US states to legacy tokenizer The hack for IL, AL and LA is only needed because these abbreviations are removed by the legacy tokenizer as a stop word. There is no need to keep the hack for future tokenizers. Move it therefore to the token extraction function. --- lib-php/Geocode.php | 7 ------- lib-php/Phrase.php | 3 ++- lib-php/tokenizer/legacy_tokenizer.php | 17 +++++++++++++++++ 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/lib-php/Geocode.php b/lib-php/Geocode.php index 0f76a9c4..43d10368 100644 --- a/lib-php/Geocode.php +++ b/lib-php/Geocode.php @@ -506,13 +506,6 @@ class Geocode userError('Query string is not UTF-8 encoded.'); } - // Conflicts between US state abreviations and various words for 'the' in different languages - if (isset($this->aLangPrefOrder['name:en'])) { - $sQuery = preg_replace('/(^|,)\s*il\s*(,|$)/i', '\1illinois\2', $sQuery); - $sQuery = preg_replace('/(^|,)\s*al\s*(,|$)/i', '\1alabama\2', $sQuery); - $sQuery = preg_replace('/(^|,)\s*la\s*(,|$)/i', '\1louisiana\2', $sQuery); - } - // Do we have anything that looks like a lat/lon pair? $sQuery = $oCtx->setNearPointFromQuery($sQuery); diff --git a/lib-php/Phrase.php b/lib-php/Phrase.php index cdde6134..4307a230 100644 --- a/lib-php/Phrase.php +++ b/lib-php/Phrase.php @@ -9,7 +9,8 @@ namespace Nominatim; */ class Phrase { - // Complete phrase as a string. + // Complete phrase as a string (guaranteed to have no leading or trailing + // spaces). private $sPhrase; // Element type for structured searches. private $sPhraseType; diff --git a/lib-php/tokenizer/legacy_tokenizer.php b/lib-php/tokenizer/legacy_tokenizer.php index e5ffbe02..b508d220 100644 --- a/lib-php/tokenizer/legacy_tokenizer.php +++ b/lib-php/tokenizer/legacy_tokenizer.php @@ -87,6 +87,23 @@ class Tokenizer $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase()); $sSQL .= 'make_standard_name(:' .$iPhrase.') as p'.$iPhrase.','; $aParams[':'.$iPhrase] = $oPhrase->getPhrase(); + + // Conflicts between US state abbreviations and various words + // for 'the' in different languages + switch (strtolower($oPhrase->getPhrase())) { + case 'il': + $aParams[':'.$iPhrase] = 'illinois'; + break; + case 'al': + $aParams[':'.$iPhrase] = 'alabama'; + break; + case 'la': + $aParams[':'.$iPhrase] = 'louisiana'; + break; + default: + $aParams[':'.$iPhrase] = $oPhrase->getPhrase(); + break; + } } $sSQL = substr($sSQL, 0, -1); -- 2.39.5