From 54393addd38726e4f02643591e2579b5da7085fd Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 1 Jun 2017 21:40:23 +0200 Subject: [PATCH 1/1] disregard special phrases that do not match fully Compare the normalized terms imported with the special terms script with the normalized version of the query string. Disregard them if they cannot be found. This avoids a significant number of mismatches due to transliteration issues. The match will only be done when a normalized word has been set making this change backwards compatible with older databases. --- lib/Geocode.php | 29 +++++++++++++++++++++-------- settings/defaults.php | 4 ++++ utils/specialphrases.php | 8 ++++++-- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/lib/Geocode.php b/lib/Geocode.php index ec8eb348..17aaf826 100644 --- a/lib/Geocode.php +++ b/lib/Geocode.php @@ -653,7 +653,7 @@ class Geocode return $aSearchResults; } - public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases) + public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery) { /* Calculate all searches using aValidTokens i.e. @@ -752,13 +752,19 @@ class Geocode */ } } elseif ($sPhraseType == '' && $aSearchTerm['class'] !== '' && $aSearchTerm['class'] !== null) { - if ($aSearch['sClass'] === '') { - $aSearch['sOperator'] = $aSearchTerm['operator']; + // require a normalized exact match of the term + // if we have the normalizer version of the query + // available + if ($aSearch['sClass'] === '' + && ($sNormQuery === null || !($aSearchTerm['word'] && strpos($sNormQuery, $aSearchTerm['word']) === false))) { $aSearch['sClass'] = $aSearchTerm['class']; $aSearch['sType'] = $aSearchTerm['type']; - if (sizeof($aSearch['aName'])) $aSearch['sOperator'] = 'name'; - else $aSearch['sOperator'] = 'near'; // near = in for the moment - if (strlen($aSearchTerm['operator']) == 0) $aSearch['iSearchRank'] += 1; + if ($aSearchTerm['operator'] == '') { + $aSearch['sOperator'] = sizeof($aSearch['aName']) ? 'name' : 'near'; + $aSearch['iSearchRank'] += 2; + } else { + $aSearch['sOperator'] = 'near'; // near = in for the moment + } if ($aSearch['iSearchRank'] < $this->iMaxRank) $aNewWordsetSearches[] = $aSearch; } @@ -913,6 +919,13 @@ class Geocode { if (!$this->sQuery && !$this->aStructuredQuery) return array(); + $oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules); + if ($oNormalizer !== null) { + $sNormQuery = $oNormalizer->transliterate($this->sQuery); + } else { + $sNormQuery = null; + } + $sLanguagePrefArraySQL = "ARRAY[".join(',', array_map("getDBQuoted", $this->aLangPrefOrder))."]"; $sCountryCodesSQL = false; if ($this->aCountryCodes) { @@ -1139,7 +1152,7 @@ class Geocode // array with: placeid => -1 | tiger-housenumber $aResultPlaceIDs = array(); - $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases); + $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery); if ($this->bReverseInPlan) { // Reverse phrase array and also reverse the order of the wordsets in @@ -1151,7 +1164,7 @@ class Geocode $aFinalPhrase = end($aPhrases); $aPhrases[sizeof($aPhrases)-1]['wordsets'] = getInverseWordSets($aFinalPhrase['words'], 0); } - $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false); + $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery); foreach ($aGroupedSearches as $aSearches) { foreach ($aSearches as $aSearch) { diff --git a/settings/defaults.php b/settings/defaults.php index 16711542..9f694c89 100644 --- a/settings/defaults.php +++ b/settings/defaults.php @@ -17,6 +17,10 @@ if (isset($_GET['debug']) && $_GET['debug']) @define('CONST_Debug', true); // codes, to restrict import to a subset of languages. // Currently only affects the import of country names and special phrases. @define('CONST_Languages', false); +// Rules for normalizing terms for comparison before doing comparisons. +// The default is to remove accents and punctuation and to lower-case the +// term. Spaces are kept but collapsed to one standard space. +@define('CONST_Term_Normalization_Rules', ":: NFD (); [:Nonspacing Mark:] >; :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();"); // Set to false to avoid importing extra postcodes for the US. @define('CONST_Use_Extra_US_Postcodes', true); diff --git a/utils/specialphrases.php b/utils/specialphrases.php index 15616976..1a4a51d7 100755 --- a/utils/specialphrases.php +++ b/utils/specialphrases.php @@ -19,7 +19,7 @@ getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true); include(CONST_InstallPath.'/settings/phrase_settings.php'); if ($aCMDResult['wiki-import']) { - $oNormalizer = Transliterator::createFromRules(":: NFD (); [:Nonspacing Mark:] >; :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();"); + $oNormalizer = Transliterator::createFromRules(CONST_Term_Normalization_Rules); $aPairs = array(); $sLanguageIn = CONST_Languages ? CONST_Languages : @@ -32,7 +32,11 @@ if ($aCMDResult['wiki-import']) { if (preg_match_all('#\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([\\-YN])#', $sWikiPageXML, $aMatches, PREG_SET_ORDER)) { foreach ($aMatches as $aMatch) { $sLabel = trim($aMatch[1]); - $sTrans = pg_escape_string($oNormalizer->transliterate($sLabel)); + if ($oNormalizer !== null) { + $sTrans = pg_escape_string($oNormalizer->transliterate($sLabel)); + } else { + $sTrans = null; + } $sClass = trim($aMatch[2]); $sType = trim($aMatch[3]); // hack around a bug where building=yes was imported with -- 2.39.5