3 * SPDX-License-Identifier: GPL-2.0-only
5 * This file is part of Nominatim. (https://nominatim.org)
7 * Copyright (C) 2022 by the Nominatim developer community.
8 * For a full list of authors see the git log.
13 require_once(CONST_LibDir.'/SimpleWordList.php');
20 private $oTransliterator;
22 public function __construct(&$oDB)
25 $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
26 $this->oTransliterator = \Transliterator::createFromRules(CONST_Transliteration);
29 public function checkStatus()
31 $sSQL = 'SELECT word_id FROM word WHERE word_id is not null limit 1';
32 $iWordID = $this->oDB->getOne($sSQL);
33 if ($iWordID === false) {
34 throw new \Exception('Query failed', 703);
37 throw new \Exception('No value', 704);
42 public function normalizeString($sTerm)
44 if ($this->oNormalizer === null) {
48 return $this->oNormalizer->transliterate($sTerm);
52 public function mostFrequentWords($iNum)
54 $sSQL = "SELECT word FROM word WHERE type = 'W'";
55 $sSQL .= "ORDER BY info->'count' DESC LIMIT ".$iNum;
56 return $this->oDB->getCol($sSQL);
60 private function makeStandardWord($sTerm)
62 return trim($this->oTransliterator->transliterate(' '.$sTerm.' '));
66 public function tokensForSpecialTerm($sTerm)
70 $sSQL = "SELECT word_id, info->>'class' as class, info->>'type' as type ";
71 $sSQL .= ' FROM word WHERE word_token = :term and type = \'S\'';
73 Debug::printVar('Term', $sTerm);
74 Debug::printSQL($sSQL);
75 $aSearchWords = $this->oDB->getAll($sSQL, array(':term' => $this->makeStandardWord($sTerm)));
77 Debug::printVar('Results', $aSearchWords);
79 foreach ($aSearchWords as $aSearchTerm) {
80 $aResults[] = new \Nominatim\Token\SpecialTerm(
81 $aSearchTerm['word_id'],
82 $aSearchTerm['class'],
84 \Nominatim\Operator::TYPE
88 Debug::printVar('Special term tokens', $aResults);
94 public function extractTokensFromPhrases(&$aPhrases)
97 $aWordLists = array();
99 foreach ($aPhrases as $iPhrase => $oPhrase) {
100 $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
101 $sPhrase = $this->makeStandardWord($oPhrase->getPhrase());
102 Debug::printVar('Phrase', $sPhrase);
104 $oWordList = new SimpleWordList($sPhrase);
105 $aTokens = array_merge($aTokens, $oWordList->getTokens());
106 $aWordLists[] = $oWordList;
109 Debug::printVar('Tokens', $aTokens);
110 Debug::printVar('WordLists', $aWordLists);
112 $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
114 foreach ($aPhrases as $iPhrase => $oPhrase) {
115 $oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens));
118 return $oValidTokens;
122 private function computeValidTokens($aTokens, $sNormQuery)
124 $oValidTokens = new TokenList();
126 if (!empty($aTokens)) {
127 $this->addTokensFromDB($oValidTokens, $aTokens, $sNormQuery);
129 // Try more interpretations for Tokens that could not be matched.
130 foreach ($aTokens as $sToken) {
131 if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) {
132 if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
133 // US ZIP+4 codes - merge in the 5-digit ZIP code
134 $oValidTokens->addToken(
136 new Token\Postcode(null, $aData[1], 'us')
138 } elseif (preg_match('/^[0-9]+$/', $sToken)) {
139 // Unknown single word token with a number.
140 // Assume it is a house number.
141 $oValidTokens->addToken(
143 new Token\HouseNumber(null, trim($sToken))
150 return $oValidTokens;
154 private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
156 // Check which tokens we have, get the ID numbers
157 $sSQL = 'SELECT word_id, word_token, type, word,';
158 $sSQL .= " info->>'op' as operator,";
159 $sSQL .= " info->>'class' as class, info->>'type' as ctype,";
160 $sSQL .= " info->>'count' as count,";
161 $sSQL .= " info->>'lookup' as lookup";
162 $sSQL .= ' FROM word WHERE word_token in (';
163 $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
165 Debug::printSQL($sSQL);
167 $aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.');
169 foreach ($aDBWords as $aWord) {
170 $iId = (int) $aWord['word_id'];
171 $sTok = $aWord['word_token'];
173 switch ($aWord['type']) {
174 case 'C': // country name tokens
175 if ($aWord['word'] !== null) {
176 $oValidTokens->addToken(
178 new Token\Country($iId, $aWord['word'])
182 case 'H': // house number tokens
183 $sLookup = $aWord['lookup'] ?? $aWord['word_token'];
184 $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $sLookup));
186 case 'P': // postcode tokens
187 // Postcodes are not normalized, so they may have content
188 // that makes SQL injection possible. Reject postcodes
189 // that would need special escaping.
190 if ($aWord['word'] !== null
191 && pg_escape_string($aWord['word']) == $aWord['word']
193 $iSplitPos = strpos($aWord['word'], '@');
194 if ($iSplitPos === false) {
195 $sPostcode = $aWord['word'];
197 $sPostcode = substr($aWord['word'], 0, $iSplitPos);
200 $oValidTokens->addToken(
202 new Token\Postcode($iId, $sPostcode, null)
206 case 'S': // tokens for classification terms (special phrases)
207 if ($aWord['class'] !== null && $aWord['ctype'] !== null) {
208 $oValidTokens->addToken($sTok, new Token\SpecialTerm(
212 (isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE
216 case 'W': // full-word tokens
217 $oValidTokens->addToken($sTok, new Token\Word(
219 (int) $aWord['count'],
220 substr_count($aWord['word_token'], ' ')
223 case 'w': // partial word terms
224 $oValidTokens->addToken($sTok, new Token\Partial(
226 $aWord['word_token'],
227 (int) $aWord['count']