3 * SPDX-License-Identifier: GPL-2.0-only
5 * This file is part of Nominatim. (https://nominatim.org)
7 * Copyright (C) 2022 by the Nominatim developer community.
8 * For a full list of authors see the git log.
13 require_once(CONST_LibDir.'/SimpleWordList.php');
20 private $oTransliterator;
22 public function __construct(&$oDB)
25 $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
26 $this->oTransliterator = \Transliterator::createFromRules(CONST_Transliteration);
29 public function checkStatus()
31 $sSQL = 'SELECT word_id FROM word WHERE word_id is not null limit 1';
32 $iWordID = $this->oDB->getOne($sSQL);
33 if ($iWordID === false) {
34 throw new \Exception('Query failed', 703);
37 throw new \Exception('No value', 704);
42 public function normalizeString($sTerm)
44 if ($this->oNormalizer === null) {
48 return $this->oNormalizer->transliterate($sTerm);
52 public function mostFrequentWords($iNum)
54 $sSQL = "SELECT word FROM word WHERE type = 'W'";
55 $sSQL .= "ORDER BY info->'count' DESC LIMIT ".$iNum;
56 return $this->oDB->getCol($sSQL);
60 private function makeStandardWord($sTerm)
62 return trim($this->oTransliterator->transliterate(' '.$sTerm.' '));
66 public function tokensForSpecialTerm($sTerm)
70 $sSQL = "SELECT word_id, info->>'class' as class, info->>'type' as type ";
71 $sSQL .= ' FROM word WHERE word_token = :term and type = \'S\'';
73 Debug::printVar('Term', $sTerm);
74 Debug::printSQL($sSQL);
75 $aSearchWords = $this->oDB->getAll($sSQL, array(':term' => $this->makeStandardWord($sTerm)));
77 Debug::printVar('Results', $aSearchWords);
79 foreach ($aSearchWords as $aSearchTerm) {
80 $aResults[] = new \Nominatim\Token\SpecialTerm(
81 $aSearchTerm['word_id'],
82 $aSearchTerm['class'],
84 \Nominatim\Operator::TYPE
88 Debug::printVar('Special term tokens', $aResults);
94 public function extractTokensFromPhrases(&$aPhrases)
97 $aWordLists = array();
99 foreach ($aPhrases as $iPhrase => $oPhrase) {
100 $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
101 $sPhrase = $this->makeStandardWord($oPhrase->getPhrase());
102 Debug::printVar('Phrase', $sPhrase);
104 $oWordList = new SimpleWordList($sPhrase);
105 $aTokens = array_merge($aTokens, $oWordList->getTokens());
106 $aWordLists[] = $oWordList;
109 Debug::printVar('Tokens', $aTokens);
110 Debug::printVar('WordLists', $aWordLists);
112 $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
114 foreach ($aPhrases as $iPhrase => $oPhrase) {
115 $oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens));
118 return $oValidTokens;
122 private function computeValidTokens($aTokens, $sNormQuery)
124 $oValidTokens = new TokenList();
126 if (!empty($aTokens)) {
127 $this->addTokensFromDB($oValidTokens, $aTokens, $sNormQuery);
129 // Try more interpretations for Tokens that could not be matched.
130 foreach ($aTokens as $sToken) {
131 if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) {
132 if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
133 // US ZIP+4 codes - merge in the 5-digit ZIP code
134 $oValidTokens->addToken(
136 new Token\Postcode(null, $aData[1], 'us')
138 } elseif (preg_match('/^[0-9]+$/', $sToken)) {
139 // Unknown single word token with a number.
140 // Assume it is a house number.
141 $oValidTokens->addToken(
143 new Token\HouseNumber(null, trim($sToken))
150 return $oValidTokens;
154 private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
156 // Check which tokens we have, get the ID numbers
157 $sSQL = 'SELECT word_id, word_token, type, word,';
158 $sSQL .= " info->>'op' as operator,";
159 $sSQL .= " info->>'class' as class, info->>'type' as ctype,";
160 $sSQL .= " info->>'count' as count";
161 $sSQL .= ' FROM word WHERE word_token in (';
162 $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
164 Debug::printSQL($sSQL);
166 $aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.');
168 foreach ($aDBWords as $aWord) {
169 $iId = (int) $aWord['word_id'];
170 $sTok = $aWord['word_token'];
172 switch ($aWord['type']) {
173 case 'C': // country name tokens
174 if ($aWord['word'] !== null) {
175 $oValidTokens->addToken(
177 new Token\Country($iId, $aWord['word'])
181 case 'H': // house number tokens
182 $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token']));
184 case 'P': // postcode tokens
185 // Postcodes are not normalized, so they may have content
186 // that makes SQL injection possible. Reject postcodes
187 // that would need special escaping.
188 if ($aWord['word'] !== null
189 && pg_escape_string($aWord['word']) == $aWord['word']
191 $sNormPostcode = $this->normalizeString($aWord['word']);
192 if (strpos($sNormQuery, $sNormPostcode) !== false) {
193 $oValidTokens->addToken(
195 new Token\Postcode($iId, $aWord['word'], null)
200 case 'S': // tokens for classification terms (special phrases)
201 if ($aWord['class'] !== null && $aWord['ctype'] !== null) {
202 $oValidTokens->addToken($sTok, new Token\SpecialTerm(
206 (isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE
210 case 'W': // full-word tokens
211 $oValidTokens->addToken($sTok, new Token\Word(
213 (int) $aWord['count'],
214 substr_count($aWord['word_token'], ' ')
217 case 'w': // partial word terms
218 $oValidTokens->addToken($sTok, new Token\Partial(
220 $aWord['word_token'],
221 (int) $aWord['count']