]> git.openstreetmap.org Git - nominatim.git/blob - lib-php/tokenizer/legacy_icu_tokenizer.php
remove country restriction from tokenizer
[nominatim.git] / lib-php / tokenizer / legacy_icu_tokenizer.php
1 <?php
2
3 namespace Nominatim;
4
5 class Tokenizer
6 {
7     private $oDB;
8
9     private $oNormalizer;
10     private $oTransliterator;
11
12     public function __construct(&$oDB)
13     {
14         $this->oDB =& $oDB;
15         $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
16         $this->oTransliterator = \Transliterator::createFromRules(CONST_Transliteration);
17     }
18
19     public function checkStatus()
20     {
21         $sSQL = 'SELECT word_id FROM word WHERE word_id is not null limit 1';
22         $iWordID = $this->oDB->getOne($sSQL);
23         if ($iWordID === false) {
24             throw new \Exception('Query failed', 703);
25         }
26         if (!$iWordID) {
27             throw new \Exception('No value', 704);
28         }
29     }
30
31
32     public function normalizeString($sTerm)
33     {
34         if ($this->oNormalizer === null) {
35             return $sTerm;
36         }
37
38         return $this->oNormalizer->transliterate($sTerm);
39     }
40
41     private function makeStandardWord($sTerm)
42     {
43         return trim($this->oTransliterator->transliterate(' '.$sTerm.' '));
44     }
45
46
47     public function tokensForSpecialTerm($sTerm)
48     {
49         $aResults = array();
50
51         $sSQL = "SELECT word_id, info->>'class' as class, info->>'type' as type ";
52         $sSQL .= '   FROM word WHERE word_token = :term and type = \'S\'';
53
54         Debug::printVar('Term', $sTerm);
55         Debug::printSQL($sSQL);
56         $aSearchWords = $this->oDB->getAll($sSQL, array(':term' => $this->makeStandardWord($sTerm)));
57
58         Debug::printVar('Results', $aSearchWords);
59
60         foreach ($aSearchWords as $aSearchTerm) {
61             $aResults[] = new \Nominatim\Token\SpecialTerm(
62                 $aSearchTerm['word_id'],
63                 $aSearchTerm['class'],
64                 $aSearchTerm['type'],
65                 \Nominatim\Operator::TYPE
66             );
67         }
68
69         Debug::printVar('Special term tokens', $aResults);
70
71         return $aResults;
72     }
73
74
75     public function extractTokensFromPhrases(&$aPhrases)
76     {
77         $sNormQuery = '';
78         $aWordLists = array();
79         $aTokens = array();
80         foreach ($aPhrases as $iPhrase => $oPhrase) {
81             $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
82             $sPhrase = $this->makeStandardWord($oPhrase->getPhrase());
83             Debug::printVar('Phrase', $sPhrase);
84             if (strlen($sPhrase) > 0) {
85                 $aWords = explode(' ', $sPhrase);
86                 Tokenizer::addTokens($aTokens, $aWords);
87                 $aWordLists[] = $aWords;
88             } else {
89                 $aWordLists[] = array();
90             }
91         }
92
93         Debug::printVar('Tokens', $aTokens);
94         Debug::printVar('WordLists', $aWordLists);
95
96         $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
97
98         foreach ($aPhrases as $iPhrase => $oPhrase) {
99             $oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
100         }
101
102         return $oValidTokens;
103     }
104
105
106     private function computeValidTokens($aTokens, $sNormQuery)
107     {
108         $oValidTokens = new TokenList();
109
110         if (!empty($aTokens)) {
111             $this->addTokensFromDB($oValidTokens, $aTokens, $sNormQuery);
112
113             // Try more interpretations for Tokens that could not be matched.
114             foreach ($aTokens as $sToken) {
115                 if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) {
116                     if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
117                         // US ZIP+4 codes - merge in the 5-digit ZIP code
118                         $oValidTokens->addToken(
119                             $sToken,
120                             new Token\Postcode(null, $aData[1], 'us')
121                         );
122                     } elseif (preg_match('/^[0-9]+$/', $sToken)) {
123                         // Unknown single word token with a number.
124                         // Assume it is a house number.
125                         $oValidTokens->addToken(
126                             $sToken,
127                             new Token\HouseNumber(null, trim($sToken))
128                         );
129                     }
130                 }
131             }
132         }
133
134         return $oValidTokens;
135     }
136
137
138     private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
139     {
140         // Check which tokens we have, get the ID numbers
141         $sSQL = 'SELECT word_id, word_token, type, word,';
142         $sSQL .= "      info->>'op' as operator,";
143         $sSQL .= "      info->>'class' as class, info->>'type' as ctype,";
144         $sSQL .= "      info->>'count' as count";
145         $sSQL .= ' FROM word WHERE word_token in (';
146         $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
147
148         Debug::printSQL($sSQL);
149
150         $aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.');
151
152         foreach ($aDBWords as $aWord) {
153             $iId = (int) $aWord['word_id'];
154             $sTok = $aWord['word_token'];
155
156             switch ($aWord['type']) {
157                 case 'C':  // country name tokens
158                     if ($aWord['word'] !== null) {
159                         $oValidTokens->addToken(
160                             $sTok,
161                             new Token\Country($iId, $aWord['word'])
162                         );
163                     }
164                     break;
165                 case 'H':  // house number tokens
166                     $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token']));
167                     break;
168                 case 'P':  // postcode tokens
169                     // Postcodes are not normalized, so they may have content
170                     // that makes SQL injection possible. Reject postcodes
171                     // that would need special escaping.
172                     if ($aWord['word'] !== null
173                         && pg_escape_string($aWord['word']) == $aWord['word']
174                     ) {
175                         $sNormPostcode = $this->normalizeString($aWord['word']);
176                         if (strpos($sNormQuery, $sNormPostcode) !== false) {
177                             $oValidTokens->addToken(
178                                 $sTok,
179                                 new Token\Postcode($iId, $aWord['word'], null)
180                             );
181                         }
182                     }
183                     break;
184                 case 'S':  // tokens for classification terms (special phrases)
185                     if ($aWord['class'] !== null && $aWord['ctype'] !== null) {
186                         $oValidTokens->addToken($sTok, new Token\SpecialTerm(
187                             $iId,
188                             $aWord['class'],
189                             $aWord['ctype'],
190                             (isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE
191                         ));
192                     }
193                     break;
194                 case 'W': // full-word tokens
195                     $oValidTokens->addToken($sTok, new Token\Word(
196                         $iId,
197                         (int) $aWord['count'],
198                         substr_count($aWord['word_token'], ' ')
199                     ));
200                     break;
201                 case 'w':  // partial word terms
202                     $oValidTokens->addToken($sTok, new Token\Partial(
203                         $iId,
204                         $aWord['word_token'],
205                         (int) $aWord['count']
206                     ));
207                     break;
208                 default:
209                     break;
210             }
211         }
212     }
213
214
215     /**
216      * Add the tokens from this phrase to the given list of tokens.
217      *
218      * @param string[] $aTokens List of tokens to append.
219      *
220      * @return void
221      */
222     private static function addTokens(&$aTokens, $aWords)
223     {
224         $iNumWords = count($aWords);
225
226         for ($i = 0; $i < $iNumWords; $i++) {
227             $sPhrase = $aWords[$i];
228             $aTokens[$sPhrase] = $sPhrase;
229
230             for ($j = $i + 1; $j < $iNumWords; $j++) {
231                 $sPhrase .= ' '.$aWords[$j];
232                 $aTokens[$sPhrase] = $sPhrase;
233             }
234         }
235     }
236 }