private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
{
// Check which tokens we have, get the ID numbers
- $sSQL = 'SELECT word_id, word_token, type,';
- $sSQL .= " info->>'cc' as country, info->>'postcode' as postcode,";
+ $sSQL = 'SELECT word_id, word_token, type, word,';
$sSQL .= " info->>'op' as operator,";
$sSQL .= " info->>'class' as class, info->>'type' as ctype,";
$sSQL .= " info->>'count' as count";
switch ($aWord['type']) {
case 'C': // country name tokens
- if ($aWord['country'] !== null
+ if ($aWord['word'] !== null
&& (!$this->aCountryRestriction
- || in_array($aWord['country'], $this->aCountryRestriction))
+ || in_array($aWord['word'], $this->aCountryRestriction))
) {
- $oValidTokens->addToken($sTok, new Token\Country($iId, $aWord['country']));
+ $oValidTokens->addToken(
+ $sTok,
+ new Token\Country($iId, $aWord['word'])
+ );
}
break;
case 'H': // house number tokens
// Postcodes are not normalized, so they may have content
// that makes SQL injection possible. Reject postcodes
// that would need special escaping.
- if ($aWord['postcode'] !== null
- && pg_escape_string($aWord['postcode']) == $aWord['postcode']
+ if ($aWord['word'] !== null
+ && pg_escape_string($aWord['word']) == $aWord['word']
) {
- $sNormPostcode = $this->normalizeString($aWord['postcode']);
+ $sNormPostcode = $this->normalizeString($aWord['word']);
if (strpos($sNormQuery, $sNormPostcode) !== false) {
- $oValidTokens->addToken($sTok, new Token\Postcode($iId, $aWord['postcode'], null));
+ $oValidTokens->addToken(
+ $sTok,
+ new Token\Postcode($iId, $aWord['word'], null)
+ );
}
}
break;
$iId,
$aWord['class'],
$aWord['ctype'],
- (isset($aWord['op'])) ? Operator::NEAR : Operator::NONE
+ (isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE
));
}
break;
word_id INTEGER,
word_token text NOT NULL,
type text NOT NULL,
+ word text,
info jsonb
) {{db.tablespace.search_data}};
USING BTREE (word_token) {{db.tablespace.search_index}};
-- Used when updating country names from the boundary relation.
CREATE INDEX idx_word_country_names ON word
- USING btree((info->>'cc')) {{db.tablespace.address_index}}
+ USING btree(word) {{db.tablespace.address_index}}
WHERE type = 'C';
-- Used when inserting new postcodes on updates.
CREATE INDEX idx_word_postcodes ON word
- USING btree((info->>'postcode')) {{db.tablespace.address_index}}
+ USING btree(word) {{db.tablespace.address_index}}
WHERE type = 'P';
-- Used when inserting full words.
CREATE INDEX idx_word_full_word ON word
- USING btree((info->>'word')) {{db.tablespace.address_index}}
+ USING btree(word) {{db.tablespace.address_index}}
WHERE type = 'W';
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
term_count INTEGER;
BEGIN
SELECT min(word_id) INTO full_token
- FROM word WHERE info->>'word' = norm_term and type = 'W';
+ FROM word WHERE word = norm_term and type = 'W';
IF full_token IS NULL THEN
full_token := nextval('seq_word');
- INSERT INTO word (word_id, word_token, type, info)
- SELECT full_token, lookup_term, 'W',
- json_build_object('word', norm_term, 'count', 0)
+ INSERT INTO word (word_id, word_token, type, word, info)
+ SELECT full_token, lookup_term, 'W', norm_term,
+ json_build_object('count', 0)
FROM unnest(lookup_terms) as lookup_term;
END IF;
(SELECT pc, word FROM
(SELECT distinct(postcode) as pc FROM location_postcode) p
FULL JOIN
- (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w
+ (SELECT word FROM word WHERE type = 'P') w
ON pc = word) x
WHERE pc is null or word is null""")
to_delete.append(word)
else:
copystr.add(self.name_processor.get_search_normalized(postcode),
- 'P', json.dumps({'postcode': postcode}))
+ 'P', postcode)
if to_delete:
cur.execute("""DELETE FROM WORD
- WHERE type ='P' and info->>'postcode' = any(%s)
+ WHERE type ='P' and word = any(%s)
""", (to_delete, ))
copystr.copy_out(cur, 'word',
- columns=['word_token', 'type', 'info'])
+ columns=['word_token', 'type', 'word'])
def update_special_phrases(self, phrases, should_replace):
with self.conn.cursor() as cur:
# Get the old phrases.
existing_phrases = set()
- cur.execute("SELECT info FROM word WHERE type = 'S'")
- for (info, ) in cur:
- existing_phrases.add((info['word'], info['class'], info['type'],
+ cur.execute("SELECT word, info FROM word WHERE type = 'S'")
+ for word, info in cur:
+ existing_phrases.add((word, info['class'], info['type'],
info.get('op') or '-'))
added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
for word, cls, typ, oper in to_add:
term = self.name_processor.get_search_normalized(word)
if term:
- copystr.add(term, 'S',
- json.dumps({'word': word, 'class': cls, 'type': typ,
+ copystr.add(term, 'S', word,
+ json.dumps({'class': cls, 'type': typ,
'op': oper if oper in ('in', 'near') else None}))
added += 1
copystr.copy_out(cursor, 'word',
- columns=['word_token', 'type', 'info'])
+ columns=['word_token', 'type', 'word', 'info'])
return added
if to_delete:
cursor.execute_values(
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
- WHERE info->>'word' = name
+ WHERE type = 'S' and word = name
and info->>'class' = in_class and info->>'type' = in_type
and ((op = '-' and info->>'op' is null) or op = info->>'op')
""", to_delete)
with self.conn.cursor() as cur:
# Get existing names
cur.execute("""SELECT word_token FROM word
- WHERE type = 'C' and info->>'cc'= %s""",
+ WHERE type = 'C' and word = %s""",
(country_code, ))
word_tokens.difference_update((t[0] for t in cur))
# Only add those names that are not yet in the list.
if word_tokens:
- cur.execute("""INSERT INTO word (word_token, type, info)
- (SELECT token, 'C', json_build_object('cc', %s)
+ cur.execute("""INSERT INTO word (word_token, type, word)
+ (SELECT token, 'C', %s
FROM unnest(%s) as token)
""", (country_code, list(word_tokens)))
with self.conn.cursor() as cur:
# no word_id needed for postcodes
- cur.execute("""INSERT INTO word (word_token, type, info)
- (SELECT %s, 'P', json_build_object('postcode', pc)
- FROM (VALUES (%s)) as v(pc)
+ cur.execute("""INSERT INTO word (word_token, type, word)
+ (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
WHERE NOT EXISTS
(SELECT * FROM word
- WHERE type = 'P' and info->>'postcode' = pc))
+ WHERE type = 'P' and word = pc))
""", (term, postcode))
self._cache.postcodes.add(postcode)
db_row.assert_row(row, ('country', 'postcode'))
-@then("word contains(?P<exclude> not)?")
-def check_word_table(context, exclude):
- """ Check the contents of the word table. Each row represents a table row
- and all data must match. Data not present in the expected table, may
- be arbitry. The rows are identified via all given columns.
- """
- with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
- for row in context.table:
- wheres = ' AND '.join(["{} = %s".format(h) for h in row.headings])
- cur.execute("SELECT * from word WHERE " + wheres, list(row.cells))
- if exclude:
- assert cur.rowcount == 0, "Row still in word table: %s" % '/'.join(values)
- else:
- assert cur.rowcount > 0, "Row not in word table: %s" % '/'.join(values)
-
-
@then("there are(?P<exclude> no)? word tokens for postcodes (?P<postcodes>.*)")
def check_word_table_for_postcodes(context, exclude, postcodes):
""" Check that the tokenizer produces postcode tokens for the given
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
if nctx.tokenizer == 'legacy_icu':
- cur.execute("""SELECT info->>'postcode' FROM word
- WHERE type = 'P' and info->>'postcode' = any(%s)""",
+ cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
(plist,))
else:
cur.execute("""SELECT word FROM word WHERE word = any(%s)
cur.execute("""CREATE TABLE word (word_id INTEGER,
word_token text NOT NULL,
type text NOT NULL,
+ word text,
info jsonb)""")
conn.commit()
def add_special(self, word_token, word, cls, typ, oper):
with self.conn.cursor() as cur:
- cur.execute("""INSERT INTO word (word_token, type, info)
- VALUES (%s, 'S',
- json_build_object('word', %s,
- 'class', %s,
+ cur.execute("""INSERT INTO word (word_token, type, word, info)
+ VALUES (%s, 'S', %s,
+ json_build_object('class', %s,
'type', %s,
'op', %s))
""", (word_token, word, cls, typ, oper))
def add_country(self, country_code, word_token):
with self.conn.cursor() as cur:
- cur.execute("""INSERT INTO word (word_token, type, info)
- VALUES(%s, 'C', json_build_object('cc', %s))""",
+ cur.execute("""INSERT INTO word (word_token, type, word)
+ VALUES(%s, 'C', %s)""",
(word_token, country_code))
self.conn.commit()
def add_postcode(self, word_token, postcode):
with self.conn.cursor() as cur:
- cur.execute("""INSERT INTO word (word_token, type, info)
- VALUES (%s, 'P', json_build_object('postcode', %s))
+ cur.execute("""INSERT INTO word (word_token, type, word)
+ VALUES (%s, 'P', %s)
""", (word_token, postcode))
self.conn.commit()
def get_special(self):
with self.conn.cursor() as cur:
- cur.execute("SELECT word_token, info FROM word WHERE type = 'S'")
- result = set(((row[0], row[1]['word'], row[1]['class'],
+ cur.execute("SELECT word_token, info, word FROM word WHERE type = 'S'")
+ result = set(((row[0], row[2], row[1]['class'],
row[1]['type'], row[1]['op']) for row in cur))
assert len(result) == cur.rowcount, "Word table has duplicates."
return result
def get_country(self):
with self.conn.cursor() as cur:
- cur.execute("SELECT info->>'cc', word_token FROM word WHERE type = 'C'")
+ cur.execute("SELECT word, word_token FROM word WHERE type = 'C'")
result = set((tuple(row) for row in cur))
assert len(result) == cur.rowcount, "Word table has duplicates."
return result
def get_postcodes(self):
with self.conn.cursor() as cur:
- cur.execute("SELECT info->>'postcode' FROM word WHERE type = 'P'")
+ cur.execute("SELECT word FROM word WHERE type = 'P'")
return set((row[0] for row in cur))