# 'too-many-ancestors' is triggered already by deriving from UserDict
disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use
-good-names=i,x,y,fd,db
+good-names=i,x,y,fd,db,cc
--- /dev/null
+# Customizing Per-Country Data
+
+Whenever an OSM is imported into Nominatim, the object is first assigned
+a country. Nominatim can use this information to adapt various aspects of
+the address computation to the local customs of the country. This section
+explains how country assignment works and the principal per-country
+localizations.
+
+## Country assignment
+
+Countries are assigned on the basis of country data from the OpenStreetMap
+input data itself. Countries are expected to be tagged according to the
+[administrative boundary schema](https://wiki.openstreetmap.org/wiki/Tag:boundary%3Dadministrative):
+a OSM relation with `boundary=administrative` and `admin_level=2`. Nominatim
+uses the country code to distinguish the countries.
+
+If there is no country data available for a point, then Nominatim uses the
+fallback data imported from `data/country_osm_grid.sql.gz`. This was computed
+from OSM data as well but is guaranteed to cover all countries.
+
+Some OSM objects may also be located outside any country, for example a buoy
+in the middle of the ocean. These object do not get any country assigned and
+get a default treatment when it comes to localized handling of data.
+
+## Per-country settings
+
+### Global country settings
+
+The main place to configure settings per country is the file
+`settings/country_settings.yaml`. This file has one section per country that
+is recognised by Nominatim. Each section is tagged with the country code
+(in lower case) and contains the different localization information. Only
+countries which are listed in this file are taken into account for computations.
+
+For example, the section for Andorra looks like this:
+
+```
+ partition: 35
+ languages: ca
+ names: !include country-names/ad.yaml
+ postcode:
+ pattern: "(ddd)"
+ output: AD\1
+```
+
+The individual settings are described below.
+
+#### `partition`
+
+Nominatim internally splits the data into multiple tables to improve
+performance. The partition number tells Nominatim into which table to put
+the country. This is purely internal management and has no effect on the
+output data.
+
+The default is to have one partition per country.
+
+#### `languages`
+
+A comma-separated list of ISO-639 language codes of default languages in the
+country. These are the languages used in name tags without a language suffix.
+Note that this is not necessarily the same as the list of official languages
+in the country. There may be officially recognised languages in a country
+which are only ever used in name tags with the appropriate language suffixes.
+Conversely, a non-official language may appear a lot in the name tags, for
+example when used as an unofficial Lingua Franca.
+
+List the languages in order of frequency of appearance with the most frequently
+used language first. It is not recommended to add languages when there are only
+very few occurrences.
+
+If only one language is listed, then Nominatim will 'auto-complete' the
+language of names without an explicit language-suffix.
+
+#### `names`
+
+List of names of the country and its translations. These names are used as
+a baseline. It is always possible to search countries by the given names, no
+matter what other names are in the OSM data. They are also used as a fallback
+when a needed translation is not available.
+
+!!! Note
+ The list of names per country is currently fairly large because Nominatim
+ supports translations in many languages per default. That is why the
+ name lists have been separated out into extra files. You can find the
+ name lists in the file `settings/country-names/<country code>.yaml`.
+ The names section in the main country settings file only refers to these
+ files via the special `!include` directive.
+
+#### `postcode`
+
+Describes the format of the postcode that is in use in the country.
+
+When a country has no official postcodes, set this to no. Example:
+
+```
+ae:
+ postcode: no
+```
+
+When a country has a postcode, you need to state the postcode pattern and
+the default output format. Example:
+
+```
+bm:
+ postcode:
+ pattern: "(ll)[ -]?(dd)"
+ output: \1 \2
+```
+
+The **pattern** is a regular expression that describes the possible formats
+accepted as a postcode. The pattern follows the standard syntax for
+[regular expressions in Python](https://docs.python.org/3/library/re.html#regular-expression-syntax)
+with two extra shortcuts: `d` is a shortcut for a single digit([0-9])
+and `l` for a single ASCII letter ([A-Z]).
+
+Use match groups to indicate groups in the postcode that may optionally be
+separated with a space or a hyphen.
+
+For example, the postcode for Bermuda above always consists of two letters
+and two digits. They may optionally be separated by a space or hyphen. That
+means that Nominatim will consider `AB56`, `AB 56` and `AB-56` spelling variants
+for one and the same postcode.
+
+Never add the country code in front of the postcode pattern. Nominatim will
+automatically accept variants with a country code prefix for all postcodes.
+
+The **output** field is an optional field that describes what the canonical
+spelling of the postcode should be. The format is the
+[regular expression expand syntax](https://docs.python.org/3/library/re.html#re.Match.expand) referring back to the bracket groups in the pattern.
+
+Most simple postcodes only have one spelling variant. In that case, the
+**output** can be omitted. The postcode will simply be used as is.
+
+In the Bermuda example above, the canonical spelling would be to have a space
+between letters and digits.
+
+!!! Warning
+ When your postcode pattern covers multiple variants of the postcode, then
+ you must explicitly state the canonical output or Nominatim will not
+ handle the variations correctly.
+
+### Other country-specific configuration
+
+There are some other configuration files where you can set localized settings
+according to the assigned country. These are:
+
+ * [Place ranking configuration](Ranking.md)
+
+Please see the linked documentation sections for more information.
rendering:
heading_level: 6
+##### clean-postcodes
+
+::: nominatim.tokenizer.sanitizers.clean_postcodes
+ selection:
+ members: False
+ rendering:
+ heading_level: 6
+
#### Token Analysis
The token-analysis section contains the list of configured analyzers. Each
analyzer must have an `id` parameter that uniquely identifies the analyzer.
The only exception is the default analyzer that is used when no special
-analyzer was selected. There is one special id '@housenumber'. If an analyzer
-with that name is present, it is used for normalization of house numbers.
+analyzer was selected. There are analysers with special ids:
+
+ * '@housenumber'. If an analyzer with that name is present, it is used
+ for normalization of house numbers.
+ * '@potcode'. If an analyzer with that name is present, it is used
+ for normalization of postcodes.
Different analyzer implementations may exist. To select the implementation,
the `analyzer` parameter must be set. The different implementations are
The analyzer cannot be customized.
+##### Postcode token analyzer
+
+The analyzer `postcodes` is pupose-made to analyze postcodes. It supports
+a 'lookup' varaint of the token, which produces variants with optional
+spaces. Use together with the clean-postcodes sanitizer.
+
+The analyzer cannot be customized.
+
### Reconfiguration
Changing the configuration after the import is currently not possible, although
both the search token list and the match token list.
```sql
-FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT
+FUNCTION token_get_postcode(info JSONB) RETURNS TEXT
```
-Return the normalized version of the given postcode. This function must return
-the same value as the Python function `AbstractAnalyzer->normalize_postcode()`.
+Return the postcode for the object, if any exists. The postcode must be in
+the form that should also be presented to the end-user.
```sql
FUNCTION token_strip_info(info JSONB) RETURNS JSONB
- 'Overview': 'customize/Overview.md'
- 'Import Styles': 'customize/Import-Styles.md'
- 'Configuration Settings': 'customize/Settings.md'
+ - 'Per-Country Data': 'customize/Country-Settings.md'
- 'Place Ranking' : 'customize/Ranking.md'
- 'Tokenizers' : 'customize/Tokenizers.md'
- 'Special Phrases': 'customize/Special-Phrases.md'
public function __construct($iId, $sPostcode, $sCountryCode = '')
{
$this->iId = $iId;
- $this->sPostcode = $sPostcode;
+ $iSplitPos = strpos($sPostcode, '@');
+ if ($iSplitPos === false) {
+ $this->sPostcode = $sPostcode;
+ } else {
+ $this->sPostcode = substr($sPostcode, 0, $iSplitPos);
+ }
$this->sCountryCode = empty($sCountryCode) ? '' : $sCountryCode;
}
if ($aWord['word'] !== null
&& pg_escape_string($aWord['word']) == $aWord['word']
) {
- $sNormPostcode = $this->normalizeString($aWord['word']);
- if (strpos($sNormQuery, $sNormPostcode) !== false) {
- $oValidTokens->addToken(
- $sTok,
- new Token\Postcode($iId, $aWord['word'], null)
- );
+ $iSplitPos = strpos($aWord['word'], '@');
+ if ($iSplitPos === false) {
+ $sPostcode = $aWord['word'];
+ } else {
+ $sPostcode = substr($aWord['word'], 0, $iSplitPos);
}
+
+ $oValidTokens->addToken(
+ $sTok,
+ new Token\Postcode($iId, $sPostcode, null)
+ );
}
break;
case 'S': // tokens for classification terms (special phrases)
location := ROW(null, null, null, hstore('ref', place.postcode), 'place',
'postcode', null, null, false, true, 5, 0)::addressline;
RETURN NEXT location;
+ ELSEIF place.address is not null and place.address ? 'postcode'
+ and not place.address->'postcode' SIMILAR TO '%(,|;)%' THEN
+ location := ROW(null, null, null, hstore('ref', place.address->'postcode'), 'place',
+ 'postcode', null, null, false, true, 5, 0)::addressline;
+ RETURN NEXT location;
END IF;
RETURN;
linegeo GEOMETRY;
splitline GEOMETRY;
sectiongeo GEOMETRY;
- interpol_postcode TEXT;
postcode TEXT;
stepmod SMALLINT;
BEGIN
ST_PointOnSurface(NEW.linegeo),
NEW.linegeo);
- interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
-
NEW.token_info := token_strip_info(NEW.token_info);
IF NEW.address ? '_inherited' THEN
NEW.address := hstore('interpolation', NEW.address->'interpolation');
FOR nextnode IN
SELECT DISTINCT ON (nodeidpos)
osm_id, address, geometry,
+ -- Take the postcode from the node only if it has a housenumber itself.
+ -- Note that there is a corner-case where the node has a wrongly
+ -- formatted postcode and therefore 'postcode' contains a derived
+ -- variant.
+ CASE WHEN address ? 'postcode' THEN placex.postcode ELSE NULL::text END as postcode,
substring(address->'housenumber','[0-9]+')::integer as hnr
FROM placex, generate_series(1, array_upper(waynodes, 1)) nodeidpos
WHERE osm_type = 'N' and osm_id = waynodes[nodeidpos]::BIGINT
endnumber := newend;
-- determine postcode
- postcode := coalesce(interpol_postcode,
- token_normalized_postcode(prevnode.address->'postcode'),
- token_normalized_postcode(nextnode.address->'postcode'),
- postcode);
- IF postcode is NULL THEN
- SELECT token_normalized_postcode(placex.postcode)
- FROM placex WHERE place_id = NEW.parent_place_id INTO postcode;
+ postcode := coalesce(prevnode.postcode, nextnode.postcode, postcode);
+ IF postcode is NULL and NEW.parent_place_id > 0 THEN
+ SELECT placex.postcode FROM placex
+ WHERE place_id = NEW.parent_place_id INTO postcode;
END IF;
IF postcode is NULL THEN
postcode := get_nearest_postcode(NEW.country_code, nextnode.geometry);
{% if debug %}RAISE WARNING 'Got parent details from search name';{% endif %}
-- determine postcode
- NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
+ NEW.postcode := coalesce(token_get_postcode(NEW.token_info),
location.postcode,
get_nearest_postcode(NEW.country_code, NEW.centroid));
{% if debug %}RAISE WARNING 'RETURN insert_addresslines: %, %, %', NEW.parent_place_id, NEW.postcode, nameaddress_vector;{% endif %}
- NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
- NEW.postcode);
+ NEW.postcode := coalesce(token_get_postcode(NEW.token_info), NEW.postcode);
-- if we have a name add this to the name search table
IF NEW.name IS NOT NULL THEN
$$ LANGUAGE SQL IMMUTABLE STRICT;
-CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB)
RETURNS TEXT
AS $$
- SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
+ SELECT info->>'postcode';
$$ LANGUAGE SQL IMMUTABLE STRICT;
END;
$$
LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION create_postcode_word(postcode TEXT, lookup_terms TEXT[])
+ RETURNS BOOLEAN
+ AS $$
+DECLARE
+ existing INTEGER;
+BEGIN
+ SELECT count(*) INTO existing
+ FROM word WHERE word = postcode and type = 'P';
+
+ IF existing > 0 THEN
+ RETURN TRUE;
+ END IF;
+
+ -- postcodes don't need word ids
+ INSERT INTO word (word_token, type, word)
+ SELECT lookup_term, 'P', postcode FROM unnest(lookup_terms) as lookup_term;
+
+ RETURN FALSE;
+END;
+$$
+LANGUAGE plpgsql;
+
$$ LANGUAGE SQL IMMUTABLE STRICT;
-CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB)
RETURNS TEXT
AS $$
- SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
+ SELECT info->>'postcode';
$$ LANGUAGE SQL IMMUTABLE STRICT;
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for formatting postcodes according to their country-specific
+format.
+"""
+import re
+
+from nominatim.errors import UsageError
+from nominatim.tools import country_info
+
+class CountryPostcodeMatcher:
+ """ Matches and formats a postcode according to a format definition
+ of the given country.
+ """
+ def __init__(self, country_code, config):
+ if 'pattern' not in config:
+ raise UsageError("Field 'pattern' required for 'postcode' "
+ f"for country '{country_code}'")
+
+ pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
+
+ self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
+ self.pattern = re.compile(pc_pattern)
+
+ self.output = config.get('output', r'\g<0>')
+
+
+ def match(self, postcode):
+ """ Match the given postcode against the postcode pattern for this
+ matcher. Returns a `re.Match` object if the match was successful
+ and None otherwise.
+ """
+ # Upper-case, strip spaces and leading country code.
+ normalized = self.norm_pattern.fullmatch(postcode.upper())
+
+ if normalized:
+ return self.pattern.fullmatch(normalized.group(1))
+
+ return None
+
+
+ def normalize(self, match):
+ """ Return the default format of the postcode for the given match.
+ `match` must be a `re.Match` object previously returned by
+ `match()`
+ """
+ return match.expand(self.output)
+
+
+class PostcodeFormatter:
+ """ Container for different postcode formats of the world and
+ access functions.
+ """
+ def __init__(self):
+ # Objects without a country code can't have a postcode per definition.
+ self.country_without_postcode = {None}
+ self.country_matcher = {}
+ self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'})
+
+ for ccode, prop in country_info.iterate('postcode'):
+ if prop is False:
+ self.country_without_postcode.add(ccode)
+ elif isinstance(prop, dict):
+ self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop)
+ else:
+ raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+
+
+ def set_default_pattern(self, pattern):
+ """ Set the postcode match pattern to use, when a country does not
+ have a specific pattern or is marked as country without postcode.
+ """
+ self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
+
+
+ def get_matcher(self, country_code):
+ """ Return the CountryPostcodeMatcher for the given country.
+ Returns None if the country doesn't have a postcode and the
+ default matcher if there is no specific matcher configured for
+ the country.
+ """
+ if country_code in self.country_without_postcode:
+ return None
+
+ return self.country_matcher.get(country_code, self.default_matcher)
+
+
+ def match(self, country_code, postcode):
+ """ Match the given postcode against the postcode pattern for this
+ matcher. Returns a `re.Match` object if the country has a pattern
+ and the match was successful or None if the match failed.
+ """
+ if country_code in self.country_without_postcode:
+ return None
+
+ return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
+
+
+ def normalize(self, country_code, match):
+ """ Return the default format of the postcode for the given match.
+ `match` must be a `re.Match` object previously returned by
+ `match()`
+ """
+ return self.country_matcher.get(country_code, self.default_matcher).normalize(match)
import itertools
import json
import logging
-import re
from textwrap import dedent
from nominatim.db.connection import connect
""" Update postcode tokens in the word table from the location_postcode
table.
"""
- to_delete = []
+ analyzer = self.token_analysis.analysis.get('@postcode')
+
with self.conn.cursor() as cur:
- # This finds us the rows in location_postcode and word that are
- # missing in the other table.
- cur.execute("""SELECT * FROM
- (SELECT pc, word FROM
- (SELECT distinct(postcode) as pc FROM location_postcode) p
- FULL JOIN
- (SELECT word FROM word WHERE type = 'P') w
- ON pc = word) x
- WHERE pc is null or word is null""")
-
- with CopyBuffer() as copystr:
- for postcode, word in cur:
- if postcode is None:
- to_delete.append(word)
- else:
- copystr.add(self._search_normalized(postcode),
- 'P', postcode)
-
- if to_delete:
- cur.execute("""DELETE FROM WORD
- WHERE type ='P' and word = any(%s)
- """, (to_delete, ))
-
- copystr.copy_out(cur, 'word',
- columns=['word_token', 'type', 'word'])
+ # First get all postcode names currently in the word table.
+ cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
+ word_entries = set((entry[0] for entry in cur))
+
+ # Then compute the required postcode names from the postcode table.
+ needed_entries = set()
+ cur.execute("SELECT country_code, postcode FROM location_postcode")
+ for cc, postcode in cur:
+ info = PlaceInfo({'country_code': cc,
+ 'class': 'place', 'type': 'postcode',
+ 'address': {'postcode': postcode}})
+ address = self.sanitizer.process_names(info)[1]
+ for place in address:
+ if place.kind == 'postcode':
+ if analyzer is None:
+ postcode_name = place.name.strip().upper()
+ variant_base = None
+ else:
+ postcode_name = analyzer.normalize(place.name)
+ variant_base = place.get_attr("variant")
+
+ if variant_base:
+ needed_entries.add(f'{postcode_name}@{variant_base}')
+ else:
+ needed_entries.add(postcode_name)
+ break
+
+ # Now update the word table.
+ self._delete_unused_postcode_words(word_entries - needed_entries)
+ self._add_missing_postcode_words(needed_entries - word_entries)
+
+ def _delete_unused_postcode_words(self, tokens):
+ if tokens:
+ with self.conn.cursor() as cur:
+ cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
+ (list(tokens), ))
+
+ def _add_missing_postcode_words(self, tokens):
+ if not tokens:
+ return
+
+ analyzer = self.token_analysis.analysis.get('@postcode')
+ terms = []
+
+ for postcode_name in tokens:
+ if '@' in postcode_name:
+ term, variant = postcode_name.split('@', 2)
+ term = self._search_normalized(term)
+ variants = {term}
+ if analyzer is not None:
+ variants.update(analyzer.get_variants_ascii(variant))
+ variants = list(variants)
+ else:
+ variants = [self._search_normalized(postcode_name)]
+ terms.append((postcode_name, variants))
+
+ if terms:
+ with self.conn.cursor() as cur:
+ cur.execute_values("""SELECT create_postcode_word(pc, var)
+ FROM (VALUES %s) AS v(pc, var)""",
+ terms)
+
+
def update_special_phrases(self, phrases, should_replace):
def _process_place_address(self, token_info, address):
for item in address:
if item.kind == 'postcode':
- self._add_postcode(item.name)
+ token_info.set_postcode(self._add_postcode(item))
elif item.kind == 'housenumber':
token_info.add_housenumber(*self._compute_housenumber_token(item))
elif item.kind == 'street':
return full_tokens, partial_tokens
- def _add_postcode(self, postcode):
+ def _add_postcode(self, item):
""" Make sure the normalized postcode is present in the word table.
"""
- if re.search(r'[:,;]', postcode) is None:
- postcode = self.normalize_postcode(postcode)
+ analyzer = self.token_analysis.analysis.get('@postcode')
- if postcode not in self._cache.postcodes:
- term = self._search_normalized(postcode)
- if not term:
- return
+ if analyzer is None:
+ postcode_name = item.name.strip().upper()
+ variant_base = None
+ else:
+ postcode_name = analyzer.normalize(item.name)
+ variant_base = item.get_attr("variant")
- with self.conn.cursor() as cur:
- # no word_id needed for postcodes
- cur.execute("""INSERT INTO word (word_token, type, word)
- (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
- WHERE NOT EXISTS
- (SELECT * FROM word
- WHERE type = 'P' and word = pc))
- """, (term, postcode))
- self._cache.postcodes.add(postcode)
+ if variant_base:
+ postcode = f'{postcode_name}@{variant_base}'
+ else:
+ postcode = postcode_name
+
+ if postcode not in self._cache.postcodes:
+ term = self._search_normalized(postcode_name)
+ if not term:
+ return None
+
+ variants = {term}
+ if analyzer is not None and variant_base:
+ variants.update(analyzer.get_variants_ascii(variant_base))
+
+ with self.conn.cursor() as cur:
+ cur.execute("SELECT create_postcode_word(%s, %s)",
+ (postcode, list(variants)))
+ self._cache.postcodes.add(postcode)
+
+ return postcode_name
class _TokenInfo:
self.street_tokens = set()
self.place_tokens = set()
self.address_tokens = {}
+ self.postcode = None
@staticmethod
if self.address_tokens:
out['addr'] = self.address_tokens
+ if self.postcode:
+ out['postcode'] = self.postcode
+
return out
if partials:
self.address_tokens[key] = self._mk_array(partials)
+ def set_postcode(self, postcode):
+ """ Set the postcode to the given one.
+ """
+ self.postcode = postcode
+
class _TokenCache:
""" Cache for token information to avoid repeated database queries.
if key == 'postcode':
# Make sure the normalized postcode is present in the word table.
if re.search(r'[:,;]', value) is None:
- self._cache.add_postcode(self.conn,
- self.normalize_postcode(value))
+ norm_pc = self.normalize_postcode(value)
+ token_info.set_postcode(norm_pc)
+ self._cache.add_postcode(self.conn, norm_pc)
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
hnrs.append(value)
elif key == 'street':
self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
+ def set_postcode(self, postcode):
+ """ Set or replace the postcode token with the given value.
+ """
+ self.data['postcode'] = postcode
+
def add_street(self, conn, street):
""" Add addr:street match terms.
"""
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Sanitizer that filters postcodes by their officially allowed pattern.
+
+Arguments:
+ convert-to-address: If set to 'yes' (the default), then postcodes that do
+ not conform with their country-specific pattern are
+ converted to an address component. That means that
+ the postcode does not take part when computing the
+ postcode centroids of a country but is still searchable.
+ When set to 'no', non-conforming postcodes are not
+ searchable either.
+ default-pattern: Pattern to use, when there is none available for the
+ country in question. Warning: will not be used for
+ objects that have no country assigned. These are always
+ assumed to have no postcode.
+"""
+from nominatim.data.postcode_format import PostcodeFormatter
+
+class _PostcodeSanitizer:
+
+ def __init__(self, config):
+ self.convert_to_address = config.get_bool('convert-to-address', True)
+ self.matcher = PostcodeFormatter()
+
+ default_pattern = config.get('default-pattern')
+ if default_pattern is not None and isinstance(default_pattern, str):
+ self.matcher.set_default_pattern(default_pattern)
+
+
+ def __call__(self, obj):
+ if not obj.address:
+ return
+
+ postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode')
+
+ for pos, postcode in postcodes:
+ formatted = self.scan(postcode.name, obj.place.country_code)
+
+ if formatted is None:
+ if self.convert_to_address:
+ postcode.kind = 'unofficial_postcode'
+ else:
+ obj.address.pop(pos)
+ else:
+ postcode.name = formatted[0]
+ postcode.set_attr('variant', formatted[1])
+
+
+ def scan(self, postcode, country):
+ """ Check the postcode for correct formatting and return the
+ normalized version. Returns None if the postcode does not
+ correspond to the oficial format of the given country.
+ """
+ match = self.matcher.match(country, postcode)
+ if match is None:
+ return None
+
+ return self.matcher.normalize(country, match),\
+ ' '.join(filter(lambda p: p is not None, match.groups()))
+
+
+
+
+def create(config):
+ """ Create a housenumber processing function.
+ """
+
+ return _PostcodeSanitizer(config)
return values
+ def get_bool(self, param, default=None):
+ """ Extract a configuration parameter as a boolean.
+ The parameter must be one of the yaml boolean values or an
+ user error will be raised. If `default` is given, then the parameter
+ may also be missing or empty.
+ """
+ value = self.data.get(param, default)
+
+ if not isinstance(value, bool):
+ raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no'.")
+
+ return value
+
+
def get_delimiter(self, default=',;'):
""" Return the 'delimiter' parameter in the configuration as a
compiled regular expression that can be used to split the names on the
self.deflangs = {}
if use_defaults in ('mono', 'all'):
- for ccode, prop in country_info.iterate():
- clangs = prop['languages']
+ for ccode, clangs in country_info.iterate('languages'):
if len(clangs) == 1 or use_defaults == 'all':
if self.whitelist:
self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Specialized processor for postcodes. Supports a 'lookup' variant of the
+token, which produces variants with optional spaces.
+"""
+
+from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
+
+### Configuration section
+
+def configure(rules, normalization_rules): # pylint: disable=W0613
+ """ All behaviour is currently hard-coded.
+ """
+ return None
+
+### Analysis section
+
+def create(normalizer, transliterator, config): # pylint: disable=W0613
+ """ Create a new token analysis instance for this module.
+ """
+ return PostcodeTokenAnalysis(normalizer, transliterator)
+
+
+class PostcodeTokenAnalysis:
+ """ Special normalization and variant generation for postcodes.
+
+ This analyser must not be used with anything but postcodes as
+ it follows some special rules: `normalize` doesn't necessarily
+ need to return a standard form as per normalization rules. It
+ needs to return the canonical form of the postcode that is also
+ used for output. `get_variants_ascii` then needs to ensure that
+ the generated variants once more follow the standard normalization
+ and transliteration, so that postcodes are correctly recognised by
+ the search algorithm.
+ """
+ def __init__(self, norm, trans):
+ self.norm = norm
+ self.trans = trans
+
+ self.mutator = MutationVariantGenerator(' ', (' ', ''))
+
+
+ def normalize(self, name):
+ """ Return the standard form of the postcode.
+ """
+ return name.strip().upper()
+
+
+ def get_variants_ascii(self, norm_name):
+ """ Compute the spelling variants for the given normalized postcode.
+
+ Takes the canonical form of the postcode, normalizes it using the
+ standard rules and then creates variants of the result where
+ all spaces are optional.
+ """
+ # Postcodes follow their own transliteration rules.
+ # Make sure at this point, that the terms are normalized in a way
+ # that they are searchable with the standard transliteration rules.
+ return [self.trans.transliterate(term) for term in
+ self.mutator.generate([self.norm.transliterate(norm_name)]) if term]
_COUNTRY_INFO.load(config)
-def iterate():
+def iterate(prop=None):
""" Iterate over country code and properties.
+
+ When `prop` is None, all countries are returned with their complete
+ set of properties.
+
+ If `prop` is given, then only countries are returned where the
+ given property is set. The second item of the tuple contains only
+ the content of the given property.
"""
- return _COUNTRY_INFO.items()
+ if prop is None:
+ return _COUNTRY_INFO.items()
+
+ return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p)
def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
Functions for importing, updating and otherwise maintaining the table
of artificial postcode centroids.
"""
+from collections import defaultdict
import csv
import gzip
import logging
from psycopg2 import sql as pysql
from nominatim.db.connection import connect
+from nominatim.utils.centroid import PointsCentroid
+from nominatim.data.postcode_format import PostcodeFormatter
LOG = logging.getLogger()
return num
-class _CountryPostcodesCollector:
+class _PostcodeCollector:
""" Collector for postcodes of a single country.
"""
- def __init__(self, country):
+ def __init__(self, country, matcher):
self.country = country
- self.collected = {}
+ self.matcher = matcher
+ self.collected = defaultdict(PointsCentroid)
+ self.normalization_cache = None
def add(self, postcode, x, y):
""" Add the given postcode to the collection cache. If the postcode
already existed, it is overwritten with the new centroid.
"""
- self.collected[postcode] = (x, y)
+ if self.matcher is not None:
+ if self.normalization_cache and self.normalization_cache[0] == postcode:
+ normalized = self.normalization_cache[1]
+ else:
+ match = self.matcher.match(postcode)
+ normalized = self.matcher.normalize(match) if match else None
+ self.normalization_cache = (postcode, normalized)
+
+ if normalized:
+ self.collected[normalized] += (x, y)
def commit(self, conn, analyzer, project_dir):
WHERE country_code = %s""",
(self.country, ))
for postcode, x, y in cur:
- newx, newy = self.collected.pop(postcode, (None, None))
- if newx is not None:
- dist = (x - newx)**2 + (y - newy)**2
- if dist > 0.0000001:
+ pcobj = self.collected.pop(postcode, None)
+ if pcobj:
+ newx, newy = pcobj.centroid()
+ if (x - newx) > 0.0000001 or (y - newy) > 0.0000001:
to_update.append((postcode, newx, newy))
else:
to_delete.append(postcode)
- to_add = [(k, v[0], v[1]) for k, v in self.collected.items()]
- self.collected = []
+ to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
+ self.collected = None
return to_add, to_delete, to_update
postcode = analyzer.normalize_postcode(row['postcode'])
if postcode not in self.collected:
try:
- self.collected[postcode] = (_to_float(row['lon'], 180),
- _to_float(row['lat'], 90))
+ # Do float conversation separately, it might throw
+ centroid = (_to_float(row['lon'], 180),
+ _to_float(row['lat'], 90))
+ self.collected[postcode] += centroid
except ValueError:
LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
row['lat'], row['lon'], self.country)
potentially enhances it with external data and then updates the
postcodes in the table 'location_postcode'.
"""
+ matcher = PostcodeFormatter()
with tokenizer.name_analyzer() as analyzer:
with connect(dsn) as conn:
# First get the list of countries that currently have postcodes.
# Recompute the list of valid postcodes from placex.
with conn.cursor(name="placex_postcodes") as cur:
cur.execute("""
- SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
+ SELECT cc, pc, ST_X(centroid), ST_Y(centroid)
FROM (SELECT
COALESCE(plx.country_code,
get_country_code(ST_Centroid(pl.geometry))) as cc,
- token_normalized_postcode(pl.address->'postcode') as pc,
- ST_Centroid(ST_Collect(COALESCE(plx.centroid,
- ST_Centroid(pl.geometry)))) as centroid
+ pl.address->'postcode' as pc,
+ COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
FROM place AS pl LEFT OUTER JOIN placex AS plx
ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
- WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null
- GROUP BY cc, pc) xx
+ WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
WHERE pc IS NOT null AND cc IS NOT null
- ORDER BY country_code, pc""")
+ ORDER BY cc, pc""")
collector = None
if collector is None or country != collector.country:
if collector is not None:
collector.commit(conn, analyzer, project_dir)
- collector = _CountryPostcodesCollector(country)
+ collector = _PostcodeCollector(country, matcher.get_matcher(country))
todo_countries.discard(country)
collector.add(postcode, x, y)
# Now handle any countries that are only in the postcode table.
for country in todo_countries:
- _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
+ fmt = matcher.get_matcher(country)
+ _PostcodeCollector(country, fmt).commit(conn, analyzer, project_dir)
conn.commit()
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for computation of centroids.
+"""
+from collections.abc import Collection
+
+class PointsCentroid:
+ """ Centroid computation from single points using an online algorithm.
+ More points may be added at any time.
+
+ Coordinates are internally treated as a 7-digit fixed-point float
+ (i.e. in OSM style).
+ """
+
+ def __init__(self):
+ self.sum_x = 0
+ self.sum_y = 0
+ self.count = 0
+
+ def centroid(self):
+ """ Return the centroid of all points collected so far.
+ """
+ if self.count == 0:
+ raise ValueError("No points available for centroid.")
+
+ return (float(self.sum_x/self.count)/10000000,
+ float(self.sum_y/self.count)/10000000)
+
+
+ def __len__(self):
+ return self.count
+
+
+ def __iadd__(self, other):
+ if isinstance(other, Collection) and len(other) == 2:
+ if all(isinstance(p, (float, int)) for p in other):
+ x, y = other
+ self.sum_x += int(x * 10000000)
+ self.sum_y += int(y * 10000000)
+ self.count += 1
+ return self
+
+ raise ValueError("Can only add 2-element tuples to centroid.")
partition: 35
languages: ca
names: !include country-names/ad.yaml
+ postcode:
+ pattern: "(ddd)"
+ output: AD\1
# United Arab Emirates (الإمارات العربية المتحدة)
partition: 83
languages: ar
names: !include country-names/ae.yaml
+ postcode: no
# Afghanistan (افغانستان)
partition: 30
languages: fa, ps
names: !include country-names/af.yaml
+ postcode:
+ pattern: "dddd"
# Antigua and Barbuda (Antigua and Barbuda)
partition: 205
languages: en
names: !include country-names/ag.yaml
+ postcode: no
# Anguilla (Anguilla)
partition: 175
languages: en
names: !include country-names/ai.yaml
+ postcode:
+ pattern: "2640"
+ output: AI-2640
# Albania (Shqipëria)
partition: 9
languages: sq
names: !include country-names/al.yaml
+ postcode:
+ pattern: "dddd"
# Armenia (Հայաստան)
partition: 33
languages: hy
names: !include country-names/am.yaml
+ postcode:
+ pattern: "dddd"
# Netherlands Antilles (De Nederlandse Antillen)
partition: 85
languages: pt
names: !include country-names/ao.yaml
+ postcode: no
# (Antarctica)
partition: 181
languages: en, es, fr, ru
names: !include country-names/aq.yaml
+ postcode: no
# Argentina (Argentina)
partition: 39
languages: es
names: !include country-names/ar.yaml
+ postcode:
+ pattern: "l?dddd(?:lll)?"
# (American Samoa)
partition: 245
languages: de
names: !include country-names/at.yaml
+ postcode:
+ pattern: "dddd"
# Australia (Australia)
partition: 139
languages: en
names: !include country-names/au.yaml
+ postcode:
+ pattern: "dddd"
# (Aruba)
partition: 183
languages: nl, pap
names: !include country-names/aw.yaml
+ postcode: no
# (Aland Islands)
partition: 119
languages: az
names: !include country-names/az.yaml
+ postcode:
+ pattern: "dddd"
# Bosnia and Herzegovina (Bosna i Hercegovina / Босна и Херцеговина)
partition: 6
languages: bs, hr, sr
names: !include country-names/ba.yaml
+ postcode:
+ pattern: "ddddd"
# Barbados (Barbados)
partition: 206
languages: en
names: !include country-names/bb.yaml
+ postcode:
+ pattern: "(ddddd)"
+ output: BB\1
# Bangladesh (Bangladesh)
partition: 158
languages: bn
names: !include country-names/bd.yaml
+ postcode:
+ pattern: "dddd"
# Belgium (België / Belgique / Belgien)
partition: 15
languages: nl, fr, de
names: !include country-names/be.yaml
+ postcode:
+ pattern: "dddd"
# Burkina Faso (Burkina Faso)
partition: 225
languages: fr
names: !include country-names/bf.yaml
+ postcode: no
# Bulgaria (Бългaрия)
partition: 140
languages: bg
names: !include country-names/bg.yaml
+ postcode:
+ pattern: "dddd"
# Bahrain (البحرين)
partition: 62
languages: ar
names: !include country-names/bh.yaml
+ postcode:
+ pattern: "d?ddd"
# Burundi (Burundi)
partition: 61
languages: fr
names: !include country-names/bi.yaml
+ postcode: no
# Benin (Bénin)
partition: 224
languages: fr
names: !include country-names/bj.yaml
+ postcode: no
# (Saint Barthélemy)
partition: 176
languages: en
names: !include country-names/bm.yaml
+ postcode:
+ pattern: "(ll)[ -]?(dd)"
+ output: \1 \2
# Brunei (Brunei)
partition: 86
languages: ms
names: !include country-names/bn.yaml
+ postcode:
+ pattern: "(ll) ?(dddd)"
+ output: \1\2
# Bolivia (Bolivia)
partition: 120
languages: es, qu, gn, ay
names: !include country-names/bo.yaml
+ postcode: no
# Caribbean Netherlands (Caribisch Nederland)
partition: 121
languages: pt
names: !include country-names/br.yaml
+ postcode:
+ pattern: "(ddddd)-?(ddd)"
+ output: \1-\2
# The Bahamas (The Bahamas)
partition: 207
languages: en
names: !include country-names/bs.yaml
+ postcode: no
# Bhutan (འབྲུག་ཡུལ་)
partition: 87
languages: dz
names: !include country-names/bt.yaml
+ postcode:
+ pattern: "ddddd"
# (Bouvet Island)
partition: 122
languages: en, tn
names: !include country-names/bw.yaml
+ postcode: no
# Belarus (Беларусь)
partition: 40
languages: be, ru
names: !include country-names/by.yaml
+ postcode:
+ pattern: "dddddd"
# Belize (Belize)
partition: 208
languages: en
names: !include country-names/bz.yaml
+ postcode: no
# Canada (Canada)
partition: 244
languages: en, fr
names: !include country-names/ca.yaml
+ postcode:
+ pattern: "(ldl) ?(dld)"
+ output: \1 \2
# Cocos (Keeling) Islands (Cocos (Keeling) Islands)
partition: 229
languages: fr
names: !include country-names/cd.yaml
+ postcode: no
# Central African Republic (Ködörösêse tî Bêafrîka - République Centrafricaine)
partition: 227
languages: fr, sg
names: !include country-names/cf.yaml
+ postcode: no
# Congo-Brazzaville (Congo)
partition: 230
languages: fr
names: !include country-names/cg.yaml
+ postcode: no
# Switzerland (Schweiz/Suisse/Svizzera/Svizra)
partition: 5
languages: de, fr, it, rm
names: !include country-names/ch.yaml
+ postcode:
+ pattern: "dddd"
# Côte d'Ivoire (Côte d’Ivoire)
partition: 228
languages: fr
names: !include country-names/ci.yaml
+ postcode: no
# Cook Islands (Kūki 'Āirani)
partition: 41
languages: en, rar
names: !include country-names/ck.yaml
+ postcode: no
# Chile (Chile)
partition: 88
languages: es
names: !include country-names/cl.yaml
+ postcode:
+ pattern: "ddddddd"
# Cameroon (Cameroun)
partition: 141
languages: fr, en
names: !include country-names/cm.yaml
+ postcode: no
# China (中国)
partition: 117
languages: zh
names: !include country-names/cn.yaml
+ postcode:
+ pattern: "dddddd"
# Colombia (Colombia)
partition: 133
languages: es
names: !include country-names/co.yaml
+ postcode:
+ pattern: "dddddd"
# Costa Rica (Costa Rica)
partition: 64
languages: es
names: !include country-names/cr.yaml
+ postcode:
+ pattern: "ddddd"
# Cuba (Cuba)
partition: 42
languages: es
names: !include country-names/cu.yaml
+ postcode:
+ pattern: "ddddd"
# Cape Verde (Cabo Verde)
partition: 89
languages: pt
names: !include country-names/cv.yaml
+ postcode:
+ pattern: "dddd"
# Curaçao (Curaçao)
partition: 114
languages: el, tr
names: !include country-names/cy.yaml
+ postcode:
+ pattern: "(?:99|d)ddd"
# Czechia (Česko)
partition: 124
languages: cs
names: !include country-names/cz.yaml
+ postcode:
+ pattern: "(ddd) ?(dd)"
+ output: \1 \2
# Germany (Deutschland)
partition: 3
languages: de
names: !include country-names/de.yaml
+ postcode:
+ pattern: "ddddd"
# Djibouti (Djibouti جيبوتي)
partition: 43
languages: fr, ar, so, aa
names: !include country-names/dj.yaml
+ postcode: no
# Denmark (Danmark)
partition: 160
languages: da
names: !include country-names/dk.yaml
+ postcode:
+ pattern: "dddd"
# Dominica (Dominica)
partition: 209
languages: en
names: !include country-names/dm.yaml
+ postcode: no
# Dominican Republic (República Dominicana)
partition: 37
languages: es
names: !include country-names/do.yaml
+ postcode:
+ pattern: "ddddd"
# Algeria (Algérie / ⵍⵣⵣⴰⵢⴻⵔ / الجزائر)
partition: 19
languages: ar, ber, fr
names: !include country-names/dz.yaml
+ postcode:
+ pattern: "ddddd"
# Ecuador (Ecuador)
partition: 78
languages: es
names: !include country-names/ec.yaml
+ postcode:
+ pattern: "dddddd"
# Estonia (Eesti)
partition: 125
languages: et
names: !include country-names/ee.yaml
+ postcode:
+ pattern: "ddddd"
# Egypt (مصر)
partition: 16
languages: ar
names: !include country-names/eg.yaml
+ postcode:
+ pattern: "ddddd"
# Sahrawi Arab Democratic Republic (الجمهورية العربية الصحراوية الديمقراطية)
partition: 142
languages: ti, ar, en
names: !include country-names/er.yaml
+ postcode: no
# Spain (España)
partition: 31
languages: es, ast, ca, eu, gl
names: !include country-names/es.yaml
+ postcode:
+ pattern: "ddddd"
# Ethiopia (ኢትዮጵያ)
partition: 90
languages: am, om
names: !include country-names/et.yaml
+ postcode:
+ pattern: "dddd"
# Finland (Suomi)
partition: 20
languages: fi, sv, se
names: !include country-names/fi.yaml
+ postcode:
+ pattern: "ddddd"
# Fiji (Viti)
partition: 210
languages: en
names: !include country-names/fj.yaml
+ postcode: no
# Falkland Islands (Falkland Islands)
partition: 91
languages: en
names: !include country-names/fk.yaml
+ postcode:
+ pattern: "FIQQ 1ZZ"
# Federated States of Micronesia (Micronesia)
partition: 217
languages: en
names: !include country-names/fm.yaml
+ postcode:
+ pattern: "ddddd"
# Faroe Islands (Føroyar)
partition: 10
languages: fo, da
names: !include country-names/fo.yaml
+ postcode:
+ pattern: "ddd"
# France (France)
partition: 4
languages: fr
names: !include country-names/fr.yaml
+ postcode:
+ pattern: "ddddd"
# Gabon (Gabon)
partition: 239
languages: fr
names: !include country-names/ga.yaml
+ postcode: no
# United Kingdom (United Kingdom)
partition: 1
languages: en
names: !include country-names/gb.yaml
+ postcode:
+ pattern: "(l?ld[A-Z0-9]?) ?(dll)"
+ output: \1 \2
# Grenada (Grenada)
partition: 143
languages: en
names: !include country-names/gd.yaml
+ postcode: no
# Georgia (საქართველო)
partition: 21
languages: ka
names: !include country-names/ge.yaml
+ postcode:
+ pattern: "dddd"
# French Guiana (Guyane Française)
partition: 77
languages: en
names: !include country-names/gg.yaml
+ postcode:
+ pattern: "(GYdd?) ?(dll)"
+ output: \1 \2
# Ghana (Ghana)
partition: 211
languages: en
names: !include country-names/gh.yaml
+ postcode:
+ pattern: "ll-d?ddd-dddd"
# Gibraltar (Gibraltar)
partition: 138
languages: en
names: !include country-names/gi.yaml
+ postcode:
+ pattern: "(GX11) ?(1AA)"
+ output: GX11 1AA
# Greenland (Kalaallit Nunaat)
partition: 111
languages: kl, da
names: !include country-names/gl.yaml
+ postcode:
+ pattern: "dddd"
# The Gambia (Gambia)
partition: 212
languages: en
names: !include country-names/gm.yaml
+ postcode: no
# Guinea (Guinée)
partition: 240
languages: fr
names: !include country-names/gn.yaml
+ postcode:
+ pattern: "ddd"
# Guadeloupe (Guadeloupe)
partition: 12
languages: es, fr, pt
names: !include country-names/gq.yaml
+ postcode: no
# Greece (Ελλάς)
partition: 22
languages: el
names: !include country-names/gr.yaml
+ postcode:
+ pattern: "(ddd) ?(dd)"
+ output: \1 \2
# South Georgia and the South Sandwich Islands (South Georgia and the South Sandwich Islands)
partition: 44
languages: en
names: !include country-names/gs.yaml
+ postcode:
+ pattern: "(SIQQ) ?(1ZZ)"
+ output: \1 \2
# Guatemala (Guatemala)
partition: 57
languages: es
names: !include country-names/gt.yaml
+ postcode:
+ pattern: "ddddd"
# Guam (Guam)
partition: 8
languages: pt
names: !include country-names/gw.yaml
+ postcode:
+ pattern: "dddd"
# Guyana (Guyana)
partition: 213
languages: en
names: !include country-names/gy.yaml
+ postcode: no
# (Hong Kong)
partition: 56
languages: es
names: !include country-names/hn.yaml
+ postcode:
+ pattern: "ddddd"
# Croatia (Hrvatska)
partition: 92
languages: hr
names: !include country-names/hr.yaml
+ postcode:
+ pattern: "ddddd"
# Haiti (Ayiti)
partition: 29
languages: fr, ht
names: !include country-names/ht.yaml
+ postcode:
+ pattern: "dddd"
# Hungary (Magyarország)
partition: 45
languages: hu
names: !include country-names/hu.yaml
+ postcode:
+ pattern: "dddd"
# Indonesia (Indonesia)
partition: 110
languages: id
names: !include country-names/id.yaml
+ postcode:
+ pattern: "ddddd"
# Ireland (Éire / Ireland)
partition: 46
languages: en, ga
names: !include country-names/ie.yaml
+ postcode:
+ pattern: "(ldd) ?([0123456789ACDEFHKNPRTVWXY]{4})"
+ output: \1 \2
# Israel (ישראל)
partition: 65
languages: he
names: !include country-names/il.yaml
+ postcode:
+ pattern: "ddddddd"
# Isle of Man (Isle of Man)
partition: 190
languages: en
names: !include country-names/im.yaml
+ postcode:
+ pattern: "(IMdd?) ?(dll)"
+ output: \1 \2
# India (India)
partition: 128
languages: hi, en
names: !include country-names/in.yaml
+ postcode:
+ pattern: "(ddd) ?(ddd)"
+ output: \1\2
# British Indian Ocean Territory (British Indian Ocean Territory)
partition: 13
languages: en
names: !include country-names/io.yaml
+ postcode:
+ pattern: "(BBND) ?(1ZZ)"
+ output: \1 \2
# Iraq (العراق)
partition: 144
languages: ar, ku
names: !include country-names/iq.yaml
+ postcode:
+ pattern: "ddddd"
# Iran (ایران)
partition: 80
languages: fa
names: !include country-names/ir.yaml
+ postcode:
+ pattern: "(ddddd)[-_ ]?(ddddd)"
+ output: \1-\2
# Iceland (Ísland)
partition: 134
languages: is
names: !include country-names/is.yaml
+ postcode:
+ pattern: "ddd"
# Italy (Italia)
partition: 28
languages: it, de, fr
names: !include country-names/it.yaml
+ postcode:
+ pattern: "ddddd"
# Jersey (Jersey)
partition: 123
languages: en
names: !include country-names/je.yaml
+ postcode:
+ pattern: "(JEdd?) ?(dll)"
+ output: \1 \2
# Jamaica (Jamaica)
partition: 214
languages: en
names: !include country-names/jm.yaml
+ postcode: no
# Jordan (الأردن)
partition: 17
languages: ar
names: !include country-names/jo.yaml
+ postcode:
+ pattern: "ddddd"
# Japan (日本)
partition: 11
languages: ja
names: !include country-names/jp.yaml
+ postcode:
+ pattern: "(ddd)-?(dddd)"
+ output: \1-\2
# Kenya (Kenya)
partition: 126
languages: sw, en
names: !include country-names/ke.yaml
+ postcode:
+ pattern: "ddddd"
# Kyrgyzstan (Кыргызстан)
partition: 93
languages: ky, ru
names: !include country-names/kg.yaml
+ postcode:
+ pattern: "dddddd"
# Cambodia (ព្រះរាជាណាចក្រកម្ពុជា)
partition: 159
languages: km
names: !include country-names/kh.yaml
+ postcode:
+ pattern: "dddddd"
# Kiribati (Kiribati)
partition: 215
languages: en
names: !include country-names/ki.yaml
+ postcode: no
# Comoros (Comores Komori جزر القمر)
partition: 47
languages: ar, fr, sw
names: !include country-names/km.yaml
+ postcode: no
# Saint Kitts and Nevis (Saint Kitts and Nevis)
partition: 84
languages: en
names: !include country-names/kn.yaml
+ postcode:
+ pattern: "dddd"
# North Korea (조선민주주의인민공화국)
partition: 48
languages: ko
names: !include country-names/kp.yaml
+ postcode: no
# South Korea (대한민국)
partition: 49
languages: ko, en
names: !include country-names/kr.yaml
+ postcode:
+ pattern: "ddddd"
# Kuwait (الكويت)
partition: 127
languages: ar
names: !include country-names/kw.yaml
+ postcode:
+ pattern: "ddddd"
# Cayman Islands (Cayman Islands)
partition: 38
languages: en
names: !include country-names/ky.yaml
+ postcode:
+ pattern: "(d)-(dddd)"
+ output: KY\1-\2
# Kazakhstan (Қазақстан)
partition: 94
languages: kk, ru
names: !include country-names/kz.yaml
+ postcode:
+ pattern: "(?:lddldld|dddddd)"
# Laos (ປະເທດລາວ)
partition: 145
languages: lo
names: !include country-names/la.yaml
+ postcode:
+ pattern: "ddddd"
# Lebanon (لبنان)
partition: 66
languages: ar, fr
names: !include country-names/lb.yaml
+ postcode:
+ pattern: "(dddd)(?: ?dddd)?"
# Saint Lucia (Saint Lucia)
partition: 146
languages: en
names: !include country-names/lc.yaml
+ postcode:
+ pattern: "(dd) ?(ddd)"
+ output: LC\1 \2
# Liechtenstein (Liechtenstein)
partition: 246
languages: de
names: !include country-names/li.yaml
+ postcode:
+ pattern: "dddd"
# Sri Lanka (ශ්රී ලංකාව இலங்கை)
partition: 95
languages: si, ta
names: !include country-names/lk.yaml
+ postcode:
+ pattern: "ddddd"
# Liberia (Liberia)
partition: 216
languages: en
names: !include country-names/lr.yaml
+ postcode:
+ pattern: "dddd"
# Lesotho (Lesotho)
partition: 136
languages: en, st
names: !include country-names/ls.yaml
+ postcode:
+ pattern: "ddd"
# Lithuania (Lietuva)
partition: 67
languages: lt
names: !include country-names/lt.yaml
+ postcode:
+ pattern: "ddddd"
# Luxembourg (Lëtzebuerg)
partition: 74
languages: lb, fr, de
names: !include country-names/lu.yaml
+ postcode:
+ pattern: "dddd"
# Latvia (Latvija)
partition: 162
languages: lv
names: !include country-names/lv.yaml
+ postcode:
+ pattern: "(dddd)"
+ output: LV-\1
# Libya (ليبيا)
partition: 163
languages: ar
names: !include country-names/ly.yaml
+ postcode: no
# Morocco (Maroc ⵍⵎⵖⵔⵉⴱ المغرب)
partition: 23
languages: fr, zgh, ar
names: !include country-names/ma.yaml
+ postcode:
+ pattern: "ddddd"
# Monaco (Monaco)
partition: 242
languages: fr
names: !include country-names/mc.yaml
+ postcode:
+ pattern: "980dd"
# Moldova (Moldova)
partition: 147
languages: ro, ru, uk
names: !include country-names/md.yaml
+ postcode:
+ pattern: "(dddd)"
+ output: MD-\1
# Montenegro (Crna Gora / Црна Гора)
partition: 180
languages: srp, sr, hr, bs, sq
names: !include country-names/me.yaml
+ postcode:
+ pattern: "ddddd"
# Saint Martin (Saint Martin)
partition: 164
languages: mg, fr
names: !include country-names/mg.yaml
+ postcode:
+ pattern: "ddd"
# Marshall Islands (Ṃajeḷ)
partition: 105
languages: en, mh
names: !include country-names/mh.yaml
+ postcode:
+ pattern: "ddddd"
# North Macedonia (Северна Македонија)
partition: 69
languages: mk
names: !include country-names/mk.yaml
+ postcode:
+ pattern: "dddd"
# Mali (Mali)
partition: 241
languages: fr
names: !include country-names/ml.yaml
+ postcode: no
# Myanmar (မြန်မာ)
partition: 148
languages: my
names: !include country-names/mm.yaml
+ postcode:
+ pattern: "ddddd"
# Mongolia (Монгол улс ᠮᠤᠩᠭᠤᠯ ᠤᠯᠤᠰ)
partition: 167
languages: mn
names: !include country-names/mn.yaml
+ postcode:
+ pattern: "ddddd"
# Macao (Macao)
partition: 191
languages: zh-hant, pt
names: !include country-names/mo.yaml
+ postcode: no
# Northern Mariana Islands (Northern Mariana Islands)
partition: 149
languages: ar, fr
names: !include country-names/mr.yaml
+ postcode: no
# Montserrat (Montserrat)
partition: 165
languages: mt, en
names: !include country-names/mt.yaml
+ postcode:
+ pattern: "(lll) ?(dddd)"
+ output: \1 \2
# Mauritius (Mauritius)
partition: 150
languages: mfe, fr, en
names: !include country-names/mu.yaml
+ postcode:
+ pattern: "ddddd"
# Maldives (ދިވެހިރާއްޖެ)
partition: 96
languages: dv
names: !include country-names/mv.yaml
+ postcode:
+ pattern: "ddddd"
# Malawi (Malawi)
partition: 97
languages: en, ny
names: !include country-names/mw.yaml
+ postcode: no
# Mexico (México)
partition: 166
languages: es
names: !include country-names/mx.yaml
+ postcode:
+ pattern: "ddddd"
# Malaysia (Malaysia)
partition: 7
languages: ms
names: !include country-names/my.yaml
+ postcode:
+ pattern: "ddddd"
# Mozambique (Moçambique)
partition: 98
languages: pt
names: !include country-names/mz.yaml
+ postcode:
+ pattern: "(dddd)(?:-dd)?"
# Namibia (Namibia)
partition: 99
languages: en, sf, de
names: !include country-names/na.yaml
+ postcode:
+ pattern: "ddddd"
# New Caledonia (Nouvelle-Calédonie)
partition: 226
languages: fr
names: !include country-names/ne.yaml
+ postcode:
+ pattern: "dddd"
# Norfolk Island (Norfolk Island)
partition: 218
languages: en
names: !include country-names/ng.yaml
+ postcode:
+ pattern: "dddddd"
# Nicaragua (Nicaragua)
partition: 151
languages: es
names: !include country-names/ni.yaml
+ postcode:
+ pattern: "ddddd"
# Netherlands (Nederland)
partition: 63
languages: nl
names: !include country-names/nl.yaml
+ postcode:
+ pattern: "(dddd) ?(ll)"
+ output: \1 \2
# Norway (Norge)
partition: 60
languages: nb, nn, no, se
names: !include country-names/no.yaml
+ postcode:
+ pattern: "dddd"
# Nepal (Nepal)
partition: 50
languages: ne
names: !include country-names/np.yaml
+ postcode:
+ pattern: "ddddd"
# Nauru (Naoero)
partition: 70
languages: na, en
names: !include country-names/nr.yaml
+ postcode: no
# Niue (Niuē)
partition: 178
languages: niu, en
names: !include country-names/nu.yaml
+ postcode: no
# New Zealand (New Zealand / Aotearoa)
partition: 27
languages: mi, en
names: !include country-names/nz.yaml
+ postcode:
+ pattern: "dddd"
# Oman (عمان)
partition: 137
languages: ar
names: !include country-names/om.yaml
+ postcode:
+ pattern: "ddd"
# Panama (Panamá)
partition: 152
languages: es
names: !include country-names/pa.yaml
+ postcode:
+ pattern: "dddd"
# Peru (Perú)
partition: 51
languages: es
names: !include country-names/pe.yaml
+ postcode:
+ pattern: "ddddd"
# French Polynesia (Polynésie française)
partition: 71
languages: en, tpi, ho
names: !include country-names/pg.yaml
+ postcode:
+ pattern: "ddd"
# Philippines (Philippines)
partition: 26
languages: en, tl
names: !include country-names/ph.yaml
+ postcode:
+ pattern: "dddd"
# Pakistan (پاکستان)
partition: 14
languages: en, ur, pnb, sd, ps, bal
names: !include country-names/pk.yaml
+ postcode:
+ pattern: "ddddd"
# Poland (Polska)
partition: 168
languages: pl
names: !include country-names/pl.yaml
+ postcode:
+ pattern: "(dd)[ -]?(ddd)"
+ output: \1-\2
# Saint Pierre and Miquelon (Saint-Pierre-et-Miquelon)
partition: 113
languages: en, pih
names: !include country-names/pn.yaml
+ postcode:
+ pattern: "(PCRN) ?(1ZZ)"
+ output: \1 \2
# Puerto Rico (Puerto Rico)
partition: 194
languages: ar, he
names: !include country-names/ps.yaml
+ postcode:
+ pattern: "ddd"
# Portugal (Portugal)
partition: 34
languages: pt
names: !include country-names/pt.yaml
+ postcode:
+ pattern: "dddd(?:-ddd)?"
# Palau (Belau)
partition: 195
languages: en, pau, ja, sov, tox
names: !include country-names/pw.yaml
+ postcode:
+ pattern: "969(39|40)"
# Paraguay (Paraguay)
partition: 101
languages: es, gn
names: !include country-names/py.yaml
+ postcode:
+ pattern: "dddddd"
# Qatar (قطر)
partition: 169
languages: ar
names: !include country-names/qa.yaml
+ postcode: no
# (Réunion)
partition: 170
languages: ro
names: !include country-names/ro.yaml
+ postcode:
+ pattern: "dddddd"
# Serbia (Србија)
partition: 59
languages: sr
names: !include country-names/rs.yaml
+ postcode:
+ pattern: "ddddd"
# Russia (Россия)
partition: 135
languages: ru
names: !include country-names/ru.yaml
+ postcode:
+ pattern: "dddddd"
# Rwanda (Rwanda)
partition: 102
languages: rw, fr, en
names: !include country-names/rw.yaml
+ postcode: no
# Saudi Arabia (السعودية)
partition: 52
languages: ar
names: !include country-names/sa.yaml
+ postcode:
+ pattern: "ddddd(?:-dddd)?"
# Solomon Islands (Solomon Islands)
partition: 201
languages: en
names: !include country-names/sb.yaml
+ postcode: no
# Seychelles (Sesel)
partition: 79
languages: fr, en, crs
names: !include country-names/sc.yaml
+ postcode: no
# Sudan (السودان)
partition: 72
languages: ar, en
names: !include country-names/sd.yaml
+ postcode:
+ pattern: "ddddd"
# Sweden (Sverige)
partition: 112
languages: sv
names: !include country-names/se.yaml
+ postcode:
+ pattern: "(ddd) ?(dd)"
+ output: \1 \2
# Singapore (Singapore)
partition: 115
languages: zh-hans, en, ms, ta
names: !include country-names/sg.yaml
+ postcode:
+ pattern: "dddddd"
# Saint Helena, Ascension and Tristan da Cunha (Saint Helena, Ascension and Tristan da Cunha)
partition: 196
languages: en
names: !include country-names/sh.yaml
+ postcode:
+ pattern: "(ASCN|STHL|TDCU) ?(1ZZ)"
+ output: \1 \2
# Slovenia (Slovenija)
partition: 36
languages: sl
names: !include country-names/si.yaml
+ postcode:
+ pattern: "dddd"
# (Svalbard and Jan Mayen)
partition: 172
languages: sk
names: !include country-names/sk.yaml
+ postcode:
+ pattern: "(ddd) ?(dd)"
+ output: \1 \2
# Sierra Leone (Sierra Leone)
partition: 219
languages: en
names: !include country-names/sl.yaml
+ postcode: no
# San Marino (San Marino)
partition: 153
languages: it
names: !include country-names/sm.yaml
+ postcode:
+ pattern: "4789d"
# Senegal (Sénégal)
partition: 237
languages: fr
names: !include country-names/sn.yaml
+ postcode:
+ pattern: "ddddd"
# Somalia (Soomaaliya الصومال)
partition: 154
languages: so, ar
names: !include country-names/so.yaml
+ postcode:
+ pattern: "(ll) ?(ddddd)"
+ output: \1 \2
# Suriname (Suriname)
partition: 24
languages: nl
names: !include country-names/sr.yaml
+ postcode: no
# South Sudan (South Sudan)
partition: 247
languages: en
names: !include country-names/ss.yaml
+ postcode: no
# São Tomé and Príncipe (São Tomé e Príncipe)
partition: 53
languages: pt
names: !include country-names/st.yaml
+ postcode: no
# El Salvador (El Salvador)
partition: 103
languages: es
names: !include country-names/sv.yaml
+ postcode:
+ pattern: "dddd"
# (Sint Maarten)
partition: 104
languages: ar
names: !include country-names/sy.yaml
+ postcode: no
# Eswatini (eSwatini)
partition: 82
languages: en, ss
names: !include country-names/sz.yaml
+ postcode:
+ pattern: "lddd"
# Turks and Caicos Islands (Turks and Caicos Islands)
partition: 106
languages: en
names: !include country-names/tc.yaml
+ postcode:
+ pattern: "(TKCA) ?(1ZZ)"
+ output: \1 \2
# Chad (Tchad تشاد)
partition: 68
languages: fr, ar
names: !include country-names/td.yaml
+ postcode: no
# French Southern Lands (Terres australes et antarctiques françaises)
partition: 243
languages: fr
names: !include country-names/tg.yaml
+ postcode: no
# Thailand (ประเทศไทย)
partition: 32
languages: th
names: !include country-names/th.yaml
+ postcode:
+ pattern: "ddddd"
# Tajikistan (Тоҷикистон)
partition: 129
languages: tg, ru
names: !include country-names/tj.yaml
+ postcode:
+ pattern: "dddddd"
# Tokelau (Tokelau)
partition: 179
languages: tkl, en, sm
names: !include country-names/tk.yaml
+ postcode: no
# East Timor (Timór Lorosa'e)
partition: 161
languages: pt, tet
names: !include country-names/tl.yaml
+ postcode: no
# Turkmenistan (Türkmenistan)
partition: 54
languages: tk
names: !include country-names/tm.yaml
+ postcode:
+ pattern: "dddddd"
# Tunisia (تونس)
partition: 18
languages: ar, fr
names: !include country-names/tn.yaml
+ postcode:
+ pattern: "dddd"
# Tonga (Tonga)
partition: 220
languages: en
names: !include country-names/to.yaml
+ postcode: no
# Turkey (Türkiye)
partition: 81
languages: tr
names: !include country-names/tr.yaml
+ postcode:
+ pattern: "ddddd"
# Trinidad and Tobago (Trinidad and Tobago)
partition: 221
languages: en
names: !include country-names/tt.yaml
+ postcode:
+ pattern: "dddddd"
# Tuvalu (Tuvalu)
partition: 156
languages: en
names: !include country-names/tv.yaml
+ postcode: no
# Taiwan (臺灣)
partition: 25
languages: zh-hant
names: !include country-names/tw.yaml
+ postcode:
+ pattern: "ddd(?:ddd?)?"
# Tanzania (Tanzania)
partition: 130
languages: sw, en
names: !include country-names/tz.yaml
+ postcode:
+ pattern: "ddddd"
# Ukraine (Україна)
partition: 173
languages: uk
names: !include country-names/ua.yaml
+ postcode:
+ pattern: "d?ddddd"
# Uganda (Uganda)
partition: 155
languages: en, sw
names: !include country-names/ug.yaml
+ postcode: no
# (United States Minor Outlying Islands)
partition: 198
languages: en
names: !include country-names/um.yaml
+ postcode:
+ pattern: "96898"
# United States (United States)
partition: 2
languages: en
names: !include country-names/us.yaml
+ postcode:
+ pattern: "ddddd"
# Uruguay (Uruguay)
partition: 174
languages: es
names: !include country-names/uy.yaml
+ postcode:
+ pattern: "ddddd"
# Uzbekistan (Oʻzbekiston)
partition: 157
languages: uz, kaa
names: !include country-names/uz.yaml
+ postcode:
+ pattern: "dddddd"
# Vatican City (Civitas Vaticana)
partition: 107
languages: it
names: !include country-names/va.yaml
+ postcode:
+ pattern: "00120"
# Saint Vincent and the Grenadines (Saint Vincent and the Grenadines)
partition: 171
languages: en
names: !include country-names/vc.yaml
+ postcode:
+ pattern: "(dddd)"
+ output: VC\1
# Venezuela (Venezuela)
partition: 108
languages: es
names: !include country-names/ve.yaml
+ postcode:
+ pattern: "dddd"
# British Virgin Islands (British Virgin Islands)
partition: 109
languages: en
names: !include country-names/vg.yaml
+ postcode:
+ pattern: "(dddd)"
+ output: VG\1
# (United States Virgin Islands)
partition: 75
languages: vi
names: !include country-names/vn.yaml
+ postcode:
+ pattern: "ddddd"
# Vanuatu (Vanuatu)
partition: 116
languages: bi, en, fr
names: !include country-names/vu.yaml
+ postcode: no
# Wallis and Futuna Islands (Wallis-et-Futuna)
partition: 59
languages: sq, sr
names: !include country-names/xk.yaml
+ postcode:
+ pattern: "ddddd"
# Yemen (اليمن)
partition: 55
languages: ar
names: !include country-names/ye.yaml
+ postcode: no
# Mayotte (Mayotte)
partition: 76
languages: en, af, st, tn, xh, zu
names: !include country-names/za.yaml
+ postcode:
+ pattern: "dddd"
# Zambia (Zambia)
partition: 222
languages: en
names: !include country-names/zm.yaml
+ postcode:
+ pattern: "dddd"
# Zimbabwe (Zimbabwe)
partition: 223
languages: en, sn, nd
names: !include country-names/zw.yaml
-
+ postcode: no
- streetnumber
convert-to-name:
- (\A|.*,)[^\d,]{3,}(,.*|\Z)
+ - step: clean-postcodes
+ convert-to-address: yes
+ default-pattern: "[A-Z0-9- ]{3,12}"
- step: split-name-list
- step: strip-brace-terms
- step: tag-analyzer-by-language
- analyzer: generic
- id: "@housenumber"
analyzer: housenumbers
+ - id: "@postcode"
+ analyzer: postcodes
- id: bg
analyzer: generic
mode: variant-only
| de | 01982 | country:de |
And there are word tokens for postcodes 01982
- Scenario: Different postcodes with the same normalization can both be found
- Given the places
- | osm | class | type | addr+postcode | addr+housenumber | geometry |
- | N34 | place | house | EH4 7EA | 111 | country:gb |
- | N35 | place | house | E4 7EA | 111 | country:gb |
- When importing
- Then location_postcode contains exactly
- | country | postcode | geometry |
- | gb | EH4 7EA | country:gb |
- | gb | E4 7EA | country:gb |
- When sending search query "EH4 7EA"
- Then results contain
- | type | display_name |
- | postcode | EH4 7EA |
- When sending search query "E4 7EA"
- Then results contain
- | type | display_name |
- | postcode | E4 7EA |
+ @Fail
Scenario: search and address ranks for GB post codes correctly assigned
Given the places
| osm | class | type | postcode | geometry |
| E45 2 | gb | 23 | 5 |
| Y45 | gb | 21 | 5 |
- Scenario: wrongly formatted GB postcodes are down-ranked
+ @fail-legacy
+ Scenario: Postcodes outside all countries are not added to the postcode and word table
Given the places
- | osm | class | type | postcode | geometry |
- | N1 | place | postcode | EA452CD | country:gb |
- | N2 | place | postcode | E45 23 | country:gb |
- When importing
- Then location_postcode contains exactly
- | postcode | country | rank_search | rank_address |
- | EA452CD | gb | 30 | 30 |
- | E45 23 | gb | 30 | 30 |
-
- Scenario: search and address rank for DE postcodes correctly assigned
- Given the places
- | osm | class | type | postcode | geometry |
- | N1 | place | postcode | 56427 | country:de |
- | N2 | place | postcode | 5642 | country:de |
- | N3 | place | postcode | 5642A | country:de |
- | N4 | place | postcode | 564276 | country:de |
- When importing
- Then location_postcode contains exactly
- | postcode | country | rank_search | rank_address |
- | 56427 | de | 21 | 11 |
- | 5642 | de | 30 | 30 |
- | 5642A | de | 30 | 30 |
- | 564276 | de | 30 | 30 |
-
- Scenario: search and address rank for other postcodes are correctly assigned
- Given the places
- | osm | class | type | postcode | geometry |
- | N1 | place | postcode | 1 | country:ca |
- | N2 | place | postcode | X3 | country:ca |
- | N3 | place | postcode | 543 | country:ca |
- | N4 | place | postcode | 54dc | country:ca |
- | N5 | place | postcode | 12345 | country:ca |
- | N6 | place | postcode | 55TT667 | country:ca |
- | N7 | place | postcode | 123-65 | country:ca |
- | N8 | place | postcode | 12 445 4 | country:ca |
- | N9 | place | postcode | A1:bc10 | country:ca |
+ | osm | class | type | addr+postcode | addr+housenumber | addr+place | geometry |
+ | N34 | place | house | 01982 | 111 | Null Island | 0 0.00001 |
+ And the places
+ | osm | class | type | name | geometry |
+ | N1 | place | hamlet | Null Island | 0 0 |
When importing
Then location_postcode contains exactly
- | postcode | country | rank_search | rank_address |
- | 1 | ca | 21 | 11 |
- | X3 | ca | 21 | 11 |
- | 543 | ca | 21 | 11 |
- | 54DC | ca | 21 | 11 |
- | 12345 | ca | 21 | 11 |
- | 55TT667 | ca | 21 | 11 |
- | 123-65 | ca | 25 | 11 |
- | 12 445 4 | ca | 25 | 11 |
- | A1:BC10 | ca | 25 | 11 |
-
-
+ | country | postcode | geometry |
+ And there are no word tokens for postcodes 01982
+ When sending search query "111, 01982 Null Island"
+ Then results contain
+ | osm | display_name |
+ | N34 | 111, Null Island, 01982 |
| ID | osm |
| 0 | R1 |
- Scenario: Unprintable characters in postcodes are ignored
- Given the named places
- | osm | class | type | address | geometry |
- | N234 | amenity | prison | 'postcode' : u'1234\u200e' | country:de |
- When importing
- And sending search query "1234"
- Then result 0 has not attributes osm_type
-
Scenario Outline: Housenumbers with special characters are found
Given the grid
| 1 | | | | 2 |
--- /dev/null
+@DB
+Feature: Querying fo postcode variants
+
+ Scenario: Postcodes in Singapore (6-digit postcode)
+ Given the grid with origin SG
+ | 10 | | | | 11 |
+ And the places
+ | osm | class | type | name | addr+postcode | geometry |
+ | W1 | highway | path | Lorang | 399174 | 10,11 |
+ When importing
+ When sending search query "399174"
+ Then results contain
+ | ID | type | display_name |
+ | 0 | postcode | 399174 |
+
+
+ @fail-legacy
+ Scenario Outline: Postcodes in the Netherlands (mixed postcode with spaces)
+ Given the grid with origin NL
+ | 10 | | | | 11 |
+ And the places
+ | osm | class | type | name | addr+postcode | geometry |
+ | W1 | highway | path | De Weide | 3993 DX | 10,11 |
+ When importing
+ When sending search query "3993 DX"
+ Then results contain
+ | ID | type | display_name |
+ | 0 | postcode | 3993 DX |
+ When sending search query "3993dx"
+ Then results contain
+ | ID | type | display_name |
+ | 0 | postcode | 3993 DX |
+
+ Examples:
+ | postcode |
+ | 3993 DX |
+ | 3993DX |
+ | 3993 dx |
+
+
+ @fail-legacy
+ Scenario: Postcodes in Singapore (6-digit postcode)
+ Given the grid with origin SG
+ | 10 | | | | 11 |
+ And the places
+ | osm | class | type | name | addr+postcode | geometry |
+ | W1 | highway | path | Lorang | 399174 | 10,11 |
+ When importing
+ When sending search query "399174"
+ Then results contain
+ | ID | type | display_name |
+ | 0 | postcode | 399174 |
+
+
+ @fail-legacy
+ Scenario Outline: Postcodes in Andorra (with country code)
+ Given the grid with origin AD
+ | 10 | | | | 11 |
+ And the places
+ | osm | class | type | name | addr+postcode | geometry |
+ | W1 | highway | path | Lorang | <postcode> | 10,11 |
+ When importing
+ When sending search query "675"
+ Then results contain
+ | ID | type | display_name |
+ | 0 | postcode | AD675 |
+ When sending search query "AD675"
+ Then results contain
+ | ID | type | display_name |
+ | 0 | postcode | AD675 |
+
+ Examples:
+ | postcode |
+ | 675 |
+ | AD 675 |
+ | AD675 |
+
+
+ Scenario: Different postcodes with the same normalization can both be found
+ Given the places
+ | osm | class | type | addr+postcode | addr+housenumber | geometry |
+ | N34 | place | house | EH4 7EA | 111 | country:gb |
+ | N35 | place | house | E4 7EA | 111 | country:gb |
+ When importing
+ Then location_postcode contains exactly
+ | country | postcode | geometry |
+ | gb | EH4 7EA | country:gb |
+ | gb | E4 7EA | country:gb |
+ When sending search query "EH4 7EA"
+ Then results contain
+ | type | display_name |
+ | postcode | EH4 7EA |
+ When sending search query "E4 7EA"
+ Then results contain
+ | type | display_name |
+ | postcode | E4 7EA |
+
def check_database_integrity(context):
""" Check some generic constraints on the tables.
"""
- # place_addressline should not have duplicate (place_id, address_place_id)
- cur = context.db.cursor()
- cur.execute("""SELECT count(*) FROM
- (SELECT place_id, address_place_id, count(*) as c
- FROM place_addressline GROUP BY place_id, address_place_id) x
- WHERE c > 1""")
- assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline"
+ with context.db.cursor() as cur:
+ # place_addressline should not have duplicate (place_id, address_place_id)
+ cur.execute("""SELECT count(*) FROM
+ (SELECT place_id, address_place_id, count(*) as c
+ FROM place_addressline GROUP BY place_id, address_place_id) x
+ WHERE c > 1""")
+ assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline"
+
+ # word table must not have empty word_tokens
+ if context.nominatim.tokenizer != 'legacy':
+ cur.execute("SELECT count(*) FROM word WHERE word_token = ''")
+ assert cur.fetchone()[0] == 0, "Empty word tokens found in word table"
+
################################ GIVEN ##################################
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for the sanitizer that normalizes postcodes.
+"""
+import pytest
+
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.indexer.place_info import PlaceInfo
+from nominatim.tools import country_info
+
+@pytest.fixture
+def sanitize(def_config, request):
+ country_info.setup_country_config(def_config)
+ sanitizer_args = {'step': 'clean-postcodes'}
+ for mark in request.node.iter_markers(name="sanitizer_params"):
+ sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()})
+
+ def _run(country=None, **kwargs):
+ pi = {'address': kwargs}
+ if country is not None:
+ pi['country_code'] = country
+
+ _, address = PlaceSanitizer([sanitizer_args]).process_names(PlaceInfo(pi))
+
+ return sorted([(p.kind, p.name) for p in address])
+
+ return _run
+
+
+@pytest.mark.parametrize("country", (None, 'ae'))
+def test_postcode_no_country(sanitize, country):
+ assert sanitize(country=country, postcode='23231') == [('unofficial_postcode', '23231')]
+
+
+@pytest.mark.parametrize("country", (None, 'ae'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_no_country_drop(sanitize, country):
+ assert sanitize(country=country, postcode='23231') == []
+
+
+@pytest.mark.parametrize("postcode", ('12345', ' 12345 ', 'de 12345',
+ 'DE12345', 'DE 12345', 'DE-12345'))
+def test_postcode_pass_good_format(sanitize, postcode):
+ assert sanitize(country='de', postcode=postcode) == [('postcode', '12345')]
+
+
+@pytest.mark.parametrize("postcode", ('123456', '', ' ', '.....',
+ 'DE 12345', 'DEF12345', 'CH 12345'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_drop_bad_format(sanitize, postcode):
+ assert sanitize(country='de', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('1234', '9435', '99000'))
+def test_postcode_cyprus_pass(sanitize, postcode):
+ assert sanitize(country='cy', postcode=postcode) == [('postcode', postcode)]
+
+
+@pytest.mark.parametrize("postcode", ('91234', '99a45', '567'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_cyprus_fail(sanitize, postcode):
+ assert sanitize(country='cy', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('123456', 'A33F2G7'))
+def test_postcode_kazakhstan_pass(sanitize, postcode):
+ assert sanitize(country='kz', postcode=postcode) == [('postcode', postcode)]
+
+
+@pytest.mark.parametrize("postcode", ('V34T6Y923456', '99345'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_kazakhstan_fail(sanitize, postcode):
+ assert sanitize(country='kz', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('675 34', '67534', 'SE-675 34', 'SE67534'))
+def test_postcode_sweden_pass(sanitize, postcode):
+ assert sanitize(country='se', postcode=postcode) == [('postcode', '675 34')]
+
+
+@pytest.mark.parametrize("postcode", ('67 345', '671123'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_sweden_fail(sanitize, postcode):
+ assert sanitize(country='se', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('AB1', '123-456-7890', '1 as 44'))
+@pytest.mark.sanitizer_params(default_pattern='[A-Z0-9- ]{3,12}')
+def test_postcode_default_pattern_pass(sanitize, postcode):
+ assert sanitize(country='an', postcode=postcode) == [('postcode', postcode.upper())]
+
+
+@pytest.mark.parametrize("postcode", ('C', '12', 'ABC123DEF 456', '1234,5678', '11223;11224'))
+@pytest.mark.sanitizer_params(convert_to_address=False, default_pattern='[A-Z0-9- ]{3,12}')
+def test_postcode_default_pattern_fail(sanitize, postcode):
+ assert sanitize(country='an', postcode=postcode) == []
+
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
variants=('~gasse -> gasse', 'street => st', ),
- sanitizers=[], with_housenumber=False):
+ sanitizers=[], with_housenumber=False,
+ with_postcode=False):
cfgstr = {'normalization': list(norm),
'sanitizers': sanitizers,
'transliteration': list(trans),
if with_housenumber:
cfgstr['token-analysis'].append({'id': '@housenumber',
'analyzer': 'housenumbers'})
+ if with_postcode:
+ cfgstr['token-analysis'].append({'id': '@postcode',
+ 'analyzer': 'postcodes'})
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
tok.loader = nominatim.tokenizer.icu_rule_loader.ICURuleLoader(test_config)
anl.normalize_postcode('38 Б') == '38 Б'
-def test_update_postcodes_from_db_empty(analyzer, table_factory, word_table):
- table_factory('location_postcode', 'postcode TEXT',
- content=(('1234',), ('12 34',), ('AB23',), ('1234',)))
+class TestPostcodes:
- with analyzer() as anl:
- anl.update_postcodes_from_db()
+ @pytest.fixture(autouse=True)
+ def setup(self, analyzer, sql_functions):
+ sanitizers = [{'step': 'clean-postcodes'}]
+ with analyzer(sanitizers=sanitizers, with_postcode=True) as anl:
+ self.analyzer = anl
+ yield anl
- assert word_table.count() == 3
- assert word_table.get_postcodes() == {'1234', '12 34', 'AB23'}
+ def process_postcode(self, cc, postcode):
+ return self.analyzer.process_place(PlaceInfo({'country_code': cc,
+ 'address': {'postcode': postcode}}))
-def test_update_postcodes_from_db_add_and_remove(analyzer, table_factory, word_table):
- table_factory('location_postcode', 'postcode TEXT',
- content=(('1234',), ('45BC', ), ('XX45', )))
- word_table.add_postcode(' 1234', '1234')
- word_table.add_postcode(' 5678', '5678')
- with analyzer() as anl:
- anl.update_postcodes_from_db()
+ def test_update_postcodes_from_db_empty(self, table_factory, word_table):
+ table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
+ content=(('de', '12345'), ('se', '132 34'),
+ ('bm', 'AB23'), ('fr', '12345')))
+
+ self.analyzer.update_postcodes_from_db()
+
+ assert word_table.count() == 5
+ assert word_table.get_postcodes() == {'12345', '132 34@132 34', 'AB 23@AB 23'}
+
+
+ def test_update_postcodes_from_db_ambigious(self, table_factory, word_table):
+ table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
+ content=(('in', '123456'), ('sg', '123456')))
+
+ self.analyzer.update_postcodes_from_db()
+
+ assert word_table.count() == 3
+ assert word_table.get_postcodes() == {'123456', '123456@123 456'}
+
+
+ def test_update_postcodes_from_db_add_and_remove(self, table_factory, word_table):
+ table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
+ content=(('ch', '1234'), ('bm', 'BC 45'), ('bm', 'XX45')))
+ word_table.add_postcode(' 1234', '1234')
+ word_table.add_postcode(' 5678', '5678')
+
+ self.analyzer.update_postcodes_from_db()
+
+ assert word_table.count() == 5
+ assert word_table.get_postcodes() == {'1234', 'BC 45@BC 45', 'XX 45@XX 45'}
+
+
+ def test_process_place_postcode_simple(self, word_table):
+ info = self.process_postcode('de', '12345')
+
+ assert info['postcode'] == '12345'
+
+ assert word_table.get_postcodes() == {'12345', }
+
+
+ def test_process_place_postcode_with_space(self, word_table):
+ info = self.process_postcode('in', '123 567')
+
+ assert info['postcode'] == '123567'
+
+ assert word_table.get_postcodes() == {'123567@123 567', }
- assert word_table.count() == 3
- assert word_table.get_postcodes() == {'1234', '45BC', 'XX45'}
def test_update_special_phrase_empty_table(analyzer, word_table):
assert word_table.get_postcodes() == {pcode, }
- @pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
- def test_process_place_bad_postcode(self, word_table, pcode):
- self.process_address(postcode=pcode)
-
- assert not word_table.get_postcodes()
-
-
@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id):
info = self.process_address(housenumber=hnr)
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for special postcode analysis and variant generation.
+"""
+import pytest
+
+from icu import Transliterator
+
+import nominatim.tokenizer.token_analysis.postcodes as module
+from nominatim.errors import UsageError
+
+DEFAULT_NORMALIZATION = """ :: NFD ();
+ '🜳' > ' ';
+ [[:Nonspacing Mark:] [:Cf:]] >;
+ :: lower ();
+ [[:Punctuation:][:Space:]]+ > ' ';
+ :: NFC ();
+ """
+
+DEFAULT_TRANSLITERATION = """ :: Latin ();
+ '🜵' > ' ';
+ """
+
+@pytest.fixture
+def analyser():
+ rules = { 'analyzer': 'postcodes'}
+ config = module.configure(rules, DEFAULT_NORMALIZATION)
+
+ trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
+ norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
+
+ return module.create(norm, trans, config)
+
+
+def get_normalized_variants(proc, name):
+ norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
+ return proc.get_variants_ascii(norm.transliterate(name).strip())
+
+
+@pytest.mark.parametrize('name,norm', [('12', '12'),
+ ('A 34 ', 'A 34'),
+ ('34-av', '34-AV')])
+def test_normalize(analyser, name, norm):
+ assert analyser.normalize(name) == norm
+
+
+@pytest.mark.parametrize('postcode,variants', [('12345', {'12345'}),
+ ('AB-998', {'ab 998', 'ab998'}),
+ ('23 FGH D3', {'23 fgh d3', '23fgh d3',
+ '23 fghd3', '23fghd3'})])
+def test_get_variants_ascii(analyser, postcode, variants):
+ out = analyser.get_variants_ascii(postcode)
+
+ assert len(out) == len(set(out))
+ assert set(out) == variants
import pytest
-from nominatim.tools import postcodes
+from nominatim.tools import postcodes, country_info
import dummy_tokenizer
class MockPostcodeTable:
def tokenizer():
return dummy_tokenizer.DummyTokenizer(None, None)
+
@pytest.fixture
-def postcode_table(temp_db_conn, placex_table):
+def postcode_table(def_config, temp_db_conn, placex_table):
+ country_info.setup_country_config(def_config)
return MockPostcodeTable(temp_db_conn)
+@pytest.fixture
+def insert_implicit_postcode(placex_table, place_row):
+ """
+ Inserts data into the placex and place table
+ which can then be used to compute one postcode.
+ """
+ def _insert_implicit_postcode(osm_id, country, geometry, address):
+ placex_table.add(osm_id=osm_id, country=country, geom=geometry)
+ place_row(osm_id=osm_id, geom='SRID=4326;'+geometry, address=address)
+
+ return _insert_implicit_postcode
+
+
def test_postcodes_empty(dsn, postcode_table, place_table,
tmp_path, tokenizer):
postcodes.update_postcodes(dsn, tmp_path, tokenizer)
table_factory('place')
assert postcodes.can_compute(dsn)
+
def test_no_placex_entry(dsn, tmp_path, temp_db_cursor, place_row, postcode_table, tokenizer):
#Rewrite the get_country_code function to verify its execution.
temp_db_cursor.execute("""
CREATE OR REPLACE FUNCTION get_country_code(place geometry)
RETURNS TEXT AS $$ BEGIN
- RETURN 'fr';
+ RETURN 'yy';
END; $$ LANGUAGE plpgsql;
""")
place_row(geom='SRID=4326;POINT(10 12)', address=dict(postcode='AB 4511'))
postcodes.update_postcodes(dsn, tmp_path, tokenizer)
- assert postcode_table.row_set == {('fr', 'AB 4511', 10, 12)}
+ assert postcode_table.row_set == {('yy', 'AB 4511', 10, 12)}
-@pytest.fixture
-def insert_implicit_postcode(placex_table, place_row):
- """
- Inserts data into the placex and place table
- which can then be used to compute one postcode.
- """
- def _insert_implicit_postcode(osm_id, country, geometry, address):
- placex_table.add(osm_id=osm_id, country=country, geom=geometry)
- place_row(osm_id=osm_id, geom='SRID=4326;'+geometry, address=address)
- return _insert_implicit_postcode
+def test_discard_badly_formatted_postcodes(dsn, tmp_path, temp_db_cursor, place_row, postcode_table, tokenizer):
+ #Rewrite the get_country_code function to verify its execution.
+ temp_db_cursor.execute("""
+ CREATE OR REPLACE FUNCTION get_country_code(place geometry)
+ RETURNS TEXT AS $$ BEGIN
+ RETURN 'fr';
+ END; $$ LANGUAGE plpgsql;
+ """)
+ place_row(geom='SRID=4326;POINT(10 12)', address=dict(postcode='AB 4511'))
+ postcodes.update_postcodes(dsn, tmp_path, tokenizer)
+
+ assert not postcode_table.row_set
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for centroid computation.
+"""
+import pytest
+
+from nominatim.utils.centroid import PointsCentroid
+
+def test_empty_set():
+ c = PointsCentroid()
+
+ with pytest.raises(ValueError, match='No points'):
+ c.centroid()
+
+
+@pytest.mark.parametrize("centroid", [(0,0), (-1, 3), [0.0000032, 88.4938]])
+def test_one_point_centroid(centroid):
+ c = PointsCentroid()
+
+ c += centroid
+
+ assert len(c.centroid()) == 2
+ assert c.centroid() == (pytest.approx(centroid[0]), pytest.approx(centroid[1]))
+
+
+def test_multipoint_centroid():
+ c = PointsCentroid()
+
+ c += (20.0, -10.0)
+ assert c.centroid() == (pytest.approx(20.0), pytest.approx(-10.0))
+ c += (20.2, -9.0)
+ assert c.centroid() == (pytest.approx(20.1), pytest.approx(-9.5))
+ c += (20.2, -9.0)
+ assert c.centroid() == (pytest.approx(20.13333), pytest.approx(-9.333333))
+
+
+def test_manypoint_centroid():
+ c = PointsCentroid()
+
+ for _ in range(10000):
+ c += (4.564732, -0.000034)
+
+ assert c.centroid() == (pytest.approx(4.564732), pytest.approx(-0.000034))
+
+
+@pytest.mark.parametrize("param", ["aa", None, 5, [1, 2, 3], (3, None), ("a", 3.9)])
+def test_add_non_tuple(param):
+ c = PointsCentroid()
+
+ with pytest.raises(ValueError, match='2-element tuples'):
+ c += param