The new format combines compound splitting and abbreviation.
It also allows to restrict rules to additional conditions
(like language or region). This latter ability is not used
yet.
settings/import-address.style
settings/import-full.style
settings/import-extratags.style
- settings/legacy_icu_tokenizer.json
- settings/icu_transliteration.rules
+ settings/legacy_icu_tokenizer.yaml
+ settings/icu-rules/extended-unicode-to-asccii.yaml
DESTINATION ${NOMINATIM_CONFIGDIR})
Processor for names that are imported into the database based on the
ICU library.
"""
-import json
+from collections import defaultdict
import itertools
from icu import Transliterator
import datrie
from nominatim.db.properties import set_property, get_property
+from nominatim.tokenizer import icu_variants as variants
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
elif conn is not None:
self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
- self.replacements = json.loads(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
+ self.replacements = \
+ variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
else:
assert False, "Parameter loader or conn required."
- # Compute the set of characters used in the replacement list.
- # We need this later when computing the tree.
- chars = set()
- for full, repl in self.replacements:
- chars.update(full)
- for word in repl:
- chars.update(word)
- self.replacement_charset = ''.join(chars)
-
def save_rules(self, conn):
""" Save the rules in the property table of the given database.
"""
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
- set_property(conn, DBCFG_IMPORT_REPLACEMENTS, json.dumps(self.replacements))
+ set_property(conn, DBCFG_IMPORT_REPLACEMENTS,
+ variants.pickle_variant_set(self.replacements))
set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
self.search = Transliterator.createFromRules("icu_search",
rules.search_rules)
- self.replacements = datrie.Trie(rules.replacement_charset)
- for full, repl in rules.replacements:
- self.replacements[full] = repl
+ # Intermediate reorder by source. Also compute required character set.
+ immediate = defaultdict(list)
+ chars = set()
+ for variant in rules.replacements:
+ immediate[variant.source].append(variant)
+ chars.update(variant.source)
+ # Then copy to datrie
+ self.replacements = datrie.Trie(''.join(chars))
+ for src, repllist in immediate.items():
+ self.replacements[src] = repllist
def get_normalized(self, name):
""" Compute the spelling variants for the given normalized name
and transliterate the result.
"""
- baseform = ' ' + norm_name + ' '
- variants = ['']
+ baseform = '^ ' + norm_name + ' ^'
+ partials = ['']
startpos = 0
pos = 0
(None, None))
if full is not None:
done = baseform[startpos:pos]
- variants = [v + done + r for v, r in itertools.product(variants, repl)]
+ partials = [v + done + r.replacement
+ for v, r in itertools.product(partials, repl)]
startpos = pos + len(full)
pos = startpos
else:
if trans_name:
results.append(trans_name)
else:
- for variant in variants:
- trans_name = self.to_ascii.transliterate(variant + baseform[startpos:pos]).strip()
+ for variant in partials:
+ name = variant[1:] + baseform[startpos:-1]
+ trans_name = self.to_ascii.transliterate(name).strip()
if trans_name:
results.append(trans_name)
"""
import io
import logging
-from collections import defaultdict
import itertools
from pathlib import Path
+import re
import yaml
from icu import Transliterator
from nominatim.errors import UsageError
+import nominatim.tokenizer.icu_variants as variants
LOG = logging.getLogger()
return output
+class VariantRule:
+ """ Saves a single variant expansion.
+
+ An expansion consists of the normalized replacement term and
+ a dicitonary of properties that describe when the expansion applies.
+ """
+
+ def __init__(self, replacement, properties):
+ self.replacement = replacement
+ self.properties = properties or {}
+
+
class ICURuleLoader:
""" Compiler for ICU rules from a tokenizer configuration file.
"""
def __init__(self, configfile):
self.configfile = configfile
- self.compound_suffixes = set()
- self.abbreviations = defaultdict()
+ self.variants = set()
if configfile.suffix == '.yaml':
self._load_from_yaml()
The result is a list of pairs: the first item is the sequence to
replace, the second is a list of replacements.
"""
- synonyms = defaultdict(set)
-
- # First add entries for compound decomposition.
- for suffix in self.compound_suffixes:
- variants = (suffix + ' ', ' ' + suffix + ' ')
- for key in variants:
- synonyms[key].update(variants)
-
- for full, abbr in self.abbreviations.items():
- key = ' ' + full + ' '
- # Entries in the abbreviation list always apply to full words:
- synonyms[key].update((' ' + a + ' ' for a in abbr))
- # Replacements are optional, so add a noop
- synonyms[key].add(key)
-
- if full in self.compound_suffixes:
- # Full word abbreviating to compunded version.
- synonyms[key].update((a + ' ' for a in abbr))
-
- key = full + ' '
- # Uncompunded suffix abbrevitating to decompounded version.
- synonyms[key].update((' ' + a + ' ' for a in abbr))
- # Uncompunded suffix abbrevitating to compunded version.
- synonyms[key].update((a + ' ' for a in abbr))
-
- # sort the resulting list by descending length (longer matches are prefered).
- sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)
-
- return [(k, list(synonyms[k])) for k in sorted_keys]
+ return self.variants
def _yaml_include_representer(self, loader, node):
value = loader.construct_scalar(node)
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
- self._parse_compound_suffix_list(self._get_section(rules, 'compound_suffixes'))
- self._parse_abbreviation_list(self._get_section(rules, 'abbreviations'))
+ self._parse_variant_list(self._get_section(rules, 'variants'))
def _get_section(self, rules, section):
return ';'.join(_flatten_yaml_list(content)) + ';'
+ def _parse_variant_list(self, rules):
+ self.variants.clear()
- def _parse_compound_suffix_list(self, rules):
if not rules:
- self.compound_suffixes = set()
return
- norm = Transliterator.createFromRules("rule_loader_normalization",
- self.normalization_rules)
+ rules = _flatten_yaml_list(rules)
- # Make sure all suffixes are in their normalised form.
- self.compound_suffixes = set((norm.transliterate(s) for s in rules))
+ vmaker = _VariantMaker(self.normalization_rules)
+ properties = []
+ for section in rules:
+ # Create the property field and deduplicate against existing
+ # instances.
+ props = variants.ICUVariantProperties.from_rules(section)
+ for existing in properties:
+ if existing == props:
+ props = existing
+ break
+ else:
+ properties.append(props)
- def _parse_abbreviation_list(self, rules):
- self.abbreviations = defaultdict(list)
+ for rule in (section.get('words') or []):
+ self.variants.update(vmaker.compute(rule, props))
- if not rules:
- return
- norm = Transliterator.createFromRules("rule_loader_normalization",
- self.normalization_rules)
+class _VariantMaker:
+ """ Generater for all necessary ICUVariants from a single variant rule.
+
+ All text in rules is normalized to make sure the variants match later.
+ """
- for rule in rules:
- parts = rule.split('=>')
- if len(parts) != 2:
- LOG.fatal("Syntax error in abbreviation section, line: %s", rule)
- raise UsageError("Syntax error in tokenizer configuration file.")
+ def __init__(self, norm_rules):
+ self.norm = Transliterator.createFromRules("rule_loader_normalization",
+ norm_rules)
- # Make sure all terms match the normalised version.
- fullterms = (norm.transliterate(t.strip()) for t in parts[0].split(','))
- abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(','))
- for full, abbr in itertools.product(fullterms, abbrterms):
- if full and abbr:
- self.abbreviations[full].append(abbr)
+ def compute(self, rule, props):
+ """ Generator for all ICUVariant tuples from a single variant rule.
+ """
+ parts = re.split(r'(\|)?([=-])>', rule)
+ if len(parts) != 4:
+ raise UsageError("Syntax error in variant rule: " + rule)
+
+ decompose = parts[1] is None
+ src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
+ repl_terms = (self.norm.transliterate(t.strip()) for t in parts[3].split(','))
+
+ # If the source should be kept, add a 1:1 replacement
+ if parts[2] == '-':
+ for src in src_terms:
+ if src:
+ for froms, tos in _create_variants(*src, src[0], decompose):
+ yield variants.ICUVariant(froms, tos, props)
+
+ for src, repl in itertools.product(src_terms, repl_terms):
+ if src and repl:
+ for froms, tos in _create_variants(*src, repl, decompose):
+ yield variants.ICUVariant(froms, tos, props)
+
+
+ def _parse_variant_word(self, name):
+ name = name.strip()
+ match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
+ if match is None or (match.group(1) == '~' and match.group(3) == '~'):
+ raise UsageError("Invalid variant word descriptor '{}'".format(name))
+ norm_name = self.norm.transliterate(match.group(2))
+ if not norm_name:
+ return None
+
+ return norm_name, match.group(1), match.group(3)
+
+
+_FLAG_MATCH = {'^': '^ ',
+ '$': ' ^',
+ '': ' '}
+
+
+def _create_variants(src, preflag, postflag, repl, decompose):
+ if preflag == '~':
+ postfix = _FLAG_MATCH[postflag]
+ # suffix decomposition
+ src = src + postfix
+ repl = repl + postfix
+
+ yield src, repl
+ yield ' ' + src, ' ' + repl
+
+ if decompose:
+ yield src, ' ' + repl
+ yield ' ' + src, repl
+ elif postflag == '~':
+ # prefix decomposition
+ prefix = _FLAG_MATCH[preflag]
+ src = prefix + src
+ repl = prefix + repl
+
+ yield src, repl
+ yield src + ' ', repl + ' '
+
+ if decompose:
+ yield src, repl + ' '
+ yield src + ' ', repl
+ else:
+ prefix = _FLAG_MATCH[preflag]
+ postfix = _FLAG_MATCH[postflag]
+
+ yield prefix + src + postfix, prefix + repl + postfix
--- /dev/null
+"""
+Data structures for saving variant expansions for ICU tokenizer.
+"""
+from collections import namedtuple
+import json
+
+from nominatim.errors import UsageError
+
+_ICU_VARIANT_PORPERTY_FIELDS = ['lang']
+
+def _get_strtuple_prop(rules, field):
+ """ Return the given field of the rules dictionary as a list.
+
+ If the field is not defined or empty, returns None. If the field is
+ a singe string, it is converted into a tuple with a single element.
+ If the field is a list of strings, return as a string tuple.
+ Raise a usage error in all other cases.
+ """
+ value = rules.get(field)
+
+ if not value:
+ return None
+
+ if isinstance(value, str):
+ return (value,)
+
+ if not isinstance(value, list) or any(not isinstance(x, str) for x in value):
+ raise UsageError("YAML variant property '{}' should be a list.".format(field))
+
+ return tuple(value)
+
+
+class ICUVariantProperties(namedtuple('_ICUVariantProperties', _ICU_VARIANT_PORPERTY_FIELDS,
+ defaults=(None, )*len(_ICU_VARIANT_PORPERTY_FIELDS))):
+ """ Data container for saving properties that describe when a variant
+ should be applied.
+
+ Porperty instances are hashable.
+ """
+ @classmethod
+ def from_rules(cls, rules):
+ """ Create a new property type from a generic dictionary.
+
+ The function only takes into account the properties that are
+ understood presently and ignores all others.
+ """
+ return cls(lang=_get_strtuple_prop(rules, 'lang'))
+
+
+ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
+
+def pickle_variant_set(variants):
+ """ Serializes an iterable of variant rules to a string.
+ """
+ # Create a list of property sets. So they don't need to be duplicated
+ properties = {}
+ pid = 1
+ for variant in variants:
+ if variant.properties not in properties:
+ properties[variant.properties] = pid
+ pid += 1
+
+ # Convert the variants into a simple list.
+ variants = [(v.source, v.replacement, properties[v.properties]) for v in variants]
+
+ # Convert everythin to json.
+ return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()},
+ 'variants': variants})
+
+
+def unpickle_variant_set(variant_string):
+ """ Deserializes a variant string that was previously created with
+ pickle_variant_set() into a set of ICUVariants.
+ """
+ data = json.loads(variant_string)
+
+ properties = {int(k): ICUVariantProperties(**v) for k, v in data['properties'].items()}
+ print(properties)
+
+ return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants']))
- ":: lower ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()"
-compound_suffixes:
- # Danish
- - hal
- - hallen
- - hallerne
- # German
- - berg
- - brücke
- - fabrik
- - gasse
- - graben
- - haus
- - höhle
- - hütte
- - kapelle
- - kogel
- - markt
- - pfad
- - platz
- - quelle
- - spitze
- - stiege
- - strasse
- - teich
- - universität
- - wald
- - weg
- - wiese
- # Dutch
- - gracht
- - laan
- - markt
- - plein
- - straat
- - vliet
- - weg
- # Norwegian
- - vei
- - veien
- - veg
- - vegen
- - gate
- - gaten
- - gata
- - plass
- - plassen
- - sving
- - svingen
- # Finnish
- - alue
- - asema
- - aukio
- - kaari
- - katu
- - kuja
- - kylä
- - penger
- - polku
- - puistikko
- - puisto
- - raitti
- - ranta
- - rinne
- - taival
- - tie
- - tori
- - väylä
- # Swedish
- - väg
- - vägen
- - gatan
- - gata
- - gränd
- - gränden
- - stig
- - stigen
- - plats
- - platsen
-abbreviations:
- - acceso => acces
- - access => accs
- - acequia => aceq
- - air force base => afb
- - air national guard base => angb
- - alameda => alam
- - ale => al
- - alea => al
- - aleea => al
- - aleja => al
- - alejach => al
- - aleje => al
- - aleji => al
- - all => al
- - allee => al
- - alley => al
- - alleyway => alwy
- - ally => al
- - alqueria => alque
- - alue => al
- - aly => al
- - am => a
- - amble => ambl
- - an der => a d
- - andador => andad
- - angosta => angta
- - apartamentos => aptos
- - apartments => apts
- - apch => app
- - apeadero => apdro
- - approach => app
- - arboleda => arb
- - arcade => arc
- - arrabal => arral
- - arroyo => arry
- - arterial => artl
- - artery => arty
- - auf der => a d
- - aukio => auk
- - autopista => auto
- - autovia => autov
- - avd => av
- - avda => av
- - ave => av
- - avenida => av
- - avenue => av
- - avinguda => av
- - b dul => bd
- - back => bk
- - bad => b
- - bahnhof => bf
- - bajada => bjada
- - balneario => balnr
- - banan => ba
- - banda => b
- - barranco => branc
- - barranquil => bqllo
- - barriada => barda
- - barrio => bo
- - barro => bo
- - basin => basn
- - bda => b
- - bdge => bri
- - bdul => bd
- - bdwy => bway
- - beach => bch
- - berg => bg
- - bgm => bgm
- - bhf => bf
- - bldngs => bldgs
- - block => blk
- - blok => bl
- - bloque => blque
- - blv => bd
- - blvd => bd
- - boardwalk => bwlk
- - boulevard => bd
- - boundary => bdy
- - brace => br
- - brazal => brzal
- - brdg => bri
- - break => brk
- - bridge => bri
- - broadway => bway
- - broeder => br
- - brücke => br
- - buildings => bldgs
- - bul => bd
- - bulev => bd
- - bulevar => bd
- - bulevard => bd
- - bulevardu => bd
- - bulevardul => bd
- - bulievard => bd
- - bulvar => bl
- - bulvari => bl
- - burg => bg
- - burgemeester => bg
- - burgermeister => bgm
- - business => bus
- - buu dien => bd
- - bvd => bd
- - bypa => byp
- - bypass => byp
- - byway => bywy
- - c le => c
- - cadde => cd
- - caddesi => cd
- - calle => c
- - callej => cjon
- - calleja => cllja
- - callejon => cjon
- - callejuela => cjla
- - callizo => cllzo
- - calzada => czada
- - camino => cno
- - camino hondo => c h
- - camino nuevo => c n
- - camino viejo => c v
- - camping => campg
- - campo => c po
- - can cu khong quan => cckq
- - cantera => cantr
- - cantina => canti
- - canton => cant
- - cao dang => cd
- - caravan => cvn
- - carrer => c
- - carrera => cra
- - carrero => cro
- - carretera => ctra
- - carreterin => ctrin
- - carretil => crtil
- - carril => crril
- - caserio => csrio
- - cau ldhc bo => clb
- - caus => cway
- - causeway => cway
- - ce => cv
- - cen => ctr
- - center => ctr
- - centre => ctr
- - centreway => cnwy
- - cesta => c
- - chalet => chlet
- - chase => ch
- - che => ch
- - chemin => ch
- - cinturon => cint
- - circle => cir
- - circuit => cct
- - circunvalacion => ccvcn
- - circus => crcs
- - city => cty
- - cl => c
- - cllon => cjon
- - close => c
- - cmno => cno
- - cobertizo => cbtiz
- - colonia => col
- - commandant => cmdt
- - common => comm
- - community => comm
- - complejo => compj
- - concourse => con
- - cong truong => ct
- - cong ty => cty
- - cong ty co phyn => ctcp
- - cong vien => cv
- - cong vien van hoa => cvvh
- - conjunto => cjto
- - convento => cnvto
- - cooperativa => coop
- - copse => cps
- - corner => cnr
- - corral => crral
- - corralillo => crrlo
- - corredor => crrdo
- - corso => c so
- - corte => c te
- - cortijo => crtjo
- - costanilla => cstan
- - costera => coste
- - cottages => cotts
- - county => co
- - county route => cr
- - cours => crs
- - court => ct
- - courtyard => ctyd
- - cove => cv
- - creek => cr
- - cres => cr
- - crescent => cr
- - crest => crst
- - crk => cr
- - croft => cft
- - cross => crss
- - crossing => xing
- - crossroad => crd
- - crossway => cowy
- - crsg => xing
- - crt => ct
- - csac => cds
- - cswy => cway
- - cty cp => ctcp
- - cuadra => cuadr
- - cuesta => custa
- - cul de sac => cds
- - cutting => cutt
- - ddhi hoc => dh
- - ddhi lo => dl
- - dehesa => dhsa
- - demarcacion => demar
- - deviation => devn
- - diagonal => diag
- - diseminado => disem
- - distributor => dstr
- - doctor => dr
- - dokter => dr
- - doktor => d r
- - dolna => dln
- - dolne => dln
- - dolny => dln
- - dominee => ds
- - dorf => df
- - dotsient => dots
- - drive => dr
- - driveway => dvwy
- - druga => 2
- - drugi => 2
- - drugie => 2
- - drv => dr
- - drwy => dvwy
- - duong => d
- - duong sat => ds
- - duza => dz
- - duze => dz
- - duzy => dz
- - east => e
- - edificio => edifc
- - elbow => elb
- - empresa => empr
- - entrada => entd
- - entrance => ent
- - errepidea => err
- - escalera => esca
- - escalinata => escal
- - espalda => eslda
- - esplanade => esp
- - estacion => estcn
- - estate => est
- - estrada => estda
- - etorbidea => etorb
- - explanada => expla
- - expressway => exp
- - expy => exp
- - extension => ex
- - extramuros => extrm
- - extrarradio => extrr
- - fabrica => fca
- - fairway => fawy
- - faubourg => fg
- - fbrca => fca
- - ferry => fy
- - fire track => ftrk
- - firetrail => fit
- - follow => folw
- - fondamenta => f ta
- - footway => ftwy
- - foreshore => fshr
- - formation => form
- - fort => ft
- - freeway => fwy
- - front => frnt
- - frontage => frtg
- - frwy => fwy
- - fundacul => fdc
- - fundatura => fnd
- - ga => g
- - galeria => gale
- - gamla => gla
- - garden => gdn
- - gardens => gdn
- - gata => g
- - gatan => g
- - gate => g
- - gaten => g
- - gdns => gdn
- - gebroeders => gebr
- - generaal => gen
- - gienieral => ghien
- - glade => gl
- - gld => gl
- - glde => gl
- - glorieta => gta
- - gorna => gn
- - gorne => gn
- - gorny => gn
- - gracht => gr
- - grad => ghr
- - gran via => g v
- - grand => gr
- - granden => gr
- - grange => gra
- - granja => granj
- - green => gn
- - grn => gn
- - gro => gr
- - grosse => gr
- - grosser => gr
- - grosses => gr
- - ground => grnd
- - grove => gr
- - gt => g
- - gte => g
- - gully => gly
- - hauptbahnhof => hbf
- - heights => hts
- - heiligen => hl
- - hgts => hts
- - high school => hs
- - highroad => hrd
- - highway => hwy
- - hipodromo => hipod
- - hird => hrd
- - hospital => hosp
- - house => ho
- - hse => ho
- - i => 1
- - ii => 2
- - iii => 3
- - im => 1
- - impasse => imp
- - in => 1
- - in der => i d
- - industrial => ind
- - ingenieur => ir
- - interchange => intg
- - international => intl
- - intr => int
- - intrarea => int
- - island => is
- - jardin => jdin
- - jonkheer => jhr
- - junction => jnc
- - k s => ks
- - kaari => kri
- - kalea => k
- - kanunnik => kan
- - kapitan => kap
- - kardinaal => kard
- - katu => k
- - khach sdhn => ks
- - khu cong nghiep => kcn
- - khu du lich => kdl
- - khu nghi mat => knm
- - kleine => kl
- - kleiner => kl
- - kleines => kl
- - kolo => k
- - kolonel => kol
- - kolonia => kol
- - koning => kon
- - koningin => kon
- - kort e => k
- - kri => kri
- - kte => k
- - kuja => kj
- - kvartal => kv
- - kyla => kl
- - laan => ln
- - ladera => ldera
- - landing => ldg
- - lane => ln
- - laneway => lnwy
- - lange => l
- - largo => l go
- - lille => ll
- - line => ln
- - link => lk
- - lit => lt
- - little => lt
- - llanura => llnra
- - lookout => lkt
- - loop => lp
- - low => lr
- - lower => lr
- - luitenant => luit
- - lwr => lr
- - m te => mt
- - m tele => mt
- - maantee => mt
- - mala => ml
- - male => ml
- - malecon => malec
- - maly => ml
- - manor => mnr
- - mansions => mans
- - market => mkt
- - markt => mkt
- - mazowiecka => maz
- - mazowiecki => maz
- - mazowieckie => maz
- - meadows => mdws
- - meander => mr
- - medical => med
- - meester => mr
- - mercado => merc
- - mevrouw => mevr
- - mews => m
- - miasto => m
- - middle => mid
- - middle school => ms
- - mile => mi
- - military => mil
- - mirador => mrdor
- - mitropolit => mit
- - mkt => mkt
- - mndr => mr
- - mnt => mt
- - monasterio => mtrio
- - monseigneur => mgr
- - mont => mt
- - motorway => mwy
- - mount => mt
- - mountain => mtn
- - mtwy => mwy
- - muelle => muell
- - municipal => mun
- - muntele => mt
- - museum => mus
- - mw => m
- - na => n
- - namesti => nam
- - namestie => nam
- - national park => np
- - national recreation area => nra
- - national wildlife refuge area => nwra
- - nha hat => nh
- - nha thi dzu => ntd
- - nha tho => nt
- - nordre => ndr
- - norra => n
- - north => n
- - north east => ne
- - north west => n
- - northeast => ne
- - northwest => n
- - nowa => n
- - nowe => n
- - nowy => n
- - nucleo => ncleo
- - nw => n
- - oa => o
- - ob => o
- - obere => o
- - oberer => o
- - oberes => o
- - olv => olv
- - onze lieve vrouw e => olv
- - osiedle => os
- - osiedlu => os
- - ostra => o
- - outlook => otlk
- - p k => pk
- - p ta => pta
- - p zza => p za
- - palacio => palac
- - pantano => pant
- - parade => pde
- - paraje => praje
- - park => pk
- - parklands => pkld
- - parkway => pwy
- - parque => pque
- - particular => parti
- - partida => ptda
- - pas => ps
- - pasadizo => pzo
- - pasaje => psaje
- - paseo => po
- - paseo maritimo => psmar
- - pasillo => psllo
- - pass => ps
- - passage => ps
- - passatge => ptge
- - passeig => pg
- - pastoor => past
- - pathway => pway
- - penger => pgr
- - pfad => p
- - pgr => pgr
- - ph => p
- - phi truong => pt
- - phuong => p
- - phwy => pway
- - piata => pta
- - piaz => p za
- - piazza => p za
- - piazzale => p le
- - piazzetta => pta
- - pierwsza => 1
- - pierwsze => 1
- - pierwszy => 1
- - pike => pk
- - pko => pko
- - pkwy => pwy
- - pky => pwy
- - plac => pl
- - placa => pl
- - place => pl
- - placem => pl
- - placu => pl
- - plass => pl
- - plassen => pl
- - plateau => plat
- - plats => pl
- - platsen => pl
- - platz => pl
- - plaza => pl
- - plazoleta => pzta
- - plazuela => plzla
- - plein => pln
- - pln => pln
- - ploshchad => pl
- - plz => pl
- - plza => pl
- - pnt => pt
- - poblado => pbdo
- - pocket => pkt
- - point => pt
- - poligono => polig
- - poligono industrial => pgind
- - polku => p
- - ponte => p te
- - porta => pta
- - portal => prtal
- - portico => prtco
- - portillo => ptilo
- - prazuela => przla
- - precinct => pct
- - pres => pres
- - president => pres
- - prins => pr
- - prinses => pr
- - professor => prof
- - profiesor => prof
- - prolongacion => prol
- - promenade => prom
- - psge => ps
- - pueblo => pblo
- - puente => pnte
- - puerta => pta
- - puerto => pto
- - puistikko => pko
- - puisto => ps
- - punto kilometrico => pk
- - pza => pl
- - quadrangle => qdgl
- - quadrant => qdrt
- - quai => qu
- - quan => q
- - quay => qy
- - quays => qys
- - qucyng truong => qt
- - quelle => qu
- - quoc lo => ql
- - raitti => r
- - rambla => rbla
- - ramble => rmbl
- - rampla => rampa
- - range => rnge
- - ranta => rt
- - rdhp hat => rh
- - reach => rch
- - reservation => res
- - reserve => res
- - reservoir => res
- - residencial => resid
- - retreat => rtt
- - rhein => rh
- - ribera => rbra
- - ridge => rdge
- - ridgeway => rgwy
- - right of way => rowy
- - rincon => rcon
- - rinconada => rcda
- - rinne => rn
- - rise => ri
- - riv => r
- - river => r
- - riverway => rvwy
- - riviera => rvra
- - road => rd
- - roads => rds
- - roadside => rdsd
- - roadway => rdwy
- - ronde => rnde
- - rosebowl => rsbl
- - rotary => rty
- - rotonda => rtda
- - round => rnd
- - route => rt
- - rte => rt
- - rue => r
- - rvr => r
- - sa => s
- - saint => st
- - sainte => ste
- - salizada => s da
- - san => s
- - san bay => sb
- - san bay quoc te => sbqt
- - san van dong => svd
- - sanatorio => sanat
- - sankt => st
- - santa => sta
- - santuario => santu
- - sector => sect
- - sendera => sedra
- - sendero => send
- - serviceway => swy
- - shunt => shun
- - shwy => sh
- - siding => sdng
- - sielo => s
- - sint => st
- - slope => slpe
- - sodra => s
- - sok => sk
- - sokagi => sk
- - sokak => sk
- - sondre => sdr
- - soseaua => sos
- - sound => snd
- - south => s
- - south east => se
- - south west => sw
- - south-east => se
- - south-west => sw
- - southeast => se
- - southwest => sw
- - spl => sp
- - splaiul => sp
- - spodnja => sp
- - spodnje => sp
- - spodnji => sp
- - square => sq
- - srednja => sr
- - srednje => sr
- - srednji => sr
- - stara => st
- - stare => st
- - stary => st
- - state highway => sh
- - state route => sr
- - station => stn
- - stazione => staz
- - ste => ste
- - steenweg => stwg
- - sth => s
- - stig => st
- - stigen => st
- - store => st
- - str la => sdla
- - stra => st
- - straat => st
- - strada => st
- - strada comunale => sc
- - strada provinciale => sp
- - strada regionale => sr
- - strada statale => ss
- - stradela => sdla
- - strand => st
- - strasse => str
- - street => st
- - strip => strp
- - stwg => stwg
- - subida => sbida
- - subway => sbwy
- - sveta => sv
- - sveti => sv
- - svieti => sv
- - sving => sv
- - svingen => sv
- - svwy => swy
- - taival => tvl
- - tanav => tn
- - tce => ter
- - tcty => tct
- - terr => ter
- - terrace => ter
- - thanh pho => tp
- - thfr => thor
- - thi trzn => tt
- - thi xa => tx
- - thoroughfare => thor
- - tie => t
- - tieu hoc => th
- - tinh lo => tl
- - tollway => tlwy
- - tong cong ty => tct
- - tori => tr
- - torrente => trrnt
- - towers => twrs
- - township => twp
- - tpke => tpk
- - track => trk
- - trail => trl
- - trailer => trlr
- - transito => trans
- - transversal => trval
- - trasera => tras
- - travesia => trva
- - triangle => tri
- - trung hoc co so => thcs
- - trung hoc pho thong => thpt
- - trung tam => tt
- - trung tam thuong mdhi => tttm
- - trunkway => tkwy
- - trzeci => 3
- - trzecia => 3
- - trzecie => 3
- - tunnel => tun
- - turn => tn
- - turnpike => tpk
- - tvl => tvl
- - ulica => ul
- - ulice => ul
- - ulicy => ul
- - ulitsa => ul
- - underpass => upas
- - university => univ
- - untere => u
- - unterer => u
- - unteres => u
- - upper => up
- - upr => up
- - urbanizacion => urb
- - utca => u
- - v d => vd
- - va => v
- - vag => v
- - vagen => v
- - vale => v
- - van => v
- - van de => vd
- - varf => vf
- - varful => vf
- - vastra => v
- - vayla => vla
- - vdct => via
- - vecindario => vecin
- - vei => v
- - veien => v
- - velika => v
- - velike => v
- - veliki => v
- - veliko => v
- - vereda => vreda
- - via => v
- - viad => via
- - viaduct => via
- - viaducto => vcto
- - viale => v le
- - vicolo => v lo
- - vien bcyo tang => vbt
- - view => vw
- - villas => vlls
- - virf => vf
- - virful => vf
- - vista => vsta
- - viviendas => vvdas
- - vkhod => vkh
- - vla => vla
- - vliet => vlt
- - vlt => vlt
- - vn => v
- - vuon quoc gia => vqg
- - walk => wlk
- - walkway => wkwy
- - way => wy
- - west => w
- - wharf => whrf
- - wielka => wlk
- - wielki => wlk
- - wielkie => wlk
- - wielkopolska => wlkp
- - wielkopolski => wlkp
- - wielkopolskie => wlkp
- - wojewodztwie => woj
- - wojewodztwo => woj
- - yard => yd
- - zgornja => zg
- - zgornje => zg
- - zgornji => zg
- - zhilishchien komplieks => zh k
- - zum => z
+variants:
+ - words:
+ - ~hal => hal
+ - ~hallen => hallen
+ - ~hallerne => hallerne
+ - ~fabrik => fabrik
+ - ~gasse => gasse
+ - ~graben => graben
+ - ~haus => haus
+ - ~höhle => höhle
+ - ~hütte => hütte
+ - ~kapelle => kapelle
+ - ~kogel => kogel
+ - ~spitze => spitze
+ - ~stiege => stiege
+ - ~teich => teich
+ - ~universität => universität
+ - ~wald => wald
+ - ~weg => weg
+ - ~wiese => wiese
+ - ~veg => veg
+ - ~vegen => vegen
+ - ~asema => asema
+ - ~väylä => väylä
+ - acceso -> acces
+ - access -> accs
+ - acequia -> aceq
+ - air force base -> afb
+ - air national guard base -> angb
+ - alameda -> alam
+ - ale -> al
+ - alea -> al
+ - aleea -> al
+ - aleja -> al
+ - alejach -> al
+ - aleje -> al
+ - aleji -> al
+ - all -> al
+ - allee -> al
+ - alley -> al
+ - alleyway -> alwy
+ - ally -> al
+ - alqueria -> alque
+ - ~alue -> al
+ - aly -> al
+ - am -> a
+ - amble -> ambl
+ - an der -> a d
+ - andador -> andad
+ - angosta -> angta
+ - apartamentos -> aptos
+ - apartments -> apts
+ - apch -> app
+ - apeadero -> apdro
+ - approach -> app
+ - arboleda -> arb
+ - arcade -> arc
+ - arrabal -> arral
+ - arroyo -> arry
+ - arterial -> artl
+ - artery -> arty
+ - auf der -> a d
+ - ~aukio -> auk
+ - autopista -> auto
+ - autovia -> autov
+ - avd -> av
+ - avda -> av
+ - ave -> av
+ - avenida -> av
+ - avenue -> av
+ - avinguda -> av
+ - b dul -> bd
+ - back -> bk
+ - bad -> b
+ - bahnhof -> bf
+ - bajada -> bjada
+ - balneario -> balnr
+ - banan -> ba
+ - banda -> b
+ - barranco -> branc
+ - barranquil -> bqllo
+ - barriada -> barda
+ - barrio -> bo
+ - barro -> bo
+ - basin -> basn
+ - bda -> b
+ - bdge -> bri
+ - bdul -> bd
+ - bdwy -> bway
+ - beach -> bch
+ - ~berg -> bg
+ - bgm -> bgm
+ - bhf -> bf
+ - bldngs -> bldgs
+ - block -> blk
+ - blok -> bl
+ - bloque -> blque
+ - blv -> bd
+ - blvd -> bd
+ - boardwalk -> bwlk
+ - boulevard -> bd
+ - boundary -> bdy
+ - brace -> br
+ - brazal -> brzal
+ - brdg -> bri
+ - break -> brk
+ - bridge -> bri
+ - broadway -> bway
+ - broeder -> br
+ - ~brücke -> br
+ - buildings -> bldgs
+ - bul -> bd
+ - bulev -> bd
+ - bulevar -> bd
+ - bulevard -> bd
+ - bulevardu -> bd
+ - bulevardul -> bd
+ - bulievard -> bd
+ - bulvar -> bl
+ - bulvari -> bl
+ - burg -> bg
+ - burgemeester -> bg
+ - burgermeister -> bgm
+ - business -> bus
+ - buu dien -> bd
+ - bvd -> bd
+ - bypa -> byp
+ - bypass -> byp
+ - byway -> bywy
+ - c le -> c
+ - cadde -> cd
+ - caddesi -> cd
+ - calle -> c
+ - callej -> cjon
+ - calleja -> cllja
+ - callejon -> cjon
+ - callejuela -> cjla
+ - callizo -> cllzo
+ - calzada -> czada
+ - camino -> cno
+ - camino hondo -> c h
+ - camino nuevo -> c n
+ - camino viejo -> c v
+ - camping -> campg
+ - campo -> c po
+ - can cu khong quan -> cckq
+ - cantera -> cantr
+ - cantina -> canti
+ - canton -> cant
+ - cao dang -> cd
+ - caravan -> cvn
+ - carrer -> c
+ - carrera -> cra
+ - carrero -> cro
+ - carretera -> ctra
+ - carreterin -> ctrin
+ - carretil -> crtil
+ - carril -> crril
+ - caserio -> csrio
+ - cau ldhc bo -> clb
+ - caus -> cway
+ - causeway -> cway
+ - ce -> cv
+ - cen -> ctr
+ - center -> ctr
+ - centre -> ctr
+ - centreway -> cnwy
+ - cesta -> c
+ - chalet -> chlet
+ - chase -> ch
+ - che -> ch
+ - chemin -> ch
+ - cinturon -> cint
+ - circle -> cir
+ - circuit -> cct
+ - circunvalacion -> ccvcn
+ - circus -> crcs
+ - city -> cty
+ - cl -> c
+ - cllon -> cjon
+ - close -> c
+ - cmno -> cno
+ - cobertizo -> cbtiz
+ - colonia -> col
+ - commandant -> cmdt
+ - common -> comm
+ - community -> comm
+ - complejo -> compj
+ - concourse -> con
+ - cong truong -> ct
+ - cong ty -> cty
+ - cong ty co phyn -> ctcp
+ - cong vien -> cv
+ - cong vien van hoa -> cvvh
+ - conjunto -> cjto
+ - convento -> cnvto
+ - cooperativa -> coop
+ - copse -> cps
+ - corner -> cnr
+ - corral -> crral
+ - corralillo -> crrlo
+ - corredor -> crrdo
+ - corso -> c so
+ - corte -> c te
+ - cortijo -> crtjo
+ - costanilla -> cstan
+ - costera -> coste
+ - cottages -> cotts
+ - county -> co
+ - county route -> cr
+ - cours -> crs
+ - court -> ct
+ - courtyard -> ctyd
+ - cove -> cv
+ - creek -> cr
+ - cres -> cr
+ - crescent -> cr
+ - crest -> crst
+ - crk -> cr
+ - croft -> cft
+ - cross -> crss
+ - crossing -> xing
+ - crossroad -> crd
+ - crossway -> cowy
+ - crsg -> xing
+ - crt -> ct
+ - csac -> cds
+ - cswy -> cway
+ - cty cp -> ctcp
+ - cuadra -> cuadr
+ - cuesta -> custa
+ - cul de sac -> cds
+ - cutting -> cutt
+ - ddhi hoc -> dh
+ - ddhi lo -> dl
+ - dehesa -> dhsa
+ - demarcacion -> demar
+ - deviation -> devn
+ - diagonal -> diag
+ - diseminado -> disem
+ - distributor -> dstr
+ - doctor -> dr
+ - dokter -> dr
+ - doktor -> d r
+ - dolna -> dln
+ - dolne -> dln
+ - dolny -> dln
+ - dominee -> ds
+ - dorf -> df
+ - dotsient -> dots
+ - drive -> dr
+ - driveway -> dvwy
+ - druga -> 2
+ - drugi -> 2
+ - drugie -> 2
+ - drv -> dr
+ - drwy -> dvwy
+ - duong -> d
+ - duong sat -> ds
+ - duza -> dz
+ - duze -> dz
+ - duzy -> dz
+ - east -> e
+ - edificio -> edifc
+ - elbow -> elb
+ - empresa -> empr
+ - entrada -> entd
+ - entrance -> ent
+ - errepidea -> err
+ - escalera -> esca
+ - escalinata -> escal
+ - espalda -> eslda
+ - esplanade -> esp
+ - estacion -> estcn
+ - estate -> est
+ - estrada -> estda
+ - etorbidea -> etorb
+ - explanada -> expla
+ - expressway -> exp
+ - expy -> exp
+ - extension -> ex
+ - extramuros -> extrm
+ - extrarradio -> extrr
+ - fabrica -> fca
+ - fairway -> fawy
+ - faubourg -> fg
+ - fbrca -> fca
+ - ferry -> fy
+ - fire track -> ftrk
+ - firetrail -> fit
+ - follow -> folw
+ - fondamenta -> f ta
+ - footway -> ftwy
+ - foreshore -> fshr
+ - formation -> form
+ - fort -> ft
+ - freeway -> fwy
+ - front -> frnt
+ - frontage -> frtg
+ - frwy -> fwy
+ - fundacul -> fdc
+ - fundatura -> fnd
+ - ga -> g
+ - galeria -> gale
+ - gamla -> gla
+ - garden -> gdn
+ - gardens -> gdn
+ - ~gata -> g
+ - ~gatan -> g
+ - ~gate -> g
+ - ~gaten -> g
+ - gdns -> gdn
+ - gebroeders -> gebr
+ - generaal -> gen
+ - gienieral -> ghien
+ - glade -> gl
+ - gld -> gl
+ - glde -> gl
+ - glorieta -> gta
+ - gorna -> gn
+ - gorne -> gn
+ - gorny -> gn
+ - ~gracht -> gr
+ - grad -> ghr
+ - gran via -> g v
+ - ~gränd -> gr
+ - ~gränden -> gr
+ - grange -> gra
+ - granja -> granj
+ - green -> gn
+ - grn -> gn
+ - gro -> gr
+ - grosse -> gr
+ - grosser -> gr
+ - grosses -> gr
+ - ground -> grnd
+ - grove -> gr
+ - gt -> g
+ - gte -> g
+ - gully -> gly
+ - hauptbahnhof -> hbf
+ - heights -> hts
+ - heiligen -> hl
+ - hgts -> hts
+ - high school -> hs
+ - highroad -> hrd
+ - highway -> hwy
+ - hipodromo -> hipod
+ - hird -> hrd
+ - hospital -> hosp
+ - house -> ho
+ - hse -> ho
+ - i -> 1
+ - ii -> 2
+ - iii -> 3
+ - im -> i
+ - impasse -> imp
+ - in -> i
+ - in der -> i d
+ - industrial -> ind
+ - ingenieur -> ir
+ - interchange -> intg
+ - international -> intl
+ - intr -> int
+ - intrarea -> int
+ - island -> is
+ - jardin -> jdin
+ - jonkheer -> jhr
+ - junction -> jnc
+ - k s -> ks
+ - ~kaari -> kri
+ - kalea -> k
+ - kanunnik -> kan
+ - kapitan -> kap
+ - kardinaal -> kard
+ - ~katu -> k
+ - khach sdhn -> ks
+ - khu cong nghiep -> kcn
+ - khu du lich -> kdl
+ - khu nghi mat -> knm
+ - kleine -> kl
+ - kleiner -> kl
+ - kleines -> kl
+ - kolo -> k
+ - kolonel -> kol
+ - kolonia -> kol
+ - koning -> kon
+ - koningin -> kon
+ - kort e -> k
+ - kri -> kri
+ - kte -> k
+ - ~kuja -> kj
+ - kvartal -> kv
+ - ~kylä -> kl
+ - ~laan -> ln
+ - ladera -> ldera
+ - landing -> ldg
+ - lane -> ln
+ - laneway -> lnwy
+ - lange -> l
+ - largo -> l go
+ - lille -> ll
+ - line -> ln
+ - link -> lk
+ - lit -> lt
+ - little -> lt
+ - llanura -> llnra
+ - lookout -> lkt
+ - loop -> lp
+ - low -> lr
+ - lower -> lr
+ - luitenant -> luit
+ - lwr -> lr
+ - m te -> mt
+ - m tele -> mt
+ - maantee -> mt
+ - mala -> ml
+ - male -> ml
+ - malecon -> malec
+ - maly -> ml
+ - manor -> mnr
+ - mansions -> mans
+ - market -> mkt
+ - ~markt -> mkt
+ - mazowiecka -> maz
+ - mazowiecki -> maz
+ - mazowieckie -> maz
+ - meadows -> mdws
+ - meander -> mr
+ - medical -> med
+ - meester -> mr
+ - mercado -> merc
+ - mevrouw -> mevr
+ - mews -> m
+ - miasto -> m
+ - middle -> mid
+ - middle school -> ms
+ - mile -> mi
+ - military -> mil
+ - mirador -> mrdor
+ - mitropolit -> mit
+ - mkt -> mkt
+ - mndr -> mr
+ - mnt -> mt
+ - monasterio -> mtrio
+ - monseigneur -> mgr
+ - mont -> mt
+ - motorway -> mwy
+ - mount -> mt
+ - mountain -> mtn
+ - mtwy -> mwy
+ - muelle -> muell
+ - municipal -> mun
+ - muntele -> mt
+ - museum -> mus
+ - mw -> m
+ - na -> n
+ - namesti -> nam
+ - namestie -> nam
+ - national park -> np
+ - national recreation area -> nra
+ - national wildlife refuge area -> nwra
+ - nha hat -> nh
+ - nha thi dzu -> ntd
+ - nha tho -> nt
+ - nordre -> ndr
+ - norra -> n
+ - north -> n
+ - north east -> ne
+ - north west -> n
+ - northeast -> ne
+ - northwest -> n
+ - nowa -> n
+ - nowe -> n
+ - nowy -> n
+ - nucleo -> ncleo
+ - nw -> n
+ - oa -> o
+ - ob -> o
+ - obere -> o
+ - oberer -> o
+ - oberes -> o
+ - olv -> olv
+ - onze lieve vrouw e -> olv
+ - osiedle -> os
+ - osiedlu -> os
+ - ostra -> o
+ - outlook -> otlk
+ - p k -> pk
+ - p ta -> pta
+ - p zza -> p za
+ - palacio -> palac
+ - pantano -> pant
+ - parade -> pde
+ - paraje -> praje
+ - park -> pk
+ - parklands -> pkld
+ - parkway -> pwy
+ - parque -> pque
+ - particular -> parti
+ - partida -> ptda
+ - pas -> ps
+ - pasadizo -> pzo
+ - pasaje -> psaje
+ - paseo -> po
+ - paseo maritimo -> psmar
+ - pasillo -> psllo
+ - pass -> ps
+ - passage -> ps
+ - passatge -> ptge
+ - passeig -> pg
+ - pastoor -> past
+ - pathway -> pway
+ - ~penger -> pgr
+ - ~pfad -> p
+ - pgr -> pgr
+ - ph -> p
+ - phi truong -> pt
+ - phuong -> p
+ - phwy -> pway
+ - piata -> pta
+ - piaz -> p za
+ - piazza -> p za
+ - piazzale -> p le
+ - piazzetta -> pta
+ - pierwsza -> 1
+ - pierwsze -> 1
+ - pierwszy -> 1
+ - pike -> pk
+ - pko -> pko
+ - pkwy -> pwy
+ - pky -> pwy
+ - plac -> pl
+ - placa -> pl
+ - place -> pl
+ - placem -> pl
+ - placu -> pl
+ - ~plass -> pl
+ - ~plassen -> pl
+ - plateau -> plat
+ - ~plats -> pl
+ - ~platsen -> pl
+ - ~platz -> pl
+ - plaza -> pl
+ - plazoleta -> pzta
+ - plazuela -> plzla
+ - ~plein -> pln
+ - pln -> pln
+ - ploshchad -> pl
+ - plz -> pl
+ - plza -> pl
+ - pnt -> pt
+ - poblado -> pbdo
+ - pocket -> pkt
+ - point -> pt
+ - poligono -> polig
+ - poligono industrial -> pgind
+ - ~polku -> p
+ - ponte -> p te
+ - porta -> pta
+ - portal -> prtal
+ - portico -> prtco
+ - portillo -> ptilo
+ - prazuela -> przla
+ - precinct -> pct
+ - pres -> pres
+ - president -> pres
+ - prins -> pr
+ - prinses -> pr
+ - professor -> prof
+ - profiesor -> prof
+ - prolongacion -> prol
+ - promenade -> prom
+ - psge -> ps
+ - pueblo -> pblo
+ - puente -> pnte
+ - puerta -> pta
+ - puerto -> pto
+ - ~puistikko -> pko
+ - ~puisto -> ps
+ - punto kilometrico -> pk
+ - pza -> pl
+ - quadrangle -> qdgl
+ - quadrant -> qdrt
+ - quai -> qu
+ - quan -> q
+ - quay -> qy
+ - quays -> qys
+ - qucyng truong -> qt
+ - ~quelle -> qu
+ - quoc lo -> ql
+ - ~raitti -> r
+ - rambla -> rbla
+ - ramble -> rmbl
+ - rampla -> rampa
+ - range -> rnge
+ - ~ranta -> rt
+ - rdhp hat -> rh
+ - reach -> rch
+ - reservation -> res
+ - reserve -> res
+ - reservoir -> res
+ - residencial -> resid
+ - retreat -> rtt
+ - rhein -> rh
+ - ribera -> rbra
+ - ridge -> rdge
+ - ridgeway -> rgwy
+ - right of way -> rowy
+ - rincon -> rcon
+ - rinconada -> rcda
+ - ~rinne -> rn
+ - rise -> ri
+ - riv -> r
+ - river -> r
+ - riverway -> rvwy
+ - riviera -> rvra
+ - road -> rd
+ - roads -> rds
+ - roadside -> rdsd
+ - roadway -> rdwy
+ - ronde -> rnde
+ - rosebowl -> rsbl
+ - rotary -> rty
+ - rotonda -> rtda
+ - round -> rnd
+ - route -> rt
+ - rte -> rt
+ - rue -> r
+ - rvr -> r
+ - sa -> s
+ - saint -> st
+ - sainte -> ste
+ - salizada -> s da
+ - san -> s
+ - san bay -> sb
+ - san bay quoc te -> sbqt
+ - san van dong -> svd
+ - sanatorio -> sanat
+ - sankt -> st
+ - santa -> sta
+ - santuario -> santu
+ - sector -> sect
+ - sendera -> sedra
+ - sendero -> send
+ - serviceway -> swy
+ - shunt -> shun
+ - shwy -> sh
+ - siding -> sdng
+ - sielo -> s
+ - sint -> st
+ - slope -> slpe
+ - sodra -> s
+ - sok -> sk
+ - sokagi -> sk
+ - sokak -> sk
+ - sondre -> sdr
+ - soseaua -> sos
+ - sound -> snd
+ - south -> s
+ - south east -> se
+ - south west -> sw
+ - south-east -> se
+ - south-west -> sw
+ - southeast -> se
+ - southwest -> sw
+ - spl -> sp
+ - splaiul -> sp
+ - spodnja -> sp
+ - spodnje -> sp
+ - spodnji -> sp
+ - square -> sq
+ - srednja -> sr
+ - srednje -> sr
+ - srednji -> sr
+ - stara -> st
+ - stare -> st
+ - stary -> st
+ - state highway -> sh
+ - state route -> sr
+ - station -> stn
+ - stazione -> staz
+ - ste -> ste
+ - steenweg -> stwg
+ - sth -> s
+ - ~stig -> st
+ - ~stigen -> st
+ - store -> st
+ - str la -> sdla
+ - stra -> st
+ - ~straat -> st
+ - strada -> st
+ - strada comunale -> sc
+ - strada provinciale -> sp
+ - strada regionale -> sr
+ - strada statale -> ss
+ - stradela -> sdla
+ - strand -> st
+ - ~strasse -> str
+ - street -> st
+ - strip -> strp
+ - stwg -> stwg
+ - subida -> sbida
+ - subway -> sbwy
+ - sveta -> sv
+ - sveti -> sv
+ - svieti -> sv
+ - ~sving -> sv
+ - ~svingen -> sv
+ - svwy -> swy
+ - ~taival -> tvl
+ - tanav -> tn
+ - tce -> ter
+ - tcty -> tct
+ - terr -> ter
+ - terrace -> ter
+ - thanh pho -> tp
+ - thfr -> thor
+ - thi trzn -> tt
+ - thi xa -> tx
+ - thoroughfare -> thor
+ - ~tie -> t
+ - tieu hoc -> th
+ - tinh lo -> tl
+ - tollway -> tlwy
+ - tong cong ty -> tct
+ - ~tori -> tr
+ - torrente -> trrnt
+ - towers -> twrs
+ - township -> twp
+ - tpke -> tpk
+ - track -> trk
+ - trail -> trl
+ - trailer -> trlr
+ - transito -> trans
+ - transversal -> trval
+ - trasera -> tras
+ - travesia -> trva
+ - triangle -> tri
+ - trung hoc co so -> thcs
+ - trung hoc pho thong -> thpt
+ - trung tam -> tt
+ - trung tam thuong mdhi -> tttm
+ - trunkway -> tkwy
+ - trzeci -> 3
+ - trzecia -> 3
+ - trzecie -> 3
+ - tunnel -> tun
+ - turn -> tn
+ - turnpike -> tpk
+ - tvl -> tvl
+ - ulica -> ul
+ - ulice -> ul
+ - ulicy -> ul
+ - ulitsa -> ul
+ - underpass -> upas
+ - university -> univ
+ - untere -> u
+ - unterer -> u
+ - unteres -> u
+ - upper -> up
+ - upr -> up
+ - urbanizacion -> urb
+ - utca -> u
+ - v d -> vd
+ - va -> v
+ - ~väg -> v
+ - ~vägen -> v
+ - vale -> v
+ - van -> v
+ - van de -> vd
+ - varf -> vf
+ - varful -> vf
+ - vastra -> v
+ - vayla -> vla
+ - vdct -> via
+ - vecindario -> vecin
+ - ~vei -> v
+ - ~veien -> v
+ - velika -> v
+ - velike -> v
+ - veliki -> v
+ - veliko -> v
+ - vereda -> vreda
+ - via -> v
+ - viad -> via
+ - viaduct -> via
+ - viaducto -> vcto
+ - viale -> v le
+ - vicolo -> v lo
+ - vien bcyo tang -> vbt
+ - view -> vw
+ - villas -> vlls
+ - virf -> vf
+ - virful -> vf
+ - vista -> vsta
+ - viviendas -> vvdas
+ - vkhod -> vkh
+ - vla -> vla
+ - ~vliet -> vlt
+ - vlt -> vlt
+ - vn -> v
+ - vuon quoc gia -> vqg
+ - walk -> wlk
+ - walkway -> wkwy
+ - way -> wy
+ - west -> w
+ - wharf -> whrf
+ - wielka -> wlk
+ - wielki -> wlk
+ - wielkie -> wlk
+ - wielkopolska -> wlkp
+ - wielkopolski -> wlkp
+ - wielkopolskie -> wlkp
+ - wojewodztwie -> woj
+ - wojewodztwo -> woj
+ - yard -> yd
+ - zgornja -> zg
+ - zgornje -> zg
+ - zgornji -> zg
+ - zhilishchien komplieks -> zh k
+ - zum -> z
@pytest.fixture
def cfgfile(tmp_path, suffix='.yaml'):
- def _create_config(suffixes, abbr):
+ def _create_config(*variants, **kwargs):
content = dedent("""\
normalization:
- ":: NFD ()"
- ":: Latin ()"
- "'🜵' > ' '"
""")
- content += "compound_suffixes:\n"
- content += '\n'.join((" - " + s for s in suffixes)) + '\n'
- content += "abbreviations:\n"
- content += '\n'.join((" - " + s for s in abbr)) + '\n'
+ content += "variants:\n - words:\n"
+ content += '\n'.join((" - " + s for s in variants)) + '\n'
+ for k, v in kwargs:
+ content += " {}: {}\n".format(k, v)
fpath = tmp_path / ('test_config' + suffix)
fpath.write_text(dedent(content))
return fpath
return proc.get_variants_ascii(proc.get_normalized(name))
def test_simple_variants(cfgfile):
- fpath = cfgfile(['strasse', 'straße', 'weg'],
- ['strasse,straße => str',
- 'prospekt => pr'])
+ fpath = cfgfile('~strasse,~straße -> str',
+ '~weg => weg',
+ 'prospekt -> pr')
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
def test_variants_empty(cfgfile):
- fpath = cfgfile([], ['saint => 🜵', 'street => st'])
+ fpath = cfgfile('saint -> 🜵', 'street -> st')
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
def test_multiple_replacements(cfgfile):
- fpath = cfgfile([], ['saint => s,st', 'street => st'])
+ fpath = cfgfile('saint -> s,st', 'street -> st')
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
def test_search_normalized(cfgfile):
- fpath = cfgfile(['street'], ['street => s,st', 'master => mstr'])
+ fpath = cfgfile('~street => s,st', 'master => mstr')
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
@pytest.fixture
def cfgfile(tmp_path, suffix='.yaml'):
- def _create_config(suffixes, abbr):
+ def _create_config(*variants, **kwargs):
content = dedent("""\
normalization:
- ":: NFD ()"
- ":: Latin ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
""")
- content += "compound_suffixes:\n"
- content += '\n'.join((" - " + s for s in suffixes)) + '\n'
- content += "abbreviations:\n"
- content += '\n'.join((" - " + s for s in abbr)) + '\n'
+ content += "variants:\n - words:\n"
+ content += '\n'.join((" - " + s for s in variants)) + '\n'
+ for k, v in kwargs:
+ content += " {}: {}\n".format(k, v)
fpath = tmp_path / ('test_config' + suffix)
fpath.write_text(dedent(content))
return fpath
fpath.write_text(dedent("""\
normalization:
transliteration:
- compound_suffixes:
- abbreviations:
+ variants:
"""))
rules = ICURuleLoader(fpath)
assert rules.get_search_rules() == ''
assert rules.get_normalization_rules() == ''
assert rules.get_transliteration_rules() == ''
- assert rules.get_replacement_pairs() == []
+ assert list(rules.get_replacement_pairs()) == []
-CONFIG_SECTIONS = ('normalization', 'transliteration',
- 'compound_suffixes', 'abbreviations')
+CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
def test_missing_normalization(tmp_path, section):
with pytest.raises(UsageError):
ICURuleLoader(fpath)
-@pytest.mark.parametrize("abbr", ["simple",
- "double => arrow => bad",
- "bad = > arrow"])
-def test_bad_abbreviation_syntax(tmp_path, abbr):
- fpath = tmp_path / ('test_config.yaml')
- fpath.write_text(dedent("""\
- normalization:
- transliteration:
- compound_suffixes:
- abbreviations:
- - {}
- """.format(abbr)))
-
- with pytest.raises(UsageError):
- rules = ICURuleLoader(fpath)
-
def test_get_search_rules(cfgfile):
- fpath = cfgfile(['strasse', 'straße', 'weg'],
- ['strasse,straße => str',
- 'prospekt => pr'])
-
- loader = ICURuleLoader(fpath)
+ loader = ICURuleLoader(cfgfile())
rules = loader.get_search_rules()
trans = Transliterator.createFromRules("test", rules)
def test_get_normalization_rules(cfgfile):
- fpath = cfgfile(['strasse', 'straße', 'weg'],
- ['strasse,straße => str'])
-
- loader = ICURuleLoader(fpath)
+ loader = ICURuleLoader(cfgfile())
rules = loader.get_normalization_rules()
trans = Transliterator.createFromRules("test", rules)
def test_get_transliteration_rules(cfgfile):
- fpath = cfgfile(['strasse', 'straße', 'weg'],
- ['strasse,straße => str'])
-
- loader = ICURuleLoader(fpath)
+ loader = ICURuleLoader(cfgfile())
rules = loader.get_transliteration_rules()
trans = Transliterator.createFromRules("test", rules)
transliteration:
- "'ax' > 'b'"
- !include transliteration.yaml
- compound_suffixes:
- abbreviations:
+ variants:
"""))
transpath = tmp_path / ('transliteration.yaml')
transpath.write_text('- "x > y"')
assert trans.transliterate(" axxt ") == " byt "
-def test_get_replacement_pairs_multi_to(cfgfile):
- fpath = cfgfile(['Pfad', 'Strasse'],
- ['Strasse => str,st'])
+class TestGetReplacements:
- repl = ICURuleLoader(fpath).get_replacement_pairs()
+ @pytest.fixture(autouse=True)
+ def setup_cfg(self, cfgfile):
+ self.cfgfile = cfgfile
- assert [(a, sorted(b)) for a, b in repl] == \
- [(' strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']),
- ('strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']),
- (' pfad ', [' pfad ', 'pfad ']),
- ('pfad ', [' pfad ', 'pfad '])]
+ def get_replacements(self, *variants):
+ loader = ICURuleLoader(self.cfgfile(*variants))
+ rules = loader.get_replacement_pairs()
+ return set((v.source, v.replacement) for v in rules)
-def test_get_replacement_pairs_multi_from(cfgfile):
- fpath = cfgfile([], ['saint,Sainte => st'])
- repl = ICURuleLoader(fpath).get_replacement_pairs()
+ @pytest.mark.parametrize("variant", ['foo > bar', 'foo -> bar -> bar',
+ '~foo~ -> bar', 'fo~ o -> bar'])
+ def test_invalid_variant_description(self, variant):
+ with pytest.raises(UsageError):
+ ICURuleLoader(self.cfgfile(variant))
- assert [(a, sorted(b)) for a, b in repl] == \
- [(' sainte ', [' sainte ', ' st ']),
- (' saint ', [' saint ', ' st '])]
+ def test_add_full(self):
+ repl = self.get_replacements("foo -> bar")
+ assert repl == {(' foo ', ' bar '), (' foo ', ' foo ')}
-def test_get_replacement_pairs_cross_abbreviations(cfgfile):
- fpath = cfgfile([], ['saint,Sainte => st',
- 'sainte => ste'])
- repl = ICURuleLoader(fpath).get_replacement_pairs()
+ def test_replace_full(self):
+ repl = self.get_replacements("foo => bar")
- assert [(a, sorted(b)) for a, b in repl] == \
- [(' sainte ', [' sainte ', ' st ', ' ste ']),
- (' saint ', [' saint ', ' st '])]
+ assert repl == {(' foo ', ' bar ')}
-@pytest.mark.parametrize("abbr", ["missing to =>",
- " => missing from",
- "=>"])
-def test_bad_abbreviation_syntax(tmp_path, abbr):
- fpath = tmp_path / ('test_config.yaml')
- fpath.write_text(dedent("""\
- normalization:
- transliteration:
- compound_suffixes:
- abbreviations:
- - {}
- """.format(abbr)))
+ def test_add_suffix_no_decompose(self):
+ repl = self.get_replacements("~berg |-> bg")
+
+ assert repl == {('berg ', 'berg '), ('berg ', 'bg '),
+ (' berg ', ' berg '), (' berg ', ' bg ')}
+
+
+ def test_replace_suffix_no_decompose(self):
+ repl = self.get_replacements("~berg |=> bg")
+
+ assert repl == {('berg ', 'bg '), (' berg ', ' bg ')}
+
+
+ def test_add_suffix_decompose(self):
+ repl = self.get_replacements("~berg -> bg")
+
+ assert repl == {('berg ', 'berg '), ('berg ', ' berg '),
+ (' berg ', ' berg '), (' berg ', 'berg '),
+ ('berg ', 'bg '), ('berg ', ' bg '),
+ (' berg ', 'bg '), (' berg ', ' bg ')}
+
+
+ def test_replace_suffix_decompose(self):
+ repl = self.get_replacements("~berg => bg")
+
+ assert repl == {('berg ', 'bg '), ('berg ', ' bg '),
+ (' berg ', 'bg '), (' berg ', ' bg ')}
+
+
+ def test_add_prefix_no_compose(self):
+ repl = self.get_replacements("hinter~ |-> hnt")
+
+ assert repl == {(' hinter', ' hinter'), (' hinter ', ' hinter '),
+ (' hinter', ' hnt'), (' hinter ', ' hnt ')}
+
+
+ def test_replace_prefix_no_compose(self):
+ repl = self.get_replacements("hinter~ |=> hnt")
+
+ assert repl == {(' hinter', ' hnt'), (' hinter ', ' hnt ')}
+
+
+ def test_add_prefix_compose(self):
+ repl = self.get_replacements("hinter~-> h")
+
+ assert repl == {(' hinter', ' hinter'), (' hinter', ' hinter '),
+ (' hinter', ' h'), (' hinter', ' h '),
+ (' hinter ', ' hinter '), (' hinter ', ' hinter'),
+ (' hinter ', ' h '), (' hinter ', ' h')}
+
+
+ def test_replace_prefix_compose(self):
+ repl = self.get_replacements("hinter~=> h")
+
+ assert repl == {(' hinter', ' h'), (' hinter', ' h '),
+ (' hinter ', ' h '), (' hinter ', ' h')}
+
+
+ def test_add_beginning_only(self):
+ repl = self.get_replacements("^Premier -> Pr")
+
+ assert repl == {('^ premier ', '^ premier '), ('^ premier ', '^ pr ')}
+
+
+ def test_replace_beginning_only(self):
+ repl = self.get_replacements("^Premier => Pr")
+
+ assert repl == {('^ premier ', '^ pr ')}
+
+
+ def test_add_final_only(self):
+ repl = self.get_replacements("road$ -> rd")
+
+ assert repl == {(' road ^', ' road ^'), (' road ^', ' rd ^')}
+
+
+ def test_replace_final_only(self):
+ repl = self.get_replacements("road$ => rd")
+
+ assert repl == {(' road ^', ' rd ^')}
+
+
+ def test_decompose_only(self):
+ repl = self.get_replacements("~foo -> foo")
+
+ assert repl == {('foo ', 'foo '), ('foo ', ' foo '),
+ (' foo ', 'foo '), (' foo ', ' foo ')}
+
+
+ def test_add_suffix_decompose_end_only(self):
+ repl = self.get_replacements("~berg |-> bg", "~berg$ -> bg")
+
+ assert repl == {('berg ', 'berg '), ('berg ', 'bg '),
+ (' berg ', ' berg '), (' berg ', ' bg '),
+ ('berg ^', 'berg ^'), ('berg ^', ' berg ^'),
+ ('berg ^', 'bg ^'), ('berg ^', ' bg ^'),
+ (' berg ^', 'berg ^'), (' berg ^', 'bg ^'),
+ (' berg ^', ' berg ^'), (' berg ^', ' bg ^')}
+
+
+ def test_replace_suffix_decompose_end_only(self):
+ repl = self.get_replacements("~berg |=> bg", "~berg$ => bg")
+
+ assert repl == {('berg ', 'bg '), (' berg ', ' bg '),
+ ('berg ^', 'bg ^'), ('berg ^', ' bg ^'),
+ (' berg ^', 'bg ^'), (' berg ^', ' bg ^')}
+
- repl = ICURuleLoader(fpath).get_replacement_pairs()
+ def test_add_multiple_suffix(self):
+ repl = self.get_replacements("~berg,~burg -> bg")
- assert repl == []
+ assert repl == {('berg ', 'berg '), ('berg ', ' berg '),
+ (' berg ', ' berg '), (' berg ', 'berg '),
+ ('berg ', 'bg '), ('berg ', ' bg '),
+ (' berg ', 'bg '), (' berg ', ' bg '),
+ ('burg ', 'burg '), ('burg ', ' burg '),
+ (' burg ', ' burg '), (' burg ', 'burg '),
+ ('burg ', 'bg '), ('burg ', ' bg '),
+ (' burg ', 'bg '), (' burg ', ' bg ')}
monkeypatch.undo()
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
- suffixes=('gasse', ), abbr=('street => st', )):
+ variants=('~gasse -> gasse', 'street => st', )):
cfgfile = tmp_path / 'analyser_test_config.yaml'
with cfgfile.open('w') as stream:
cfgstr = {'normalization' : list(norm),
'transliteration' : list(trans),
- 'compound_suffixes' : list(suffixes),
- 'abbreviations' : list(abbr)}
+ 'variants' : [ {'words': list(variants)}]}
yaml.dump(cfgstr, stream)
tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgfile))