Merge remote-tracking branch 'upstream/master'

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 12 Feb 2020 07:18:53 +0000 (08:18 +0100)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 12 Feb 2020 07:18:53 +0000 (08:18 +0100)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 12 Feb 2020 07:18:53 +0000 (08:18 +0100)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 12 Feb 2020 07:18:53 +0000 (08:18 +0100)
diff --git a/data-sources/wikipedia-wikidata/import_wikidata.sh b/data-sources/wikipedia-wikidata/import_wikidata.sh

index a89dd319b04ff3074b5ce5b1886ddd531a1c4a85..6939214f4560aac2cb782517b50fd57bd80d74ec 100755 (executable)
--- a/data-sources/wikipedia-wikidata/import_wikidata.sh
+++ b/data-sources/wikipedia-wikidata/import_wikidata.sh
@@ -10,7 +10,7 @@ mysql2pgsqlcmd() {
  
  download() {
       echo "Downloading $1"
-     wget --quiet --no-clobber --tries 3
+     wget --quiet --no-clobber --tries 3 "$1"
  }
  
  # languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
diff --git a/data-sources/wikipedia-wikidata/import_wikipedia.sh b/data-sources/wikipedia-wikidata/import_wikipedia.sh

index 19553e184aeaa63cee9e9f0793138b8b6ef723f3..106131e82fe25c15b6e86df3b9c8b102dbd549be 100755 (executable)
--- a/data-sources/wikipedia-wikidata/import_wikipedia.sh
+++ b/data-sources/wikipedia-wikidata/import_wikipedia.sh
@@ -1,16 +1,19 @@
  #!/bin/bash
  
  psqlcmd() {
-     psql --quiet wikiprocessingdb
+     psql --quiet wikiprocessingdb |& \
+     grep -v 'does not exist, skipping' |& \
+     grep -v 'violates check constraint' |& \
+     grep -vi 'Failing row contains'
  }
  
  mysql2pgsqlcmd() {
-     ./mysql2pgsql.perl /dev/stdin /dev/stdout
+     ./mysql2pgsql.perl --nodrop /dev/stdin /dev/stdout
  }
  
  download() {
       echo "Downloading $1"
-     wget --quiet --no-clobber --tries 3
+     wget --quiet --no-clobber --tries=3 "$1"
  }
  
  
@@ -96,17 +99,98 @@ for i in "${LANGUAGES[@]}"
  do
      echo "Language: $i"
  
+    # We pre-create the table schema. This allows us to
+    # 1. Skip index creation. Most queries we do are full table scans
+    # 2. Add constrain to only import namespace=0 (wikipedia articles)
+    # Both cuts down data size considerably (50%+)
+
      echo "Importing ${i}wiki-latest-pagelinks"
-    gzip -dc ${i}wiki-latest-pagelinks.sql.gz | sed "s/\`pagelinks\`/\`${i}pagelinks\`/g" | mysql2pgsqlcmd | psqlcmd
+
+    echo "DROP TABLE IF EXISTS ${i}pagelinks;" | psqlcmd
+    echo "CREATE TABLE ${i}pagelinks (
+       pl_from            int  NOT NULL DEFAULT '0',
+       pl_namespace       int  NOT NULL DEFAULT '0',
+       pl_title           text NOT NULL DEFAULT '',
+       pl_from_namespace  int  NOT NULL DEFAULT '0'
+    );" | psqlcmd
+
+    time \
+      gzip -dc ${i}wiki-latest-pagelinks.sql.gz | \
+      sed "s/\`pagelinks\`/\`${i}pagelinks\`/g" | \
+      mysql2pgsqlcmd | \
+      grep -v '^CREATE INDEX ' | \
+      psqlcmd
+
+
+
  
      echo "Importing ${i}wiki-latest-page"
-    gzip -dc ${i}wiki-latest-page.sql.gz      | sed "s/\`page\`/\`${i}page\`/g"           | mysql2pgsqlcmd | psqlcmd
+
+    # autoincrement serial8 4byte
+    echo "DROP TABLE IF EXISTS ${i}page;" | psqlcmd
+    echo "CREATE TABLE ${i}page (
+       page_id             int NOT NULL,
+       page_namespace      int NOT NULL DEFAULT '0',
+       page_title          text NOT NULL DEFAULT '',
+       page_restrictions   text NOT NULL,
+       page_is_redirect    smallint NOT NULL DEFAULT '0',
+       page_is_new         smallint NOT NULL DEFAULT '0',
+       page_random         double precision NOT NULL DEFAULT '0',
+       page_touched        text NOT NULL DEFAULT '',
+       page_links_updated  text DEFAULT NULL,
+       page_latest         int NOT NULL DEFAULT '0',
+       page_len            int NOT NULL DEFAULT '0',
+       page_content_model  text DEFAULT NULL,
+       page_lang           text DEFAULT NULL
+     );" | psqlcmd
+
+    time \
+      gzip -dc ${i}wiki-latest-page.sql.gz | \
+      sed "s/\`page\`/\`${i}page\`/g" | \
+      mysql2pgsqlcmd | \
+      grep -v '^CREATE INDEX ' | \
+      psqlcmd
+
+
+
  
      echo "Importing ${i}wiki-latest-langlinks"
-    gzip -dc ${i}wiki-latest-langlinks.sql.gz | sed "s/\`langlinks\`/\`${i}langlinks\`/g" | mysql2pgsqlcmd | psqlcmd
+
+    echo "DROP TABLE IF EXISTS ${i}langlinks;" | psqlcmd
+    echo "CREATE TABLE ${i}langlinks (
+       ll_from   int  NOT NULL DEFAULT '0',
+       ll_lang   text NOT NULL DEFAULT '',
+       ll_title  text NOT NULL DEFAULT ''
+    );" | psqlcmd
+
+    time \
+      gzip -dc ${i}wiki-latest-langlinks.sql.gz | \
+      sed "s/\`langlinks\`/\`${i}langlinks\`/g" | \
+      mysql2pgsqlcmd | \
+      grep -v '^CREATE INDEX ' | \
+      psqlcmd
+
+
+
+
  
      echo "Importing ${i}wiki-latest-redirect"
-    gzip -dc ${i}wiki-latest-redirect.sql.gz  | sed "s/\`redirect\`/\`${i}redirect\`/g"   | mysql2pgsqlcmd | psqlcmd
+
+    echo "DROP TABLE IF EXISTS ${i}redirect;" | psqlcmd
+    echo "CREATE TABLE ${i}redirect (
+       rd_from       int   NOT NULL DEFAULT '0',
+       rd_namespace  int   NOT NULL DEFAULT '0',
+       rd_title      text  NOT NULL DEFAULT '',
+       rd_interwiki  text  DEFAULT NULL,
+       rd_fragment   text  DEFAULT NULL
+    );" | psqlcmd
+
+    time \
+      gzip -dc ${i}wiki-latest-redirect.sql.gz | \
+      sed "s/\`redirect\`/\`${i}redirect\`/g" | \
+      mysql2pgsqlcmd | \
+      grep -v '^CREATE INDEX ' | \
+      psqlcmd
  done
  
  
@@ -125,7 +209,8 @@ do
      echo "CREATE TABLE ${i}pagelinkcount
            AS
            SELECT pl_title AS title,
-                 COUNT(*) AS count
+                 COUNT(*) AS count,
+                 0::bigint as othercount
            FROM ${i}pagelinks
            WHERE pl_namespace = 0
            GROUP BY pl_title
@@ -150,14 +235,11 @@ do
              AND rd_namespace = 0
            ;" | psqlcmd
  
-    echo "ALTER TABLE ${i}pagelinkcount
-          ADD COLUMN othercount integer
-          ;" | psqlcmd
+done
  
-    echo "UPDATE ${i}pagelinkcount
-          SET othercount = 0
-          ;" | psqlcmd
  
+for i in "${LANGUAGES[@]}"
+do
      for j in "${LANGUAGES[@]}"
      do
          echo "UPDATE ${i}pagelinkcount
@@ -211,3 +293,5 @@ do
      echo "DROP TABLE ${i}redirect;"      | psqlcmd
      echo "DROP TABLE ${i}pagelinkcount;" | psqlcmd
  done
+
+echo "all done."
diff --git a/nominatim/nominatim.py b/nominatim/nominatim.py

index 54d9b2085db36c3e5fe840228253a45d690ff2e8..14643770a6642662be283b0d21b3d36b556ac407 100755 (executable)
--- a/nominatim/nominatim.py
+++ b/nominatim/nominatim.py
@@ -35,9 +35,14 @@ import select
  log = logging.getLogger()
  
  def make_connection(options, asynchronous=False):
-    return psycopg2.connect(dbname=options.dbname, user=options.user,
-                            password=options.password, host=options.host,
-                            port=options.port, async_=asynchronous)
+    params = {'dbname' : options.dbname,
+              'user' : options.user,
+              'password' : options.password,
+              'host' : options.host,
+              'port' : options.port,
+              'async' : asynchronous}
+
+    return psycopg2.connect(**params)
  
  
  class RankRunner(object):
@@ -104,19 +109,39 @@ class DBConnection(object):
      """
  
      def __init__(self, options):
+        self.current_query = None
+        self.current_params = None
+
+        self.conn = None
+        self.connect()
+
+    def connect(self):
+        if self.conn is not None:
+            self.cursor.close()
+            self.conn.close()
+
          self.conn = make_connection(options, asynchronous=True)
          self.wait()
  
          self.cursor = self.conn.cursor()
  
-        self.current_query = None
-        self.current_params = None
-
      def wait(self):
          """ Block until any pending operation is done.
          """
-        wait_select(self.conn)
-        self.current_query = None
+        while True:
+            try:
+                wait_select(self.conn)
+                self.current_query = None
+                return
+            except psycopg2.extensions.TransactionRollbackError as e:
+                if e.pgcode == '40P01':
+                    log.info("Deadlock detected (params = {}), retry."
+                              .format(self.current_params))
+                    self.cursor.execute(self.current_query, self.current_params)
+                else:
+                    raise
+            except psycopg2.errors.DeadlockDetected:
+                self.cursor.execute(self.current_query, self.current_params)
  
      def perform(self, sql, args=None):
          """ Send SQL query to the server. Returns immediately without
@@ -150,6 +175,8 @@ class DBConnection(object):
                  self.cursor.execute(self.current_query, self.current_params)
              else:
                  raise
+        except psycopg2.errors.DeadlockDetected:
+            self.cursor.execute(self.current_query, self.current_params)
  
          return False
  
@@ -257,13 +284,25 @@ class Indexer(object):
              sending a query.
          """
          ready = self.threads
+        command_stat = 0
  
          while True:
              for thread in ready:
                  if thread.is_done():
+                    command_stat += 1
                      yield thread
  
-            ready, _, _ = select.select(self.threads, [], [])
+            # refresh the connections occasionaly to avoid potential
+            # memory leaks in Postgresql.
+            if command_stat > 100000:
+                for t in self.threads:
+                    while not t.is_done():
+                        wait_select(t.conn)
+                    t.connect()
+                command_stat = 0
+                ready = self.threads
+            else:
+                ready, _, _ = select.select(self.threads, [], [])
  
          assert(False, "Unreachable code")
  
diff --git a/test/README.md b/test/README.md

index cdf350f81d4a5379e41129c0bb1db0e6b7de0c2f..ab5f7f4c653ef98cd65c454aa6dccfec43f2ce3e 100644 (file)
--- a/test/README.md
+++ b/test/README.md
@@ -108,12 +108,19 @@ They require a preimported test database, which consists of the import of a
  planet extract. A precompiled PBF with the necessary data can be downloaded from
  https://www.nominatim.org/data/test/nominatim-api-testdata.pbf
  
+You need at least 2GB RAM and 10GB discspace.
+
  The polygons defining the extract can be found in the test/testdb
  directory. There is also a reduced set of wikipedia data for this extract,
  which you need to import as well. For Tiger tests the data of South Dakota
  is required. Get the Tiger files `46*`.
  
-The official test dataset is derived from the 160725 planet. Newer
+    cd Nominatim/data
+    wget https://nominatim.org/data/tiger2018-nominatim-preprocessed.tar.gz
+    tar xvf tiger2018-nominatim-preprocessed.tar.gz --wildcards --no-anchored '46*'
+    rm tiger2018-nominatim-preprocessed.tar.gz
+
+The official test dataset is derived from the 180924 planet. Newer
  planets are likely to work as well but you may see isolated test
  failures where the data has changed. To recreate the input data
  for the test database run:
diff --git a/vagrant/Install-on-Ubuntu-16.sh b/vagrant/Install-on-Ubuntu-16.sh

index 3539851fa1c5e3e881207a4cf66005427d176549..393cd0b1fc297143ed418f3b899d4377fa5bcefb 100755 (executable)
--- a/vagrant/Install-on-Ubuntu-16.sh
+++ b/vagrant/Install-on-Ubuntu-16.sh
@@ -31,10 +31,7 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
                              postgresql-contrib-9.5 \
                              apache2 php php-pgsql libapache2-mod-php \
                              php-intl python3-setuptools python3-dev python3-pip \
-                            python3-tidylib git
-
-    # python3-psycopg2 apt package is too old (2.6), we want at least 2.7
-    pip3 install --user psycopg2
+                            python3-tidylib python3-psycopg2 git
  
  # If you want to run the test suite, you need to install the following
  # additional packages:
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 12 Feb 2020 07:18:53 +0000 (08:18 +0100)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 12 Feb 2020 07:18:53 +0000 (08:18 +0100)
data-sources/wikipedia-wikidata/import_wikidata.sh		patch \| blob \| history
data-sources/wikipedia-wikidata/import_wikipedia.sh		patch \| blob \| history
nominatim/nominatim.py		patch \| blob \| history
test/README.md		patch \| blob \| history
vagrant/Install-on-Ubuntu-16.sh		patch \| blob \| history