summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDennis Nienhüser <nienhueser@kde.org>2016-11-13 08:57:39 (GMT)
committerDennis Nienhüser <nienhueser@kde.org>2016-11-13 09:07:43 (GMT)
commit850e124bf85f9c568df6d5cca5bb6f27c4fd7371 (patch)
tree08adc555dbbd0807892a493fab250ac5897dd4fa
parent9e11b578f18ea3d905e26d7c2d2ec68563f69e44 (diff)
Correct nat earth city names at level 7/9 with geonames database.
The natural earth data set has a lot of spelling / encoding errors for cities in the detailled data set. During tile creation now the largest cities database from geonames.org is downloaded and used to correct city names. City names are corrected to the city whose name has the smallest Levenshtein distance and is not more than 5 km away. The distance check needs to allow for noise as well since there is no agreed city center point between natural earth and geonames. Internally the cities database is stored in a tile hash to provide a fast lookup. Candidates are retrieved from the matching tile and (due to spatial noise in the city center coordinates) its neighbors. The correction runs very fast, but might need some minor parameter tweaking still to avoid false negatives as well as false positives. Checking alternative city names from geonames.org might be helpful as well to better deal with translations.
-rw-r--r--tools/natural-earth-vector-tiling/ne_tilegenerator.py44
-rw-r--r--tools/vectorosm-tilecreator/CMakeLists.txt1
-rw-r--r--tools/vectorosm-tilecreator/SpellChecker.cpp189
-rw-r--r--tools/vectorosm-tilecreator/SpellChecker.h42
-rw-r--r--tools/vectorosm-tilecreator/main.cpp8
5 files changed, 266 insertions, 18 deletions
diff --git a/tools/natural-earth-vector-tiling/ne_tilegenerator.py b/tools/natural-earth-vector-tiling/ne_tilegenerator.py
index c36aa45..8639d9a 100644
--- a/tools/natural-earth-vector-tiling/ne_tilegenerator.py
+++ b/tools/natural-earth-vector-tiling/ne_tilegenerator.py
@@ -27,6 +27,8 @@ from subprocess import call
def unzip_file(filename, in_dir):
print(in_dir)
path_zip = os.path.join(in_dir, filename + '.zip')
+ if not os.path.exists(path_zip):
+ path_zip = os.path.splitext(path_zip)[0] + '.zip'
with zipfile.ZipFile(path_zip ,"r") as zip_ref:
path_dir = os.path.join(in_dir, filename)
os.mkdir(path_dir)
@@ -34,18 +36,20 @@ def unzip_file(filename, in_dir):
os.remove(path_zip)
def generate_url(filename):
- url = 'www.naturalearthdata.com/http//www.naturalearthdata.com/download/'
- cultural_tokens = ['admin', 'populated', 'roads', 'railroads', 'airports', 'ports', 'urban', 'parks', 'time', 'cultural']
- file_tokens = filename.split('_')
- url += file_tokens[1] + '/'
- data_type = 'physical'
- for token in file_tokens:
- if token in cultural_tokens:
- data_type = 'cultural'
- break
- url += data_type + '/'
- url += filename + '.zip'
- return url
+ if filename == 'cities15000.txt':
+ return 'http://download.geonames.org/export/dump/cities15000.zip'
+ url = 'www.naturalearthdata.com/http//www.naturalearthdata.com/download/'
+ cultural_tokens = ['admin', 'populated', 'roads', 'railroads', 'airports', 'ports', 'urban', 'parks', 'time', 'cultural']
+ file_tokens = filename.split('_')
+ url += file_tokens[1] + '/'
+ data_type = 'physical'
+ for token in file_tokens:
+ if token in cultural_tokens:
+ data_type = 'cultural'
+ break
+ url += data_type + '/'
+ url += filename + '.zip'
+ return url
def download(filename, in_dir):
url = generate_url(filename)
@@ -108,6 +112,7 @@ if __name__ == "__main__":
args = parser.parse_args()
exception_names = ['ne_50m_admin_1_states_provinces_lines']
+ check_existence('cities15000.txt', args.in_dir)
level_info = parse_file(args.file, args.in_dir)
for level in level_info:
@@ -123,9 +128,12 @@ if __name__ == "__main__":
path = os.path.join(args.in_dir, filename) + '/' + filename + '_shp.shp'
abs_file_paths.append(path)
print('Level has following SHP datasets: ', abs_file_paths)
- polyshp2osm.run(abs_file_paths, 1, 5000000, 'tiny_planet_{}'.format(level))
- print('Tiny planetosm for Level = {} complete.'.format(level))
- f = open('bound_info_{}'.format(level), "w")
- print('tiny_planet_{}.1.osm;Level;-180.0;-86.0;180.0;86.0'.format(level), file=f)
- f.close()
- call(["marble-vectorosm-tilecreator", "-e", "o5m", "-z", str(level), "-o", args.out_dir, 'tiny_planet_{}.1.osm'.format(level)])
+ target = 'tiny_planet_{}.1.osm'.format(level)
+ if args.overwrite or not os.path.exists(target):
+ polyshp2osm.run(abs_file_paths, 1, 5000000, 'tiny_planet_{}'.format(level))
+ print('Tiny planetosm for Level = {} complete.'.format(level))
+ f = open('bound_info_{}'.format(level), "w")
+ print('tiny_planet_{}.1.osm;Level;-180.0;-86.0;180.0;86.0'.format(level), file=f)
+ f.close()
+ spellcheck = [] if level < 6 else ['-s', os.path.join(args.in_dir, 'cities15000.txt', 'cities15000.txt')]
+ call(["marble-vectorosm-tilecreator", "-e", "o5m", "-z", str(level)] + spellcheck + ["-o", args.out_dir, target])
diff --git a/tools/vectorosm-tilecreator/CMakeLists.txt b/tools/vectorosm-tilecreator/CMakeLists.txt
index c61bc92..2c10b73 100644
--- a/tools/vectorosm-tilecreator/CMakeLists.txt
+++ b/tools/vectorosm-tilecreator/CMakeLists.txt
@@ -25,6 +25,7 @@ BaseClipper.cpp
BaseFilter.cpp
NodeReducer.cpp
PeakAnalyzer.cpp
+SpellChecker.cpp
TagsFilter.cpp
TileIterator.cpp
TileDirectory.cpp
diff --git a/tools/vectorosm-tilecreator/SpellChecker.cpp b/tools/vectorosm-tilecreator/SpellChecker.cpp
new file mode 100644
index 0000000..6cf58ef
--- /dev/null
+++ b/tools/vectorosm-tilecreator/SpellChecker.cpp
@@ -0,0 +1,189 @@
+//
+// This file is part of the Marble Virtual Globe.
+//
+// This program is free software licensed under the GNU LGPL. You can
+// find a copy of this license in LICENSE.txt in the top directory of
+// the source code.
+//
+// Copyright 2016 Dennis Nienhüser <nienhueser@kde.org>
+//
+
+#include "SpellChecker.h"
+#include "GeoDataPlacemark.h"
+#include "MarbleMath.h"
+#include "OsmPlacemarkData.h"
+#include "MarbleDirs.h"
+#include "MarbleModel.h"
+#include "ParsingRunnerManager.h"
+#include "GeoSceneMercatorTileProjection.h"
+#include "TileId.h"
+
+#include <QSet>
+#include <QDebug>
+#include <QFile>
+
+namespace Marble {
+
+SpellChecker::SpellChecker(const QString &citiesFile) :
+ m_tileLevel(10),
+ m_tileHash(parseCities(citiesFile)),
+ m_verbose(false)
+{
+ // nothing to do
+}
+
+void SpellChecker::correctPlaceLabels(const QVector<GeoDataPlacemark*> &placemarks)
+{
+ auto places = cityPlaces(placemarks);
+ double const maxDistance = 5000.0 / EARTH_RADIUS;
+ int hits = 0;
+ int validated = 0;
+ int misses = 0;
+ for (auto place: places) {
+ auto const places = candidatesFor(place);
+ bool hasMatch = false;
+ bool isValid = false;
+ QString const placeName = place->name();
+ if (!places.isEmpty()) {
+ auto match = places.first();
+ if (match->name() == place->name()) {
+ ++validated;
+ isValid = true;
+ } else {
+ if (distanceSphere(match->coordinate(), place->coordinate()) < maxDistance) {
+ if (levenshteinDistance(places.first()->name(), placeName) < 6) {
+ if (m_verbose) {
+ qDebug() << "Correcting" << placeName << "to" << match->name();
+ }
+ place->setName(match->name());
+ place->osmData().removeTag("name");
+ place->osmData().addTag("name", match->name());
+ hasMatch = true;
+ }
+ }
+
+ if (m_verbose && !hasMatch) {
+ qDebug() << "No match for " << placeName << ", candidates: ";
+ for (auto candidate: places) {
+ qDebug() << distanceSphere(candidate->coordinate(), place->coordinate()) * EARTH_RADIUS << " m, "
+ << "levenshtein distance " << levenshteinDistance(placeName, candidate->name()) << ":" << candidate->name();
+ }
+ }
+ }
+ } else if (m_verbose) {
+ qDebug() << "No match for " << placeName << " at " << place->coordinate().toString(GeoDataCoordinates::Decimal) << " and no candidates for replacement";
+ }
+ hits += hasMatch ? 1 : 0;
+ misses += (hasMatch || isValid) ? 0 : 1;
+ }
+ if (m_verbose) {
+ qDebug() << "In total there are " << hits << " corrections, " << validated << " validations and " << misses << " misses";
+ }
+}
+
+void SpellChecker::setVerbose(bool verbose)
+{
+ m_verbose = verbose;
+}
+
+QVector<GeoDataPlacemark *> SpellChecker::cityPlaces(const QVector<GeoDataPlacemark *> &placemarks) const
+{
+ QSet<GeoDataPlacemark::GeoDataVisualCategory> categories;
+ categories << GeoDataPlacemark::PlaceCity;
+ categories << GeoDataPlacemark::PlaceCityCapital;
+ categories << GeoDataPlacemark::PlaceSuburb;
+ categories << GeoDataPlacemark::PlaceHamlet;
+ categories << GeoDataPlacemark::PlaceLocality;
+ categories << GeoDataPlacemark::PlaceTown;
+ categories << GeoDataPlacemark::PlaceTownCapital;
+ categories << GeoDataPlacemark::PlaceVillage;
+ categories << GeoDataPlacemark::PlaceVillageCapital;
+
+ QVector<GeoDataPlacemark*> places;
+ std::copy_if(placemarks.begin(), placemarks.end(), std::back_inserter(places),
+ [categories] (GeoDataPlacemark* placemark) {
+ return categories.contains(placemark->visualCategory()); });
+ return places;
+}
+
+QHash<TileId, QVector<GeoDataPlacemark *> > SpellChecker::parseCities(const QString &filename) const
+{
+ QHash<TileId, QVector<GeoDataPlacemark*> > placeLabels;
+ QFile file(filename);
+ if (!file.open(QIODevice::ReadOnly)) {
+ qDebug() << "Cannot open " << filename << ":" << file.errorString();
+ return placeLabels;
+ }
+
+ int count;
+ while (!file.atEnd()) {
+ QByteArray line = file.readLine();
+ auto const values = line.split('\t');
+ if (values.size() > 15) {
+
+ GeoDataPlacemark* city = new GeoDataPlacemark;
+ city->setName(values[1]);
+ bool ok;
+ double const lon = values[5].toDouble(&ok);
+ if (!ok) {
+ qDebug() << values[5] << " is no longitude";
+ continue;
+ }
+ double const lat = values[4].toDouble(&ok);
+ if (!ok) {
+ qDebug() << values[4] << " is no latitude";
+ continue;
+ }
+ double const ele = values[15].toDouble();
+ auto const coordinate = GeoDataCoordinates(lon, lat, ele, GeoDataCoordinates::Degree);
+ city->setCoordinate(coordinate);
+
+ auto const tile = TileId::fromCoordinates(coordinate, m_tileLevel);
+ placeLabels[tile] << city;
+ ++count;
+ }
+ }
+ return placeLabels;
+}
+
+int SpellChecker::levenshteinDistance(const QString &a, const QString &b)
+{
+ // From https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance
+ unsigned int const len1 = a.size(), len2 = b.size();
+ std::vector<std::vector<unsigned int>> distance(len1 + 1, std::vector<unsigned int>(len2 + 1));
+
+ distance[0][0] = 0;
+ for(unsigned int i = 1; i <= len1; ++i) {
+ distance[i][0] = i;
+ }
+ for(unsigned int i = 1; i <= len2; ++i) {
+ distance[0][i] = i;
+ }
+
+ for(unsigned int i = 1; i <= len1; ++i) {
+ for(unsigned int j = 1; j <= len2; ++j) {
+ distance[i][j] = std::min({ distance[i - 1][j] + 1, distance[i][j - 1] + 1, distance[i - 1][j - 1] + (a[i - 1] == b[j - 1] ? 0 : 1) });
+ }
+ }
+ return distance[len1][len2];
+}
+
+QVector<GeoDataPlacemark *> SpellChecker::candidatesFor(GeoDataPlacemark *placemark) const
+{
+ int const N = pow(2, m_tileLevel);
+ auto const tile = TileId::fromCoordinates(placemark->coordinate(), m_tileLevel);
+ QVector<GeoDataPlacemark *> places;
+ for (int x=qMax(0, tile.x()-1); x<qMin(N-1, tile.x()+1); ++x) {
+ for (int y=qMax(0, tile.y()-1); y<qMin(N-1, tile.y()+1); ++y) {
+ places << m_tileHash[TileId(0, m_tileLevel, x, y)];
+ }
+ }
+ QString const placeName = placemark->name();
+ std::sort(places.begin(), places.end(),
+ [placeName] (GeoDataPlacemark* a, GeoDataPlacemark* b) {
+ return levenshteinDistance(a->name(), placeName) < levenshteinDistance(b->name(), placeName);
+ });
+ return places;
+}
+
+}
diff --git a/tools/vectorosm-tilecreator/SpellChecker.h b/tools/vectorosm-tilecreator/SpellChecker.h
new file mode 100644
index 0000000..2be1e5c
--- /dev/null
+++ b/tools/vectorosm-tilecreator/SpellChecker.h
@@ -0,0 +1,42 @@
+//
+// This file is part of the Marble Virtual Globe.
+//
+// This program is free software licensed under the GNU LGPL. You can
+// find a copy of this license in LICENSE.txt in the top directory of
+// the source code.
+//
+// Copyright 2016 Dennis Nienhüser <nienhueser@kde.org>
+//
+
+#include <TileId.h>
+#include "GeoDataPlacemark.h"
+
+#include <QVector>
+
+namespace Marble {
+
+class GeoDataPlacemark;
+
+class SpellChecker
+{
+public:
+ SpellChecker(const QString &citiesFile);
+ void setVerbose(bool verbose);
+
+ void correctPlaceLabels(const QVector<GeoDataPlacemark*> &placemarks);
+
+private:
+ typedef QHash<TileId, QVector<GeoDataPlacemark*> > TileHash;
+
+ static int levenshteinDistance(const QString &a, const QString &b);
+
+ QVector<GeoDataPlacemark*> cityPlaces(const QVector<GeoDataPlacemark*> &placemarks) const;
+ TileHash parseCities(const QString &filename) const;
+ QVector<GeoDataPlacemark*> candidatesFor(GeoDataPlacemark* placemark) const;
+
+ int const m_tileLevel;
+ TileHash m_tileHash;
+ bool m_verbose;
+};
+
+}
diff --git a/tools/vectorosm-tilecreator/main.cpp b/tools/vectorosm-tilecreator/main.cpp
index fee501d..9fc0a33 100644
--- a/tools/vectorosm-tilecreator/main.cpp
+++ b/tools/vectorosm-tilecreator/main.cpp
@@ -39,6 +39,7 @@
#include "TileIterator.h"
#include "TileDirectory.h"
#include "MbTileWriter.h"
+#include "SpellChecker.h"
#include <iostream>
@@ -136,6 +137,8 @@ int main(int argc, char *argv[])
{"conflict-resolution", "How to deal with existing tiles: overwrite, skip or merge", "mode", "overwrite"},
{{"c", "cache-directory"}, "Directory for temporary data.", "cache", "cache"},
{{"m", "mbtile"}, "Store tiles at level 15 onwards in a mbtile database.", "mbtile"},
+ {{"s", "spellcheck"}, "Use this geonames.org cities file for spell-checking city names", "spellcheck"},
+ {"verbose", "Increase amount of shell output information"},
{{"z", "zoom-level"}, "Zoom level according to which OSM information has to be processed.", "levels", "11,13,15,17"},
{{"o", "output"}, "Output file or directory", "output", QString("%1/maps/earth/vectorosm").arg(MarbleDirs::localPath())},
{{"e", "extension"}, "Output file type: o5m (default), osm or kml", "file extension", "o5m"}
@@ -189,6 +192,11 @@ int main(int argc, char *argv[])
auto map = TileDirectory::open(inputFileName, manager);
VectorClipper processor(map.data(), maxZoomLevel);
GeoDataLatLonBox world(85.0, -85.0, 180.0, -180.0, GeoDataCoordinates::Degree);
+ if (parser.isSet("spellcheck")) {
+ SpellChecker spellChecker(parser.value("spellcheck"));
+ spellChecker.setVerbose(parser.isSet("verbose"));
+ spellChecker.correctPlaceLabels(map.data()->placemarkList());
+ }
foreach(auto zoomLevel, zoomLevels) {
TileIterator iter(world, zoomLevel);
qint64 count = 0;