summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSebastian Trueg <trueg@kde.org>2012-03-09 16:17:48 (GMT)
committerSebastian Trueg <trueg@kde.org>2012-03-13 08:46:12 (GMT)
commit754275eda610dce1160286a76339353097d8764c (patch)
tree4210f95780135309580554cf51b06b00877bee90
parentfbc021940b5938203d5299ba0f35cc2bb02df688 (diff)
Backport from nepomuk-core: improved performance on res identification.
BUG: 289932 FIXED-IN: 4.8.2
-rw-r--r--nepomuk/services/backupsync/lib/resourceidentifier.cpp75
1 files changed, 52 insertions, 23 deletions
diff --git a/nepomuk/services/backupsync/lib/resourceidentifier.cpp b/nepomuk/services/backupsync/lib/resourceidentifier.cpp
index c1a9919..894372c 100644
--- a/nepomuk/services/backupsync/lib/resourceidentifier.cpp
+++ b/nepomuk/services/backupsync/lib/resourceidentifier.cpp
@@ -31,6 +31,7 @@
#include <Soprano/Statement>
#include <Soprano/Graph>
#include <Soprano/Node>
+#include <Soprano/BindingSet>
#include <Soprano/StatementIterator>
#include <Soprano/QueryResultIterator>
#include <Soprano/Model>
@@ -176,19 +177,18 @@ bool Nepomuk::Sync::ResourceIdentifier::runIdentification(const KUrl& uri)
return false;
}
- QString query;
-
QStringList identifyingProperties;
QHash<KUrl, Soprano::Node> identifyingPropertiesHash;
QHash< KUrl, Soprano::Node >::const_iterator it = res.constBegin();
QHash< KUrl, Soprano::Node >::const_iterator constEnd = res.constEnd();
+ QList<Soprano::Node> requiredTypes;
for( ; it != constEnd; it++ ) {
const QUrl & prop = it.key();
// Special handling for rdf:type
if( prop == RDF::type() ) {
- query += QString::fromLatin1(" ?r a %1 . ").arg( it.value().toN3() );
+ requiredTypes << it.value().uri();
continue;
}
@@ -219,6 +219,10 @@ bool Nepomuk::Sync::ResourceIdentifier::runIdentification(const KUrl& uri)
return false;
}
+
+ // construct the identification query
+ QString query = QLatin1String("select distinct ?r where { ");
+
//
// Optimization:
// If there is only one identifying property using all that optional and filter stuff
@@ -235,7 +239,7 @@ bool Nepomuk::Sync::ResourceIdentifier::runIdentification(const KUrl& uri)
QString::number( numIdentifyingProperties++ ) );
}
- // Make sure atleast one of the identification properties has been matched
+ // Make sure at least one of the identification properties has been matched
// by adding filter( bound(?o1) || bound(?o2) ... )
query += QString::fromLatin1("filter( ");
for( int i=0; i<numIdentifyingProperties-1; i++ ) {
@@ -247,43 +251,68 @@ bool Nepomuk::Sync::ResourceIdentifier::runIdentification(const KUrl& uri)
query += QString::fromLatin1("?r %1 %2 . ").arg(Soprano::Node::resourceToN3(identifyingPropertiesHash.constBegin().key()),
identifyingPropertiesHash.constBegin().value().toN3());
}
- query += QLatin1String("}");
- // Construct the entire query
- QString queryBegin = QString::fromLatin1("select distinct ?r count(?p) as ?cnt "
- "where { ?r ?p ?o. filter( ?p in (%1) ).")
- .arg( identifyingProperties.join(",") );
-
- query = queryBegin + query + QString::fromLatin1(" order by desc(?cnt)");
+ //
+ // For performance reasons we add a limit even though this could mean that we
+ // miss a resource to identify since we check the types below.
+ //
+ query += QLatin1String("} LIMIT 100");
- kDebug() << query;
//
- // Only store the results which have the maximum score
+ // Fetch a score for each result.
+ // We do this in a separate query for performance reasons.
//
- QSet<KUrl> results;
- int score = -1;
+ QMultiHash<int, KUrl> resultsScoreHash;
+ int maxScore = -1;
Soprano::QueryResultIterator qit = d->m_model->executeQuery( query, Soprano::Query::QueryLanguageSparql );
while( qit.next() ) {
- //kDebug() << "RESULT: " << qit["r"] << " " << qit["cnt"];
+ const Soprano::Node r(qit["r"]);
+
+ //
+ // Check the type requirements. Experiments have shown this to mean a substantial
+ // performance boost as compared to doing it in the main query.
+ //
+ if(!requiredTypes.isEmpty() ) {
+ query = QLatin1String("ask where { ");
+ foreach(const Soprano::Node& type, requiredTypes) {
+ query += QString::fromLatin1("%1 a %2 . ").arg(r.toN3(), type.toN3());
+ }
+ query += QLatin1String("}");
+ if(!d->m_model->executeQuery(query, Soprano::Query::QueryLanguageSparql).boolValue()) {
+ continue;
+ }
+ }
+
+
+ const int score = d->m_model->executeQuery(QString::fromLatin1("select count(?p) as ?cnt where { "
+ "%1 ?p ?o. filter( ?p in (%2) ) . }")
+ .arg( r.toN3(),
+ identifyingProperties.join(",") ),
+ Soprano::Query::QueryLanguageSparql)
+ .allBindings().first()["cnt"].literal().toInt();
- int count = qit["cnt"].literal().toInt();
- if( score == -1 ) {
- score = count;
+ if( maxScore < score ) {
+ maxScore = score;
}
- else if( count < score )
- break;
- results << qit["r"].uri();
+ resultsScoreHash.insert(score, r.uri());
}
+ //
+ // Only get the results which have the maximum score
+ //
+ QSet<KUrl> results = QSet<KUrl>::fromList(resultsScoreHash.values(maxScore));
+
+
//kDebug() << "Got " << results.size() << " results";
if( results.empty() )
return false;
KUrl newUri;
- if( results.size() == 1 )
+ if( results.size() == 1 ) {
newUri = *results.begin();
+ }
else {
kDebug() << "DUPLICATE RESULTS!";
newUri = duplicateMatch( res.uri(), results );