Skip to content

Commit

Permalink
bibupload: use CFG_OAI_ID_FIELD for deduping
Browse files Browse the repository at this point in the history
* FIX In case of replicating datasets between instances by means of OAI
  harvesting it is plausible to check for external OAI-IDs not only in the field
  specified by CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG but also in CFG_OAI_ID_FIELD.
  (closes inveniosoftware#2812) (PR inveniosoftware#2816)

* Note: find_record_from_oaiid() always returned the first record with a given
  OAI-ID assuming that it is unique. However, due to manual intervention one may
  have accidentially produced dupes here. Thus check, if more than one record
  with an OAI-ID exists. If so, check if all records except one are deleted. In
  this case return this surviving record, as cataloguers have resolved dupes
  manually. In case we still have dupes resort to the old behaviour for sake of
  compatibility, but at least throw a warning. This is somewhat of a TODO.
  Probably one should refuse merge here and wait for manual curation (to be
  discussed).
  • Loading branch information
aw-bib authored and tiborsimko committed Nov 18, 2016
1 parent 12cd78c commit 6cfa771
Showing 1 changed file with 37 additions and 2 deletions.
39 changes: 37 additions & 2 deletions modules/bibupload/lib/bibupload.py
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,13 @@ def find_records_from_extoaiid(extoaiid, extoaisrc=None):
if extoaisrc is None:
write_message('WARNING: Found recid %s for extoaiid="%s" that specify a provenance (%s), while input record does not have a provenance.' % (id_bibrec, extoaiid, this_extoaisrc), stream=sys.stderr)

if len(ret) == 0:
# no oaiid in CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG. Check if we have one
# in the locally used CFG_OAI_ID_FIELD that matches. This can happen
# if oai harvesting is used to duplicate records.
recid = find_record_from_oaiid(extoaiid)
if recid is not None:
ret.add(recid)
return ret
except Error, error:
write_message(" Error during find_records_from_extoaiid(): %s "
Expand All @@ -674,7 +681,35 @@ def find_record_from_oaiid(oaiid):
write_message(" Error during find_record_from_oaiid(): %s " % error,
verbose=1, stream=sys.stderr)
if res:
return res[0][0]
if len(res) == 1:
return res[0][0]
else:
# Principle dupes detected.
# If the dupes are resolved only one record should have survived,
# and all others are deleted. => get non-deleted record(s)
lst = []
for r in res:
lst.append('recid:"%s"' % str(r[0]))
candidates = search_pattern(p=' or '.join(lst))
deleted = search_pattern(p='980:"DELETED"')
recs = candidates.difference(deleted)
if len(recs) == 1:
# all dupes are resolved, use the not deleted record downstream
return recs[0]
else:
# Dupes persist. This should be a pathological case.
# Throw an error
error = "Supposed unique id not unique: %s in %s" % \
(oaiid, str(recs.tolist()))
write_message(" Error during find_record_from_oaiid(): %s " % \
error, verbose=1, stream=sys.stderr)

# TODO Is return none correct here? Is returning one id better?
# should we really fail, given that we just detected a
# multiplet? Note however that this function origially returned
# a scalar all the time so all calling functions suspect either
# one value or None.
return recs[0]
else:
return None

Expand Down Expand Up @@ -2268,7 +2303,7 @@ def task_run_core():
return not stat['nb_errors'] >= 1

def log_record_uploading(oai_rec_id, task_id, bibrec_id, insertion_db, pretend=False):
if oai_rec_id != "" and oai_rec_id != None:
if oai_rec_id != "" and oai_rec_id is not None:
query = """UPDATE oaiHARVESTLOG SET date_inserted=NOW(), inserted_to_db=%s, id_bibrec=%s WHERE oai_id = %s AND bibupload_task_id = %s ORDER BY date_harvested LIMIT 1"""
try:
if not pretend:
Expand Down

0 comments on commit 6cfa771

Please sign in to comment.