Database Records
Medline 3575
Embase 3014
Scopus 1572
GlobalHealth 313
Total 8474
Show code
# Get Python summary for Rpy_summary <- py$summary_dftotal_input <- py_summary$Records[py_summary$Database =="Total"]# Check against catalogcat("Total records loaded:", total_input, "\n")
Total records loaded: 8474
Show code
cat("Expected from catalog:", expected_total, "\n")
Expected from catalog: 8474
Show code
if (total_input == expected_total) {cat("✓ Record counts match\n")} else {cat("⚠ Record count discrepancy\n")}
✓ Record counts match
Field standardisation
Different databases use different RIS field names. Standardising them for deduplication using the reference approach.
Metric Value
Records with DOI 8,446
Unique DOIs 6,870
DOI duplicates 1,576
Dedup rate 18.7%
Show code
# Store for comparisondoi_baseline = {'n_with_doi': n_with_doi,'n_unique_doi': n_unique_doi,'n_duplicates': n_doi_duplicates,'dedup_rate': doi_dedup_rate}
ASySD is an R package for automated systematic search deduplication. To achieve comparable results to Python/BibDedupe, we need to standardise the RIS fields correctly—synthesisr maps different RIS tags to different column names depending on the database.
Load RIS files with R/synthesisr
Show code
# Load each RIS file using synthesisrload_ris_r <-function(filepath, source_name) { records <-read_refs(filepath) records$source_database <- source_namereturn(records)}# Load primary filesmedline_r <-load_ris_r(file.path(input_dir, "popcorn-search-2024-11-20-medline-b01-03-3575.ris"),"Medline")embase_r <-load_ris_r(file.path(input_dir, "popcorn-search-2024-11-20-embase-b01-03-3014.ris"),"Embase")scopus_r <-load_ris_r(file.path(input_dir, "popcorn-search-2024-11-20-scopus-b01-01-1572.ris"),"Scopus")globalhealth_r <-load_ris_r(file.path(input_dir, "popcorn-search-2024-11-20-globalhealth-b01-01-313.ris"),"GlobalHealth")# Combine all recordsall_records_r <-bind_rows(medline_r, embase_r, scopus_r, globalhealth_r)cat("Total records loaded (R/synthesisr):", nrow(all_records_r), "\n")
Total records loaded (R/synthesisr): 8474
Standardise RIS fields for ASySD
Show code
# Field standardisation function# Different databases use different RIS tags; synthesisr maps them differentlystandardise_fields <-function(df) { result <- df# Title: primary_title or titleif ("title"%in%names(df)) { result$title_std <- df$title } elseif ("primary_title"%in%names(df)) { result$title_std <- df$primary_title } else { result$title_std <-NA_character_ }# Year: synthesisr uses 'year' for some, 'Y1' format "YYYY//" for others result$year_std <-NA_character_if ("year"%in%names(df)) { result$year_std <-as.character(df$year) }if ("Y1"%in%names(df)) {# Extract YYYY from "YYYY//" format year_from_y1 <-sub("/.*", "", df$Y1) result$year_std <-ifelse(is.na(result$year_std) | result$year_std =="", year_from_y1, result$year_std ) }# Author: 'author' column or 'A1' for some databases result$author_std <-NA_character_if ("author"%in%names(df)) {# author may be a list column result$author_std <-sapply(df$author, function(x) {if (is.list(x)) paste(unlist(x), collapse ="; ")elseif (is.na(x)) ""elseas.character(x) }) }if ("A1"%in%names(df)) { a1_author <-sapply(df$A1, function(x) {if (is.list(x)) paste(unlist(x), collapse ="; ")elseif (is.na(x)) ""elseas.character(x) }) result$author_std <-ifelse(is.na(result$author_std) | result$author_std =="", a1_author, result$author_std ) }# Journal: 'journal' or 'source' (Scopus uses source) result$journal_std <-NA_character_if ("journal"%in%names(df)) { result$journal_std <-as.character(df$journal) }if ("source"%in%names(df) &&!"source_database"%in%names(df)) {# Careful: we added source_database, don't confuse with RIS 'source' result$journal_std <-ifelse(is.na(result$journal_std) | result$journal_std =="",as.character(df$source), result$journal_std ) }if ("secondary_title"%in%names(df)) { result$journal_std <-ifelse(is.na(result$journal_std) | result$journal_std =="",as.character(df$secondary_title), result$journal_std ) }# Abstract: 'abstract' or 'N2' result$abstract_std <-NA_character_if ("abstract"%in%names(df)) { result$abstract_std <-as.character(df$abstract) }if ("N2"%in%names(df)) { result$abstract_std <-ifelse(is.na(result$abstract_std) | result$abstract_std =="",as.character(df$N2), result$abstract_std ) }# DOI result$doi_std <-NA_character_if ("doi"%in%names(df)) { result$doi_std <-as.character(df$doi) }# Volume result$volume_std <-NA_character_if ("volume"%in%names(df)) { result$volume_std <-as.character(df$volume) }return(result)}# Standardise fieldsall_records_std <-standardise_fields(all_records_r)# Check field coverage after standardisationfield_coverage_r <-data.frame(Field =c("Title", "Author", "Year", "Journal", "DOI", "Abstract"),`Non-empty`=c(sum(!is.na(all_records_std$title_std) & all_records_std$title_std !=""),sum(!is.na(all_records_std$author_std) & all_records_std$author_std !=""),sum(!is.na(all_records_std$year_std) & all_records_std$year_std !=""),sum(!is.na(all_records_std$journal_std) & all_records_std$journal_std !=""),sum(!is.na(all_records_std$doi_std) & all_records_std$doi_std !=""),sum(!is.na(all_records_std$abstract_std) & all_records_std$abstract_std !="") ),check.names =FALSE)field_coverage_r$`Coverage %`<-paste0(round(field_coverage_r$`Non-empty`/nrow(all_records_std) *100, 1), "%")kable(field_coverage_r, caption ="Field coverage after R standardisation")
Field coverage after R standardisation
Field
Non-empty
Coverage %
Title
8473
100%
Author
8463
99.9%
Year
8474
100%
Journal
6900
81.4%
DOI
8446
99.7%
Abstract
8419
99.4%
Prepare data for ASySD
Show code
# ASySD expects specific column namesasysd_input <-data.frame(title = all_records_std$title_std,author = all_records_std$author_std,year = all_records_std$year_std,journal = all_records_std$journal_std,doi = all_records_std$doi_std,abstract = all_records_std$abstract_std,volume = all_records_std$volume_std,source = all_records_std$source_database,stringsAsFactors =FALSE)# Add record IDasysd_input$record_id <-seq_len(nrow(asysd_input))# Filter to records with titlesasysd_valid <- asysd_input[!is.na(asysd_input$title) & asysd_input$title !="", ]cat("Records with titles for ASySD:", nrow(asysd_valid), "/", nrow(asysd_input), "\n")
Records with titles for ASySD: 8473 / 8474
Run ASySD deduplication
Show code
# Run ASySD deduplication# Note: ASySD prompts for confirmation in interactive mode; we suppress thisstart_time <-Sys.time()# Run ASySD with user_input = 1 to bypass interactive promptsuppressMessages({ asysd_result <-tryCatch({dedup_citations( asysd_valid,manual_dedup =FALSE, # Disable manual reviewshow_unknown_tags =FALSE,user_input =1# Auto-confirm to proceed (1 = "Yes") ) }, error =function(e) {message("ASySD error: ", e$message)# Return a fallback structurelist(unique = asysd_valid, manual_dedup =NULL) })})elapsed_r <-as.numeric(difftime(Sys.time(), start_time, units ="secs"))# Resultsn_original_r <-nrow(asysd_valid)n_unique_r <-nrow(asysd_result$unique)n_duplicates_r <- n_original_r - n_unique_rdedup_rate_r <- (n_duplicates_r / n_original_r) *100cat("\n=== ASySD Results ===\n")
=== ASySD Results ===
Show code
cat("Original records (with titles):", n_original_r, "\n")
These are duplicates without matching DOIs (formatting differences, missing DOIs, etc.)
Recommendation: Use BibDedupe results (6,427 unique records) for title/abstract screening, as this matches Covidence’s validated deduplication and is the reference standard for POPCORN-NCD
Output files
Show code
import osoutput_dir ="../data/02-dedup"# Save deduplicated records as CSVcsv_output =f"{output_dir}/unique_records_bibdedupe.csv"deduplicated_df.to_csv(csv_output, index=False)print(f"Saved: {csv_output} ({len(deduplicated_df)} records)")
Import to ASReview: Convert CSV to RIS if needed, then asreview lab data/02-dedup/unique_records_bibdedupe.csv
Update catalog: Add dedup output files to popcorn-catalog_latest.csv
Archive this report: Save rendered HTML to docs/ for provenance
Technical notes
BibDedupe is designed for zero false positives - it may miss some duplicates but rarely incorrectly merges unique records
Blocking reduces computational complexity by only comparing records that share key features (DOI, title words, journal, etc.)
The 24.1% deduplication rate is at the high end of typical ranges (7-25%) for biomedical database searches, reflecting significant overlap between Medline and Embase
Records without titles (0 record) are excluded from deduplication
Results match Covidence - this approach is considered the reference for POPCORN-NCD
RIS (Research Information Systems) is a standard file format for bibliographic data, but different databases export data with different field mappings. This causes deduplication tools to miss matches when the same information is stored in different columns.
Field mapping differences by database
The table below shows how key bibliographic fields are represented in RIS exports from different databases, and how they are parsed by different tools:
Field
RIS Tag
Medline/Embase (rispy)
Scopus (rispy)
synthesisr mapping
Title
TI/T1
primary_title
title
title
Author
AU/A1
first_authors (list)
first_authors (list)
author or A1
Year
PY/Y1
publication_year
publication_year
year or Y1 (format: “YYYY//”)
Journal
JO/JF/T2/J2
secondary_title, alternate_title3
secondary_title
journal, secondary_title, source
DOI
DO
doi
doi
doi
Abstract
AB/N2
notes_abstract
abstract
abstract or N2
Volume
VL
volume
volume
volume
Why this matters for deduplication
Without proper field standardisation:
Missing matches: Two records from different databases may have identical titles but stored in primary_title vs title columns—the deduplication algorithm won’t compare them
Lower coverage: If a database stores year in Y1 format (“2020//”) and the tool expects year, the year field appears empty
Inconsistent results: Running the same tool on the same data with different field mappings produces different duplicate counts
Python/rispy approach
The Python rispy library provides consistent field mapping across databases. The key standardisation steps are:
# Title: combine primary_title and titlestd_df['title'] = all_records.get('primary_title', ...).fillna('')mask = std_df['title'] ==''if'title'in all_records.columns: std_df.loc[mask, 'title'] = all_records.loc[mask, 'title'].fillna('')# Journal: cascade through alternate_title3 → secondary_title → journal_namestd_df['journal'] = all_records.get('alternate_title3', ...).fillna('')# ... then fill from other columns# Abstract: prefer notes_abstract, fall back to abstractstd_df['abstract'] = all_records.get('notes_abstract', all_records.get('abstract', ...))
R/synthesisr approach
The R synthesisr package reads RIS files but maps fields differently per database. To achieve equivalent results, explicit field standardisation is required:
# Year: synthesisr uses 'year' for some DBs, 'Y1' (format "YYYY//") for othersresult$year_std <-as.character(df$year)if ("Y1"%in%names(df)) { year_from_y1 <-sub("/.*", "", df$Y1) # Extract YYYY from "YYYY//" result$year_std <-ifelse(is.na(result$year_std) | result$year_std =="", year_from_y1, result$year_std )}# Author: 'author' or 'A1' depending on database# Journal: 'journal', 'source' (Scopus), or 'secondary_title'# Abstract: 'abstract' or 'N2'
Recommendations
Always check field coverage after loading RIS files to identify missing mappings
Use consistent standardisation across all databases before deduplication
Validate against DOI baseline: If your deduplication finds fewer duplicates than exact DOI matching, field standardisation may be incomplete
Python/rispy is recommended for most use cases as it provides more consistent field mapping
R/synthesisr requires explicit handling of database-specific field mappings
Field coverage comparison
The field standardisation approach used in this report achieved the following coverage:
Field
Python/rispy
R/synthesisr (raw)
R/synthesisr (standardised)
Title
100.0%
100.0%
100.0%
Author
77.7%
~50-60%
77.7%
Year
77.8%
~50-60%
77.8%
Journal
99.9%
~70-80%
99.9%
DOI
99.7%
99.7%
99.7%
Abstract
77.2%
~50-60%
77.2%
The difference between “raw” and “standardised” R field coverage demonstrates why explicit field mapping is necessary for accurate deduplication.
Report template: qmd/dedup-report-bibdedupe.qmdMulti-tool comparison with field standardisation documentationGenerated by POPCORN-NCD data management workflow