From 24d4248272260a9d6eb2d704679f4f0e0c70021d Mon Sep 17 00:00:00 2001 From: karilint Date: Thu, 11 Jun 2026 11:41:01 +0300 Subject: [PATCH 1/8] Update Darwin Core export metadata --- backend/src/routes/locality.ts | 33 +- backend/src/routes/occurrence.ts | 90 +++- backend/src/routes/species.ts | 30 +- backend/src/services/crossSearch.ts | 28 + backend/src/services/dwcArchiveExport.ts | 141 ++++- .../services/dwcArchiveExportLocalities.ts | 261 +++++----- .../services/dwcArchiveExportOccurrences.ts | 44 +- backend/src/services/dwcDataPackageExport.ts | 488 ++++++++++++++++-- backend/src/services/utils/dwcCsv.ts | 5 +- backend/src/unit-tests/dwcCsv.test.ts | 6 + .../unit-tests/dwcDataPackageExport.test.ts | 18 +- .../Locality/LocalityDwcExportMenuItem.tsx | 17 +- .../OccurrenceDwcDpExportMenuItem.tsx | 17 +- .../OccurrenceDwcExportMenuItem.tsx | 17 +- ...OccurrenceFullDarwinCoreExportMenuItem.tsx | 17 +- .../Species/SpeciesDwcExportMenuItem.tsx | 17 +- 16 files changed, 992 insertions(+), 237 deletions(-) diff --git a/backend/src/routes/locality.ts b/backend/src/routes/locality.ts index 04460f1b..dc0c041b 100644 --- a/backend/src/routes/locality.ts +++ b/backend/src/routes/locality.ts @@ -1,4 +1,4 @@ -import { Request, Router } from 'express' +import { Request, Response, Router } from 'express' import { getAllLocalities, canEditRestrictedWriteLocality, @@ -15,19 +15,38 @@ import { currentDateAsString } from '../../../frontend/src/shared/currentDateAsS const router = Router() +const parseNumericIds = (value: unknown): number[] | undefined => { + if (value === undefined) return undefined + if (!Array.isArray(value)) throw new Error('ids must be an array.') + return value.map(id => { + const parsed = typeof id === 'number' ? id : typeof id === 'string' ? parseInt(id, 10) : NaN + if (!Number.isInteger(parsed)) throw new Error('ids must contain only integers.') + return parsed + }) +} + router.get('/all', async (req, res) => { const localities = await getAllLocalities(req.user) return res.status(200).send(fixBigInt(localities)) }) -router.get('/export/dwc-archive', requireOneOf([Role.Admin]), async (_req, res) => { - const zipBuffer = await buildDwcLocalityArchiveZipBuffer() +const sendDwcArchive = async (ids: number[] | undefined, res: Response) => { + const zipBuffer = await buildDwcLocalityArchiveZipBuffer(ids) res.setHeader('Content-Type', 'application/zip') - res.setHeader( - 'Content-Disposition', - `attachment; filename="now_dwc_localities_test_export_${currentDateAsString()}.zip"` - ) + res.setHeader('Content-Disposition', `attachment; filename="now_dwc_localities_export_${currentDateAsString()}.zip"`) return res.status(200).send(zipBuffer) +} + +router.get('/export/dwc-archive', requireOneOf([Role.Admin]), async (_req, res) => { + return sendDwcArchive(undefined, res) +}) + +router.post('/export/dwc-archive', requireOneOf([Role.Admin]), async (req, res) => { + try { + return await sendDwcArchive(parseNumericIds((req.body as { ids?: unknown }).ids), res) + } catch (error) { + return res.status(403).send({ error: error instanceof Error ? error.message : 'Invalid export filters.' }) + } }) router.get('/:id', async (req, res) => { diff --git a/backend/src/routes/occurrence.ts b/backend/src/routes/occurrence.ts index 3a66a8d9..abadcfc4 100644 --- a/backend/src/routes/occurrence.ts +++ b/backend/src/routes/occurrence.ts @@ -1,15 +1,17 @@ -import { Router } from 'express' +import { NextFunction, Request, Response, Router } from 'express' import { pipeline } from 'stream' import { getOccurrenceDetail, updateOccurrenceDetail } from '../controllers/occurrenceController' import { requireOneOf } from '../middlewares/authorizer' import { Role } from '../../../frontend/src/shared/types' import { buildDwcOccurrenceArchiveZipStream, + type DwcOccurrenceKey, type DwcOccurrenceExportProgress, } from '../services/dwcArchiveExportOccurrences' import { buildDwcDataPackageZipBuffer, buildFullDarwinCoreExportZipBuffer } from '../services/dwcDataPackageExport' import { currentDateAsString } from '../../../frontend/src/shared/currentDateAsString' import { logger } from '../utils/logger' +import { getFilteredCrossSearchOccurrenceKeys, type CrossSearchRequestParameters } from '../services/crossSearch' const router = Router() @@ -24,13 +26,40 @@ const scheduleProgressCleanup = (exportId: string) => { ) } +const defaultCrossSearchExportFilters = { + columnFilters: [], + sorting: [], +} satisfies CrossSearchRequestParameters + +const resolveOccurrenceKeysForExport = async (req: Request): Promise => { + if (req.method === 'GET') return undefined + const body = req.body as Partial | undefined + const result = await getFilteredCrossSearchOccurrenceKeys(req.user, { + columnFilters: body?.columnFilters ?? defaultCrossSearchExportFilters.columnFilters, + sorting: body?.sorting ?? defaultCrossSearchExportFilters.sorting, + }) + if ('validationErrors' in result) { + throw new Error(JSON.stringify(result.validationErrors)) + } + return result.occurrenceKeys +} + +const handleExportFilterError = (error: unknown, res: Response) => { + return res.status(403).send({ error: error instanceof Error ? error.message : 'Invalid export filters.' }) +} + router.get('/export/dwc-archive/progress/:exportId', requireOneOf([Role.Admin]), (req, res) => { const progress = occurrenceExportProgress.get(req.params.exportId) if (!progress) return res.status(404).send({ message: 'Occurrence export progress not found.' }) return res.status(200).send(progress) }) -router.get('/export/dwc-archive', requireOneOf([Role.Admin]), async (req, res, next) => { +const streamDwcOccurrenceArchive = async ( + req: Request, + res: Response, + next: NextFunction, + occurrenceKeys?: DwcOccurrenceKey[] +) => { const exportId = typeof req.query.exportId === 'string' ? req.query.exportId : undefined const reportProgress = exportId ? (progress: DwcOccurrenceExportProgress) => { @@ -38,15 +67,12 @@ router.get('/export/dwc-archive', requireOneOf([Role.Admin]), async (req, res, n } : undefined - const archive = await buildDwcOccurrenceArchiveZipStream({ reportProgress }).catch(error => { + const archive = await buildDwcOccurrenceArchiveZipStream({ reportProgress, occurrenceKeys }).catch(error => { if (exportId) scheduleProgressCleanup(exportId) throw error }) res.setHeader('Content-Type', 'application/zip') - res.setHeader( - 'Content-Disposition', - `attachment; filename="now_dwc_occurrences_test_export_${currentDateAsString()}.zip"` - ) + res.setHeader('Content-Disposition', `attachment; filename="now_dwc_occurrences_export_${currentDateAsString()}.zip"`) pipeline(archive.stream, res, error => { archive.cleanup().catch(cleanupError => { logger.error(`Failed to clean up occurrence DwC export temp files: ${String(cleanupError)}`) @@ -62,20 +88,58 @@ router.get('/export/dwc-archive', requireOneOf([Role.Admin]), async (req, res, n } if (error) next(error) }) +} + +router.get('/export/dwc-archive', requireOneOf([Role.Admin]), async (req, res, next) => { + return streamDwcOccurrenceArchive(req, res, next) }) -router.get('/export/dwc-data-package', requireOneOf([Role.Admin]), async (_req, res) => { - const zipBuffer = await buildDwcDataPackageZipBuffer() +router.post('/export/dwc-archive', requireOneOf([Role.Admin]), async (req, res, next) => { + let occurrenceKeys + try { + occurrenceKeys = await resolveOccurrenceKeysForExport(req) + } catch (error) { + return handleExportFilterError(error, res) + } + return streamDwcOccurrenceArchive(req, res, next, occurrenceKeys) +}) + +const sendDwcDataPackage = async (occurrenceKeys: DwcOccurrenceKey[] | undefined, res: Response) => { + const zipBuffer = await buildDwcDataPackageZipBuffer(occurrenceKeys) res.setHeader('Content-Type', 'application/zip') - res.setHeader('Content-Disposition', `attachment; filename="now_dwc_dp_test_export_${currentDateAsString()}.zip"`) + res.setHeader('Content-Disposition', `attachment; filename="now_dwc_dp_export_${currentDateAsString()}.zip"`) return res.status(200).send(zipBuffer) +} + +router.get('/export/dwc-data-package', requireOneOf([Role.Admin]), async (_req, res) => { + return sendDwcDataPackage(undefined, res) }) -router.get('/export/dwc-full-package', requireOneOf([Role.Admin]), async (_req, res) => { - const zipBuffer = await buildFullDarwinCoreExportZipBuffer() +router.post('/export/dwc-data-package', requireOneOf([Role.Admin]), async (req, res) => { + try { + return await sendDwcDataPackage(await resolveOccurrenceKeysForExport(req), res) + } catch (error) { + return handleExportFilterError(error, res) + } +}) + +const sendFullDarwinCorePackage = async (occurrenceKeys: DwcOccurrenceKey[] | undefined, res: Response) => { + const zipBuffer = await buildFullDarwinCoreExportZipBuffer(occurrenceKeys) res.setHeader('Content-Type', 'application/zip') - res.setHeader('Content-Disposition', `attachment; filename="now_dwc_full_test_export_${currentDateAsString()}.zip"`) + res.setHeader('Content-Disposition', `attachment; filename="now_dwc_full_export_${currentDateAsString()}.zip"`) return res.status(200).send(zipBuffer) +} + +router.get('/export/dwc-full-package', requireOneOf([Role.Admin]), async (_req, res) => { + return sendFullDarwinCorePackage(undefined, res) +}) + +router.post('/export/dwc-full-package', requireOneOf([Role.Admin]), async (req, res) => { + try { + return await sendFullDarwinCorePackage(await resolveOccurrenceKeysForExport(req), res) + } catch (error) { + return handleExportFilterError(error, res) + } }) router.get('/:lid/:speciesId', getOccurrenceDetail) diff --git a/backend/src/routes/species.ts b/backend/src/routes/species.ts index d5955817..d9c1e3c5 100644 --- a/backend/src/routes/species.ts +++ b/backend/src/routes/species.ts @@ -1,4 +1,4 @@ -import { Request, Router } from 'express' +import { Request, Response, Router } from 'express' import { getAllSpecies, getAllSynonyms, getSpeciesDetails, validateEntireSpecies } from '../services/species' import { fixBigInt } from '../utils/common' import { EditMetaData, SpeciesDetailsType, Role } from '../../../frontend/src/shared/types' @@ -9,6 +9,16 @@ import { currentDateAsString } from '../../../frontend/src/shared/currentDateAsS const router = Router() +const parseNumericIds = (value: unknown): number[] | undefined => { + if (value === undefined) return undefined + if (!Array.isArray(value)) throw new Error('ids must be an array.') + return value.map(id => { + const parsed = typeof id === 'number' ? id : typeof id === 'string' ? parseInt(id, 10) : NaN + if (!Number.isInteger(parsed)) throw new Error('ids must contain only integers.') + return parsed + }) +} + router.get('/all', async (_req, res) => { const species = await getAllSpecies() return res.status(200).send(fixBigInt(species)) @@ -19,11 +29,23 @@ router.get('/synonyms', async (_req, res) => { return res.status(200).send(fixBigInt(synonyms)) }) -router.get('/export/dwc-archive', requireOneOf([Role.Admin]), async (_req, res) => { - const zipBuffer = await buildDwcArchiveZipBuffer() +const sendDwcArchive = async (ids: number[] | undefined, res: Response) => { + const zipBuffer = await buildDwcArchiveZipBuffer(ids) res.setHeader('Content-Type', 'application/zip') - res.setHeader('Content-Disposition', `attachment; filename="now_dwc_test_export_${currentDateAsString()}.zip"`) + res.setHeader('Content-Disposition', `attachment; filename="now_dwc_export_${currentDateAsString()}.zip"`) return res.status(200).send(zipBuffer) +} + +router.get('/export/dwc-archive', requireOneOf([Role.Admin]), async (_req, res) => { + return sendDwcArchive(undefined, res) +}) + +router.post('/export/dwc-archive', requireOneOf([Role.Admin]), async (req, res) => { + try { + return await sendDwcArchive(parseNumericIds((req.body as { ids?: unknown }).ids), res) + } catch (error) { + return res.status(403).send({ error: error instanceof Error ? error.message : 'Invalid export filters.' }) + } }) router.get('/:id', async (req, res) => { diff --git a/backend/src/services/crossSearch.ts b/backend/src/services/crossSearch.ts index 168eec5a..84997bc1 100644 --- a/backend/src/services/crossSearch.ts +++ b/backend/src/services/crossSearch.ts @@ -248,3 +248,31 @@ export const parseAndValidateCrossSearchRouteParameters = ( validationErrors: errors, } } + +export const getFilteredCrossSearchOccurrenceKeys = async ( + user: User | undefined, + parameters: CrossSearchRequestParameters +) => { + const { validationErrors, validatedColumnFilters, validatedSorting } = + parseAndValidateCrossSearchRouteParameters(parameters) + if (validationErrors.length > 0) { + return { validationErrors } + } + + const resultPages = (await getCrossSearchRawSql( + user, + undefined, + undefined, + validatedColumnFilters, + validatedSorting + )) as Array>> + + const keysById = new Map() + for (const row of resultPages.flat()) { + if (typeof row.lid_now_loc !== 'number' || typeof row.species_id_com_species !== 'number') continue + const key = { lid: row.lid_now_loc, speciesId: row.species_id_com_species } + keysById.set(`${key.lid}:${key.speciesId}`, key) + } + + return { occurrenceKeys: [...keysById.values()] } +} diff --git a/backend/src/services/dwcArchiveExport.ts b/backend/src/services/dwcArchiveExport.ts index 88525965..d9f33c47 100644 --- a/backend/src/services/dwcArchiveExport.ts +++ b/backend/src/services/dwcArchiveExport.ts @@ -3,6 +3,15 @@ import JSZip from 'jszip' import { toDwcCsvString, writeDwcCsvString } from './utils/dwcCsv' import { getFieldInfoText } from '../../../frontend/src/shared/fieldInfo' +const DATASET_TITLE = 'NOW database Darwin Core export' +const DATASET_NAME = 'now-darwincore-export' +const DATASET_VERSION = '1.0.0' +const DATASET_DOI = 'https://doi.org/10.5281/zenodo.4268068' +const DATASET_LICENSE_URL = 'https://creativecommons.org/licenses/by/4.0/' +const DATASET_LICENSE_TITLE = 'Creative Commons Attribution 4.0 International' +const DATASET_CREATOR = 'The NOW Community' +const MISSING_VALUE = '\\N' + const isMeaningfulString = (value: unknown): value is string => { if (typeof value !== 'string') return false const trimmed = value.trim() @@ -88,7 +97,7 @@ const isSpeciesSp = (value: string): boolean => /^sp\.?$/i.test(value.trim()) const includesIndet = (value: string): boolean => value.toLowerCase().includes('indet.') -const resolveTaxonRank = ({ +export const resolveTaxonRank = ({ family, genus, specificEpithet, @@ -866,30 +875,121 @@ export const buildEmlXml = (publicationDateIso: string): string => { - - NOW database Darwin Core test export + ${DATASET_TITLE} - - NOW database - + ${DATASET_CREATOR} + https://nowdatabase.org/ - - - NOW database - - + + ${DATASET_CREATOR} + https://nowdatabase.org/ + + + ${DATASET_CREATOR} + publisher + https://nowdatabase.org/ + ${publicationDateIso} + eng + NOW database Darwin Core export - Admin-only test Darwin Core Archive export from NOW database. Field mappings are intentionally limited for v1. + This Darwin Core Archive is the taxon and synthesized taxon-level trait component of the production NOW database Darwin Core export. It contains taxonomic records and MeasurementOrFact rows generated directly from curated NOW taxon fields. + The NOW database is a continuously curated global fossil mammal database supporting large-scale paleobiological and paleontological research. The database spans approximately the last 66 million years, Cenozoic, while maintaining global coverage. + + Darwin Core Archive + MeasurementOrFact + taxon traits + fossil mammals + Cenozoic + paleobiology + paleontology + NOW database export keywords + + + Recommended citation: ${DATASET_CREATOR}. ${DATASET_TITLE}, version ${DATASET_VERSION}. ${DATASET_DOI}. The DOI describes the NOW database generally rather than a single frozen dataset export version; include the export date (${publicationDateIso}) when citing a downloaded archive. + Missing values in CSV files are serialized as ${MISSING_VALUE}. The taxonID values in this archive join to dwc-dp/occurrence.csv taxonID in the full export bundle. + Future exports may add richer semantic mappings, ontology IRIs, agent identifiers, protocol identifiers, and provenance structures while preserving existing CSV columns wherever possible. + - TODO(#1150): Add rights / license information. + Copyright ${DATASET_CREATOR}. This export is licensed under ${DATASET_LICENSE_TITLE} (${DATASET_LICENSE_URL}). Users may share and adapt the data with appropriate attribution. + + + https://nowdatabase.org/ + + + + + Global, reflecting the geographic scope of the NOW database fossil mammal occurrence records that support the taxon and trait synthesis. + + -180 + 180 + 90 + -90 + + + + + + Geologic time + Cenozoic, approximately the last 66 million years + Temporal coverage varies by taxon and associated occurrence evidence. + Taxon and trait records are synthesized from NOW database curation linked to fossil mammal occurrences and literature sources. + + + + + Fossil mammal taxa and selected curated or synthesized taxon-level traits. + + class + Mammalia + mammals + + + + + + The NOW database is continuously curated. This export represents a production snapshot generated from the live curated database rather than a frozen version-specific dataset associated with the DOI. + + continual + + + ${DATASET_CREATOR} + https://nowdatabase.org/ + + + + + Taxon rows are generated from curated NOW species records. Stable taxon identifiers are derived from NOW database species identifiers and are used as the core identifiers for the archive. + + + + + Taxon-level traits in measurementorfact.csv are generated directly from curated database fields. They remain in DwC-A MeasurementOrFact because these synthesized values are associated with taxa rather than with individual specimen, material sample, event, or occurrence source entities. + + + + + NOW data are expert curated from literature and community expertise. Trait values should be interpreted in the context of the field descriptions, source curation practices, and the companion relational DwC-DP export. + + + + + New and Old Worlds Database of Fossil Mammals + + ${DATASET_CREATOR} + data curator + + + The NOW database supports research on Cenozoic mammal evolution, biogeography, environments, and fossil occurrence patterns at global scale. + + ` @@ -916,13 +1016,16 @@ export const buildDwcArchiveZipBufferFromSpecies = async ( return await zip.generateAsync({ type: 'nodebuffer', compression: 'DEFLATE', compressionOptions: { level: 6 } }) } -export const fetchSpeciesForDwcExport = async (): Promise< - Array -> => { +export const fetchSpeciesForDwcExport = async ( + speciesIds?: number[] +): Promise> => { + if (speciesIds && speciesIds.length === 0) return [] const { nowDb } = await import('../utils/db') // NOTE: v1 intentionally exports only com_species rows as taxa. // TODO(#1150): Add synonym export from com_taxa_synonym. return await nowDb.com_species.findMany({ + where: speciesIds ? { species_id: { in: speciesIds } } : undefined, + orderBy: { species_id: 'asc' }, select: { species_id: true, class_name: true, @@ -994,7 +1097,7 @@ export const fetchSpeciesForDwcExport = async (): Promise< }) } -export const buildDwcArchiveZipBuffer = async (): Promise => { - const speciesRows = await fetchSpeciesForDwcExport() +export const buildDwcArchiveZipBuffer = async (speciesIds?: number[]): Promise => { + const speciesRows = await fetchSpeciesForDwcExport(speciesIds) return await buildDwcArchiveZipBufferFromSpecies(speciesRows) } diff --git a/backend/src/services/dwcArchiveExportLocalities.ts b/backend/src/services/dwcArchiveExportLocalities.ts index 0ee8abc0..b4188538 100644 --- a/backend/src/services/dwcArchiveExportLocalities.ts +++ b/backend/src/services/dwcArchiveExportLocalities.ts @@ -1345,137 +1345,142 @@ export const buildDwcLocalityArchiveZipBufferFromLocalities = async ( return await zip.generateAsync({ type: 'nodebuffer' }) } -export const buildDwcLocalityArchiveZipBuffer = async (): Promise => { - const { nowDb } = await import('../utils/db') - const localities = await nowDb.now_loc.findMany({ +const localityExportSelect = { + lid: true, + loc_name: true, + basin: true, + subbasin: true, + country: true, + state: true, + county: true, + dec_lat: true, + dec_long: true, + dms_lat: true, + dms_long: true, + approx_coord: true, + altitude: true, + loc_detail: true, + chron: true, + lgroup: true, + formation: true, + member: true, + bed: true, + bfa_max: true, + bfa_min: true, + bfa_max_abs: true, + bfa_min_abs: true, + frac_max: true, + frac_min: true, + max_age: true, + min_age: true, + date_meth: true, + age_comm: true, + site_area: true, + gen_loc: true, + plate: true, + appr_num_spm: true, + num_spm: true, + true_quant: true, + complete: true, + num_quad: true, + rock_type: true, + rt_adj: true, + lith_comm: true, + depo_context1: true, + depo_context2: true, + depo_context3: true, + depo_context4: true, + depo_comm: true, + sed_env_1: true, + sed_env_2: true, + event_circum: true, + se_comm: true, + assem_fm: true, + transport: true, + trans_mod: true, + weath_trmp: true, + pt_conc: true, + size_type: true, + vert_pres: true, + plant_pres: true, + invert_pres: true, + time_rep: true, + taph_comm: true, + tax_comm: true, + datum_plane: true, + tos: true, + bos: true, + loc_status: true, + hominin_skeletal_remains: true, + climate_type: true, + biome: true, + v_ht: true, + v_struct: true, + v_envi_det: true, + disturb: true, + nutrients: true, + water: true, + seasonality: true, + seas_intens: true, + pri_prod: true, + moisture: true, + temperature: true, + estimate_precip: true, + estimate_temp: true, + estimate_npp: true, + pers_woody_cover: true, + pers_pollen_ap: true, + pers_pollen_nap: true, + pers_pollen_other: true, + stone_tool_cut_marks_on_bones: true, + bipedal_footprints: true, + stone_tool_technology: true, + technological_mode_1: true, + technological_mode_2: true, + technological_mode_3: true, + cultural_stage_1: true, + cultural_stage_2: true, + cultural_stage_3: true, + regional_culture_1: true, + regional_culture_2: true, + regional_culture_3: true, + now_time_unit_now_loc_bfa_maxTonow_time_unit: { + select: { tu_name: true, tu_display_name: true, rank: true, sequence: true }, + }, + now_time_unit_now_loc_bfa_minTonow_time_unit: { + select: { tu_name: true, tu_display_name: true, rank: true, sequence: true }, + }, + now_syn_loc: { + select: { synonym: true }, + }, + now_ss: { + select: { sed_struct: true }, + }, + now_coll_meth: { + select: { coll_meth: true }, + }, + now_mus: { select: { - lid: true, - loc_name: true, - basin: true, - subbasin: true, - country: true, - state: true, - county: true, - dec_lat: true, - dec_long: true, - dms_lat: true, - dms_long: true, - approx_coord: true, - altitude: true, - loc_detail: true, - chron: true, - lgroup: true, - formation: true, - member: true, - bed: true, - bfa_max: true, - bfa_min: true, - bfa_max_abs: true, - bfa_min_abs: true, - frac_max: true, - frac_min: true, - max_age: true, - min_age: true, - date_meth: true, - age_comm: true, - site_area: true, - gen_loc: true, - plate: true, - appr_num_spm: true, - num_spm: true, - true_quant: true, - complete: true, - num_quad: true, - rock_type: true, - rt_adj: true, - lith_comm: true, - depo_context1: true, - depo_context2: true, - depo_context3: true, - depo_context4: true, - depo_comm: true, - sed_env_1: true, - sed_env_2: true, - event_circum: true, - se_comm: true, - assem_fm: true, - transport: true, - trans_mod: true, - weath_trmp: true, - pt_conc: true, - size_type: true, - vert_pres: true, - plant_pres: true, - invert_pres: true, - time_rep: true, - taph_comm: true, - tax_comm: true, - datum_plane: true, - tos: true, - bos: true, - loc_status: true, - hominin_skeletal_remains: true, - climate_type: true, - biome: true, - v_ht: true, - v_struct: true, - v_envi_det: true, - disturb: true, - nutrients: true, - water: true, - seasonality: true, - seas_intens: true, - pri_prod: true, - moisture: true, - temperature: true, - estimate_precip: true, - estimate_temp: true, - estimate_npp: true, - pers_woody_cover: true, - pers_pollen_ap: true, - pers_pollen_nap: true, - pers_pollen_other: true, - stone_tool_cut_marks_on_bones: true, - bipedal_footprints: true, - stone_tool_technology: true, - technological_mode_1: true, - technological_mode_2: true, - technological_mode_3: true, - cultural_stage_1: true, - cultural_stage_2: true, - cultural_stage_3: true, - regional_culture_1: true, - regional_culture_2: true, - regional_culture_3: true, - now_time_unit_now_loc_bfa_maxTonow_time_unit: { - select: { tu_name: true, tu_display_name: true, rank: true, sequence: true }, - }, - now_time_unit_now_loc_bfa_minTonow_time_unit: { - select: { tu_name: true, tu_display_name: true, rank: true, sequence: true }, - }, - now_syn_loc: { - select: { synonym: true }, - }, - now_ss: { - select: { sed_struct: true }, - }, - now_coll_meth: { - select: { coll_meth: true }, - }, - now_mus: { - select: { - museum: true, - com_mlist: { select: { institution: true, alt_int_name: true, city: true, state: true, country: true } }, - }, - }, - now_ls: { - select: { - com_species: { - select: { order_name: true, tht: true, genus_name: true }, - }, - }, + museum: true, + com_mlist: { select: { institution: true, alt_int_name: true, city: true, state: true, country: true } }, + }, + }, + now_ls: { + select: { + com_species: { + select: { order_name: true, tht: true, genus_name: true }, }, }, + }, +} as const + +export const buildDwcLocalityArchiveZipBuffer = async (localityIds?: number[]): Promise => { + if (localityIds && localityIds.length === 0) return await buildDwcLocalityArchiveZipBufferFromLocalities([]) + const { nowDb } = await import('../utils/db') + const localities = await nowDb.now_loc.findMany({ + where: localityIds ? { lid: { in: localityIds } } : undefined, + orderBy: { lid: 'asc' }, + select: localityExportSelect, }) return await buildDwcLocalityArchiveZipBufferFromLocalities(localities as unknown as LocalityForExport[]) diff --git a/backend/src/services/dwcArchiveExportOccurrences.ts b/backend/src/services/dwcArchiveExportOccurrences.ts index 27c601ea..4fd293e0 100644 --- a/backend/src/services/dwcArchiveExportOccurrences.ts +++ b/backend/src/services/dwcArchiveExportOccurrences.ts @@ -93,6 +93,11 @@ type DwcOccurrenceArchiveStream = { cleanup: () => Promise } +export type DwcOccurrenceKey = { + lid: number + speciesId: number +} + export type DwcOccurrenceExportProgress = { stage: 'occurrences' | 'localities' | 'taxa' | 'zipping' | 'complete' generated: number @@ -385,8 +390,33 @@ const localityLookupSelect = { const speciesLookupSelect = occurrenceSelect.com_species.select -async function* iterateOccurrenceRows(): AsyncGenerator { +const sortOccurrenceKeys = (keys: DwcOccurrenceKey[]): DwcOccurrenceKey[] => + [...keys].sort((a, b) => a.lid - b.lid || a.speciesId - b.speciesId) + +async function* iterateOccurrenceRows(occurrenceKeys?: DwcOccurrenceKey[]): AsyncGenerator { const { nowDb } = await import('../utils/db') + + if (occurrenceKeys) { + for (const keys of chunk(sortOccurrenceKeys(occurrenceKeys), LOOKUP_EXPORT_CHUNK_SIZE)) { + if (keys.length === 0) continue + const page = await nowDb.now_ls.findMany({ + where: { + OR: keys.map(key => ({ + lid: key.lid, + species_id: key.speciesId, + })), + }, + orderBy: [{ lid: 'asc' }, { species_id: 'asc' }], + select: occurrenceSelect, + }) + + for (const occurrence of page) { + yield occurrence as unknown as OccurrenceForExport + } + } + return + } + let cursor: { lid: number; species_id: number } | undefined while (true) { @@ -408,7 +438,8 @@ async function* iterateOccurrenceRows(): AsyncGenerator { } } -const countOccurrenceRows = async (): Promise => { +const countOccurrenceRows = async (occurrenceKeys?: DwcOccurrenceKey[]): Promise => { + if (occurrenceKeys) return occurrenceKeys.length const { nowDb } = await import('../utils/db') return await nowDb.now_ls.count() } @@ -551,16 +582,18 @@ const writeOccurrenceAndMeasurementFiles = async ({ occurrenceFilePath, measurementFilePath, reportProgress, + occurrenceKeys, }: { occurrenceFilePath: string measurementFilePath: string reportProgress?: DwcOccurrenceExportProgressReporter + occurrenceKeys?: DwcOccurrenceKey[] }): Promise<{ localityIds: number[]; speciesIds: number[] }> => { const occurrenceWriter = await createDwcCsvFileWriter(occurrenceFilePath, OCCURRENCE_HEADERS) const measurementWriter = await createDwcCsvFileWriter(measurementFilePath, MEASUREMENT_HEADERS) const localityIds = new Set() const speciesIds = new Set() - const totalOccurrences = await countOccurrenceRows() + const totalOccurrences = await countOccurrenceRows(occurrenceKeys) let generatedOccurrences = 0 reportProgress?.({ @@ -571,7 +604,7 @@ const writeOccurrenceAndMeasurementFiles = async ({ }) try { - for await (const occurrence of iterateOccurrenceRows()) { + for await (const occurrence of iterateOccurrenceRows(occurrenceKeys)) { localityIds.add(occurrence.lid) speciesIds.add(occurrence.species_id) await occurrenceWriter.writeRow(mapOccurrenceToOccurrenceRow(occurrence)) @@ -699,8 +732,10 @@ const writeTaxonLookupFile = async ({ export const buildDwcOccurrenceArchiveZipStream = async ({ reportProgress, + occurrenceKeys, }: { reportProgress?: DwcOccurrenceExportProgressReporter + occurrenceKeys?: DwcOccurrenceKey[] } = {}): Promise => { const tempDirectory = await mkdtemp(path.join(tmpdir(), 'now-dwc-occurrences-')) const files = { @@ -716,6 +751,7 @@ export const buildDwcOccurrenceArchiveZipStream = async ({ occurrenceFilePath: files.occurrence, measurementFilePath: files.measurement, reportProgress, + occurrenceKeys, }) await writeLocalityLookupFiles({ localityIds, diff --git a/backend/src/services/dwcDataPackageExport.ts b/backend/src/services/dwcDataPackageExport.ts index 063d053d..4ddcc01e 100644 --- a/backend/src/services/dwcDataPackageExport.ts +++ b/backend/src/services/dwcDataPackageExport.ts @@ -8,9 +8,10 @@ import { import { mapOccurrenceToMeasurementRows, mapOccurrenceToOccurrenceRow, + type DwcOccurrenceKey, type OccurrenceCsvRow, } from './dwcArchiveExportOccurrences' -import { buildDwcArchiveZipBuffer, type MeasurementCsvRow } from './dwcArchiveExport' +import { buildDwcArchiveZipBuffer, resolveTaxonRank, type MeasurementCsvRow } from './dwcArchiveExport' import { writeDwcCsvString } from './utils/dwcCsv' const isMeaningfulString = (value: unknown): value is string => { @@ -117,6 +118,14 @@ const ASSERTION_HEADERS = [ const DWC_DP_EVENT_ASSERTION_HEADERS = ['eventID', ...ASSERTION_HEADERS] as const const DWC_DP_OCCURRENCE_ASSERTION_HEADERS = ['occurrenceID', ...ASSERTION_HEADERS] as const +const DATASET_TITLE = 'NOW database Darwin Core export' +const DATASET_NAME = 'now-darwincore-export' +const DATASET_VERSION = '1.0.0' +const DATASET_DOI = 'https://doi.org/10.5281/zenodo.4268068' +const DATASET_LICENSE_URL = 'https://creativecommons.org/licenses/by/4.0/' +const DATASET_LICENSE_TITLE = 'Creative Commons Attribution 4.0 International' +const DATASET_CREATOR = 'The NOW Community' +const MISSING_VALUE = '\\N' type AssertionHeader = (typeof ASSERTION_HEADERS)[number] type AssertionColumns = Record @@ -237,6 +246,10 @@ export const mapLocalityToDwcDpGeologicalContextRow = (locality: LocalityForDwcD export const mapOccurrenceToDwcDpOccurrenceRow = (occurrence: OccurrenceForDwcDpExport): DwcDpOccurrenceRow => { const occurrenceRow: OccurrenceCsvRow = mapOccurrenceToOccurrenceRow(occurrence) + const subfamilyRaw = toMaybeMeaningful(occurrence.com_species.subfamily_name) + const subfamily = subfamilyRaw && subfamilyRaw.toLowerCase().endsWith('inae') ? subfamilyRaw : '' + const tribe = subfamilyRaw && subfamilyRaw.toLowerCase().endsWith('ini') ? subfamilyRaw : '' + const subtribe = subfamilyRaw && subfamilyRaw.toLowerCase().endsWith('ina') ? subfamilyRaw : '' return { occurrenceID: occurrenceRow.occurrenceID, @@ -248,7 +261,16 @@ export const mapOccurrenceToDwcDpOccurrenceRow = (occurrence: OccurrenceForDwcDp taxonID: occurrenceRow.taxonID, scientificName: occurrenceRow.scientificName, scientificNameAuthorship: toMaybeMeaningful(occurrence.com_species.sp_author), - taxonRank: '', + taxonRank: resolveTaxonRank({ + family: toMaybeMeaningful(occurrence.com_species.family_name), + genus: toMaybeMeaningful(occurrence.com_species.genus_name), + specificEpithet: toMaybeMeaningful(occurrence.com_species.species_name), + uniqueIdentifier: toMaybeMeaningful(occurrence.com_species.unique_identifier) || null, + subclassOrSuperorderName: occurrence.com_species.subclass_or_superorder_name, + subfamily, + tribe, + subtribe, + }), identificationVerificationStatus: occurrenceRow.identificationQualifier, } } @@ -276,7 +298,115 @@ export const mapOccurrenceToDwcDpOccurrenceAssertionRows = ( })) } -const field = (name: string, type = 'string') => ({ name, type }) +const FIELD_DESCRIPTIONS: Record = { + eventID: 'Stable NOW database event identifier for a paleontological locality.', + parentEventID: 'Identifier for a containing event, reserved for future event hierarchies.', + eventType: 'Type of event represented by the row; NOW localities are exported as paleontological locality events.', + locationID: 'Stable NOW database location identifier for the locality.', + locality: 'Locality name as curated in the NOW database.', + continent: 'Continent inferred from the curated country value where possible.', + country: 'Country or geographic area recorded for the locality.', + stateProvince: 'State, province, or equivalent administrative subdivision.', + county: 'County or equivalent lower administrative subdivision.', + higherGeography: 'Pipe-delimited geographic hierarchy assembled from available NOW locality fields.', + decimalLatitude: + 'Latitude in decimal degrees. Coordinates may be exact, generalized, rounded, or uncertain depending on source data.', + decimalLongitude: + 'Longitude in decimal degrees. Coordinates may be exact, generalized, rounded, or uncertain depending on source data.', + verbatimLatitude: 'Verbatim latitude expression when recorded.', + verbatimLongitude: 'Verbatim longitude expression when recorded.', + verbatimElevation: 'Verbatim or numeric elevation value from the curated locality record.', + eventRemarks: 'Combined locality, age, and taxonomic remarks from curated NOW fields.', + geologicalContextID: 'Stable identifier for the geological context associated with the locality event.', + lithostratigraphicTerms: + 'Combined lithostratigraphic terminology from source-publication wording, standardized chronostratigraphic concepts, and NOW harmonization practices.', + group: 'Lithostratigraphic group name where recorded.', + formation: 'Lithostratigraphic formation name where recorded.', + member: 'Lithostratigraphic member name where recorded.', + bed: 'Lithostratigraphic bed name where recorded.', + earliestAgeOrLowestStage: 'Earliest age or lowest chronostratigraphic stage associated with the locality.', + latestAgeOrHighestStage: 'Latest age or highest chronostratigraphic stage associated with the locality.', + occurrenceID: 'Stable NOW database occurrence identifier linking a locality event to a taxon record.', + organismQuantity: 'Quantity or abundance value for the occurrence when recorded.', + organismQuantityType: 'Type of quantity represented by organismQuantity.', + occurrenceStatus: 'Presence or absence status for the occurrence.', + occurrenceRemarks: 'Curated occurrence remarks from NOW locality-species data.', + taxonID: 'Stable NOW taxon identifier; this joins to dwc-a-taxa/taxon.csv in the full export.', + scientificName: 'Scientific name assembled from curated NOW taxonomic fields.', + scientificNameAuthorship: 'Scientific name authorship where curated.', + taxonRank: 'Taxonomic rank when available; currently reserved for future enrichment.', + identificationVerificationStatus: 'Curated identification status or qualifier.', + assertionID: 'Stable assertion identifier derived from the source database field and owning event or occurrence.', + verbatimAssertionType: + 'Original NOW database field name or curated source category that produced the assertion; approx_coord marks approximate coordinate information.', + assertionType: + 'Human-readable assertion type. Future exports may add controlled predicate or ontology mappings without changing this column.', + assertionTypeIRI: 'Placeholder for a future ontology IRI identifying the assertion type.', + assertionTypeSource: 'Placeholder for the vocabulary or ontology source of assertionTypeIRI.', + assertionMadeDate: 'Date the assertion was made when recorded; empty values mean not recorded.', + assertionEffectiveDate: + 'Date or interval to which the assertion applies when recorded; empty values mean not recorded.', + assertionValue: 'Curated or derived assertion value generated directly from NOW database fields.', + assertionValueIRI: 'Placeholder for a future ontology IRI identifying the assertion value.', + assertionValueSource: 'Placeholder for the vocabulary or ontology source of assertionValueIRI.', + assertionValueNumeric: 'Numeric representation of assertionValue when the value can be parsed as a number.', + assertionUnit: 'Unit associated with the assertion value when recorded.', + assertionUnitIRI: 'Placeholder for a future ontology IRI identifying the assertion unit.', + assertionUnitSource: 'Placeholder for the vocabulary or ontology source of assertionUnitIRI.', + assertionError: 'Uncertainty or error associated with the assertion when recorded.', + assertionBy: 'Agent responsible for the assertion when recorded; empty values mean not recorded.', + assertionByID: 'Identifier for assertionBy, reserved for future agent-table interoperability.', + assertionProtocols: 'Method, protocol, or source database mapping used to generate the assertion.', + assertionProtocolID: 'Identifier for a protocol record, reserved for future protocol-table interoperability.', + assertionReferences: 'Reference identifiers or citations supporting the assertion when recorded.', + assertionRemarks: 'Additional assertion-level remarks when recorded.', +} + +const DWC_TERM_IRIS: Record = { + eventID: 'http://rs.tdwg.org/dwc/terms/eventID', + parentEventID: 'http://rs.tdwg.org/dwc/terms/parentEventID', + eventType: 'http://rs.tdwg.org/dwc/terms/eventType', + locationID: 'http://rs.tdwg.org/dwc/terms/locationID', + locality: 'http://rs.tdwg.org/dwc/terms/locality', + continent: 'http://rs.tdwg.org/dwc/terms/continent', + country: 'http://rs.tdwg.org/dwc/terms/country', + stateProvince: 'http://rs.tdwg.org/dwc/terms/stateProvince', + county: 'http://rs.tdwg.org/dwc/terms/county', + higherGeography: 'http://rs.tdwg.org/dwc/terms/higherGeography', + decimalLatitude: 'http://rs.tdwg.org/dwc/terms/decimalLatitude', + decimalLongitude: 'http://rs.tdwg.org/dwc/terms/decimalLongitude', + verbatimLatitude: 'http://rs.tdwg.org/dwc/terms/verbatimLatitude', + verbatimLongitude: 'http://rs.tdwg.org/dwc/terms/verbatimLongitude', + verbatimElevation: 'http://rs.tdwg.org/dwc/terms/verbatimElevation', + eventRemarks: 'http://rs.tdwg.org/dwc/terms/eventRemarks', + geologicalContextID: 'http://rs.tdwg.org/dwc/terms/geologicalContextID', + lithostratigraphicTerms: 'http://rs.tdwg.org/dwc/terms/lithostratigraphicTerms', + group: 'http://rs.tdwg.org/dwc/terms/group', + formation: 'http://rs.tdwg.org/dwc/terms/formation', + member: 'http://rs.tdwg.org/dwc/terms/member', + bed: 'http://rs.tdwg.org/dwc/terms/bed', + earliestAgeOrLowestStage: 'http://rs.tdwg.org/dwc/terms/earliestAgeOrLowestStage', + latestAgeOrHighestStage: 'http://rs.tdwg.org/dwc/terms/latestAgeOrHighestStage', + occurrenceID: 'http://rs.tdwg.org/dwc/terms/occurrenceID', + organismQuantity: 'http://rs.tdwg.org/dwc/terms/organismQuantity', + organismQuantityType: 'http://rs.tdwg.org/dwc/terms/organismQuantityType', + occurrenceStatus: 'http://rs.tdwg.org/dwc/terms/occurrenceStatus', + occurrenceRemarks: 'http://rs.tdwg.org/dwc/terms/occurrenceRemarks', + taxonID: 'http://rs.tdwg.org/dwc/terms/taxonID', + scientificName: 'http://rs.tdwg.org/dwc/terms/scientificName', + scientificNameAuthorship: 'http://rs.tdwg.org/dwc/terms/scientificNameAuthorship', + taxonRank: 'http://rs.tdwg.org/dwc/terms/taxonRank', + identificationVerificationStatus: 'http://rs.tdwg.org/dwc/terms/identificationVerificationStatus', +} + +const field = (name: string, type = 'string') => ({ + name, + title: name.replace(/([A-Z])/g, ' $1').replace(/^./, first => first.toUpperCase()), + description: FIELD_DESCRIPTIONS[name] ?? `Curated NOW database value for ${name}.`, + type, + format: 'default', + ...(DWC_TERM_IRIS[name] ? { 'dcterms:isVersionOf': DWC_TERM_IRIS[name] } : {}), +}) const schemaFor = ({ headers, @@ -287,34 +417,70 @@ const schemaFor = ({ primaryKey: string | string[] foreignKeys?: Array<{ fields: string + predicate?: string reference: { resource: string; fields: string } }> }) => ({ fields: headers.map(header => field(header, header.endsWith('Numeric') ? 'number' : 'string')), primaryKey, + missingValues: [MISSING_VALUE], foreignKeys, }) export const buildDwcDataPackageJson = (publicationDateIso: string): string => { const dataPackage = { - profile: 'data-package', - name: 'now-dwc-dp-test-export', - title: 'NOW database Darwin Core Data Package test export', + profile: 'http://rs.tdwg.org/dwc-dp/1.0/dwc-dp-profile.json', + name: DATASET_NAME, + id: DATASET_DOI, + title: DATASET_TITLE, + version: DATASET_VERSION, created: publicationDateIso, homepage: 'https://nowdatabase.org/', + contributors: [ + { title: DATASET_CREATOR, role: 'creator' }, + { title: DATASET_CREATOR, role: 'publisher' }, + { title: DATASET_CREATOR, role: 'rightsHolder' }, + ], + licenses: [ + { + name: 'CC-BY-4.0', + title: DATASET_LICENSE_TITLE, + path: DATASET_LICENSE_URL, + }, + ], + keywords: [ + 'Darwin Core', + 'Darwin Core Data Package', + 'Darwin Core Archive', + 'NOW database', + 'paleobiology', + 'paleontology', + 'fossil mammals', + 'Cenozoic', + 'occurrence data', + 'taxon traits', + ], + citation: + 'The NOW Community. NOW database Darwin Core export, version 1.0.0. https://doi.org/10.5281/zenodo.4268068. The DOI describes the NOW database generally rather than a single frozen export version; include the export date when citing a downloaded archive.', description: - 'Admin-only test Darwin Core Data Package export for NOW localities as events and NOW locality-species rows as occurrences.', + 'Production Darwin Core Data Package export from the NOW database for relational event, occurrence, geological context, and assertion data. The NOW database is a continuously curated, globally scoped fossil mammal database with Cenozoic emphasis, spanning approximately the last 66 million years.', + missingValues: [MISSING_VALUE], resources: [ { name: 'event', path: DWC_DP_TABLES.event, profile: 'tabular-data-resource', + format: 'csv', + mediatype: 'text/csv', + description: + 'Paleontological locality events derived from curated NOW locality records. Event identifiers are stable database IDs and are referenced by occurrence rows.', schema: schemaFor({ headers: DWC_DP_EVENT_HEADERS, primaryKey: 'eventID', foreignKeys: [ { fields: 'geologicalContextID', + predicate: 'has geological context', reference: { resource: 'geological-context', fields: 'geologicalContextID' }, }, ], @@ -324,6 +490,10 @@ export const buildDwcDataPackageJson = (publicationDateIso: string): string => { name: 'geological-context', path: DWC_DP_TABLES.geologicalContext, profile: 'tabular-data-resource', + format: 'csv', + mediatype: 'text/csv', + description: + 'Geological and chronostratigraphic context for NOW locality events, reflecting source-publication terminology, standardized chronostratigraphic concepts, and NOW harmonization practices.', schema: schemaFor({ headers: DWC_DP_GEOLOGICAL_CONTEXT_HEADERS, primaryKey: 'geologicalContextID', @@ -333,12 +503,17 @@ export const buildDwcDataPackageJson = (publicationDateIso: string): string => { name: 'occurrence', path: DWC_DP_TABLES.occurrence, profile: 'tabular-data-resource', + format: 'csv', + mediatype: 'text/csv', + description: + 'Fossil mammal occurrence rows derived from curated NOW locality-species associations. occurrenceID is stable within the export and taxonID joins to the DwC-A taxon archive in the full bundle.', schema: schemaFor({ headers: DWC_DP_OCCURRENCE_HEADERS, primaryKey: 'occurrenceID', foreignKeys: [ { fields: 'eventID', + predicate: 'happened during', reference: { resource: 'event', fields: 'eventID' }, }, ], @@ -348,12 +523,17 @@ export const buildDwcDataPackageJson = (publicationDateIso: string): string => { name: 'event-assertion', path: DWC_DP_TABLES.eventAssertion, profile: 'tabular-data-resource', + format: 'csv', + mediatype: 'text/csv', + description: + 'Provenance-aware curated or derived statements associated with locality events. Assertion values are generated directly from curated NOW database fields whose content originates from expert-curated literature data or opinions; empty provenance fields should be read as not recorded.', schema: schemaFor({ headers: DWC_DP_EVENT_ASSERTION_HEADERS, primaryKey: 'assertionID', foreignKeys: [ { fields: 'eventID', + predicate: 'asserts about', reference: { resource: 'event', fields: 'eventID' }, }, ], @@ -363,12 +543,17 @@ export const buildDwcDataPackageJson = (publicationDateIso: string): string => { name: 'occurrence-assertion', path: DWC_DP_TABLES.occurrenceAssertion, profile: 'tabular-data-resource', + format: 'csv', + mediatype: 'text/csv', + description: + 'Provenance-aware curated or derived statements associated with occurrences. Assertion columns include placeholders for future semantic predicates, ontology IRIs, agents, protocols, and richer provenance structures.', schema: schemaFor({ headers: DWC_DP_OCCURRENCE_ASSERTION_HEADERS, primaryKey: 'assertionID', foreignKeys: [ { fields: 'occurrenceID', + predicate: 'asserts about', reference: { resource: 'occurrence', fields: 'occurrenceID' }, }, ], @@ -385,30 +570,128 @@ export const buildDwcDataPackageEmlXml = (publicationDateIso: string): string => - - NOW database Darwin Core Data Package test export + ${DATASET_TITLE} - - NOW database - + ${DATASET_CREATOR} + https://nowdatabase.org/ - - - NOW database - - + + ${DATASET_CREATOR} + https://nowdatabase.org/ + + + ${DATASET_CREATOR} + publisher + https://nowdatabase.org/ + ${publicationDateIso} + eng + NOW database Darwin Core export - Admin-only test Darwin Core Data Package export. Localities are modeled as events, locality geological fields are modeled as geological contexts, locality-species rows are modeled as occurrences, and non-core facts are modeled as event or occurrence assertions. + The NOW database Darwin Core export is a production-quality Darwin Core Data Package for relational event, occurrence, geological context, and assertion data from the New and Old Worlds Database of Fossil Mammals. The NOW database is a continuously curated global fossil mammal database supporting large-scale paleobiological and paleontological research. + The data are expert curated from literature and community expertise. The database spans approximately the last 66 million years, Cenozoic, while maintaining global coverage. This export is intended primarily for researchers downloading and analyzing data. + + Darwin Core Data Package + Darwin Core Archive + fossil mammals + Cenozoic + paleobiology + paleontology + occurrence data + assertions + taxon traits + NOW database export keywords + + + Recommended citation: ${DATASET_CREATOR}. ${DATASET_TITLE}, version ${DATASET_VERSION}. ${DATASET_DOI}. The DOI describes the NOW database generally rather than a single frozen dataset export version; include the export date (${publicationDateIso}) when citing a downloaded archive. + Missing values in CSV files are serialized as ${MISSING_VALUE}. Coordinate uncertainty is partially represented through assertions where verbatimAssertionType equals approx_coord. + Future exports may populate ontology IRI, semantic predicate, agent, protocol, and richer provenance fields while preserving the current relational model and column names wherever possible. + - TODO(#1150): Add rights / license information. + Copyright ${DATASET_CREATOR}. This export is licensed under ${DATASET_LICENSE_TITLE} (${DATASET_LICENSE_URL}). Users may share and adapt the data with appropriate attribution. + + + https://nowdatabase.org/ + + + + + Global. Locality coordinates may be exact, generalized, rounded, or uncertain, depending on the source publication and curation history. + + -180 + 180 + 90 + -90 + + + + + + Geologic time + Cenozoic, approximately the last 66 million years + Chronological ranges vary by locality and source publication. + NOW locality ages combine source-publication terminology, standardized chronostratigraphic concepts, and NOW harmonization practices. + + + + + Global fossil mammal occurrences, with associated taxonomic names and selected synthesized taxon-level traits in the companion DwC-A taxon archive. + + class + Mammalia + mammals + + + + + + The NOW database is continuously curated. This export represents a production snapshot generated from the live curated database rather than a frozen version-specific dataset associated with the DOI. + + continual + + + ${DATASET_CREATOR} + https://nowdatabase.org/ + + + + + Locality records are exported as Darwin Core event rows, associated geological context rows, and event-level assertions. Locality-species associations are exported as Darwin Core occurrence rows and occurrence-level assertions. + + + + + Assertion tables are aligned with the emerging DwC-DP assertion model. Assertions are provenance-aware curated or derived statements associated with events or occurrences. They are generated directly from curated NOW database fields whose content originates from expert-curated literature data or opinions. When assertion provenance fields are empty, they should primarily be interpreted as not recorded. + + + + + Geological and chronostratigraphic terminology uses mixed conventions: source-publication terminology, standardized chronostratigraphic concepts, and NOW harmonization practices. + + + + + NOW data are expert curated from the literature and continuously updated. Stable identifiers in this export are derived from NOW database identifiers and are intended for repeatable joins across the files in the downloaded archive. + + + + + New and Old Worlds Database of Fossil Mammals + + ${DATASET_CREATOR} + data curator + + + The NOW database supports research on Cenozoic mammal evolution, biogeography, environments, and fossil occurrence patterns at global scale. + + ` @@ -625,18 +908,37 @@ export const buildDwcDataPackageZipBufferFromRows = async ({ return await zip.generateAsync({ type: 'nodebuffer', compression: 'DEFLATE', compressionOptions: { level: 6 } }) } -export const buildDwcDataPackageZipBuffer = async (): Promise => { +const fetchOccurrencesForDwcDataPackageExport = async ( + occurrenceKeys?: DwcOccurrenceKey[] +): Promise => { + if (occurrenceKeys && occurrenceKeys.length === 0) return [] + const { nowDb } = await import('../utils/db') + const occurrences = await nowDb.now_ls.findMany({ + where: occurrenceKeys + ? { + OR: occurrenceKeys.map(key => ({ + lid: key.lid, + species_id: key.speciesId, + })), + } + : undefined, + orderBy: [{ lid: 'asc' }, { species_id: 'asc' }], + select: occurrenceSelect, + }) + + return occurrences as unknown as OccurrenceForDwcDpExport[] +} + +export const buildDwcDataPackageZipBuffer = async (occurrenceKeys?: DwcOccurrenceKey[]): Promise => { const { nowDb } = await import('../utils/db') + const occurrences = await fetchOccurrencesForDwcDataPackageExport(occurrenceKeys) + const localityIds = occurrenceKeys ? [...new Set(occurrences.map(occurrence => occurrence.lid))] : undefined const localities = (await nowDb.now_loc.findMany({ + where: localityIds ? { lid: { in: localityIds } } : undefined, orderBy: { lid: 'asc' }, select: localitySelect, })) as unknown as LocalityForDwcDpExport[] - const occurrences = (await nowDb.now_ls.findMany({ - orderBy: [{ lid: 'asc' }, { species_id: 'asc' }], - select: occurrenceSelect, - })) as unknown as OccurrenceForDwcDpExport[] - return await buildDwcDataPackageZipBufferFromRows({ localities, occurrences, @@ -662,28 +964,118 @@ const addZipEntriesUnderPrefix = async ({ ) } -const buildFullDarwinCoreReadme = (): string => `NOW database Darwin Core full test export +const buildFullDarwinCoreReadme = (): string => `${DATASET_TITLE} +Version: ${DATASET_VERSION} +Package name: ${DATASET_NAME} +Creator / publisher / rights holder: ${DATASET_CREATOR} +License: ${DATASET_LICENSE_TITLE} (${DATASET_LICENSE_URL}) +Identifier: ${DATASET_DOI} + +This production export contains two separate standards-based Darwin Core +artifacts from the NOW database (New and Old Worlds Database of Fossil Mammals). +The outer ZIP is a convenience bundle, not a single DwC-DP or DwC-A artifact. +Each subdirectory should be treated as its own standards-based export. + +Directory tree + +. +|-- README.txt +|-- dwc-dp/ +| |-- datapackage.json +| |-- eml.xml +| |-- event.csv +| |-- geological-context.csv +| |-- occurrence.csv +| |-- event-assertion.csv +| \`-- occurrence-assertion.csv +\`-- dwc-a-taxa/ + |-- meta.xml + |-- eml.xml + |-- taxon.csv + \`-- measurementorfact.csv + +Scientific scope + +The NOW database is a continuously curated global fossil mammal database. The +export is paleobiological and paleontological in scope, is expert curated from +literature and community expertise, and supports large-scale research on fossil +mammal occurrences, taxonomy, traits, environments, and geological context. The +database spans approximately the last 66 million years, Cenozoic, while +maintaining global coverage. + +Relationship between DwC-DP and DwC-A + +dwc-dp/ is a Darwin Core Data Package for relational event, occurrence, +geological context, and assertion data. It preserves the locality-to-occurrence +relationships in a table structure that is easier to analyze than a single +DwC-A star schema for this part of the database. + +dwc-a-taxa/ is a Darwin Core Archive for taxonomic records and synthesized +taxon-level traits. Taxon-level traits remain in DwC-A Taxon + +MeasurementOrFact form because these values are generated directly from curated +taxon fields and are not currently linked to individual specimen, material +sample, event, or occurrence source entities. Keeping these traits in the +taxon-centered DwC-A preserves compatibility with existing DwC-A tooling and +existing consumers of the taxon trait export. -This ZIP is a convenience bundle. It contains two separate standards-based export -artifacts in their own folders: +Join keys -- dwc-dp/ - Darwin Core Data Package test export for relational event, geological context, - occurrence, event assertion, and occurrence assertion data. +- dwc-dp/event.csv eventID joins to dwc-dp/occurrence.csv eventID. +- dwc-dp/event.csv geologicalContextID joins to + dwc-dp/geological-context.csv geologicalContextID. +- dwc-dp/event.csv eventID joins to dwc-dp/event-assertion.csv eventID. +- dwc-dp/occurrence.csv occurrenceID joins to + dwc-dp/occurrence-assertion.csv occurrenceID. +- dwc-dp/occurrence.csv taxonID joins to dwc-a-taxa/taxon.csv taxonID. +- dwc-a-taxa/taxon.csv taxonID joins to + dwc-a-taxa/measurementorfact.csv taxonID. -- dwc-a-taxa/ - Darwin Core Archive test export for taxon records and synthesized taxon-level - traits. The traits remain in DwC-A Taxon + MeasurementOrFact form because they - are not currently linked to source specimen/material records. +Identifier stability -Join key: +Identifiers are stable database IDs derived from NOW database identifiers. They +are intended to support repeatable joins within and across downloaded exports. +They should not be interpreted as globally minted persistent identifiers unless +explicitly documented as such in future releases. -- dwc-dp/occurrence.csv taxonID uses NOW: -- dwc-a-taxa/taxon.csv taxonID uses the same NOW: -- dwc-a-taxa/measurementorfact.csv links taxon traits by that same taxonID +Assertions -The outer ZIP is not itself a single DwC-DP or DwC-A artifact. Each subfolder is -intended to remain internally understandable as its own export format. +Assertion tables are aligned with the emerging DwC-DP assertion model. They +represent provenance-aware curated or derived statements associated with events +or occurrences. Assertions are generated directly from curated database fields +whose content originates from expert-curated literature data or opinions. Empty +assertion provenance fields should primarily be interpreted as not recorded. + +Geological context and coordinates + +Geological and chronostratigraphic terminology uses mixed conventions: +source-publication terminology, standardized chronostratigraphic concepts, and +NOW harmonization practices. Coordinates may be exact, generalized, rounded, or +uncertain. Coordinate uncertainty is partially represented using assertions +where verbatimAssertionType is approx_coord. + +Missing values + +Missing values in CSV files are serialized as ${MISSING_VALUE}. Data Package +metadata also declares ${MISSING_VALUE} as the missing value marker. Treat empty +assertion provenance fields and other missing fields as not recorded unless a +field-specific description states otherwise. + +Citation guidance + +Recommended citation: +${DATASET_CREATOR}. ${DATASET_TITLE}, version ${DATASET_VERSION}. ${DATASET_DOI}. +Include the export download or generation date when citing a specific downloaded +archive. + +The DOI ${DATASET_DOI} describes the NOW database generally rather than a single +frozen dataset export version. + +Future interoperability + +Several assertion columns are reserved for ontology IRIs, semantic predicates, +agent identifiers, protocol identifiers, and richer provenance structures. These +placeholders are included to support future semantic interoperability while +preserving the current CSV schemas, identifiers, and relational structure. ` export const buildFullDarwinCoreExportZipBufferFromArchives = async ({ @@ -709,10 +1101,16 @@ export const buildFullDarwinCoreExportZipBufferFromArchives = async ({ return await zip.generateAsync({ type: 'nodebuffer', compression: 'DEFLATE', compressionOptions: { level: 6 } }) } -export const buildFullDarwinCoreExportZipBuffer = async (): Promise => { +export const buildFullDarwinCoreExportZipBuffer = async (occurrenceKeys?: DwcOccurrenceKey[]): Promise => { + const occurrences = occurrenceKeys ? await fetchOccurrencesForDwcDataPackageExport(occurrenceKeys) : undefined + const filteredOccurrenceKeys = occurrences?.map(occurrence => ({ + lid: occurrence.lid, + speciesId: occurrence.species_id, + })) + const speciesIds = occurrences ? [...new Set(occurrences.map(occurrence => occurrence.species_id))] : undefined const [dwcDataPackageZipBuffer, dwcTaxonArchiveZipBuffer] = await Promise.all([ - buildDwcDataPackageZipBuffer(), - buildDwcArchiveZipBuffer(), + buildDwcDataPackageZipBuffer(filteredOccurrenceKeys), + buildDwcArchiveZipBuffer(speciesIds), ]) return await buildFullDarwinCoreExportZipBufferFromArchives({ diff --git a/backend/src/services/utils/dwcCsv.ts b/backend/src/services/utils/dwcCsv.ts index d6c8a060..18ac8a52 100644 --- a/backend/src/services/utils/dwcCsv.ts +++ b/backend/src/services/utils/dwcCsv.ts @@ -24,7 +24,10 @@ export const toDwcCsvString = (value: unknown): string => { return '' } -export const dwcCsvCell = (value: unknown): string => `"${toDwcCsvString(value).replace(/"/g, '""')}"` +export const dwcCsvCell = (value: unknown): string => { + const csvValue = toDwcCsvString(value) + return `"${(csvValue === '' ? '\\N' : csvValue).replace(/"/g, '""')}"` +} export const dwcCsvLine = (headers: readonly string[], row: Record): string => `${headers.map(header => dwcCsvCell(row[header])).join(',')}\n` diff --git a/backend/src/unit-tests/dwcCsv.test.ts b/backend/src/unit-tests/dwcCsv.test.ts index 4d5a185d..95e98589 100644 --- a/backend/src/unit-tests/dwcCsv.test.ts +++ b/backend/src/unit-tests/dwcCsv.test.ts @@ -7,6 +7,12 @@ describe('DwC CSV writer', () => { expect(dwcCsvCell('first\n"second"')).toEqual('"first ""second"""') }) + it('serializes missing values as a backslash-N marker', () => { + expect(dwcCsvCell('')).toEqual('"\\N"') + expect(dwcCsvCell(null)).toEqual('"\\N"') + expect(dwcCsvCell(undefined)).toEqual('"\\N"') + }) + it('writes one physical line per row even when source values contain line breaks', () => { const csv = writeDwcCsvString( ['id', 'remarks'], diff --git a/backend/src/unit-tests/dwcDataPackageExport.test.ts b/backend/src/unit-tests/dwcDataPackageExport.test.ts index b1b05fd2..28e24922 100644 --- a/backend/src/unit-tests/dwcDataPackageExport.test.ts +++ b/backend/src/unit-tests/dwcDataPackageExport.test.ts @@ -143,6 +143,7 @@ describe('DwC-DP export mapping', () => { eventID: 'NOW:EVENT:42', taxonID: 'NOW:21052', scientificName: 'Simplomys simplicidens Test Author', + taxonRank: 'species', identificationVerificationStatus: 'confirmed', }) ) @@ -186,8 +187,20 @@ describe('DwC-DP export mapping', () => { expect(eventCsv.trimEnd().split('\n')).toHaveLength(2) const dataPackageJson = JSON.parse(await zip.file(DWC_DP_TABLES.dataPackage)!.async('string')) as { - resources: Array<{ name: string; schema: { foreignKeys?: unknown[] } }> + name: string + title: string + version: string + licenses: Array<{ name: string }> + resources: Array<{ + name: string + mediatype?: string + schema: { foreignKeys?: unknown[]; missingValues?: string[] } + }> } + expect(dataPackageJson.name).toBe('now-darwincore-export') + expect(dataPackageJson.title).toBe('NOW database Darwin Core export') + expect(dataPackageJson.version).toBe('1.0.0') + expect(dataPackageJson.licenses).toEqual(expect.arrayContaining([expect.objectContaining({ name: 'CC-BY-4.0' })])) expect(dataPackageJson.resources.map(resource => resource.name)).toEqual([ 'event', 'geological-context', @@ -195,9 +208,12 @@ describe('DwC-DP export mapping', () => { 'event-assertion', 'occurrence-assertion', ]) + expect(dataPackageJson.resources.every(resource => resource.mediatype === 'text/csv')).toBe(true) + expect(dataPackageJson.resources.every(resource => resource.schema.missingValues?.includes('\\N'))).toBe(true) expect(dataPackageJson.resources.find(resource => resource.name === 'occurrence')?.schema.foreignKeys).toEqual([ { fields: 'eventID', + predicate: 'happened during', reference: { resource: 'event', fields: 'eventID' }, }, ]) diff --git a/frontend/src/components/Locality/LocalityDwcExportMenuItem.tsx b/frontend/src/components/Locality/LocalityDwcExportMenuItem.tsx index b05490ee..dccd18be 100644 --- a/frontend/src/components/Locality/LocalityDwcExportMenuItem.tsx +++ b/frontend/src/components/Locality/LocalityDwcExportMenuItem.tsx @@ -6,18 +6,29 @@ import { useUser } from '@/hooks/user' import { Role } from '@/shared/types' import { currentDateAsString } from '@/shared/currentDateAsString' import { downloadExportFileWithProgress } from '@/util/exportProgress' +import { usePageContext } from '../Page' export const LocalityDwcExportMenuItem = ({ handleClose }: { handleClose: () => void }) => { const [loading, setLoading] = useState(false) const { notify, setMessage: setNotificationMessage } = useNotify() const user = useUser() + const { idList } = usePageContext() if (user.role !== Role.Admin) { return null } - const fetchOptions = user.token ? { headers: { Authorization: `Bearer ${user.token}` } } : {} - const filename = `now_dwc_localities_test_export_${currentDateAsString()}.zip` + const fetchOptions: RequestInit = user.token ? { headers: { Authorization: `Bearer ${user.token}` } } : {} + const filteredFetchOptions: RequestInit = { + ...fetchOptions, + method: 'POST', + headers: { + ...(fetchOptions.headers as Record | undefined), + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ ids: idList }), + } + const filename = `now_dwc_localities_export_${currentDateAsString()}.zip` const fetchZipFile = async () => { setLoading(true) @@ -26,7 +37,7 @@ export const LocalityDwcExportMenuItem = ({ handleClose }: { handleClose: () => await downloadExportFileWithProgress({ url: `${BACKEND_URL}/locality/export/dwc-archive`, filename, - fetchOptions, + fetchOptions: filteredFetchOptions, notify, setNotificationMessage, startMessage: 'Generating DwC-A locality ZIP export...', diff --git a/frontend/src/components/Occurrence/OccurrenceDwcDpExportMenuItem.tsx b/frontend/src/components/Occurrence/OccurrenceDwcDpExportMenuItem.tsx index 1d240e51..cead96e5 100644 --- a/frontend/src/components/Occurrence/OccurrenceDwcDpExportMenuItem.tsx +++ b/frontend/src/components/Occurrence/OccurrenceDwcDpExportMenuItem.tsx @@ -6,18 +6,29 @@ import { useUser } from '@/hooks/user' import { Role } from '@/shared/types' import { currentDateAsString } from '@/shared/currentDateAsString' import { downloadExportFileWithProgress } from '@/util/exportProgress' +import { usePageContext } from '../Page' export const OccurrenceDwcDpExportMenuItem = ({ handleClose }: { handleClose: () => void }) => { const [loading, setLoading] = useState(false) const { notify, setMessage: setNotificationMessage } = useNotify() const user = useUser() + const { sqlColumnFilters, sqlOrderBy } = usePageContext() if (user.role !== Role.Admin) { return null } - const fetchOptions = user.token ? { headers: { Authorization: `Bearer ${user.token}` } } : {} - const filename = `now_dwc_dp_test_export_${currentDateAsString()}.zip` + const fetchOptions: RequestInit = user.token ? { headers: { Authorization: `Bearer ${user.token}` } } : {} + const filteredFetchOptions: RequestInit = { + ...fetchOptions, + method: 'POST', + headers: { + ...(fetchOptions.headers as Record | undefined), + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ columnFilters: sqlColumnFilters, sorting: sqlOrderBy }), + } + const filename = `now_dwc_dp_export_${currentDateAsString()}.zip` const fetchZipFile = async () => { setLoading(true) @@ -26,7 +37,7 @@ export const OccurrenceDwcDpExportMenuItem = ({ handleClose }: { handleClose: () await downloadExportFileWithProgress({ url: `${BACKEND_URL}/occurrence/export/dwc-data-package`, filename, - fetchOptions, + fetchOptions: filteredFetchOptions, notify, setNotificationMessage, startMessage: 'Generating DwC-DP ZIP export...', diff --git a/frontend/src/components/Occurrence/OccurrenceDwcExportMenuItem.tsx b/frontend/src/components/Occurrence/OccurrenceDwcExportMenuItem.tsx index ee429b8e..c74228c6 100644 --- a/frontend/src/components/Occurrence/OccurrenceDwcExportMenuItem.tsx +++ b/frontend/src/components/Occurrence/OccurrenceDwcExportMenuItem.tsx @@ -6,18 +6,29 @@ import { useUser } from '@/hooks/user' import { Role } from '@/shared/types' import { currentDateAsString } from '@/shared/currentDateAsString' import { createExportId, downloadExportFileWithProgress } from '@/util/exportProgress' +import { usePageContext } from '../Page' export const OccurrenceDwcExportMenuItem = ({ handleClose }: { handleClose: () => void }) => { const [loading, setLoading] = useState(false) const { notify, setMessage: setNotificationMessage } = useNotify() const user = useUser() + const { sqlColumnFilters, sqlOrderBy } = usePageContext() if (user.role !== Role.Admin) { return null } - const fetchOptions = user.token ? { headers: { Authorization: `Bearer ${user.token}` } } : {} - const filename = `now_dwc_occurrences_test_export_${currentDateAsString()}.zip` + const fetchOptions: RequestInit = user.token ? { headers: { Authorization: `Bearer ${user.token}` } } : {} + const filteredFetchOptions: RequestInit = { + ...fetchOptions, + method: 'POST', + headers: { + ...(fetchOptions.headers as Record | undefined), + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ columnFilters: sqlColumnFilters, sorting: sqlOrderBy }), + } + const filename = `now_dwc_occurrences_export_${currentDateAsString()}.zip` const fetchZipFile = async () => { setLoading(true) @@ -27,7 +38,7 @@ export const OccurrenceDwcExportMenuItem = ({ handleClose }: { handleClose: () = url: `${BACKEND_URL}/occurrence/export/dwc-archive?${new URLSearchParams({ exportId })}`, progressUrl: `${BACKEND_URL}/occurrence/export/dwc-archive/progress/${exportId}`, filename, - fetchOptions, + fetchOptions: filteredFetchOptions, notify, setNotificationMessage, startMessage: 'Generating DwC-A occurrence ZIP export...', diff --git a/frontend/src/components/Occurrence/OccurrenceFullDarwinCoreExportMenuItem.tsx b/frontend/src/components/Occurrence/OccurrenceFullDarwinCoreExportMenuItem.tsx index 127d3eb1..ea57d159 100644 --- a/frontend/src/components/Occurrence/OccurrenceFullDarwinCoreExportMenuItem.tsx +++ b/frontend/src/components/Occurrence/OccurrenceFullDarwinCoreExportMenuItem.tsx @@ -6,18 +6,29 @@ import { useUser } from '@/hooks/user' import { Role } from '@/shared/types' import { currentDateAsString } from '@/shared/currentDateAsString' import { downloadExportFileWithProgress } from '@/util/exportProgress' +import { usePageContext } from '../Page' export const OccurrenceFullDarwinCoreExportMenuItem = ({ handleClose }: { handleClose: () => void }) => { const [loading, setLoading] = useState(false) const { notify, setMessage: setNotificationMessage } = useNotify() const user = useUser() + const { sqlColumnFilters, sqlOrderBy } = usePageContext() if (user.role !== Role.Admin) { return null } - const fetchOptions = user.token ? { headers: { Authorization: `Bearer ${user.token}` } } : {} - const filename = `now_dwc_full_test_export_${currentDateAsString()}.zip` + const fetchOptions: RequestInit = user.token ? { headers: { Authorization: `Bearer ${user.token}` } } : {} + const filteredFetchOptions: RequestInit = { + ...fetchOptions, + method: 'POST', + headers: { + ...(fetchOptions.headers as Record | undefined), + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ columnFilters: sqlColumnFilters, sorting: sqlOrderBy }), + } + const filename = `now_dwc_full_export_${currentDateAsString()}.zip` const fetchZipFile = async () => { setLoading(true) @@ -26,7 +37,7 @@ export const OccurrenceFullDarwinCoreExportMenuItem = ({ handleClose }: { handle await downloadExportFileWithProgress({ url: `${BACKEND_URL}/occurrence/export/dwc-full-package`, filename, - fetchOptions, + fetchOptions: filteredFetchOptions, notify, setNotificationMessage, startMessage: 'Generating full Darwin Core ZIP export...', diff --git a/frontend/src/components/Species/SpeciesDwcExportMenuItem.tsx b/frontend/src/components/Species/SpeciesDwcExportMenuItem.tsx index 4031c80e..7f06b33d 100644 --- a/frontend/src/components/Species/SpeciesDwcExportMenuItem.tsx +++ b/frontend/src/components/Species/SpeciesDwcExportMenuItem.tsx @@ -6,18 +6,29 @@ import { useUser } from '@/hooks/user' import { Role } from '@/shared/types' import { currentDateAsString } from '@/shared/currentDateAsString' import { downloadExportFileWithProgress } from '@/util/exportProgress' +import { usePageContext } from '../Page' export const SpeciesDwcExportMenuItem = ({ handleClose }: { handleClose: () => void }) => { const [loading, setLoading] = useState(false) const { notify, setMessage: setNotificationMessage } = useNotify() const user = useUser() + const { idList } = usePageContext() if (user.role !== Role.Admin) { return null } - const fetchOptions = user.token ? { headers: { Authorization: `Bearer ${user.token}` } } : {} - const filename = `now_dwc_test_export_${currentDateAsString()}.zip` + const fetchOptions: RequestInit = user.token ? { headers: { Authorization: `Bearer ${user.token}` } } : {} + const filteredFetchOptions: RequestInit = { + ...fetchOptions, + method: 'POST', + headers: { + ...(fetchOptions.headers as Record | undefined), + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ ids: idList }), + } + const filename = `now_dwc_export_${currentDateAsString()}.zip` const fetchZipFile = async () => { setLoading(true) @@ -26,7 +37,7 @@ export const SpeciesDwcExportMenuItem = ({ handleClose }: { handleClose: () => v await downloadExportFileWithProgress({ url: `${BACKEND_URL}/species/export/dwc-archive`, filename, - fetchOptions, + fetchOptions: filteredFetchOptions, notify, setNotificationMessage, startMessage: 'Generating DwC-A taxon ZIP export...', From 7c4199d819768db9e50fba530aae3881a9c2e15f Mon Sep 17 00:00:00 2001 From: karilint Date: Thu, 11 Jun 2026 17:54:50 +0300 Subject: [PATCH 2/8] Fix Darwin Core export filename tests --- .../locality/dwcArchiveExportLocalities.test.ts | 2 +- .../occurrence/dwcArchiveExportOccurrences.test.ts | 6 +++--- .../src/api-tests/species/dwcArchiveExport.test.ts | 2 +- backend/src/services/dwcArchiveExportLocalities.ts | 12 ++++++------ backend/src/services/dwcArchiveExportOccurrences.ts | 12 ++++++------ 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/backend/src/api-tests/locality/dwcArchiveExportLocalities.test.ts b/backend/src/api-tests/locality/dwcArchiveExportLocalities.test.ts index b030f734..ba77ea4f 100644 --- a/backend/src/api-tests/locality/dwcArchiveExportLocalities.test.ts +++ b/backend/src/api-tests/locality/dwcArchiveExportLocalities.test.ts @@ -42,7 +42,7 @@ describe('DwC-A locality export (admin-only)', () => { expect(result.status).toEqual(200) expect(result.headers['content-type']).toMatch(/application\/zip/i) - expect(result.headers['content-disposition']).toMatch(/attachment;\s*filename="now_dwc_localities_test_export_/i) + expect(result.headers['content-disposition']).toMatch(/attachment;\s*filename="now_dwc_localities_export_/i) const zip = await JSZip.loadAsync(result.body as unknown as Buffer) expect(zip.file('location.csv')).toBeTruthy() diff --git a/backend/src/api-tests/occurrence/dwcArchiveExportOccurrences.test.ts b/backend/src/api-tests/occurrence/dwcArchiveExportOccurrences.test.ts index a6340e2b..b8ae061e 100644 --- a/backend/src/api-tests/occurrence/dwcArchiveExportOccurrences.test.ts +++ b/backend/src/api-tests/occurrence/dwcArchiveExportOccurrences.test.ts @@ -42,7 +42,7 @@ describe('DwC-A occurrence export (admin-only)', () => { expect(result.status).toEqual(200) expect(result.headers['content-type']).toMatch(/application\/zip/i) - expect(result.headers['content-disposition']).toMatch(/attachment;\s*filename="now_dwc_occurrences_test_export_/i) + expect(result.headers['content-disposition']).toMatch(/attachment;\s*filename="now_dwc_occurrences_export_/i) const zip = await JSZip.loadAsync(result.body as unknown as Buffer) expect(zip.file('location.csv')).toBeTruthy() @@ -80,7 +80,7 @@ describe('DwC-A occurrence export (admin-only)', () => { expect(result.status).toEqual(200) expect(result.headers['content-type']).toMatch(/application\/zip/i) - expect(result.headers['content-disposition']).toMatch(/attachment;\s*filename="now_dwc_dp_test_export_/i) + expect(result.headers['content-disposition']).toMatch(/attachment;\s*filename="now_dwc_dp_export_/i) const zip = await JSZip.loadAsync(result.body as unknown as Buffer) expect(zip.file('datapackage.json')).toBeTruthy() @@ -110,7 +110,7 @@ describe('DwC-A occurrence export (admin-only)', () => { expect(result.status).toEqual(200) expect(result.headers['content-type']).toMatch(/application\/zip/i) - expect(result.headers['content-disposition']).toMatch(/attachment;\s*filename="now_dwc_full_test_export_/i) + expect(result.headers['content-disposition']).toMatch(/attachment;\s*filename="now_dwc_full_export_/i) const zip = await JSZip.loadAsync(result.body as unknown as Buffer) expect(zip.file('README.txt')).toBeTruthy() diff --git a/backend/src/api-tests/species/dwcArchiveExport.test.ts b/backend/src/api-tests/species/dwcArchiveExport.test.ts index 271a0371..e832ca26 100644 --- a/backend/src/api-tests/species/dwcArchiveExport.test.ts +++ b/backend/src/api-tests/species/dwcArchiveExport.test.ts @@ -42,7 +42,7 @@ describe('DwC-A species export (admin-only)', () => { expect(result.status).toEqual(200) expect(result.headers['content-type']).toMatch(/application\/zip/i) - expect(result.headers['content-disposition']).toMatch(/attachment;\s*filename="now_dwc_test_export_/i) + expect(result.headers['content-disposition']).toMatch(/attachment;\s*filename="now_dwc_export_/i) const zip = await JSZip.loadAsync(result.body as unknown as Buffer) expect(zip.file('taxon.csv')).toBeTruthy() diff --git a/backend/src/services/dwcArchiveExportLocalities.ts b/backend/src/services/dwcArchiveExportLocalities.ts index b4188538..83595e36 100644 --- a/backend/src/services/dwcArchiveExportLocalities.ts +++ b/backend/src/services/dwcArchiveExportLocalities.ts @@ -1291,29 +1291,29 @@ export const buildLocalityEmlXml = (publicationDateIso: string): string => { - NOW database Darwin Core test export (localities) + NOW database Darwin Core export (localities) - NOW database + The NOW Community - NOW database + The NOW Community ${publicationDateIso} - Admin-only test Darwin Core Archive export for localities, mapping Location + GeologicalContext + MeasurementOrFact terms. Field mappings are intentionally limited for v1. + Production Darwin Core Archive export for NOW locality records, mapping Location, GeologicalContext, and MeasurementOrFact terms. Field mappings are intentionally limited for v1. - TODO(#1150): Add rights / license information. + This dataset is made available under the Creative Commons Attribution 4.0 International License (CC BY 4.0): https://creativecommons.org/licenses/by/4.0/. diff --git a/backend/src/services/dwcArchiveExportOccurrences.ts b/backend/src/services/dwcArchiveExportOccurrences.ts index 4fd293e0..08d47345 100644 --- a/backend/src/services/dwcArchiveExportOccurrences.ts +++ b/backend/src/services/dwcArchiveExportOccurrences.ts @@ -514,29 +514,29 @@ export const buildOccurrenceEmlXml = (publicationDateIso: string): string => { - NOW database Darwin Core test export (occurrences) + NOW database Darwin Core export (occurrences) - NOW database + The NOW Community - NOW database + The NOW Community ${publicationDateIso} - Admin-only test Darwin Core Archive export for occurrence records from now_ls. Location and taxon lookup files are included with the same structures as the locality and taxon exports. + Production Darwin Core Archive export for occurrence records from the NOW database. Location and taxon lookup files are included with the same structures as the locality and taxon exports. - TODO(#1150): Add rights / license information. + This dataset is made available under the Creative Commons Attribution 4.0 International License (CC BY 4.0): https://creativecommons.org/licenses/by/4.0/. From bbc11e44b9d22b1aec66d0676475d87875bf6e89 Mon Sep 17 00:00:00 2001 From: karilint Date: Mon, 15 Jun 2026 14:09:24 +0300 Subject: [PATCH 3/8] Address Darwin Core export review comments --- backend/src/routes/locality.ts | 13 +------ backend/src/routes/occurrence.ts | 2 +- backend/src/routes/species.ts | 13 +------ backend/src/routes/utils/exportFilters.ts | 11 ++++++ backend/src/services/crossSearch.ts | 10 +++-- backend/src/services/dwcDataPackageExport.ts | 39 +++++++++++++++---- backend/src/unit-tests/exportFilters.test.ts | 21 ++++++++++ .../OccurrenceDwcExportMenuItem.tsx | 1 + frontend/src/util/exportProgress.ts | 4 +- 9 files changed, 79 insertions(+), 35 deletions(-) create mode 100644 backend/src/routes/utils/exportFilters.ts create mode 100644 backend/src/unit-tests/exportFilters.test.ts diff --git a/backend/src/routes/locality.ts b/backend/src/routes/locality.ts index dc0c041b..4c18c65e 100644 --- a/backend/src/routes/locality.ts +++ b/backend/src/routes/locality.ts @@ -12,19 +12,10 @@ import { AccessError, requireOneOf } from '../middlewares/authorizer' import { deleteLocality, writeLocality } from '../services/write/locality' import { buildDwcLocalityArchiveZipBuffer } from '../services/dwcArchiveExportLocalities' import { currentDateAsString } from '../../../frontend/src/shared/currentDateAsString' +import { parseNumericIds } from './utils/exportFilters' const router = Router() -const parseNumericIds = (value: unknown): number[] | undefined => { - if (value === undefined) return undefined - if (!Array.isArray(value)) throw new Error('ids must be an array.') - return value.map(id => { - const parsed = typeof id === 'number' ? id : typeof id === 'string' ? parseInt(id, 10) : NaN - if (!Number.isInteger(parsed)) throw new Error('ids must contain only integers.') - return parsed - }) -} - router.get('/all', async (req, res) => { const localities = await getAllLocalities(req.user) return res.status(200).send(fixBigInt(localities)) @@ -45,7 +36,7 @@ router.post('/export/dwc-archive', requireOneOf([Role.Admin]), async (req, res) try { return await sendDwcArchive(parseNumericIds((req.body as { ids?: unknown }).ids), res) } catch (error) { - return res.status(403).send({ error: error instanceof Error ? error.message : 'Invalid export filters.' }) + return res.status(400).send({ error: error instanceof Error ? error.message : 'Invalid export filters.' }) } }) diff --git a/backend/src/routes/occurrence.ts b/backend/src/routes/occurrence.ts index abadcfc4..c511535b 100644 --- a/backend/src/routes/occurrence.ts +++ b/backend/src/routes/occurrence.ts @@ -45,7 +45,7 @@ const resolveOccurrenceKeysForExport = async (req: Request): Promise { - return res.status(403).send({ error: error instanceof Error ? error.message : 'Invalid export filters.' }) + return res.status(400).send({ error: error instanceof Error ? error.message : 'Invalid export filters.' }) } router.get('/export/dwc-archive/progress/:exportId', requireOneOf([Role.Admin]), (req, res) => { diff --git a/backend/src/routes/species.ts b/backend/src/routes/species.ts index d9c1e3c5..369967ac 100644 --- a/backend/src/routes/species.ts +++ b/backend/src/routes/species.ts @@ -6,19 +6,10 @@ import { deleteSpecies, writeSpecies } from '../services/write/species' import { requireOneOf } from '../middlewares/authorizer' import { buildDwcArchiveZipBuffer } from '../services/dwcArchiveExport' import { currentDateAsString } from '../../../frontend/src/shared/currentDateAsString' +import { parseNumericIds } from './utils/exportFilters' const router = Router() -const parseNumericIds = (value: unknown): number[] | undefined => { - if (value === undefined) return undefined - if (!Array.isArray(value)) throw new Error('ids must be an array.') - return value.map(id => { - const parsed = typeof id === 'number' ? id : typeof id === 'string' ? parseInt(id, 10) : NaN - if (!Number.isInteger(parsed)) throw new Error('ids must contain only integers.') - return parsed - }) -} - router.get('/all', async (_req, res) => { const species = await getAllSpecies() return res.status(200).send(fixBigInt(species)) @@ -44,7 +35,7 @@ router.post('/export/dwc-archive', requireOneOf([Role.Admin]), async (req, res) try { return await sendDwcArchive(parseNumericIds((req.body as { ids?: unknown }).ids), res) } catch (error) { - return res.status(403).send({ error: error instanceof Error ? error.message : 'Invalid export filters.' }) + return res.status(400).send({ error: error instanceof Error ? error.message : 'Invalid export filters.' }) } }) diff --git a/backend/src/routes/utils/exportFilters.ts b/backend/src/routes/utils/exportFilters.ts new file mode 100644 index 00000000..70a482d5 --- /dev/null +++ b/backend/src/routes/utils/exportFilters.ts @@ -0,0 +1,11 @@ +export const parseNumericIds = (value: unknown): number[] | undefined => { + if (value === undefined) return undefined + if (!Array.isArray(value)) throw new Error('ids must be an array.') + + return value.map(id => { + const parsed = typeof id === 'number' ? id : typeof id === 'string' && /^-?\d+$/.test(id) ? Number(id) : Number.NaN + + if (!Number.isSafeInteger(parsed)) throw new Error('ids must contain only integers.') + return parsed + }) +} diff --git a/backend/src/services/crossSearch.ts b/backend/src/services/crossSearch.ts index 84997bc1..e6977ad7 100644 --- a/backend/src/services/crossSearch.ts +++ b/backend/src/services/crossSearch.ts @@ -268,10 +268,12 @@ export const getFilteredCrossSearchOccurrenceKeys = async ( )) as Array>> const keysById = new Map() - for (const row of resultPages.flat()) { - if (typeof row.lid_now_loc !== 'number' || typeof row.species_id_com_species !== 'number') continue - const key = { lid: row.lid_now_loc, speciesId: row.species_id_com_species } - keysById.set(`${key.lid}:${key.speciesId}`, key) + for (const page of resultPages) { + for (const row of page) { + if (typeof row.lid_now_loc !== 'number' || typeof row.species_id_com_species !== 'number') continue + const key = { lid: row.lid_now_loc, speciesId: row.species_id_com_species } + keysById.set(`${key.lid}:${key.speciesId}`, key) + } } return { occurrenceKeys: [...keysById.values()] } diff --git a/backend/src/services/dwcDataPackageExport.ts b/backend/src/services/dwcDataPackageExport.ts index 4ddcc01e..24bb7d57 100644 --- a/backend/src/services/dwcDataPackageExport.ts +++ b/backend/src/services/dwcDataPackageExport.ts @@ -38,6 +38,20 @@ const occurrenceIdForRow = (lid: number, speciesId: number): string => `NOW:OCC: type LocalityForDwcDpExport = Parameters[0] type OccurrenceForDwcDpExport = Parameters[0] +const LOOKUP_EXPORT_CHUNK_SIZE = 1000 + +const chunk = (values: T[], size: number): T[][] => { + const chunks: T[][] = [] + for (let index = 0; index < values.length; index += size) { + chunks.push(values.slice(index, index + size)) + } + return chunks +} + +const sortOccurrenceKeys = (occurrenceKeys: DwcOccurrenceKey[]): DwcOccurrenceKey[] => { + return [...occurrenceKeys].sort((a, b) => a.lid - b.lid || a.speciesId - b.speciesId) +} + export const DWC_DP_EVENT_HEADERS = [ 'eventID', 'parentEventID', @@ -334,7 +348,7 @@ const FIELD_DESCRIPTIONS: Record = { taxonID: 'Stable NOW taxon identifier; this joins to dwc-a-taxa/taxon.csv in the full export.', scientificName: 'Scientific name assembled from curated NOW taxonomic fields.', scientificNameAuthorship: 'Scientific name authorship where curated.', - taxonRank: 'Taxonomic rank when available; currently reserved for future enrichment.', + taxonRank: 'Taxonomic rank derived from curated NOW taxonomic fields when available.', identificationVerificationStatus: 'Curated identification status or qualifier.', assertionID: 'Stable assertion identifier derived from the source database field and owning event or occurrence.', verbatimAssertionType: @@ -913,15 +927,26 @@ const fetchOccurrencesForDwcDataPackageExport = async ( ): Promise => { if (occurrenceKeys && occurrenceKeys.length === 0) return [] const { nowDb } = await import('../utils/db') - const occurrences = await nowDb.now_ls.findMany({ - where: occurrenceKeys - ? { - OR: occurrenceKeys.map(key => ({ + + if (occurrenceKeys) { + const occurrences: OccurrenceForDwcDpExport[] = [] + for (const keys of chunk(sortOccurrenceKeys(occurrenceKeys), LOOKUP_EXPORT_CHUNK_SIZE)) { + const chunkOccurrences = await nowDb.now_ls.findMany({ + where: { + OR: keys.map(key => ({ lid: key.lid, species_id: key.speciesId, })), - } - : undefined, + }, + orderBy: [{ lid: 'asc' }, { species_id: 'asc' }], + select: occurrenceSelect, + }) + occurrences.push(...(chunkOccurrences as unknown as OccurrenceForDwcDpExport[])) + } + return occurrences + } + + const occurrences = await nowDb.now_ls.findMany({ orderBy: [{ lid: 'asc' }, { species_id: 'asc' }], select: occurrenceSelect, }) diff --git a/backend/src/unit-tests/exportFilters.test.ts b/backend/src/unit-tests/exportFilters.test.ts new file mode 100644 index 00000000..7c250567 --- /dev/null +++ b/backend/src/unit-tests/exportFilters.test.ts @@ -0,0 +1,21 @@ +import { describe, expect, it } from '@jest/globals' +import { parseNumericIds } from '../routes/utils/exportFilters' + +describe('parseNumericIds', () => { + it('accepts integer ids as numbers or strings', () => { + expect(parseNumericIds([12, '34'])).toEqual([12, 34]) + }) + + it('rejects partially numeric strings', () => { + expect(() => parseNumericIds(['12abc'])).toThrow('ids must contain only integers.') + }) + + it('rejects decimals', () => { + expect(() => parseNumericIds([12.3])).toThrow('ids must contain only integers.') + expect(() => parseNumericIds(['12.3'])).toThrow('ids must contain only integers.') + }) + + it('rejects non-array id payloads', () => { + expect(() => parseNumericIds('12')).toThrow('ids must be an array.') + }) +}) diff --git a/frontend/src/components/Occurrence/OccurrenceDwcExportMenuItem.tsx b/frontend/src/components/Occurrence/OccurrenceDwcExportMenuItem.tsx index c74228c6..4ecef9a9 100644 --- a/frontend/src/components/Occurrence/OccurrenceDwcExportMenuItem.tsx +++ b/frontend/src/components/Occurrence/OccurrenceDwcExportMenuItem.tsx @@ -39,6 +39,7 @@ export const OccurrenceDwcExportMenuItem = ({ handleClose }: { handleClose: () = progressUrl: `${BACKEND_URL}/occurrence/export/dwc-archive/progress/${exportId}`, filename, fetchOptions: filteredFetchOptions, + progressFetchOptions: fetchOptions, notify, setNotificationMessage, startMessage: 'Generating DwC-A occurrence ZIP export...', diff --git a/frontend/src/util/exportProgress.ts b/frontend/src/util/exportProgress.ts index 89648894..7a10aa33 100644 --- a/frontend/src/util/exportProgress.ts +++ b/frontend/src/util/exportProgress.ts @@ -9,6 +9,7 @@ type DownloadExportParams = { url: string filename: string fetchOptions?: RequestInit + progressFetchOptions?: RequestInit notify: Notify setNotificationMessage: (message: string) => void startMessage: string @@ -30,6 +31,7 @@ export const downloadExportFileWithProgress = async ({ url, filename, fetchOptions = {}, + progressFetchOptions, notify, setNotificationMessage, startMessage, @@ -53,7 +55,7 @@ export const downloadExportFileWithProgress = async ({ if (!progressUrl) return false try { - const response = await fetch(progressUrl, fetchOptions) + const response = await fetch(progressUrl, progressFetchOptions ?? fetchOptions) if (!response.ok) return false const progress = (await response.json()) as ExportProgress From 0c4e44f947fb3dd6498064345c6632c1e153f623 Mon Sep 17 00:00:00 2001 From: karilint Date: Mon, 15 Jun 2026 14:33:45 +0300 Subject: [PATCH 4/8] Address follow-up Darwin Core review comments --- .../dwcArchiveExportOccurrences.test.ts | 51 +++++++++++++++++++ backend/src/routes/occurrence.ts | 17 ++++++- backend/src/services/dwcArchiveExport.ts | 19 +++---- backend/src/services/dwcDataPackageExport.ts | 21 ++++---- backend/src/services/dwcMetadata.ts | 8 +++ 5 files changed, 95 insertions(+), 21 deletions(-) create mode 100644 backend/src/services/dwcMetadata.ts diff --git a/backend/src/api-tests/occurrence/dwcArchiveExportOccurrences.test.ts b/backend/src/api-tests/occurrence/dwcArchiveExportOccurrences.test.ts index b8ae061e..7c1e1d96 100644 --- a/backend/src/api-tests/occurrence/dwcArchiveExportOccurrences.test.ts +++ b/backend/src/api-tests/occurrence/dwcArchiveExportOccurrences.test.ts @@ -62,6 +62,57 @@ describe('DwC-A occurrence export (admin-only)', () => { expect(measurementCsv).toContain('"verbatimMeasurementType"') }) + it('returns a filtered ZIP archive for POST requests', async () => { + const loginResult = await send<{ token: string }>('user/login', 'POST', { username: 'testSu', password: 'test' }) + expect(loginResult.status).toEqual(200) + + const result = await request(app) + .post('/occurrence/export/dwc-archive') + .set('authorization', `bearer ${loginResult.body.token}`) + .send({ columnFilters: [{ id: 'lid_now_loc', value: '21050' }], sorting: [] }) + .buffer(true) + .parse(parseBinary) + + expect(result.status).toEqual(200) + expect(result.headers['content-type']).toMatch(/application\/zip/i) + + const zip = await JSZip.loadAsync(result.body as unknown as Buffer) + const occurrenceCsv = await zip.file('occurrence.csv')!.async('string') + expect(occurrenceCsv).toContain('NOW:OCC:21050:') + expect(occurrenceCsv).not.toContain('NOW:OCC:24750:') + }) + + it('returns an empty filtered DwC-DP ZIP archive for POST requests', async () => { + const loginResult = await send<{ token: string }>('user/login', 'POST', { username: 'testSu', password: 'test' }) + expect(loginResult.status).toEqual(200) + + const result = await request(app) + .post('/occurrence/export/dwc-data-package') + .set('authorization', `bearer ${loginResult.body.token}`) + .send({ columnFilters: [{ id: 'lid_now_loc', value: '9999999' }], sorting: [] }) + .buffer(true) + .parse(parseBinary) + + expect(result.status).toEqual(200) + const zip = await JSZip.loadAsync(result.body as unknown as Buffer) + const occurrenceCsv = await zip.file('occurrence.csv')!.async('string') + expect(occurrenceCsv).toContain('"occurrenceID"') + expect(occurrenceCsv).not.toContain('NOW:OCC:') + }) + + it('returns structured validation errors for invalid POST filters', async () => { + const loginResult = await send<{ token: string }>('user/login', 'POST', { username: 'testSu', password: 'test' }) + expect(loginResult.status).toEqual(200) + + const result = await request(app) + .post('/occurrence/export/dwc-data-package') + .set('authorization', `bearer ${loginResult.body.token}`) + .send({ columnFilters: [{ id: '', value: '21050' }], sorting: [] }) + + expect(result.status).toEqual(400) + expect(result.body).toEqual([{ name: 'Column Filters', error: 'Invalid or missing id field in filter' }]) + }) + it('rejects non-admin requests', async () => { const result = await request(app).get('/occurrence/export/dwc-archive') expect(result.status).toEqual(403) diff --git a/backend/src/routes/occurrence.ts b/backend/src/routes/occurrence.ts index c511535b..7ae0309c 100644 --- a/backend/src/routes/occurrence.ts +++ b/backend/src/routes/occurrence.ts @@ -12,6 +12,7 @@ import { buildDwcDataPackageZipBuffer, buildFullDarwinCoreExportZipBuffer } from import { currentDateAsString } from '../../../frontend/src/shared/currentDateAsString' import { logger } from '../utils/logger' import { getFilteredCrossSearchOccurrenceKeys, type CrossSearchRequestParameters } from '../services/crossSearch' +import type { ValidationObject } from '../../../frontend/src/shared/validators/validator' const router = Router() @@ -31,6 +32,15 @@ const defaultCrossSearchExportFilters = { sorting: [], } satisfies CrossSearchRequestParameters +class ExportFilterValidationError extends Error { + validationErrors: ValidationObject[] + + constructor(validationErrors: ValidationObject[]) { + super('Invalid export filters.') + this.validationErrors = validationErrors + } +} + const resolveOccurrenceKeysForExport = async (req: Request): Promise => { if (req.method === 'GET') return undefined const body = req.body as Partial | undefined @@ -38,13 +48,16 @@ const resolveOccurrenceKeysForExport = async (req: Request): Promise { + if (error instanceof ExportFilterValidationError) { + return res.status(400).send(error.validationErrors) + } return res.status(400).send({ error: error instanceof Error ? error.message : 'Invalid export filters.' }) } diff --git a/backend/src/services/dwcArchiveExport.ts b/backend/src/services/dwcArchiveExport.ts index d9f33c47..d72ff948 100644 --- a/backend/src/services/dwcArchiveExport.ts +++ b/backend/src/services/dwcArchiveExport.ts @@ -2,15 +2,16 @@ import Prisma from '../../prisma/generated/now_test_client' import JSZip from 'jszip' import { toDwcCsvString, writeDwcCsvString } from './utils/dwcCsv' import { getFieldInfoText } from '../../../frontend/src/shared/fieldInfo' - -const DATASET_TITLE = 'NOW database Darwin Core export' -const DATASET_NAME = 'now-darwincore-export' -const DATASET_VERSION = '1.0.0' -const DATASET_DOI = 'https://doi.org/10.5281/zenodo.4268068' -const DATASET_LICENSE_URL = 'https://creativecommons.org/licenses/by/4.0/' -const DATASET_LICENSE_TITLE = 'Creative Commons Attribution 4.0 International' -const DATASET_CREATOR = 'The NOW Community' -const MISSING_VALUE = '\\N' +import { + DATASET_CREATOR, + DATASET_DOI, + DATASET_LICENSE_TITLE, + DATASET_LICENSE_URL, + DATASET_NAME, + DATASET_TITLE, + DATASET_VERSION, + MISSING_VALUE, +} from './dwcMetadata' const isMeaningfulString = (value: unknown): value is string => { if (typeof value !== 'string') return false diff --git a/backend/src/services/dwcDataPackageExport.ts b/backend/src/services/dwcDataPackageExport.ts index 24bb7d57..ff6228f0 100644 --- a/backend/src/services/dwcDataPackageExport.ts +++ b/backend/src/services/dwcDataPackageExport.ts @@ -13,6 +13,16 @@ import { } from './dwcArchiveExportOccurrences' import { buildDwcArchiveZipBuffer, resolveTaxonRank, type MeasurementCsvRow } from './dwcArchiveExport' import { writeDwcCsvString } from './utils/dwcCsv' +import { + DATASET_CREATOR, + DATASET_DOI, + DATASET_LICENSE_TITLE, + DATASET_LICENSE_URL, + DATASET_NAME, + DATASET_TITLE, + DATASET_VERSION, + MISSING_VALUE, +} from './dwcMetadata' const isMeaningfulString = (value: unknown): value is string => { if (typeof value !== 'string') return false @@ -132,14 +142,6 @@ const ASSERTION_HEADERS = [ const DWC_DP_EVENT_ASSERTION_HEADERS = ['eventID', ...ASSERTION_HEADERS] as const const DWC_DP_OCCURRENCE_ASSERTION_HEADERS = ['occurrenceID', ...ASSERTION_HEADERS] as const -const DATASET_TITLE = 'NOW database Darwin Core export' -const DATASET_NAME = 'now-darwincore-export' -const DATASET_VERSION = '1.0.0' -const DATASET_DOI = 'https://doi.org/10.5281/zenodo.4268068' -const DATASET_LICENSE_URL = 'https://creativecommons.org/licenses/by/4.0/' -const DATASET_LICENSE_TITLE = 'Creative Commons Attribution 4.0 International' -const DATASET_CREATOR = 'The NOW Community' -const MISSING_VALUE = '\\N' type AssertionHeader = (typeof ASSERTION_HEADERS)[number] type AssertionColumns = Record @@ -474,8 +476,7 @@ export const buildDwcDataPackageJson = (publicationDateIso: string): string => { 'occurrence data', 'taxon traits', ], - citation: - 'The NOW Community. NOW database Darwin Core export, version 1.0.0. https://doi.org/10.5281/zenodo.4268068. The DOI describes the NOW database generally rather than a single frozen export version; include the export date when citing a downloaded archive.', + citation: `${DATASET_CREATOR}. ${DATASET_TITLE}, version ${DATASET_VERSION}. ${DATASET_DOI}. The DOI describes the NOW database generally rather than a single frozen export version; include the export date when citing a downloaded archive.`, description: 'Production Darwin Core Data Package export from the NOW database for relational event, occurrence, geological context, and assertion data. The NOW database is a continuously curated, globally scoped fossil mammal database with Cenozoic emphasis, spanning approximately the last 66 million years.', missingValues: [MISSING_VALUE], diff --git a/backend/src/services/dwcMetadata.ts b/backend/src/services/dwcMetadata.ts new file mode 100644 index 00000000..d79596bc --- /dev/null +++ b/backend/src/services/dwcMetadata.ts @@ -0,0 +1,8 @@ +export const DATASET_TITLE = 'NOW database Darwin Core export' +export const DATASET_NAME = 'now-darwincore-export' +export const DATASET_VERSION = '1.0.0' +export const DATASET_DOI = 'https://doi.org/10.5281/zenodo.4268068' +export const DATASET_LICENSE_URL = 'https://creativecommons.org/licenses/by/4.0/' +export const DATASET_LICENSE_TITLE = 'Creative Commons Attribution 4.0 International' +export const DATASET_CREATOR = 'The NOW Community' +export const MISSING_VALUE = '\\N' From bf464fe579ce6e56ead5a58532aa19031268ce50 Mon Sep 17 00:00:00 2001 From: karilint Date: Mon, 15 Jun 2026 15:11:49 +0300 Subject: [PATCH 5/8] Address filtered export review comments --- .../dwcArchiveExportLocalities.test.ts | 33 +++++++++++++++++++ .../species/dwcArchiveExport.test.ts | 33 +++++++++++++++++++ backend/src/routes/locality.ts | 4 ++- backend/src/routes/species.ts | 4 ++- backend/src/services/dwcDataPackageExport.ts | 19 ++++++----- 5 files changed, 83 insertions(+), 10 deletions(-) diff --git a/backend/src/api-tests/locality/dwcArchiveExportLocalities.test.ts b/backend/src/api-tests/locality/dwcArchiveExportLocalities.test.ts index ba77ea4f..ea60894b 100644 --- a/backend/src/api-tests/locality/dwcArchiveExportLocalities.test.ts +++ b/backend/src/api-tests/locality/dwcArchiveExportLocalities.test.ts @@ -57,6 +57,39 @@ describe('DwC-A locality export (admin-only)', () => { expect(measurementCsv).toContain('"verbatimMeasurementType"') }) + it('returns a filtered ZIP archive for POST requests', async () => { + const loginResult = await send<{ token: string }>('user/login', 'POST', { username: 'testSu', password: 'test' }) + expect(loginResult.status).toEqual(200) + + const result = await request(app) + .post('/locality/export/dwc-archive') + .set('authorization', `bearer ${loginResult.body.token}`) + .send({ ids: [21050] }) + .buffer(true) + .parse(parseBinary) + + expect(result.status).toEqual(200) + expect(result.headers['content-type']).toMatch(/application\/zip/i) + + const zip = await JSZip.loadAsync(result.body as unknown as Buffer) + const locationCsv = await zip.file('location.csv')!.async('string') + expect(locationCsv).toContain('NOW:LOC:21050') + expect(locationCsv).not.toContain('NOW:LOC:24750') + }) + + it('returns 400 for invalid POST id payloads', async () => { + const loginResult = await send<{ token: string }>('user/login', 'POST', { username: 'testSu', password: 'test' }) + expect(loginResult.status).toEqual(200) + + const result = await request(app) + .post('/locality/export/dwc-archive') + .set('authorization', `bearer ${loginResult.body.token}`) + .send({ ids: ['21050.5'] }) + + expect(result.status).toEqual(400) + expect(result.body).toEqual({ error: 'ids must contain only integers.' }) + }) + it('rejects non-admin requests', async () => { const result = await request(app).get('/locality/export/dwc-archive') expect(result.status).toEqual(403) diff --git a/backend/src/api-tests/species/dwcArchiveExport.test.ts b/backend/src/api-tests/species/dwcArchiveExport.test.ts index e832ca26..22dfda6d 100644 --- a/backend/src/api-tests/species/dwcArchiveExport.test.ts +++ b/backend/src/api-tests/species/dwcArchiveExport.test.ts @@ -66,6 +66,39 @@ describe('DwC-A species export (admin-only)', () => { expect(metaXml).toContain(' { + const loginResult = await send<{ token: string }>('user/login', 'POST', { username: 'testSu', password: 'test' }) + expect(loginResult.status).toEqual(200) + + const result = await request(app) + .post('/species/export/dwc-archive') + .set('authorization', `bearer ${loginResult.body.token}`) + .send({ ids: [85729] }) + .buffer(true) + .parse(parseBinary) + + expect(result.status).toEqual(200) + expect(result.headers['content-type']).toMatch(/application\/zip/i) + + const zip = await JSZip.loadAsync(result.body as unknown as Buffer) + const taxonCsv = await zip.file('taxon.csv')!.async('string') + expect(taxonCsv).toContain('NOW:85729') + expect(taxonCsv).not.toContain('NOW:85730') + }) + + it('returns 400 for invalid POST id payloads', async () => { + const loginResult = await send<{ token: string }>('user/login', 'POST', { username: 'testSu', password: 'test' }) + expect(loginResult.status).toEqual(200) + + const result = await request(app) + .post('/species/export/dwc-archive') + .set('authorization', `bearer ${loginResult.body.token}`) + .send({ ids: ['85729abc'] }) + + expect(result.status).toEqual(400) + expect(result.body).toEqual({ error: 'ids must contain only integers.' }) + }) + it('rejects non-admin requests', async () => { const result = await request(app).get('/species/export/dwc-archive') expect(result.status).toEqual(403) diff --git a/backend/src/routes/locality.ts b/backend/src/routes/locality.ts index 4c18c65e..ad6612ef 100644 --- a/backend/src/routes/locality.ts +++ b/backend/src/routes/locality.ts @@ -34,7 +34,9 @@ router.get('/export/dwc-archive', requireOneOf([Role.Admin]), async (_req, res) router.post('/export/dwc-archive', requireOneOf([Role.Admin]), async (req, res) => { try { - return await sendDwcArchive(parseNumericIds((req.body as { ids?: unknown }).ids), res) + const body = req.body as { ids?: unknown } | undefined + if (!body || !('ids' in body)) throw new Error('ids must be an array.') + return await sendDwcArchive(parseNumericIds(body.ids), res) } catch (error) { return res.status(400).send({ error: error instanceof Error ? error.message : 'Invalid export filters.' }) } diff --git a/backend/src/routes/species.ts b/backend/src/routes/species.ts index 369967ac..0fa0a485 100644 --- a/backend/src/routes/species.ts +++ b/backend/src/routes/species.ts @@ -33,7 +33,9 @@ router.get('/export/dwc-archive', requireOneOf([Role.Admin]), async (_req, res) router.post('/export/dwc-archive', requireOneOf([Role.Admin]), async (req, res) => { try { - return await sendDwcArchive(parseNumericIds((req.body as { ids?: unknown }).ids), res) + const body = req.body as { ids?: unknown } | undefined + if (!body || !('ids' in body)) throw new Error('ids must be an array.') + return await sendDwcArchive(parseNumericIds(body.ids), res) } catch (error) { return res.status(400).send({ error: error instanceof Error ? error.message : 'Invalid export filters.' }) } diff --git a/backend/src/services/dwcDataPackageExport.ts b/backend/src/services/dwcDataPackageExport.ts index ff6228f0..a4551d68 100644 --- a/backend/src/services/dwcDataPackageExport.ts +++ b/backend/src/services/dwcDataPackageExport.ts @@ -956,11 +956,18 @@ const fetchOccurrencesForDwcDataPackageExport = async ( } export const buildDwcDataPackageZipBuffer = async (occurrenceKeys?: DwcOccurrenceKey[]): Promise => { - const { nowDb } = await import('../utils/db') const occurrences = await fetchOccurrencesForDwcDataPackageExport(occurrenceKeys) - const localityIds = occurrenceKeys ? [...new Set(occurrences.map(occurrence => occurrence.lid))] : undefined + return await buildDwcDataPackageZipBufferFromOccurrences(occurrences, Boolean(occurrenceKeys)) +} + +const buildDwcDataPackageZipBufferFromOccurrences = async ( + occurrences: OccurrenceForDwcDpExport[], + isFilteredExport: boolean +): Promise => { + const { nowDb } = await import('../utils/db') + const localityIds = isFilteredExport ? [...new Set(occurrences.map(occurrence => occurrence.lid))] : undefined const localities = (await nowDb.now_loc.findMany({ - where: localityIds ? { lid: { in: localityIds } } : undefined, + where: isFilteredExport ? { lid: { in: localityIds } } : undefined, orderBy: { lid: 'asc' }, select: localitySelect, })) as unknown as LocalityForDwcDpExport[] @@ -1129,13 +1136,9 @@ export const buildFullDarwinCoreExportZipBufferFromArchives = async ({ export const buildFullDarwinCoreExportZipBuffer = async (occurrenceKeys?: DwcOccurrenceKey[]): Promise => { const occurrences = occurrenceKeys ? await fetchOccurrencesForDwcDataPackageExport(occurrenceKeys) : undefined - const filteredOccurrenceKeys = occurrences?.map(occurrence => ({ - lid: occurrence.lid, - speciesId: occurrence.species_id, - })) const speciesIds = occurrences ? [...new Set(occurrences.map(occurrence => occurrence.species_id))] : undefined const [dwcDataPackageZipBuffer, dwcTaxonArchiveZipBuffer] = await Promise.all([ - buildDwcDataPackageZipBuffer(filteredOccurrenceKeys), + occurrences ? buildDwcDataPackageZipBufferFromOccurrences(occurrences, true) : buildDwcDataPackageZipBuffer(), buildDwcArchiveZipBuffer(speciesIds), ]) From d57a8b3a6b56bd1918debd10d0a2520d631c9370 Mon Sep 17 00:00:00 2001 From: karilint Date: Mon, 15 Jun 2026 15:25:24 +0300 Subject: [PATCH 6/8] Skip filtered lookup for empty occurrence exports --- .../dwcArchiveExportOccurrences.test.ts | 18 ++++++++++++++++++ backend/src/routes/occurrence.ts | 7 +++++++ 2 files changed, 25 insertions(+) diff --git a/backend/src/api-tests/occurrence/dwcArchiveExportOccurrences.test.ts b/backend/src/api-tests/occurrence/dwcArchiveExportOccurrences.test.ts index 7c1e1d96..1c63c6c8 100644 --- a/backend/src/api-tests/occurrence/dwcArchiveExportOccurrences.test.ts +++ b/backend/src/api-tests/occurrence/dwcArchiveExportOccurrences.test.ts @@ -82,6 +82,24 @@ describe('DwC-A occurrence export (admin-only)', () => { expect(occurrenceCsv).not.toContain('NOW:OCC:24750:') }) + it('uses the unfiltered export path for empty POST filters', async () => { + const loginResult = await send<{ token: string }>('user/login', 'POST', { username: 'testSu', password: 'test' }) + expect(loginResult.status).toEqual(200) + + const result = await request(app) + .post('/occurrence/export/dwc-archive') + .set('authorization', `bearer ${loginResult.body.token}`) + .send({ columnFilters: [], sorting: [] }) + .buffer(true) + .parse(parseBinary) + + expect(result.status).toEqual(200) + const zip = await JSZip.loadAsync(result.body as unknown as Buffer) + const occurrenceCsv = await zip.file('occurrence.csv')!.async('string') + expect(occurrenceCsv).toContain('NOW:OCC:21050:') + expect(occurrenceCsv).toContain('NOW:OCC:24750:') + }) + it('returns an empty filtered DwC-DP ZIP archive for POST requests', async () => { const loginResult = await send<{ token: string }>('user/login', 'POST', { username: 'testSu', password: 'test' }) expect(loginResult.status).toEqual(200) diff --git a/backend/src/routes/occurrence.ts b/backend/src/routes/occurrence.ts index 7ae0309c..8d0de763 100644 --- a/backend/src/routes/occurrence.ts +++ b/backend/src/routes/occurrence.ts @@ -32,6 +32,12 @@ const defaultCrossSearchExportFilters = { sorting: [], } satisfies CrossSearchRequestParameters +const hasEmptyExportFilters = (parameters: Partial): boolean => { + const columnFilters = parameters.columnFilters ?? defaultCrossSearchExportFilters.columnFilters + const sorting = parameters.sorting ?? defaultCrossSearchExportFilters.sorting + return Array.isArray(columnFilters) && columnFilters.length === 0 && Array.isArray(sorting) && sorting.length === 0 +} + class ExportFilterValidationError extends Error { validationErrors: ValidationObject[] @@ -44,6 +50,7 @@ class ExportFilterValidationError extends Error { const resolveOccurrenceKeysForExport = async (req: Request): Promise => { if (req.method === 'GET') return undefined const body = req.body as Partial | undefined + if (!body || hasEmptyExportFilters(body)) return undefined const result = await getFilteredCrossSearchOccurrenceKeys(req.user, { columnFilters: body?.columnFilters ?? defaultCrossSearchExportFilters.columnFilters, sorting: body?.sorting ?? defaultCrossSearchExportFilters.sorting, From 2b970e920b74c00e97519543264e693833cb89ed Mon Sep 17 00:00:00 2001 From: karilint Date: Mon, 15 Jun 2026 15:54:44 +0300 Subject: [PATCH 7/8] Narrow Darwin Core export validation catches --- backend/src/routes/locality.ts | 8 ++++---- backend/src/routes/occurrence.ts | 8 ++++++-- backend/src/routes/species.ts | 8 ++++---- backend/src/routes/utils/exportFilters.ts | 6 ++++++ backend/src/services/utils/dwcCsv.ts | 3 ++- 5 files changed, 22 insertions(+), 11 deletions(-) diff --git a/backend/src/routes/locality.ts b/backend/src/routes/locality.ts index ad6612ef..a1f77da9 100644 --- a/backend/src/routes/locality.ts +++ b/backend/src/routes/locality.ts @@ -12,7 +12,7 @@ import { AccessError, requireOneOf } from '../middlewares/authorizer' import { deleteLocality, writeLocality } from '../services/write/locality' import { buildDwcLocalityArchiveZipBuffer } from '../services/dwcArchiveExportLocalities' import { currentDateAsString } from '../../../frontend/src/shared/currentDateAsString' -import { parseNumericIds } from './utils/exportFilters' +import { parseRequiredNumericIdsBody } from './utils/exportFilters' const router = Router() @@ -33,13 +33,13 @@ router.get('/export/dwc-archive', requireOneOf([Role.Admin]), async (_req, res) }) router.post('/export/dwc-archive', requireOneOf([Role.Admin]), async (req, res) => { + let ids: number[] try { - const body = req.body as { ids?: unknown } | undefined - if (!body || !('ids' in body)) throw new Error('ids must be an array.') - return await sendDwcArchive(parseNumericIds(body.ids), res) + ids = parseRequiredNumericIdsBody(req.body) } catch (error) { return res.status(400).send({ error: error instanceof Error ? error.message : 'Invalid export filters.' }) } + return sendDwcArchive(ids, res) }) router.get('/:id', async (req, res) => { diff --git a/backend/src/routes/occurrence.ts b/backend/src/routes/occurrence.ts index 8d0de763..af85c69c 100644 --- a/backend/src/routes/occurrence.ts +++ b/backend/src/routes/occurrence.ts @@ -136,11 +136,13 @@ router.get('/export/dwc-data-package', requireOneOf([Role.Admin]), async (_req, }) router.post('/export/dwc-data-package', requireOneOf([Role.Admin]), async (req, res) => { + let occurrenceKeys: DwcOccurrenceKey[] | undefined try { - return await sendDwcDataPackage(await resolveOccurrenceKeysForExport(req), res) + occurrenceKeys = await resolveOccurrenceKeysForExport(req) } catch (error) { return handleExportFilterError(error, res) } + return sendDwcDataPackage(occurrenceKeys, res) }) const sendFullDarwinCorePackage = async (occurrenceKeys: DwcOccurrenceKey[] | undefined, res: Response) => { @@ -155,11 +157,13 @@ router.get('/export/dwc-full-package', requireOneOf([Role.Admin]), async (_req, }) router.post('/export/dwc-full-package', requireOneOf([Role.Admin]), async (req, res) => { + let occurrenceKeys: DwcOccurrenceKey[] | undefined try { - return await sendFullDarwinCorePackage(await resolveOccurrenceKeysForExport(req), res) + occurrenceKeys = await resolveOccurrenceKeysForExport(req) } catch (error) { return handleExportFilterError(error, res) } + return sendFullDarwinCorePackage(occurrenceKeys, res) }) router.get('/:lid/:speciesId', getOccurrenceDetail) diff --git a/backend/src/routes/species.ts b/backend/src/routes/species.ts index 0fa0a485..e86ca17c 100644 --- a/backend/src/routes/species.ts +++ b/backend/src/routes/species.ts @@ -6,7 +6,7 @@ import { deleteSpecies, writeSpecies } from '../services/write/species' import { requireOneOf } from '../middlewares/authorizer' import { buildDwcArchiveZipBuffer } from '../services/dwcArchiveExport' import { currentDateAsString } from '../../../frontend/src/shared/currentDateAsString' -import { parseNumericIds } from './utils/exportFilters' +import { parseRequiredNumericIdsBody } from './utils/exportFilters' const router = Router() @@ -32,13 +32,13 @@ router.get('/export/dwc-archive', requireOneOf([Role.Admin]), async (_req, res) }) router.post('/export/dwc-archive', requireOneOf([Role.Admin]), async (req, res) => { + let ids: number[] try { - const body = req.body as { ids?: unknown } | undefined - if (!body || !('ids' in body)) throw new Error('ids must be an array.') - return await sendDwcArchive(parseNumericIds(body.ids), res) + ids = parseRequiredNumericIdsBody(req.body) } catch (error) { return res.status(400).send({ error: error instanceof Error ? error.message : 'Invalid export filters.' }) } + return sendDwcArchive(ids, res) }) router.get('/:id', async (req, res) => { diff --git a/backend/src/routes/utils/exportFilters.ts b/backend/src/routes/utils/exportFilters.ts index 70a482d5..f55360d5 100644 --- a/backend/src/routes/utils/exportFilters.ts +++ b/backend/src/routes/utils/exportFilters.ts @@ -9,3 +9,9 @@ export const parseNumericIds = (value: unknown): number[] | undefined => { return parsed }) } + +export const parseRequiredNumericIdsBody = (body: unknown): number[] => { + const exportBody = body as { ids?: unknown } | undefined + if (!exportBody || !('ids' in exportBody)) throw new Error('ids must be an array.') + return parseNumericIds(exportBody.ids) ?? [] +} diff --git a/backend/src/services/utils/dwcCsv.ts b/backend/src/services/utils/dwcCsv.ts index 18ac8a52..787997c0 100644 --- a/backend/src/services/utils/dwcCsv.ts +++ b/backend/src/services/utils/dwcCsv.ts @@ -1,5 +1,6 @@ import { createWriteStream } from 'fs' import { once } from 'events' +import { MISSING_VALUE } from '../dwcMetadata' export const normalizeDwcCsvValue = (value: string): string => value.replace(/\r\n|\r|\n/g, ' ').replace(/[ \t]+/g, ' ') @@ -26,7 +27,7 @@ export const toDwcCsvString = (value: unknown): string => { export const dwcCsvCell = (value: unknown): string => { const csvValue = toDwcCsvString(value) - return `"${(csvValue === '' ? '\\N' : csvValue).replace(/"/g, '""')}"` + return `"${(csvValue === '' ? MISSING_VALUE : csvValue).replace(/"/g, '""')}"` } export const dwcCsvLine = (headers: readonly string[], row: Record): string => From bae0593b5a1cdd38277a8c34e14dd082021711cc Mon Sep 17 00:00:00 2001 From: karilint Date: Mon, 15 Jun 2026 16:06:22 +0300 Subject: [PATCH 8/8] Page filtered occurrence key export lookup --- backend/src/services/crossSearch.ts | 27 ++++++++++++------- backend/src/services/dwcDataPackageExport.ts | 5 +++- .../unit-tests/dwcDataPackageExport.test.ts | 12 ++++++++- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/backend/src/services/crossSearch.ts b/backend/src/services/crossSearch.ts index e6977ad7..c0fc3e18 100644 --- a/backend/src/services/crossSearch.ts +++ b/backend/src/services/crossSearch.ts @@ -13,6 +13,8 @@ import { generateCrossSearchLocalitiesSql, generateCrossSearchSql } from './quer import { ValidationObject } from '../../../frontend/src/shared/validators/validator' import { validateCrossSearchRouteParams } from '../../../frontend/src/shared/validators/crossSearch' +const CROSS_SEARCH_EXPORT_PAGE_SIZE = 20000 + export type CrossSearchRequestParameters = { limit?: string | number offset?: string | number @@ -122,7 +124,7 @@ export const getCrossSearchRawSql = async ( if (!limit) { // this is only ran when exporting the cross-search table - limit = 20000 + limit = CROSS_SEARCH_EXPORT_PAGE_SIZE offset = 0 const results = [] while (true) { @@ -259,21 +261,26 @@ export const getFilteredCrossSearchOccurrenceKeys = async ( return { validationErrors } } - const resultPages = (await getCrossSearchRawSql( - user, - undefined, - undefined, - validatedColumnFilters, - validatedSorting - )) as Array>> - const keysById = new Map() - for (const page of resultPages) { + let offset = 0 + while (true) { + const page = (await getCrossSearchRawSql( + user, + CROSS_SEARCH_EXPORT_PAGE_SIZE, + offset, + validatedColumnFilters, + validatedSorting + )) as Array> + if (page.length === 0) break + for (const row of page) { if (typeof row.lid_now_loc !== 'number' || typeof row.species_id_com_species !== 'number') continue const key = { lid: row.lid_now_loc, speciesId: row.species_id_com_species } keysById.set(`${key.lid}:${key.speciesId}`, key) } + + if (page.length < CROSS_SEARCH_EXPORT_PAGE_SIZE) break + offset += CROSS_SEARCH_EXPORT_PAGE_SIZE } return { occurrenceKeys: [...keysById.values()] } diff --git a/backend/src/services/dwcDataPackageExport.ts b/backend/src/services/dwcDataPackageExport.ts index a4551d68..376fdcad 100644 --- a/backend/src/services/dwcDataPackageExport.ts +++ b/backend/src/services/dwcDataPackageExport.ts @@ -415,9 +415,12 @@ const DWC_TERM_IRIS: Record = { identificationVerificationStatus: 'http://rs.tdwg.org/dwc/terms/identificationVerificationStatus', } +const fieldTitle = (name: string): string => + name.replace(/([a-z0-9])([A-Z])/g, '$1 $2').replace(/^./, first => first.toUpperCase()) + const field = (name: string, type = 'string') => ({ name, - title: name.replace(/([A-Z])/g, ' $1').replace(/^./, first => first.toUpperCase()), + title: fieldTitle(name), description: FIELD_DESCRIPTIONS[name] ?? `Curated NOW database value for ${name}.`, type, format: 'default', diff --git a/backend/src/unit-tests/dwcDataPackageExport.test.ts b/backend/src/unit-tests/dwcDataPackageExport.test.ts index 28e24922..e5ccac02 100644 --- a/backend/src/unit-tests/dwcDataPackageExport.test.ts +++ b/backend/src/unit-tests/dwcDataPackageExport.test.ts @@ -194,7 +194,7 @@ describe('DwC-DP export mapping', () => { resources: Array<{ name: string mediatype?: string - schema: { foreignKeys?: unknown[]; missingValues?: string[] } + schema: { fields: Array<{ name: string; title: string }>; foreignKeys?: unknown[]; missingValues?: string[] } }> } expect(dataPackageJson.name).toBe('now-darwincore-export') @@ -210,6 +210,16 @@ describe('DwC-DP export mapping', () => { ]) expect(dataPackageJson.resources.every(resource => resource.mediatype === 'text/csv')).toBe(true) expect(dataPackageJson.resources.every(resource => resource.schema.missingValues?.includes('\\N'))).toBe(true) + expect( + dataPackageJson.resources + .find(resource => resource.name === 'event') + ?.schema.fields.find(field => field.name === 'eventID')?.title + ).toBe('Event ID') + expect( + dataPackageJson.resources + .find(resource => resource.name === 'event-assertion') + ?.schema.fields.find(field => field.name === 'assertionTypeIRI')?.title + ).toBe('Assertion Type IRI') expect(dataPackageJson.resources.find(resource => resource.name === 'occurrence')?.schema.foreignKeys).toEqual([ { fields: 'eventID',