fix(plugin-import-export): pre-scan columns before streaming CSV export (#13009)

### What? Fixes an issue where only the fields from the first batch of documents were used to generate CSV column headers during streaming exports. ### Why? Previously, columns were determined during the first streaming batch. If a field appeared only in later documents, it was omitted from the CSV entirely — leading to incomplete exports when fields were sparsely populated across the dataset. ### How? - Adds a **pre-scan step** before streaming begins to collect all column keys across all pages - Uses this superset of keys to define the final CSV header - Ensures every row is padded to match the full column set This matches the behavior of non-streamed exports and guarantees that the streamed CSV output includes all relevant fields, regardless of when they appear in pagination.
2025-07-03 08:53:02 -04:00
parent 81532cb9c9
commit 34c3a5193b
1 changed files with 76 additions and 20 deletions
--- a/packages/plugin-import-export/src/export/createExport.ts
+++ b/packages/plugin-import-export/src/export/createExport.ts
@@ -111,23 +111,45 @@ export const createExport = async (args: CreateExportArgs) => {
  if (download) {
    if (debug) {
-      req.payload.logger.info('Starting download stream')
+      req.payload.logger.info('Pre-scanning all columns before streaming')
    }
    const allColumnsSet = new Set<string>()
    const allColumns: string[] = []
    let scanPage = 1
    let hasMore = true
    while (hasMore) {
      const result = await payload.find({ ...findArgs, page: scanPage })
      result.docs.forEach((doc) => {
        const flat = flattenObject({ doc, fields, toCSVFunctions })
        Object.keys(flat).forEach((key) => {
          if (!allColumnsSet.has(key)) {
            allColumnsSet.add(key)
            allColumns.push(key)
          }
        })
      })
      hasMore = result.hasNextPage
      scanPage += 1
    }
    if (debug) {
      req.payload.logger.info(`Discovered ${allColumns.length} columns`)
    }
    const encoder = new TextEncoder()
    let isFirstBatch = true
-    let columns: string[] | undefined
+    let streamPage = 1
    let page = 1
    const stream = new Readable({
      async read() {
-        const result = await payload.find({
+        const result = await payload.find({ ...findArgs, page: streamPage })
          ...findArgs,
          page,
        })
        if (debug) {
-          req.payload.logger.info(`Processing batch ${page} with ${result.docs.length} documents`)
+          req.payload.logger.info(`Streaming batch ${streamPage} with ${result.docs.length} docs`)
        }
        if (result.docs.length === 0) {
@@ -135,19 +157,24 @@ export const createExport = async (args: CreateExportArgs) => {
          return
        }
-        const csvInput = result.docs.map((doc) => flattenObject({ doc, fields, toCSVFunctions }))
+        const batchRows = result.docs.map((doc) => flattenObject({ doc, fields, toCSVFunctions }))
-        if (isFirstBatch) {
+        const paddedRows = batchRows.map((row) => {
-          columns = Object.keys(csvInput[0] ?? {})
+          const fullRow: Record<string, unknown> = {}
-        }
+          for (const col of allColumns) {
            fullRow[col] = row[col] ?? ''
          }
          return fullRow
        })
-        const csvString = stringify(csvInput, {
+        const csvString = stringify(paddedRows, {
          header: isFirstBatch,
-          columns,
+          columns: allColumns,
        })
        this.push(encoder.encode(csvString))
        isFirstBatch = false
        streamPage += 1
        if (!result.hasNextPage) {
          if (debug) {
@@ -155,8 +182,6 @@ export const createExport = async (args: CreateExportArgs) => {
          }
          this.push(null) // End the stream
        }
        page += 1
      },
    })
@@ -168,11 +193,15 @@ export const createExport = async (args: CreateExportArgs) => {
    })
  }
  // Non-download path (buffered export)
  if (debug) {
    req.payload.logger.info('Starting file generation')
  }
  const outputData: string[] = []
-  let isFirstBatch = true
+  const rows: Record<string, unknown>[] = []
  const columnsSet = new Set<string>()
  const columns: string[] = []
  let page = 1
  let hasNextPage = true
@@ -189,9 +218,19 @@ export const createExport = async (args: CreateExportArgs) => {
    }
    if (isCSV) {
-      const csvInput = result.docs.map((doc) => flattenObject({ doc, fields, toCSVFunctions }))
+      const batchRows = result.docs.map((doc) => flattenObject({ doc, fields, toCSVFunctions }))
-      outputData.push(stringify(csvInput, { header: isFirstBatch }))
+
-      isFirstBatch = false
+      // Track discovered column keys
      batchRows.forEach((row) => {
        Object.keys(row).forEach((key) => {
          if (!columnsSet.has(key)) {
            columnsSet.add(key)
            columns.push(key)
          }
        })
      })
      rows.push(...batchRows)
    } else {
      const jsonInput = result.docs.map((doc) => JSON.stringify(doc))
      outputData.push(jsonInput.join(',\n'))
@@ -201,6 +240,23 @@ export const createExport = async (args: CreateExportArgs) => {
    page += 1
  }
  if (isCSV) {
    const paddedRows = rows.map((row) => {
      const fullRow: Record<string, unknown> = {}
      for (const col of columns) {
        fullRow[col] = row[col] ?? ''
      }
      return fullRow
    })
    outputData.push(
      stringify(paddedRows, {
        header: true,
        columns,
      }),
    )
  }
  const buffer = Buffer.from(format === 'json' ? `[${outputData.join(',')}]` : outputData.join(''))
  if (debug) {
    req.payload.logger.info(`${format} file generation complete`)