fix(db-postgres): ensure countDistinct works correctly and achieve better performance when the query has table joins (#11208)

The fix, added in https://github.com/payloadcms/payload/pull/11096 wasn't sufficient enough. It did handle the case when the same query path / table was joined twice and caused incorrect `totalDocs`, but it didn't handle the case when `JOIN` returns more than 1 rows, which 2 added new assertions here check. Now, we use `COUNT(*)` only if we don't have any joined tables. If we do, instead of using `SELECT (COUNT DISTINCT id)` which as described in the previous PR is _very slow_ for large tables, we use the following query: ```sql SELECT COUNT(1) OVER() as count -- window function, executes for each row only once FROM users LEFT JOIN -- ... here additional rows are added WHERE -- ... GROUP BY users.id -- this ensures we're counting only users without additional rows from joins. LIMIT 1 -- Since COUNT(1) OVER() executes and resolves before doing LIMIT, we can safely apply LIMIT 1. ```
2025-02-16 14:08:08 +02:00
parent 2ae670e0e4
commit 513ba636af
5 changed files with 152 additions and 29 deletions
--- a/packages/db-sqlite/src/countDistinct.ts
+++ b/packages/db-sqlite/src/countDistinct.ts
@@ -9,32 +9,40 @@ export const countDistinct: CountDistinct = async function countDistinct(
  this: SQLiteAdapter,
  { db, joins, tableName, where },
 ) {
+  // When we don't have any joins - use a simple COUNT(*) query.
+  if (joins.length === 0) {
+    const countResult = await db
+      .select({
+        count: count(),
+      })
+      .from(this.tables[tableName])
+      .where(where)
+    return Number(countResult[0].count)
+  }
+
  const chainedMethods: ChainedMethods = []

-  // COUNT(DISTINCT id)  is slow on large tables, so we only use DISTINCT if we have to
-  const visitedPaths = new Set([])
-  let useDistinct = false
-  joins.forEach(({ condition, queryPath, table }) => {
-    if (!useDistinct && queryPath) {
-      if (visitedPaths.has(queryPath)) {
-        useDistinct = true
-      } else {
-        visitedPaths.add(queryPath)
-      }
-    }
+  joins.forEach(({ condition, table }) => {
    chainedMethods.push({
      args: [table, condition],
      method: 'leftJoin',
    })
  })
+
+  // When we have any joins, we need to count each individual ID only once.
+  // COUNT(*) doesn't work for this well in this case, as it also counts joined tables.
+  // SELECT (COUNT DISTINCT id) has a very slow performance on large tables.
+  // Instead, COUNT (GROUP BY id) can be used which is still slower than COUNT(*) but acceptable.
  const countResult = await chainMethods({
    methods: chainedMethods,
    query: db
      .select({
-        count: useDistinct ? sql`COUNT(DISTINCT ${this.tables[tableName].id})` : count(),
+        count: sql`COUNT(1) OVER()`,
      })
      .from(this.tables[tableName])
-      .where(where),
+      .where(where)
+      .groupBy(this.tables[tableName].id)
+      .limit(1),
  })

  return Number(countResult[0].count)
--- a/packages/drizzle/src/postgres/countDistinct.ts
+++ b/packages/drizzle/src/postgres/countDistinct.ts
@@ -1,6 +1,6 @@
 import { count, sql } from 'drizzle-orm'

-import type { ChainedMethods, TransactionPg } from '../types.js'
+import type { ChainedMethods } from '../types.js'
 import type { BasePostgresAdapter, CountDistinct } from './types.js'

 import { chainMethods } from '../find/chainMethods.js'
@@ -9,33 +9,40 @@ export const countDistinct: CountDistinct = async function countDistinct(
  this: BasePostgresAdapter,
  { db, joins, tableName, where },
 ) {
+  // When we don't have any joins - use a simple COUNT(*) query.
+  if (joins.length === 0) {
+    const countResult = await db
+      .select({
+        count: count(),
+      })
+      .from(this.tables[tableName])
+      .where(where)
+    return Number(countResult[0].count)
+  }
+
  const chainedMethods: ChainedMethods = []

-  // COUNT(DISTINCT id)  is slow on large tables, so we only use DISTINCT if we have to
-  const visitedPaths = new Set([])
-  let useDistinct = false
-  joins.forEach(({ condition, queryPath, table }) => {
-    if (!useDistinct && queryPath) {
-      if (visitedPaths.has(queryPath)) {
-        useDistinct = true
-      } else {
-        visitedPaths.add(queryPath)
-      }
-    }
+  joins.forEach(({ condition, table }) => {
    chainedMethods.push({
      args: [table, condition],
      method: 'leftJoin',
    })
  })

+  // When we have any joins, we need to count each individual ID only once.
+  // COUNT(*) doesn't work for this well in this case, as it also counts joined tables.
+  // SELECT (COUNT DISTINCT id) has a very slow performance on large tables.
+  // Instead, COUNT (GROUP BY id) can be used which is still slower than COUNT(*) but acceptable.
  const countResult = await chainMethods({
    methods: chainedMethods,
-    query: (db as TransactionPg)
+    query: db
      .select({
-        count: useDistinct ? sql`COUNT(DISTINCT ${this.tables[tableName].id})` : count(),
+        count: sql`COUNT(1) OVER()`,
      })
      .from(this.tables[tableName])
-      .where(where),
+      .where(where)
+      .groupBy(this.tables[tableName].id)
+      .limit(1),
  })

  return Number(countResult[0].count)