fix(db-postgres): ensure countDistinct works correctly and achieve better performance when the query has table joins (#11208)

The fix, added in https://github.com/payloadcms/payload/pull/11096
wasn't sufficient enough. It did handle the case when the same query
path / table was joined twice and caused incorrect `totalDocs`, but it
didn't handle the case when `JOIN` returns more than 1 rows, which 2
added new assertions here check.

Now, we use `COUNT(*)` only if we don't have any joined tables. If we
do, instead of using `SELECT (COUNT DISTINCT id)` which as described in
the previous PR is _very slow_ for large tables, we use the following
query:

```sql
SELECT COUNT(1) OVER() as count -- window function, executes for each row only once
FROM users
LEFT JOIN -- ... here additional rows are added
WHERE -- ...
GROUP BY users.id -- this ensures we're counting only users without additional rows from joins. 
LIMIT 1 -- Since COUNT(1) OVER() executes and resolves before doing LIMIT, we can safely apply LIMIT 1.
```
This commit is contained in:
Sasha
2025-02-16 14:08:08 +02:00
committed by GitHub
parent 2ae670e0e4
commit 513ba636af
5 changed files with 152 additions and 29 deletions

View File

@@ -9,32 +9,40 @@ export const countDistinct: CountDistinct = async function countDistinct(
this: SQLiteAdapter,
{ db, joins, tableName, where },
) {
// When we don't have any joins - use a simple COUNT(*) query.
if (joins.length === 0) {
const countResult = await db
.select({
count: count(),
})
.from(this.tables[tableName])
.where(where)
return Number(countResult[0].count)
}
const chainedMethods: ChainedMethods = []
// COUNT(DISTINCT id) is slow on large tables, so we only use DISTINCT if we have to
const visitedPaths = new Set([])
let useDistinct = false
joins.forEach(({ condition, queryPath, table }) => {
if (!useDistinct && queryPath) {
if (visitedPaths.has(queryPath)) {
useDistinct = true
} else {
visitedPaths.add(queryPath)
}
}
joins.forEach(({ condition, table }) => {
chainedMethods.push({
args: [table, condition],
method: 'leftJoin',
})
})
// When we have any joins, we need to count each individual ID only once.
// COUNT(*) doesn't work for this well in this case, as it also counts joined tables.
// SELECT (COUNT DISTINCT id) has a very slow performance on large tables.
// Instead, COUNT (GROUP BY id) can be used which is still slower than COUNT(*) but acceptable.
const countResult = await chainMethods({
methods: chainedMethods,
query: db
.select({
count: useDistinct ? sql`COUNT(DISTINCT ${this.tables[tableName].id})` : count(),
count: sql`COUNT(1) OVER()`,
})
.from(this.tables[tableName])
.where(where),
.where(where)
.groupBy(this.tables[tableName].id)
.limit(1),
})
return Number(countResult[0].count)

View File

@@ -1,6 +1,6 @@
import { count, sql } from 'drizzle-orm'
import type { ChainedMethods, TransactionPg } from '../types.js'
import type { ChainedMethods } from '../types.js'
import type { BasePostgresAdapter, CountDistinct } from './types.js'
import { chainMethods } from '../find/chainMethods.js'
@@ -9,33 +9,40 @@ export const countDistinct: CountDistinct = async function countDistinct(
this: BasePostgresAdapter,
{ db, joins, tableName, where },
) {
// When we don't have any joins - use a simple COUNT(*) query.
if (joins.length === 0) {
const countResult = await db
.select({
count: count(),
})
.from(this.tables[tableName])
.where(where)
return Number(countResult[0].count)
}
const chainedMethods: ChainedMethods = []
// COUNT(DISTINCT id) is slow on large tables, so we only use DISTINCT if we have to
const visitedPaths = new Set([])
let useDistinct = false
joins.forEach(({ condition, queryPath, table }) => {
if (!useDistinct && queryPath) {
if (visitedPaths.has(queryPath)) {
useDistinct = true
} else {
visitedPaths.add(queryPath)
}
}
joins.forEach(({ condition, table }) => {
chainedMethods.push({
args: [table, condition],
method: 'leftJoin',
})
})
// When we have any joins, we need to count each individual ID only once.
// COUNT(*) doesn't work for this well in this case, as it also counts joined tables.
// SELECT (COUNT DISTINCT id) has a very slow performance on large tables.
// Instead, COUNT (GROUP BY id) can be used which is still slower than COUNT(*) but acceptable.
const countResult = await chainMethods({
methods: chainedMethods,
query: (db as TransactionPg)
query: db
.select({
count: useDistinct ? sql`COUNT(DISTINCT ${this.tables[tableName].id})` : count(),
count: sql`COUNT(1) OVER()`,
})
.from(this.tables[tableName])
.where(where),
.where(where)
.groupBy(this.tables[tableName].id)
.limit(1),
})
return Number(countResult[0].count)