-
Notifications
You must be signed in to change notification settings - Fork 727
Fix merging of big orgs #2694
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix merging of big orgs #2694
Changes from all commits
3d9e69b
108fd2b
27f5575
a222203
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| ALTER TABLE "memberSegmentsAgg" ADD COLUMN IF NOT EXISTS "createdAt" TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,6 +7,14 @@ import { IMemberAbsoluteAggregates, IMemberSegmentAggregates } from './types' | |
|
|
||
| const log = getServiceChildLogger('organizations/segments') | ||
|
|
||
| export async function findLastSyncDate(qx: QueryExecutor, memberId: string): Promise<Date | null> { | ||
| const result = await qx.selectOneOrNone( | ||
| `SELECT MAX("createdAt") AS "lastSyncDate" FROM "memberSegmentsAgg" WHERE "memberId" = $(memberId)`, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Codebase verification Add index for memberId and createdAt columns to optimize the MAX aggregation query The
To optimize the query performance, you should add a composite index on ( 🔗 Analysis chainVerify database schema and indexing. Let's verify the existence of the Let me generate another verification script to check for indexes specifically on the 🏁 Scripts executedThe following scripts were executed for the analysis: Script: #!/bin/bash
# Description: Check schema and indexes for memberSegmentsAgg table
# Check if createdAt column exists and its definition
echo "Checking createdAt column definition:"
rg -A 5 '"memberSegmentsAgg".*CREATE TABLE|ALTER TABLE.*"memberSegmentsAgg".*ADD.*"createdAt"'
# Check for indexes on memberId and createdAt
echo "Checking indexes:"
rg -A 5 'CREATE.*INDEX.*ON.*"memberSegmentsAgg".*\("memberId"|"createdAt"\)'
Length of output: 10157 Script: #!/bin/bash
# Search for memberSegmentsAgg table creation and its indexes
echo "Checking memberSegmentsAgg table creation and indexes:"
rg -A 10 'CREATE TABLE.*"memberSegmentsAgg"|CREATE.*INDEX.*"memberSegmentsAgg"'
# Search for any migration file containing memberSegmentsAgg
echo "Checking migrations containing memberSegmentsAgg:"
fd -e sql . | xargs rg "memberSegmentsAgg"
Length of output: 2540 |
||
| { memberId }, | ||
| ) | ||
| return result?.lastSyncDate ? new Date(result.lastSyncDate) : null | ||
| } | ||
|
|
||
| export async function cleanupMemberAggregates(qx: QueryExecutor, memberId: string) { | ||
| return qx.result( | ||
| ` | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,13 +40,16 @@ export class SearchSyncApiClient { | |
| public async triggerOrganizationMembersSync( | ||
| tenantId: string, | ||
| organizationId: string, | ||
| onboarding?: boolean, | ||
| syncFrom: Date | null = null, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Codebase verification Update needed: The verification revealed inconsistencies in the usage of
Please update all call sites to match the new signature with the optional 🔗 Analysis chainLGTM! Verify the parameter usage across the codebase. The addition of the optional Let's verify the parameter usage across the codebase: 🏁 Scripts executedThe following scripts were executed for the analysis: Script: #!/bin/bash
# Description: Verify all calls to triggerOrganizationMembersSync to ensure they're updated correctly
# Search for method calls
ast-grep --pattern 'triggerOrganizationMembersSync($$$)'
# Search for the route handler to verify parameter handling
rg -A 5 "'/sync/organization/members'"
Length of output: 1581 Script: #!/bin/bash
# Let's search for actual calls to the method in TypeScript/JavaScript files
rg -t ts -t js "triggerOrganizationMembersSync" -B 2 -A 2
# Also search for any test files that might be using this method
fd -e test.ts -e spec.ts -X rg "triggerOrganizationMembersSync"
Length of output: 2357 |
||
| ): Promise<void> { | ||
| if (!organizationId) { | ||
| throw new Error('organizationId is required!') | ||
| } | ||
|
|
||
| await this.searchSyncApi.post('/sync/organization/members', { | ||
| organizationId, | ||
| syncFrom, | ||
| }) | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -48,7 +48,7 @@ export class MemberRepository extends RepositoryBase<MemberRepository> { | |
| select m.id | ||
| from members m | ||
| left join indexed_entities ie on m.id = ie.entity_id and ie.type = $(type) | ||
| where m."tenantId" = $(tenantId) and | ||
| where m."tenantId" = $(tenantId) and | ||
| ie.entity_id is null | ||
| limit ${perPage};`, | ||
| { | ||
|
|
@@ -64,29 +64,37 @@ export class MemberRepository extends RepositoryBase<MemberRepository> { | |
| organizationId: string, | ||
| perPage: number, | ||
| lastId?: string, | ||
| ): Promise<IMemberIdData[]> { | ||
| return await this.db().any( | ||
| syncFrom?: Date, | ||
| ): Promise<string[]> { | ||
| const rows = await this.db().any( | ||
| ` | ||
| select distinct mo."memberId", m."manuallyCreated" | ||
| from "memberOrganizations" mo | ||
| inner join members m on mo."memberId" = m.id | ||
| where mo."organizationId" = $(organizationId) and | ||
| mo."deletedAt" is null and | ||
| ${lastId !== undefined ? 'mo."memberId" > $(lastId) and' : ''} | ||
| m."deletedAt" is null and | ||
| SELECT | ||
| DISTINCT mo."memberId" | ||
| FROM "memberOrganizations" mo | ||
| INNER JOIN members m ON mo."memberId" = m.id | ||
| ${syncFrom !== undefined ? 'LEFT JOIN "memberSegmentsAgg" msa ON m.id = msa."memberId"' : ''} | ||
| WHERE mo."organizationId" = $(organizationId) AND | ||
| mo."deletedAt" is null AND | ||
| ${syncFrom !== undefined ? '(msa."createdAt" < $(syncFrom) OR msa."createdAt" IS NULL) AND' : ''} | ||
| ${lastId !== undefined ? 'mo."memberId" > $(lastId) AND' : ''} | ||
| m."deletedAt" is null AND | ||
| exists (select 1 from "memberIdentities" where "memberId" = mo."memberId") | ||
| order by mo."memberId" | ||
| limit ${perPage};`, | ||
| ORDER BY mo."memberId" | ||
| LIMIT ${perPage}; | ||
| `, | ||
|
Comment on lines
+69
to
+84
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Avoid SQL Injection Risks with Safe Query Construction The SQL query is conditionally constructed using template literals and string interpolation, which can introduce SQL injection risks if not handled carefully. Even though parameters are used, it's safer to use query builders or parameterized query methods to construct SQL queries dynamically. Consider refactoring the query to build the conditional clauses securely. Here's an example using parameterized conditions: const conditions = [
'mo."organizationId" = $(organizationId)',
'mo."deletedAt" IS NULL',
'm."deletedAt" IS NULL',
'EXISTS (SELECT 1 FROM "memberIdentities" WHERE "memberId" = mo."memberId")',
];
if (syncFrom !== undefined) {
conditions.push('(msa."createdAt" < $(syncFrom) OR msa."createdAt" IS NULL)');
}
if (lastId !== undefined) {
conditions.push('mo."memberId" > $(lastId)');
}
const query = `
SELECT DISTINCT mo."memberId"
FROM "memberOrganizations" mo
INNER JOIN members m ON mo."memberId" = m.id
${syncFrom !== undefined ? 'LEFT JOIN "memberSegmentsAgg" msa ON m.id = msa."memberId"' : ''}
WHERE ${conditions.join(' AND ')}
ORDER BY mo."memberId"
LIMIT ${perPage};
`;
const rows = await this.db().any(query, {
organizationId,
syncFrom,
lastId,
});This approach helps prevent SQL injection and enhances the readability and maintainability of your code. |
||
| { | ||
| organizationId, | ||
| lastId, | ||
| syncFrom, | ||
| }, | ||
| ) | ||
|
|
||
| return rows.map((r) => r.memberId) | ||
| } | ||
|
|
||
| public async getMemberData(memberId: string): Promise<IDbMemberSyncData[]> { | ||
| const results = await this.db().oneOrNone( | ||
| ` | ||
| ` | ||
| with to_merge_data as ( | ||
| select mtm."memberId", | ||
| array_agg(distinct mtm."toMergeId"::text) as to_merge_ids | ||
|
|
@@ -208,7 +216,7 @@ export class MemberRepository extends RepositoryBase<MemberRepository> { | |
| where mtk."memberId" = $(memberId) | ||
| and tk."deletedAt" is null | ||
| group by mtk."memberId") | ||
| select | ||
| select | ||
| m.id, | ||
| m."tenantId", | ||
| m."displayName", | ||
|
|
@@ -254,7 +262,7 @@ export class MemberRepository extends RepositoryBase<MemberRepository> { | |
| ` | ||
| select m.id as "memberId", m."manuallyCreated" | ||
| from members m | ||
| where m."tenantId" = $(tenantId ) and | ||
| where m."tenantId" = $(tenantId) and | ||
| m.id in ($(memberIds:csv)) and | ||
| exists(select 1 from "memberIdentities" mi where mi."memberId" = m.id) | ||
| `, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,6 +10,7 @@ import { getMemberAggregates } from '@crowd/data-access-layer/src/activities' | |
| import { | ||
| cleanupMemberAggregates, | ||
| fetchAbsoluteMemberAggregates, | ||
| findLastSyncDate, | ||
| insertMemberSegments, | ||
| } from '@crowd/data-access-layer/src/members/segments' | ||
| import { IMemberSegmentAggregates } from '@crowd/data-access-layer/src/members/types' | ||
|
|
@@ -309,33 +310,36 @@ export class MemberSyncService { | |
| ) | ||
| } | ||
|
|
||
| public async syncOrganizationMembers(organizationId: string, batchSize = 200): Promise<void> { | ||
| public async syncOrganizationMembers( | ||
| organizationId: string, | ||
| opts: { syncFrom: Date | null } = { syncFrom: null }, | ||
| ): Promise<void> { | ||
| this.log.debug({ organizationId }, 'Syncing all organization members!') | ||
| const batchSize = 500 | ||
| let docCount = 0 | ||
| let memberCount = 0 | ||
|
|
||
| const now = new Date() | ||
|
|
||
| const loadNextPage = async (lastId?: string): Promise<string[]> => { | ||
| this.log.info('Loading next page of organization members!', { organizationId, lastId }) | ||
| const memberIdData = await logExecutionTimeV2( | ||
| () => this.memberRepo.getOrganizationMembersForSync(organizationId, batchSize, lastId), | ||
| const memberIds = await logExecutionTimeV2( | ||
| () => | ||
| this.memberRepo.getOrganizationMembersForSync( | ||
| organizationId, | ||
| batchSize, | ||
| lastId, | ||
| opts.syncFrom, | ||
| ), | ||
|
Comment on lines
+313
to
+333
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Codebase verification Based on the search results, I can now generate the final response: Update required for search_sync_worker's syncOrganizationMembers call The
Note: The 🔗 Analysis chainVerify the propagation of syncFrom parameter The syncFrom parameter is correctly propagated to downstream calls. Let's verify there are no other callers that need to be updated. 🏁 Scripts executedThe following scripts were executed for the analysis: Script: #!/bin/bash
# Search for direct calls to getOrganizationMembersForSync to ensure all callers pass the syncFrom parameter
rg "getOrganizationMembersForSync" -A 5 -B 5
# Search for calls to syncOrganizationMembers to verify they're updated to use the new parameter
rg "syncOrganizationMembers" -A 5 -B 5
Length of output: 11134 |
||
| this.log, | ||
| `getOrganizationMembersForSync`, | ||
| ) | ||
|
|
||
| if (memberIdData.length === 0) { | ||
| if (memberIds.length === 0) { | ||
| return [] | ||
| } | ||
|
|
||
| const membersWithActivities = await filterMembersWithActivities( | ||
| this.qdbStore.connection(), | ||
| memberIdData.map((m) => m.memberId), | ||
| ) | ||
|
|
||
| return memberIdData | ||
| .filter((m) => m.manuallyCreated || membersWithActivities.includes(m.memberId)) | ||
| .map((m) => m.memberId) | ||
| return memberIds | ||
| } | ||
|
|
||
| let memberIds: string[] = await loadNextPage() | ||
|
|
@@ -344,7 +348,7 @@ export class MemberSyncService { | |
| for (let i = 0; i < memberIds.length; i++) { | ||
| const memberId = memberIds[i] | ||
| const { membersSynced, documentsIndexed } = await logExecutionTimeV2( | ||
| () => this.syncMembers(memberId), | ||
| () => this.syncMembers(memberId, { withAggs: true, syncFrom: opts.syncFrom }), | ||
| this.log, | ||
| `syncMembers (${i}/${memberIds.length})`, | ||
| ) | ||
|
|
@@ -371,11 +375,22 @@ export class MemberSyncService { | |
|
|
||
| public async syncMembers( | ||
| memberId: string, | ||
| opts: { withAggs?: boolean } = { withAggs: true }, | ||
| opts: { withAggs?: boolean; syncFrom?: Date } = { withAggs: true }, | ||
| ): Promise<IMemberSyncResult> { | ||
| const qx = repoQx(this.memberRepo) | ||
|
|
||
| const syncMemberAggregates = async (memberId) => { | ||
| if (opts.syncFrom) { | ||
| const lastSyncDate = await findLastSyncDate(qx, memberId) | ||
| if (lastSyncDate && lastSyncDate.getTime() > opts.syncFrom.getTime()) { | ||
| this.log.info( | ||
| `Skipping sync of member aggregates as last sync date is greater than syncFrom!`, | ||
| { memberId, lastSyncDate, syncFrom: opts.syncFrom }, | ||
| ) | ||
| return | ||
| } | ||
| } | ||
|
|
||
| let documentsIndexed = 0 | ||
| let memberData: IMemberSegmentAggregates[] | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Evaluate shared syncStart timestamp for sequential syncs
Both organizations use the same
syncStarttimestamp, but the syncs are performed sequentially. This means the second sync (secondaryId) might miss updates that occurred during the first sync (primaryId).Consider this alternative implementation:
📝 Committable suggestion