Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import {
touchMemberEnrichmentCacheUpdatedAt,
updateMemberEnrichmentCache,
} from './activities/enrichment'
import { getMembers } from './activities/getMembers'
import { getEnrichableMembers } from './activities/getMembers'
import { refreshToken } from './activities/lf-auth0/authenticateLFAuth0'
import {
getIdentitiesExistInOtherMembers,
Expand All @@ -29,7 +29,7 @@ import {
} from './activities/syncEnrichedData'

export {
getMembers,
getEnrichableMembers,
getEnrichmentData,
normalizeEnrichmentData,
findMemberEnrichmentCache,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
import { fetchMembersForEnrichment } from '@crowd/data-access-layer/src/old/apps/premium/members_enrichment_worker'
import { IMember, IMemberEnrichmentSourceQueryInput, MemberEnrichmentSource } from '@crowd/types'
import {
IEnrichableMember,
IMemberEnrichmentSourceQueryInput,
MemberEnrichmentSource,
} from '@crowd/types'

import { EnrichmentSourceServiceFactory } from '../factory'
import { svc } from '../main'

export async function getMembers(
export async function getEnrichableMembers(
limit: number,
sources: MemberEnrichmentSource[],
afterId: string,
): Promise<IMember[]> {
let rows: IMember[] = []
): Promise<IEnrichableMember[]> {
let rows: IEnrichableMember[] = []
const sourceInputs: IMemberEnrichmentSourceQueryInput[] = sources.map((s) => {
const srv = EnrichmentSourceServiceFactory.getEnrichmentSourceService(s, svc.log)
return {
source: s,
cacheObsoleteAfterSeconds: srv.cacheObsoleteAfterSeconds,
enrichableBy: srv.enrichableBy,
enrichableBySql: srv.enrichableBySql,
}
})
const db = svc.postgres.reader
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { MemberEnrichmentSource } from '@crowd/types'

import EnrichmentServiceClearbit from './sources/clearbit/service'
import EnrichmentServiceProgAI from './sources/progai/service'
import EnrichmentServiceSerpApi from './sources/serp/service'
import { IEnrichmentService } from './types'
import { ALSO_USE_EMAIL_IDENTITIES_FOR_ENRICHMENT, ENRICH_EMAIL_IDENTITIES } from './utils/config'

Expand All @@ -21,6 +22,8 @@ export class EnrichmentSourceServiceFactory {
)
case MemberEnrichmentSource.CLEARBIT:
return new EnrichmentServiceClearbit(log)
case MemberEnrichmentSource.SERP:
return new EnrichmentServiceSerpApi(log)
default:
throw new Error(`Enrichment service for ${source} is not found!`)
}
Expand Down
2 changes: 2 additions & 0 deletions services/apps/premium/members_enrichment_worker/src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ const config: Config = {
'CROWD_ENRICHMENT_PROGAI_API_KEY',
'CROWD_ENRICHMENT_CLEARBIT_URL',
'CROWD_ENRICHMENT_CLEARBIT_API_KEY',
'CROWD_ENRICHMENT_SERP_API_URL',
'CROWD_ENRICHMENT_SERP_API_KEY',
],
producer: {
enabled: false,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ import axios from 'axios'

import { Logger, LoggerBase } from '@crowd/logging'
import {
IMemberEnrichmentSourceEnrichableBy,
MemberAttributeName,
MemberEnrichmentSource,
MemberIdentityType,
Expand All @@ -28,11 +27,7 @@ import {
export default class EnrichmentServiceClearbit extends LoggerBase implements IEnrichmentService {
public source: MemberEnrichmentSource = MemberEnrichmentSource.CLEARBIT
public platform = `enrichment-${this.source}`
public enrichableBy: IMemberEnrichmentSourceEnrichableBy[] = [
{
type: MemberIdentityType.EMAIL,
},
]
public enrichableBySql = `mi.type = 'email' and mi.verified`

// bust cache after 120 days
public cacheObsoleteAfterSeconds = 60 * 60 * 24 * 120
Expand Down Expand Up @@ -60,7 +55,7 @@ export default class EnrichmentServiceClearbit extends LoggerBase implements IEn
}

isEnrichableBySource(input: IEnrichmentSourceInput): boolean {
return !!input.email?.value
return !!input.email?.value && input.email?.verified
}

async getData(input: IEnrichmentSourceInput): Promise<IMemberEnrichmentDataClearbit | null> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ import lodash from 'lodash'

import { Logger, LoggerBase } from '@crowd/logging'
import {
IMemberEnrichmentSourceEnrichableBy,
MemberAttributeName,
MemberEnrichmentSource,
MemberIdentityType,
Expand Down Expand Up @@ -33,15 +32,8 @@ import {
export default class EnrichmentServiceProgAI extends LoggerBase implements IEnrichmentService {
public source: MemberEnrichmentSource = MemberEnrichmentSource.PROGAI
public platform = `enrichment-${this.source}`
public enrichableBy: IMemberEnrichmentSourceEnrichableBy[] = [
{
type: MemberIdentityType.USERNAME,
platform: PlatformType.GITHUB,
},
{
type: MemberIdentityType.EMAIL,
},
]

enrichableBySql = `mi.verified and ((mi.type = 'username' AND mi.platform = 'github') OR (mi.type = 'email'))`

// bust cache after 90 days
public cacheObsoleteAfterSeconds = 60 * 60 * 24 * 90
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import axios from 'axios'

import { Logger, LoggerBase } from '@crowd/logging'
import { MemberEnrichmentSource, MemberIdentityType, PlatformType } from '@crowd/types'

import {
IEnrichmentService,
IEnrichmentSourceInput,
IMemberEnrichmentDataNormalized,
} from '../../types'

import { IMemberEnrichmentDataSerp, IMemberEnrichmentSerpApiResponse } from './types'

export default class EnrichmentServiceSerpApi extends LoggerBase implements IEnrichmentService {
public source: MemberEnrichmentSource = MemberEnrichmentSource.SERP
public platform = `enrichment-${this.source}`
public enrichMembersWithActivityMoreThan = 10

public enrichableBySql = `
("activitySummary".total_count > ${this.enrichMembersWithActivityMoreThan}) AND
(members."displayName" like '% %') AND
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the idea? People with a space in their name?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, people with multiple-word names. Single-word names are not good enough when searching in google.

But now I'm thinking, since we're already using other stuff to distinguish (like website, location w/e), maybe we can remove this

Copy link
Collaborator Author

@epipav epipav Oct 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the test results with one-word display names aren't great.

  • With multiple-word names, the correctness using GitHub identities in queries was %92.4, when using single-word names it dropped to %65
  • With multiple word names, queries with the website had correctness of %100, when using single-word names it dropped to %75

I'm keeping this for now, since it's more important to have more correct answers than more enrichable members

(members.attributes->'location'->>'default' is not null and members.attributes->'location'->>'default' <> '') AND
((members.attributes->'websiteUrl'->>'default' is not null and members.attributes->'websiteUrl'->>'default' <> '') OR
(mi.verified AND mi.type = 'username' and mi.platform = 'github') OR
(mi.verified AND mi.type = 'email')
)`
Comment on lines +19 to +26
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Fix undefined SQL alias 'mi'

The SQL condition uses the mi alias without defining it in a JOIN clause. While the multiple-word display name check is intentionally kept based on test results showing better accuracy, the undefined alias needs to be addressed.

Consider adding the necessary JOIN clause:

  public enrichableBySql = `
  ("activitySummary".total_count > ${this.enrichMembersWithActivityMoreThan}) AND
  (members."displayName" like '% %') AND 
  (members.attributes->'location'->>'default' is not null and members.attributes->'location'->>'default' <> '') AND
  ((members.attributes->'websiteUrl'->>'default' is not null and members.attributes->'websiteUrl'->>'default' <> '') OR 
+  EXISTS (SELECT 1 FROM member_identities mi WHERE mi.member_id = members.id AND
   (mi.verified AND mi.type = 'username' and mi.platform = 'github') OR 
   (mi.verified AND mi.type = 'email')
+  )
  )`
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
public enrichableBySql = `
("activitySummary".total_count > ${this.enrichMembersWithActivityMoreThan}) AND
(members."displayName" like '% %') AND
(members.attributes->'location'->>'default' is not null and members.attributes->'location'->>'default' <> '') AND
((members.attributes->'websiteUrl'->>'default' is not null and members.attributes->'websiteUrl'->>'default' <> '') OR
(mi.verified AND mi.type = 'username' and mi.platform = 'github') OR
(mi.verified AND mi.type = 'email')
)`
public enrichableBySql = `
("activitySummary".total_count > ${this.enrichMembersWithActivityMoreThan}) AND
(members."displayName" like '% %') AND
(members.attributes->'location'->>'default' is not null and members.attributes->'location'->>'default' <> '') AND
((members.attributes->'websiteUrl'->>'default' is not null and members.attributes->'websiteUrl'->>'default' <> '') OR
EXISTS (SELECT 1 FROM member_identities mi WHERE mi.member_id = members.id AND
(mi.verified AND mi.type = 'username' and mi.platform = 'github') OR
(mi.verified AND mi.type = 'email')
)
)`


// bust cache after 120 days
public cacheObsoleteAfterSeconds = 60 * 60 * 24 * 120

constructor(public readonly log: Logger) {
super(log)
}

isEnrichableBySource(input: IEnrichmentSourceInput): boolean {
const displayNameSplit = input.displayName?.split(' ')
return (
displayNameSplit?.length > 1 &&
!!input.location &&
((!!input.email && input.email.verified) ||
(!!input.github && input.github.verified) ||
!!input.website)
)
}

async getData(input: IEnrichmentSourceInput): Promise<IMemberEnrichmentDataSerp | null> {
let enriched: IMemberEnrichmentDataSerp = null

if (input.displayName && input.location && input.website) {
enriched = await this.querySerpApi(input.displayName, input.location, input.website)
}

if (!enriched && input.displayName && input.location && input.github && input.github.value) {
enriched = await this.querySerpApi(input.displayName, input.location, input.github.value)
}

if (!enriched && input.displayName && input.location && input.email && input.email.value) {
enriched = await this.querySerpApi(input.displayName, input.location, input.email.value)
}
return enriched
}

private async querySerpApi(
displayName: string,
location: string,
identifier: string,
): Promise<IMemberEnrichmentDataSerp> {
const config = {
method: 'get',
url: process.env['CROWD_ENRICHMENT_SERP_API_URL'],
params: {
api_key: process.env['CROWD_ENRICHMENT_SERP_API_KEY'],
q: `"${displayName}" ${location} "${identifier}" site:linkedin.com/in`,
num: 3,
engine: 'google',
},
}

const response: IMemberEnrichmentSerpApiResponse = (await axios(config)).data
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add error handling for Axios request to prevent unhandled exceptions

The Axios request in querySerpApi lacks error handling. If the request fails due to network issues or API errors, it could result in unhandled exceptions. It's advisable to wrap the call in a try-catch block to handle errors gracefully and log meaningful messages.

Apply this diff to add error handling:

       try {
         const response: IMemberEnrichmentSerpApiResponse = (await axios(config)).data

         if (response.search_information.total_results > 0) {
           // existing logic
         }

         return null
+      } catch (error) {
+        this.log.error('Error fetching data from SerpAPI', error)
+        return null
       }

Committable suggestion was skipped due to low confidence.


if (response.search_information.total_results > 0) {
if (
response.organic_results.length > 0 &&
response.organic_results[0].link &&
!response.search_information.spelling_fix &&
!response.search_information.spelling_fix_type
) {
return {
linkedinUrl: response.organic_results[0].link,
}
}
}

return null
}

normalize(data: IMemberEnrichmentDataSerp): IMemberEnrichmentDataNormalized {
const normalized: IMemberEnrichmentDataNormalized = {
identities: [
{
platform: PlatformType.LINKEDIN,
type: MemberIdentityType.USERNAME,
verified: false,
value: this.normalizeLinkedUrl(data.linkedinUrl),
},
],
}
return normalized
}
Comment on lines +97 to +109
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add error handling in normalize method

The normalize method doesn't handle potential errors from normalizeLinkedUrl. If URL normalization fails, it will throw an error and potentially break the enrichment process.

Add error handling:

  normalize(data: IMemberEnrichmentDataSerp): IMemberEnrichmentDataNormalized {
+   let normalizedUrl: string;
+   try {
+     normalizedUrl = this.normalizeLinkedUrl(data.linkedinUrl);
+   } catch (error) {
+     this.log.warn('Failed to normalize LinkedIn URL, using original', { url: data.linkedinUrl });
+     normalizedUrl = data.linkedinUrl;
+   }
    const normalized: IMemberEnrichmentDataNormalized = {
      identities: [
        {
          platform: PlatformType.LINKEDIN,
          type: MemberIdentityType.USERNAME,
          verified: false,
-         value: this.normalizeLinkedUrl(data.linkedinUrl),
+         value: normalizedUrl,
        },
      ],
    }
    return normalized
  }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
normalize(data: IMemberEnrichmentDataSerp): IMemberEnrichmentDataNormalized {
const normalized: IMemberEnrichmentDataNormalized = {
identities: [
{
platform: PlatformType.LINKEDIN,
type: MemberIdentityType.USERNAME,
verified: false,
value: this.normalizeLinkedUrl(data.linkedinUrl),
},
],
}
return normalized
}
normalize(data: IMemberEnrichmentDataSerp): IMemberEnrichmentDataNormalized {
let normalizedUrl: string;
try {
normalizedUrl = this.normalizeLinkedUrl(data.linkedinUrl);
} catch (error) {
this.log.warn('Failed to normalize LinkedIn URL, using original', { url: data.linkedinUrl });
normalizedUrl = data.linkedinUrl;
}
const normalized: IMemberEnrichmentDataNormalized = {
identities: [
{
platform: PlatformType.LINKEDIN,
type: MemberIdentityType.USERNAME,
verified: false,
value: normalizedUrl,
},
],
}
return normalized
}


private normalizeLinkedUrl(url: string): string {
try {
const parsedUrl = new URL(url)

if (parsedUrl.hostname.endsWith('linkedin.com')) {
parsedUrl.hostname = 'linkedin.com'
parsedUrl.search = ''

let path = parsedUrl.pathname
if (path.endsWith('/')) {
path = path.slice(0, -1)
}

return parsedUrl.origin + path
}

return url
} catch (error) {
this.log.error(`Error while normalizing linkedin url: ${url}`, error)
throw error
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/* eslint-disable @typescript-eslint/no-explicit-any */
import axios from 'axios'

// import { writeFileSync } from 'fs'
// import { Parser } from 'json2csv'
import { timeout } from '@crowd/common'

Comment on lines +1 to +7
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Consider removing eslint-disable directive.

The eslint-disable directive for @typescript-eslint/no-explicit-any suggests potential type safety issues. Instead of disabling the rule, consider defining proper interfaces for the member data structure.

-/* eslint-disable @typescript-eslint/no-explicit-any */
+interface Member {
+  displayName: string;
+  location: string;
+  website: string;
+  linkedinFromSerp?: string;
+}

Committable suggestion was skipped due to low confidence.

const testSerpApi = async () => {
const members = [] as any[]

for (const mem of members) {
Comment on lines +8 to +11
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Convert script to proper test suite.

This appears to be a manual testing script rather than an automated test suite. Consider restructuring using a testing framework like Jest:

-const testSerpApi = async () => {
+describe('SERP API LinkedIn Finder', () => {
+  it('should find LinkedIn profiles for members', async () => {
+    const members: Member[] = [];

Would you like me to help create a proper test suite with mocked API responses and test cases?

Committable suggestion was skipped due to low confidence.

const url = `https://serpapi.com/search.json`
const config = {
method: 'get',
url,
params: {
api_key: process.env['CROWD_SERP_API_KEY'],
q: `"${mem.displayName}" ${mem.location} "${mem.website}" site:linkedin.com/in`,
num: 3,
engine: 'google',
},
Comment on lines +17 to +21
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Secure API key handling and input validation needed.

Several security and robustness concerns:

  1. Direct environment variable access without validation
  2. Potential SQL injection via unescaped member data in query string
  3. Missing error handling for invalid/missing API key
-        api_key: process.env['CROWD_SERP_API_KEY'],
-        q: `"${mem.displayName}" ${mem.location} "${mem.website}" site:linkedin.com/in`,
+        api_key: getRequiredEnvVar('CROWD_SERP_API_KEY'),
+        q: `"${escapeSearchString(mem.displayName)}" ${escapeSearchString(mem.location)} "${escapeSearchString(mem.website)}" site:linkedin.com/in`,

Committable suggestion was skipped due to low confidence.

}

const response = (await axios(config)).data

if (response.search_information.total_results > 0) {
if (
response.organic_results.length > 0 &&
response.organic_results[0].link &&
!response.search_information.spelling_fix &&
!response.search_information.spelling_fix_type
) {
console.log(`Found LinkedIn for ${mem.displayName}: ${response.organic_results[0].link}`)
console.log(response.search_information)
mem.linkedinFromSerp = response.organic_results[0].link
} else {
console.log(`No LinkedIn found for ${mem.displayName}`)
}
} else {
console.log(`No LinkedIn found for ${mem.displayName}`)
}

await timeout(1000)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Consider implementing proper rate limiting.

Using a fixed timeout isn't the best approach for rate limiting. Consider implementing a proper rate limiter that respects API limits.

-    await timeout(1000)
+    await rateLimiter.wait()  // Implement a proper rate limiter class

Committable suggestion was skipped due to low confidence.

}
Comment on lines +24 to +44
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Improve error handling and response validation.

The response handling lacks proper error handling and validation:

  1. No try-catch around axios call
  2. No type checking for response structure
  3. Duplicate console.log for "No LinkedIn found"
-    const response = (await axios(config)).data
+    try {
+      const response = await axios(config)
+      const data = validateSerpResponse(response.data)
+      
+      if (!data.search_information?.total_results) {
+        logger.info(`No LinkedIn found for ${mem.displayName}`)
+        continue
+      }
+      
+      const linkedInProfile = extractLinkedInProfile(data.organic_results)
+      if (linkedInProfile) {
+        logger.info(`Found LinkedIn for ${mem.displayName}: ${linkedInProfile}`)
+        mem.linkedinFromSerp = linkedInProfile
+      } else {
+        logger.info(`No valid LinkedIn profile found for ${mem.displayName}`)
+      }
+    } catch (error) {
+      logger.error(`Error fetching data for ${mem.displayName}:`, error)
+    }

Committable suggestion was skipped due to low confidence.


try {
// const fields = [
// 'id',
// 'displayName',
// 'location',
// 'profileUrl',
// 'website',
// 'linkedinFromClearbit',
// 'linkedinFromProgai',
// 'linkedinFromSerp',
// ]
// const json2csvParser = new Parser({ fields })
// const csv = json2csvParser.parse(members)
// writeFileSync('output.csv', csv)
// console.log('CSV file has been successfully written.')
} catch (err) {
console.error('Error writing CSV file:', err)
}
Comment on lines +46 to +63
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Remove or implement CSV export functionality.

The commented-out CSV export code creates confusion. Either implement it properly or remove it. The try-catch block is currently unreachable as noted by static analysis.

If CSV export is needed, consider moving it to a separate utility function:

async function exportToCSV(members: Member[], outputPath: string): Promise<void> {
  const fields = ['id', 'displayName', /* ... */];
  const parser = new Parser({ fields });
  await writeFile(outputPath, parser.parse(members));
}
🧰 Tools
🪛 Biome

[error] 61-63: This code is unreachable

(lint/correctness/noUnreachable)

}

setImmediate(async () => {
await testSerpApi()
process.exit(0)
})
Comment on lines +66 to +69
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Avoid using setImmediate for test execution.

Using setImmediate and process.exit() is not a proper way to run tests. This should be restructured to use a proper test runner.

-setImmediate(async () => {
-  await testSerpApi()
-  process.exit(0)
-})
+if (require.main === module) {
+  describe('SERP API Tests', () => {
+    // ... test cases
+  });
+}

Committable suggestion was skipped due to low confidence.

Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
export interface IMemberEnrichmentDataSerp {
linkedinUrl: string
}

export interface IMemberEnrichmentSerpApiResponse {
organic_results: IMemberEnrichmentSerpApiResponseOrganicResult[]
search_information: IMemberEnrichmentSerpApiResponseSearchInformation
}

export interface IMemberEnrichmentSerpApiResponseSearchInformation {
query_displayed: string
total_results: number
time_taken_displayed: number
organic_results_state: string
spelling_fix?: string
spelling_fix_type?: string
}

export interface IMemberEnrichmentSerpApiResponseOrganicResult {
position: number
title: string
link: string
redirect_link: string
displayed_link: string
favicon: string
snippet: string
snippet_highlighted_words: string[]
sitelinks: {
inline: IMemberEnrichmentSerpApiResponseOrganicResultSitelinkInline[]
}
source: string
}

export interface IMemberEnrichmentSerpApiResponseOrganicResultSitelinkInline {
title: string
link: string
}
Loading