Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import {
touchMemberEnrichmentCacheUpdatedAt,
updateMemberEnrichmentCache,
} from './activities/enrichment'
import { getMembers } from './activities/getMembers'
import { getEnrichableMembers } from './activities/getMembers'
import { refreshToken } from './activities/lf-auth0/authenticateLFAuth0'
import {
getIdentitiesExistInOtherMembers,
Expand All @@ -29,7 +29,7 @@ import {
} from './activities/syncEnrichedData'

export {
getMembers,
getEnrichableMembers,
getEnrichmentData,
normalizeEnrichmentData,
findMemberEnrichmentCache,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
import { fetchMembersForEnrichment } from '@crowd/data-access-layer/src/old/apps/premium/members_enrichment_worker'
import { IMember, IMemberEnrichmentSourceQueryInput, MemberEnrichmentSource } from '@crowd/types'
import {
IEnrichableMember,
IMemberEnrichmentSourceQueryInput,
MemberEnrichmentSource,
} from '@crowd/types'

import { EnrichmentSourceServiceFactory } from '../factory'
import { svc } from '../main'

export async function getMembers(
export async function getEnrichableMembers(
limit: number,
sources: MemberEnrichmentSource[],
afterId: string,
): Promise<IMember[]> {
let rows: IMember[] = []
): Promise<IEnrichableMember[]> {
let rows: IEnrichableMember[] = []
Comment thread
epipav marked this conversation as resolved.
const sourceInputs: IMemberEnrichmentSourceQueryInput[] = sources.map((s) => {
const srv = EnrichmentSourceServiceFactory.getEnrichmentSourceService(s, svc.log)
return {
source: s,
cacheObsoleteAfterSeconds: srv.cacheObsoleteAfterSeconds,
enrichableBy: srv.enrichableBy,
enrichableBySql: srv.enrichableBySql,
}
})
const db = svc.postgres.reader
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { MemberEnrichmentSource } from '@crowd/types'

import EnrichmentServiceClearbit from './sources/clearbit/service'
import EnrichmentServiceProgAI from './sources/progai/service'
import EnrichmentServiceSerpApi from './sources/serp/service'
import { IEnrichmentService } from './types'
import { ALSO_USE_EMAIL_IDENTITIES_FOR_ENRICHMENT, ENRICH_EMAIL_IDENTITIES } from './utils/config'

Expand All @@ -21,6 +22,8 @@ export class EnrichmentSourceServiceFactory {
)
case MemberEnrichmentSource.CLEARBIT:
return new EnrichmentServiceClearbit(log)
case MemberEnrichmentSource.SERP:
return new EnrichmentServiceSerpApi(log)
default:
throw new Error(`Enrichment service for ${source} is not found!`)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ import axios from 'axios'

import { Logger, LoggerBase } from '@crowd/logging'
import {
IMemberEnrichmentSourceEnrichableBy,
MemberAttributeName,
MemberEnrichmentSource,
MemberIdentityType,
Expand All @@ -28,11 +27,7 @@ import {
export default class EnrichmentServiceClearbit extends LoggerBase implements IEnrichmentService {
public source: MemberEnrichmentSource = MemberEnrichmentSource.CLEARBIT
public platform = `enrichment-${this.source}`
public enrichableBy: IMemberEnrichmentSourceEnrichableBy[] = [
{
type: MemberIdentityType.EMAIL,
},
]
public enrichableBySql = `mi.type = 'email' and mi.verified`
Comment thread
epipav marked this conversation as resolved.

// bust cache after 120 days
public cacheObsoleteAfterSeconds = 60 * 60 * 24 * 120
Expand Down Expand Up @@ -60,7 +55,7 @@ export default class EnrichmentServiceClearbit extends LoggerBase implements IEn
}

isEnrichableBySource(input: IEnrichmentSourceInput): boolean {
return !!input.email?.value
return !!input.email?.value && input.email?.verified
}

async getData(input: IEnrichmentSourceInput): Promise<IMemberEnrichmentDataClearbit | null> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ import lodash from 'lodash'

import { Logger, LoggerBase } from '@crowd/logging'
import {
IMemberEnrichmentSourceEnrichableBy,
MemberAttributeName,
MemberEnrichmentSource,
MemberIdentityType,
Expand Down Expand Up @@ -33,15 +32,8 @@ import {
export default class EnrichmentServiceProgAI extends LoggerBase implements IEnrichmentService {
public source: MemberEnrichmentSource = MemberEnrichmentSource.PROGAI
public platform = `enrichment-${this.source}`
public enrichableBy: IMemberEnrichmentSourceEnrichableBy[] = [
{
type: MemberIdentityType.USERNAME,
platform: PlatformType.GITHUB,
},
{
type: MemberIdentityType.EMAIL,
},
]

enrichableBySql = `mi.verified and ((mi.type = 'username' AND mi.platform = 'github') OR (mi.type = 'email'))`

// bust cache after 90 days
public cacheObsoleteAfterSeconds = 60 * 60 * 24 * 90
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import axios from 'axios'

import { Logger, LoggerBase } from '@crowd/logging'
import { MemberEnrichmentSource, MemberIdentityType, PlatformType } from '@crowd/types'

import {
IEnrichmentService,
IEnrichmentSourceInput,
IMemberEnrichmentDataNormalized,
} from '../../types'

import { IMemberEnrichmentDataSerp, IMemberEnrichmentSerpApiResponse } from './types'

export default class EnrichmentServiceSerpApi extends LoggerBase implements IEnrichmentService {
public source: MemberEnrichmentSource = MemberEnrichmentSource.SERP
public platform = `enrichment-${this.source}`

public enrichableBySql = `
(members."displayName" like '% %') AND

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the idea? People with a space in their name?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, people with multiple-word names. Single-word names are not good enough when searching in google.

But now I'm thinking, since we're already using other stuff to distinguish (like website, location w/e), maybe we can remove this

@epipav epipav Oct 30, 2024

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the test results with one-word display names aren't great.

  • With multiple-word names, the correctness using GitHub identities in queries was %92.4, when using single-word names it dropped to %65
  • With multiple word names, queries with the website had correctness of %100, when using single-word names it dropped to %75

I'm keeping this for now, since it's more important to have more correct answers than more enrichable members

(members.attributes->'location'->>'default' is not null and members.attributes->'location'->>'default' <> '') AND
((members.attributes->'websiteUrl'->>'default' is not null and members.attributes->'websiteUrl'->>'default' <> '') OR
(mi.verified AND mi.type = 'username' and mi.platform = 'github') OR
(mi.verified AND mi.type = 'email')
)`
Comment thread
epipav marked this conversation as resolved.
Comment on lines +19 to +26

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Fix undefined SQL alias 'mi'

The SQL condition uses the mi alias without defining it in a JOIN clause. While the multiple-word display name check is intentionally kept based on test results showing better accuracy, the undefined alias needs to be addressed.

Consider adding the necessary JOIN clause:

  public enrichableBySql = `
  ("activitySummary".total_count > ${this.enrichMembersWithActivityMoreThan}) AND
  (members."displayName" like '% %') AND 
  (members.attributes->'location'->>'default' is not null and members.attributes->'location'->>'default' <> '') AND
  ((members.attributes->'websiteUrl'->>'default' is not null and members.attributes->'websiteUrl'->>'default' <> '') OR 
+  EXISTS (SELECT 1 FROM member_identities mi WHERE mi.member_id = members.id AND
   (mi.verified AND mi.type = 'username' and mi.platform = 'github') OR 
   (mi.verified AND mi.type = 'email')
+  )
  )`
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
public enrichableBySql = `
("activitySummary".total_count > ${this.enrichMembersWithActivityMoreThan}) AND
(members."displayName" like '% %') AND
(members.attributes->'location'->>'default' is not null and members.attributes->'location'->>'default' <> '') AND
((members.attributes->'websiteUrl'->>'default' is not null and members.attributes->'websiteUrl'->>'default' <> '') OR
(mi.verified AND mi.type = 'username' and mi.platform = 'github') OR
(mi.verified AND mi.type = 'email')
)`
public enrichableBySql = `
("activitySummary".total_count > ${this.enrichMembersWithActivityMoreThan}) AND
(members."displayName" like '% %') AND
(members.attributes->'location'->>'default' is not null and members.attributes->'location'->>'default' <> '') AND
((members.attributes->'websiteUrl'->>'default' is not null and members.attributes->'websiteUrl'->>'default' <> '') OR
EXISTS (SELECT 1 FROM member_identities mi WHERE mi.member_id = members.id AND
(mi.verified AND mi.type = 'username' and mi.platform = 'github') OR
(mi.verified AND mi.type = 'email')
)
)`


// bust cache after 60 days
public cacheObsoleteAfterSeconds = 60 * 60 * 24 * 60

constructor(public readonly log: Logger) {
super(log)
}

isEnrichableBySource(input: IEnrichmentSourceInput): boolean {
const displayNameSplit = input.displayName?.split(' ')
return (
displayNameSplit?.length > 1 &&
!!input.location &&
((!!input.email && input.email.verified) ||
(!!input.github && input.github.verified) ||
!!input.website)
)
}

async getData(input: IEnrichmentSourceInput): Promise<IMemberEnrichmentDataSerp | null> {
let enriched: IMemberEnrichmentDataSerp = null

if (input.displayName && input.location && input.website) {
enriched = await this.querySerpApi(input.displayName, input.location, input.website)
}

if (!enriched && input.displayName && input.location && input.github && input.github.value) {
enriched = await this.querySerpApi(input.displayName, input.location, input.github.value)
}

if (!enriched && input.displayName && input.location && input.email && input.email.value) {
enriched = await this.querySerpApi(input.displayName, input.location, input.email.value)
}
return enriched
}

private async querySerpApi(
displayName: string,
location: string,
identifier: string,
): Promise<IMemberEnrichmentDataSerp> {
const url = `https://serpapi.com/search.json`

const config = {
method: 'get',
url,
params: {
api_key: process.env['CROWD_SERP_API_KEY'],
Comment thread
epipav marked this conversation as resolved.
Outdated
q: `"${displayName}" ${location} "${identifier}" site:linkedin.com/in`,
num: 3,
engine: 'google',
},
}

const response: IMemberEnrichmentSerpApiResponse = (await axios(config)).data

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add error handling for Axios request to prevent unhandled exceptions

The Axios request in querySerpApi lacks error handling. If the request fails due to network issues or API errors, it could result in unhandled exceptions. It's advisable to wrap the call in a try-catch block to handle errors gracefully and log meaningful messages.

Apply this diff to add error handling:

       try {
         const response: IMemberEnrichmentSerpApiResponse = (await axios(config)).data

         if (response.search_information.total_results > 0) {
           // existing logic
         }

         return null
+      } catch (error) {
+        this.log.error('Error fetching data from SerpAPI', error)
+        return null
       }

Committable suggestion was skipped due to low confidence.


if (response.search_information.total_results > 0) {
if (
response.organic_results.length > 0 &&
response.organic_results[0].link &&
!response.search_information.spelling_fix &&
!response.search_information.spelling_fix_type
) {
return {
linkedinUrl: response.organic_results[0].link,
}
}
}

return null
}

normalize(data: IMemberEnrichmentDataSerp): IMemberEnrichmentDataNormalized {
const normalized: IMemberEnrichmentDataNormalized = {
identities: [
{
platform: PlatformType.LINKEDIN,
type: MemberIdentityType.USERNAME,
verified: false,
value: this.normalizeLinkedUrl(data.linkedinUrl),
},
],
}
return normalized
}
Comment on lines +97 to +109

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add error handling in normalize method

The normalize method doesn't handle potential errors from normalizeLinkedUrl. If URL normalization fails, it will throw an error and potentially break the enrichment process.

Add error handling:

  normalize(data: IMemberEnrichmentDataSerp): IMemberEnrichmentDataNormalized {
+   let normalizedUrl: string;
+   try {
+     normalizedUrl = this.normalizeLinkedUrl(data.linkedinUrl);
+   } catch (error) {
+     this.log.warn('Failed to normalize LinkedIn URL, using original', { url: data.linkedinUrl });
+     normalizedUrl = data.linkedinUrl;
+   }
    const normalized: IMemberEnrichmentDataNormalized = {
      identities: [
        {
          platform: PlatformType.LINKEDIN,
          type: MemberIdentityType.USERNAME,
          verified: false,
-         value: this.normalizeLinkedUrl(data.linkedinUrl),
+         value: normalizedUrl,
        },
      ],
    }
    return normalized
  }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
normalize(data: IMemberEnrichmentDataSerp): IMemberEnrichmentDataNormalized {
const normalized: IMemberEnrichmentDataNormalized = {
identities: [
{
platform: PlatformType.LINKEDIN,
type: MemberIdentityType.USERNAME,
verified: false,
value: this.normalizeLinkedUrl(data.linkedinUrl),
},
],
}
return normalized
}
normalize(data: IMemberEnrichmentDataSerp): IMemberEnrichmentDataNormalized {
let normalizedUrl: string;
try {
normalizedUrl = this.normalizeLinkedUrl(data.linkedinUrl);
} catch (error) {
this.log.warn('Failed to normalize LinkedIn URL, using original', { url: data.linkedinUrl });
normalizedUrl = data.linkedinUrl;
}
const normalized: IMemberEnrichmentDataNormalized = {
identities: [
{
platform: PlatformType.LINKEDIN,
type: MemberIdentityType.USERNAME,
verified: false,
value: normalizedUrl,
},
],
}
return normalized
}


private normalizeLinkedUrl(url: string): string {
try {
const parsedUrl = new URL(url)

if (parsedUrl.hostname.endsWith('linkedin.com')) {
parsedUrl.hostname = 'linkedin.com'
parsedUrl.search = ''

let path = parsedUrl.pathname
if (path.endsWith('/')) {
path = path.slice(0, -1)
}

return parsedUrl.origin + path
}

return url
} catch (error) {
this.log.error(`Error while normalizing linkedin url: ${url}`, error)
Comment thread
epipav marked this conversation as resolved.
throw error
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/* eslint-disable @typescript-eslint/no-explicit-any */
import axios from 'axios'

// import { writeFileSync } from 'fs'
// import { Parser } from 'json2csv'
import { timeout } from '@crowd/common'

Comment on lines +1 to +7

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Consider removing eslint-disable directive.

The eslint-disable directive for @typescript-eslint/no-explicit-any suggests potential type safety issues. Instead of disabling the rule, consider defining proper interfaces for the member data structure.

-/* eslint-disable @typescript-eslint/no-explicit-any */
+interface Member {
+  displayName: string;
+  location: string;
+  website: string;
+  linkedinFromSerp?: string;
+}

Committable suggestion was skipped due to low confidence.

const testSerpApi = async () => {
const members = [] as any[]

for (const mem of members) {
Comment on lines +8 to +11

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Convert script to proper test suite.

This appears to be a manual testing script rather than an automated test suite. Consider restructuring using a testing framework like Jest:

-const testSerpApi = async () => {
+describe('SERP API LinkedIn Finder', () => {
+  it('should find LinkedIn profiles for members', async () => {
+    const members: Member[] = [];

Would you like me to help create a proper test suite with mocked API responses and test cases?

Committable suggestion was skipped due to low confidence.

const url = `https://serpapi.com/search.json`
const config = {
method: 'get',
url,
params: {
api_key: process.env['CROWD_SERP_API_KEY'],
q: `"${mem.displayName}" ${mem.location} "${mem.website}" site:linkedin.com/in`,
num: 3,
engine: 'google',
},
Comment on lines +17 to +21

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Secure API key handling and input validation needed.

Several security and robustness concerns:

  1. Direct environment variable access without validation
  2. Potential SQL injection via unescaped member data in query string
  3. Missing error handling for invalid/missing API key
-        api_key: process.env['CROWD_SERP_API_KEY'],
-        q: `"${mem.displayName}" ${mem.location} "${mem.website}" site:linkedin.com/in`,
+        api_key: getRequiredEnvVar('CROWD_SERP_API_KEY'),
+        q: `"${escapeSearchString(mem.displayName)}" ${escapeSearchString(mem.location)} "${escapeSearchString(mem.website)}" site:linkedin.com/in`,

Committable suggestion was skipped due to low confidence.

}

const response = (await axios(config)).data

if (response.search_information.total_results > 0) {
if (
response.organic_results.length > 0 &&
response.organic_results[0].link &&
!response.search_information.spelling_fix &&
!response.search_information.spelling_fix_type
) {
console.log(`Found LinkedIn for ${mem.displayName}: ${response.organic_results[0].link}`)
console.log(response.search_information)
mem.linkedinFromSerp = response.organic_results[0].link
} else {
console.log(`No LinkedIn found for ${mem.displayName}`)
}
} else {
console.log(`No LinkedIn found for ${mem.displayName}`)
}

await timeout(1000)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Consider implementing proper rate limiting.

Using a fixed timeout isn't the best approach for rate limiting. Consider implementing a proper rate limiter that respects API limits.

-    await timeout(1000)
+    await rateLimiter.wait()  // Implement a proper rate limiter class

Committable suggestion was skipped due to low confidence.

}
Comment on lines +24 to +44

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Improve error handling and response validation.

The response handling lacks proper error handling and validation:

  1. No try-catch around axios call
  2. No type checking for response structure
  3. Duplicate console.log for "No LinkedIn found"
-    const response = (await axios(config)).data
+    try {
+      const response = await axios(config)
+      const data = validateSerpResponse(response.data)
+      
+      if (!data.search_information?.total_results) {
+        logger.info(`No LinkedIn found for ${mem.displayName}`)
+        continue
+      }
+      
+      const linkedInProfile = extractLinkedInProfile(data.organic_results)
+      if (linkedInProfile) {
+        logger.info(`Found LinkedIn for ${mem.displayName}: ${linkedInProfile}`)
+        mem.linkedinFromSerp = linkedInProfile
+      } else {
+        logger.info(`No valid LinkedIn profile found for ${mem.displayName}`)
+      }
+    } catch (error) {
+      logger.error(`Error fetching data for ${mem.displayName}:`, error)
+    }

Committable suggestion was skipped due to low confidence.


try {
// const fields = [
// 'id',
// 'displayName',
// 'location',
// 'profileUrl',
// 'website',
// 'linkedinFromClearbit',
// 'linkedinFromProgai',
// 'linkedinFromSerp',
// ]
// const json2csvParser = new Parser({ fields })
// const csv = json2csvParser.parse(members)
// writeFileSync('output.csv', csv)
// console.log('CSV file has been successfully written.')
} catch (err) {
console.error('Error writing CSV file:', err)
}
Comment on lines +46 to +63

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Remove or implement CSV export functionality.

The commented-out CSV export code creates confusion. Either implement it properly or remove it. The try-catch block is currently unreachable as noted by static analysis.

If CSV export is needed, consider moving it to a separate utility function:

async function exportToCSV(members: Member[], outputPath: string): Promise<void> {
  const fields = ['id', 'displayName', /* ... */];
  const parser = new Parser({ fields });
  await writeFile(outputPath, parser.parse(members));
}
🧰 Tools
🪛 Biome

[error] 61-63: This code is unreachable

(lint/correctness/noUnreachable)

}

setImmediate(async () => {
await testSerpApi()
process.exit(0)
})
Comment on lines +66 to +69

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Avoid using setImmediate for test execution.

Using setImmediate and process.exit() is not a proper way to run tests. This should be restructured to use a proper test runner.

-setImmediate(async () => {
-  await testSerpApi()
-  process.exit(0)
-})
+if (require.main === module) {
+  describe('SERP API Tests', () => {
+    // ... test cases
+  });
+}

Committable suggestion was skipped due to low confidence.

Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
export interface IMemberEnrichmentDataSerp {
linkedinUrl: string
}
Comment thread
epipav marked this conversation as resolved.

export interface IMemberEnrichmentSerpApiResponse {
organic_results: IMemberEnrichmentSerpApiResponseOrganicResult[]
search_information: IMemberEnrichmentSerpApiResponseSearchInformation
}
Comment thread
epipav marked this conversation as resolved.

export interface IMemberEnrichmentSerpApiResponseSearchInformation {
query_displayed: string
total_results: number
time_taken_displayed: number
organic_results_state: string
spelling_fix?: string
spelling_fix_type?: string
}
Comment thread
epipav marked this conversation as resolved.

export interface IMemberEnrichmentSerpApiResponseOrganicResult {
position: number
title: string
link: string
redirect_link: string
displayed_link: string
favicon: string
snippet: string
snippet_highlighted_words: string[]
sitelinks: {
inline: IMemberEnrichmentSerpApiResponseOrganicResultSitelinkInline[]
}
source: string
}
Comment thread
epipav marked this conversation as resolved.
Comment thread
epipav marked this conversation as resolved.

export interface IMemberEnrichmentSerpApiResponseOrganicResultSitelinkInline {
title: string
link: string
}
Comment thread
epipav marked this conversation as resolved.
Loading