Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import { fetchMembersForEnrichment } from '@crowd/data-access-layer/src/old/apps/premium/members_enrichment_worker'
import { IMember, IMemberEnrichmentSourceQueryInput, MemberEnrichmentSource } from '@crowd/types'
import {
IEnrichableMember,
IMemberEnrichmentSourceQueryInput,
MemberEnrichmentSource,
} from '@crowd/types'

import { EnrichmentSourceServiceFactory } from '../factory'
import { svc } from '../main'
Expand All @@ -8,14 +12,14 @@ export async function getMembers(
limit: number,
sources: MemberEnrichmentSource[],
afterId: string,
): Promise<IMember[]> {
let rows: IMember[] = []
): Promise<IEnrichableMember[]> {
let rows: IEnrichableMember[] = []
Comment thread
epipav marked this conversation as resolved.
const sourceInputs: IMemberEnrichmentSourceQueryInput[] = sources.map((s) => {
const srv = EnrichmentSourceServiceFactory.getEnrichmentSourceService(s, svc.log)
return {
source: s,
cacheObsoleteAfterSeconds: srv.cacheObsoleteAfterSeconds,
enrichableBy: srv.enrichableBy,
enrichableBySql: srv.enrichableBySql,
}
})
const db = svc.postgres.reader
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { MemberEnrichmentSource } from '@crowd/types'

import EnrichmentServiceClearbit from './sources/clearbit/service'
import EnrichmentServiceProgAI from './sources/progai/service'
import EnrichmentServiceSerpApi from './sources/serp/service'
import { IEnrichmentService } from './types'
import { ALSO_USE_EMAIL_IDENTITIES_FOR_ENRICHMENT, ENRICH_EMAIL_IDENTITIES } from './utils/config'

Expand All @@ -21,6 +22,8 @@ export class EnrichmentSourceServiceFactory {
)
case MemberEnrichmentSource.CLEARBIT:
return new EnrichmentServiceClearbit(log)
case MemberEnrichmentSource.SERP:
return new EnrichmentServiceSerpApi(log)
default:
throw new Error(`Enrichment service for ${source} is not found!`)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ import axios from 'axios'

import { Logger, LoggerBase } from '@crowd/logging'
import {
IMemberEnrichmentSourceEnrichableBy,
MemberAttributeName,
MemberEnrichmentSource,
MemberIdentityType,
Expand All @@ -28,11 +27,7 @@ import {
export default class EnrichmentServiceClearbit extends LoggerBase implements IEnrichmentService {
public source: MemberEnrichmentSource = MemberEnrichmentSource.CLEARBIT
public platform = `enrichment-${this.source}`
public enrichableBy: IMemberEnrichmentSourceEnrichableBy[] = [
{
type: MemberIdentityType.EMAIL,
},
]
public enrichableBySql = `mi.type = 'email' and mi.verified`
Comment thread
epipav marked this conversation as resolved.

// bust cache after 120 days
public cacheObsoleteAfterSeconds = 60 * 60 * 24 * 120
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ import lodash from 'lodash'

import { Logger, LoggerBase } from '@crowd/logging'
import {
IMemberEnrichmentSourceEnrichableBy,
MemberAttributeName,
MemberEnrichmentSource,
MemberIdentityType,
Expand Down Expand Up @@ -33,15 +32,8 @@ import {
export default class EnrichmentServiceProgAI extends LoggerBase implements IEnrichmentService {
public source: MemberEnrichmentSource = MemberEnrichmentSource.PROGAI
public platform = `enrichment-${this.source}`
public enrichableBy: IMemberEnrichmentSourceEnrichableBy[] = [
{
type: MemberIdentityType.USERNAME,
platform: PlatformType.GITHUB,
},
{
type: MemberIdentityType.EMAIL,
},
]

enrichableBySql = `mi.verified and ((mi.type = 'username' AND mi.platform = 'github') OR (mi.type = 'email'))`

// bust cache after 90 days
public cacheObsoleteAfterSeconds = 60 * 60 * 24 * 90
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
import axios from 'axios'

import { Logger, LoggerBase } from '@crowd/logging'
import { MemberEnrichmentSource, MemberIdentityType, PlatformType } from '@crowd/types'

import {
IEnrichmentService,
IEnrichmentSourceInput,
IMemberEnrichmentDataNormalized,
} from '../../types'

import { IMemberEnrichmentDataSerp, IMemberEnrichmentSerpApiResponse } from './types'

export default class EnrichmentServiceSerpApi extends LoggerBase implements IEnrichmentService {
public source: MemberEnrichmentSource = MemberEnrichmentSource.SERP
public platform = `enrichment-${this.source}`

public enrichableBySql = `
(members."displayName" like '% %') AND

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the idea? People with a space in their name?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, people with multiple-word names. Single-word names are not good enough when searching in google.

But now I'm thinking, since we're already using other stuff to distinguish (like website, location w/e), maybe we can remove this

@epipav epipav Oct 30, 2024

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the test results with one-word display names aren't great.

  • With multiple-word names, the correctness using GitHub identities in queries was %92.4, when using single-word names it dropped to %65
  • With multiple word names, queries with the website had correctness of %100, when using single-word names it dropped to %75

I'm keeping this for now, since it's more important to have more correct answers than more enrichable members

(members.attributes->'location'->>'default' is not null and members.attributes->'location'->>'default' <> '') AND
((members.attributes->'websiteUrl'->>'default' is not null and members.attributes->'websiteUrl'->>'default' <> '') OR
(mi.verified AND mi.type = 'username' and mi.platform = 'github') OR
(mi.verified AND mi.type = 'email')
)`
Comment thread
epipav marked this conversation as resolved.
Comment on lines +19 to +26

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Fix undefined SQL alias 'mi'

The SQL condition uses the mi alias without defining it in a JOIN clause. While the multiple-word display name check is intentionally kept based on test results showing better accuracy, the undefined alias needs to be addressed.

Consider adding the necessary JOIN clause:

  public enrichableBySql = `
  ("activitySummary".total_count > ${this.enrichMembersWithActivityMoreThan}) AND
  (members."displayName" like '% %') AND 
  (members.attributes->'location'->>'default' is not null and members.attributes->'location'->>'default' <> '') AND
  ((members.attributes->'websiteUrl'->>'default' is not null and members.attributes->'websiteUrl'->>'default' <> '') OR 
+  EXISTS (SELECT 1 FROM member_identities mi WHERE mi.member_id = members.id AND
   (mi.verified AND mi.type = 'username' and mi.platform = 'github') OR 
   (mi.verified AND mi.type = 'email')
+  )
  )`
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
public enrichableBySql = `
("activitySummary".total_count > ${this.enrichMembersWithActivityMoreThan}) AND
(members."displayName" like '% %') AND
(members.attributes->'location'->>'default' is not null and members.attributes->'location'->>'default' <> '') AND
((members.attributes->'websiteUrl'->>'default' is not null and members.attributes->'websiteUrl'->>'default' <> '') OR
(mi.verified AND mi.type = 'username' and mi.platform = 'github') OR
(mi.verified AND mi.type = 'email')
)`
public enrichableBySql = `
("activitySummary".total_count > ${this.enrichMembersWithActivityMoreThan}) AND
(members."displayName" like '% %') AND
(members.attributes->'location'->>'default' is not null and members.attributes->'location'->>'default' <> '') AND
((members.attributes->'websiteUrl'->>'default' is not null and members.attributes->'websiteUrl'->>'default' <> '') OR
EXISTS (SELECT 1 FROM member_identities mi WHERE mi.member_id = members.id AND
(mi.verified AND mi.type = 'username' and mi.platform = 'github') OR
(mi.verified AND mi.type = 'email')
)
)`


// bust cache after 60 days
public cacheObsoleteAfterSeconds = 60 * 60 * 24 * 60

constructor(public readonly log: Logger) {
super(log)
}

isEnrichableBySource(input: IEnrichmentSourceInput): boolean {
const displayNameSplit = input.displayName?.split(' ')
return (
displayNameSplit?.length > 1 &&
!!input.location &&
((!!input.email && input.email.verified) ||
(!!input.github && input.github.verified) ||
!!input.website)
)
}

async getData(input: IEnrichmentSourceInput): Promise<IMemberEnrichmentDataSerp | null> {
let enriched: IMemberEnrichmentDataSerp = null

if (input.displayName && input.location && input.website) {
enriched = await this.queryWithWebsite(input.displayName, input.location, input.website)
}

if (!enriched && input.displayName && input.location && input.github) {
enriched = await this.queryWithGithubIdentity(
input.displayName,
input.location,
input.github.value,
)
}
Comment thread
epipav marked this conversation as resolved.
Outdated

if (!enriched && input.displayName && input.location && input.email) {
enriched = await this.queryWithEmail(input.displayName, input.location, input.email.value)
}
Comment thread
epipav marked this conversation as resolved.
Outdated
return enriched
}

private async queryWithWebsite(
displayName: string,
location: string,
website: string,
): Promise<IMemberEnrichmentDataSerp> {
const url = `https://serpapi.com/search.json`

const config = {
method: 'get',
url,
params: {
api_key: process.env['CROWD_SERP_API_KEY'],
Comment thread
epipav marked this conversation as resolved.
Outdated
q: `"${displayName}" ${location} "${website}" site:linkedin.com/in`,
num: 3,
engine: 'google',
},
}

const response: IMemberEnrichmentSerpApiResponse = (await axios(config)).data

Comment thread
epipav marked this conversation as resolved.
Outdated
if (response.search_information.total_results > 0) {
if (
response.organic_results.length > 0 &&
response.organic_results[0].link &&
!response.search_information.spelling_fix &&
!response.search_information.spelling_fix_type
) {
return {
linkedinUrl: response.organic_results[0].link,
}
}
}

return null
}
Comment thread
epipav marked this conversation as resolved.
Outdated
Comment thread
epipav marked this conversation as resolved.
Outdated
Comment thread
epipav marked this conversation as resolved.
Outdated

private async queryWithGithubIdentity(
displayName: string,
location: string,
githubIdentity: string,
): Promise<IMemberEnrichmentDataSerp> {
const url = `https://serpapi.com/search.json`

const config = {
method: 'get',
url,
params: {
api_key: process.env['CROWD_SERP_API_KEY'],
q: `"${displayName}" ${location} "${githubIdentity}" site:linkedin.com/in`,
num: 3,
engine: 'google',
},
}

const response: IMemberEnrichmentSerpApiResponse = (await axios(config)).data

if (response.search_information.total_results > 0) {
if (
response.organic_results.length > 0 &&
response.organic_results[0].link &&
!response.search_information.spelling_fix &&
!response.search_information.spelling_fix_type
) {
return {
linkedinUrl: response.organic_results[0].link,
}
}
}

return null
}

private async queryWithEmail(
displayName: string,
location: string,
email: string,
): Promise<IMemberEnrichmentDataSerp> {
const url = `https://serpapi.com/search.json`

const config = {
method: 'get',
url,
params: {
api_key: process.env['CROWD_SERP_API_KEY'],
q: `"${displayName}" ${location} "${email}" site:linkedin.com/in`,
num: 3,
engine: 'google',
},
}

const response: IMemberEnrichmentSerpApiResponse = (await axios(config)).data

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add error handling for Axios request to prevent unhandled exceptions

The Axios request in querySerpApi lacks error handling. If the request fails due to network issues or API errors, it could result in unhandled exceptions. It's advisable to wrap the call in a try-catch block to handle errors gracefully and log meaningful messages.

Apply this diff to add error handling:

       try {
         const response: IMemberEnrichmentSerpApiResponse = (await axios(config)).data

         if (response.search_information.total_results > 0) {
           // existing logic
         }

         return null
+      } catch (error) {
+        this.log.error('Error fetching data from SerpAPI', error)
+        return null
       }

Committable suggestion was skipped due to low confidence.


if (response.search_information.total_results > 0) {
if (
response.organic_results.length > 0 &&
response.organic_results[0].link &&
!response.search_information.spelling_fix &&
!response.search_information.spelling_fix_type
) {
return {
linkedinUrl: response.organic_results[0].link,
}
}
}

return null
}

normalize(data: IMemberEnrichmentDataSerp): IMemberEnrichmentDataNormalized {
const normalized: IMemberEnrichmentDataNormalized = {
identities: [
{
platform: PlatformType.LINKEDIN,
type: MemberIdentityType.USERNAME,
verified: false,
value: this.normalizeLinkedUrl(data.linkedinUrl),
},
],
}
return normalized
}
Comment on lines +97 to +109

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add error handling in normalize method

The normalize method doesn't handle potential errors from normalizeLinkedUrl. If URL normalization fails, it will throw an error and potentially break the enrichment process.

Add error handling:

  normalize(data: IMemberEnrichmentDataSerp): IMemberEnrichmentDataNormalized {
+   let normalizedUrl: string;
+   try {
+     normalizedUrl = this.normalizeLinkedUrl(data.linkedinUrl);
+   } catch (error) {
+     this.log.warn('Failed to normalize LinkedIn URL, using original', { url: data.linkedinUrl });
+     normalizedUrl = data.linkedinUrl;
+   }
    const normalized: IMemberEnrichmentDataNormalized = {
      identities: [
        {
          platform: PlatformType.LINKEDIN,
          type: MemberIdentityType.USERNAME,
          verified: false,
-         value: this.normalizeLinkedUrl(data.linkedinUrl),
+         value: normalizedUrl,
        },
      ],
    }
    return normalized
  }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
normalize(data: IMemberEnrichmentDataSerp): IMemberEnrichmentDataNormalized {
const normalized: IMemberEnrichmentDataNormalized = {
identities: [
{
platform: PlatformType.LINKEDIN,
type: MemberIdentityType.USERNAME,
verified: false,
value: this.normalizeLinkedUrl(data.linkedinUrl),
},
],
}
return normalized
}
normalize(data: IMemberEnrichmentDataSerp): IMemberEnrichmentDataNormalized {
let normalizedUrl: string;
try {
normalizedUrl = this.normalizeLinkedUrl(data.linkedinUrl);
} catch (error) {
this.log.warn('Failed to normalize LinkedIn URL, using original', { url: data.linkedinUrl });
normalizedUrl = data.linkedinUrl;
}
const normalized: IMemberEnrichmentDataNormalized = {
identities: [
{
platform: PlatformType.LINKEDIN,
type: MemberIdentityType.USERNAME,
verified: false,
value: normalizedUrl,
},
],
}
return normalized
}


private normalizeLinkedUrl(url: string): string {
try {
const parsedUrl = new URL(url)

if (parsedUrl.hostname.endsWith('linkedin.com')) {
parsedUrl.hostname = 'linkedin.com'
parsedUrl.search = ''

let path = parsedUrl.pathname
if (path.endsWith('/')) {
path = path.slice(0, -1)
}

return parsedUrl.origin + path
}

return url
} catch (error) {
this.log.error(`Error while normalizing linkedin url: ${url}`, error)
Comment thread
epipav marked this conversation as resolved.
throw error
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
export interface IMemberEnrichmentDataSerp {
linkedinUrl: string
}
Comment thread
epipav marked this conversation as resolved.

export interface IMemberEnrichmentSerpApiResponse {
organic_results: IMemberEnrichmentSerpApiResponseOrganicResult[]
search_information: IMemberEnrichmentSerpApiResponseSearchInformation
}
Comment thread
epipav marked this conversation as resolved.

export interface IMemberEnrichmentSerpApiResponseSearchInformation {
query_displayed: string
total_results: number
time_taken_displayed: number
organic_results_state: string
spelling_fix?: string
spelling_fix_type?: string
}
Comment thread
epipav marked this conversation as resolved.

export interface IMemberEnrichmentSerpApiResponseOrganicResult {
position: number
title: string
link: string
redirect_link: string
displayed_link: string
favicon: string
snippet: string
snippet_highlighted_words: string[]
sitelinks: {
inline: IMemberEnrichmentSerpApiResponseOrganicResultSitelinkInline[]
}
source: string
}
Comment thread
epipav marked this conversation as resolved.
Comment thread
epipav marked this conversation as resolved.

export interface IMemberEnrichmentSerpApiResponseOrganicResultSitelinkInline {
title: string
link: string
}
Comment thread
epipav marked this conversation as resolved.
14 changes: 10 additions & 4 deletions services/apps/premium/members_enrichment_worker/src/types.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import {
IAttributes,
IMemberContribution,
IMemberEnrichmentSourceEnrichableBy,
IMemberIdentity,
IOrganizationIdentity,
MemberAttributeName,
Expand All @@ -12,14 +11,21 @@ import {

import { IMemberEnrichmentDataClearbit } from './sources/clearbit/types'
import { IMemberEnrichmentDataProgAI } from './sources/progai/types'
import { IMemberEnrichmentDataSerp } from './sources/serp/types'

export interface IEnrichmentSourceInput {
github?: IMemberIdentity
linkedin?: IMemberIdentity
email?: IMemberIdentity
website?: string
location?: string
displayName?: string
}

export type IMemberEnrichmentData = IMemberEnrichmentDataProgAI | IMemberEnrichmentDataClearbit
export type IMemberEnrichmentData =
| IMemberEnrichmentDataProgAI
| IMemberEnrichmentDataClearbit
| IMemberEnrichmentDataSerp

export interface IEnrichmentService {
source: MemberEnrichmentSource
Expand All @@ -30,8 +36,8 @@ export interface IEnrichmentService {
// can the source enrich using this input
isEnrichableBySource(input: IEnrichmentSourceInput): boolean

// what kind of identities can this source use as input
enrichableBy: IMemberEnrichmentSourceEnrichableBy[]
// what kind of custom sql should this source use as input
enrichableBySql: string

// should either return the data or null if it's a miss
getData(input: IEnrichmentSourceInput): Promise<IMemberEnrichmentData | null>
Expand Down
Loading