From 03d38e5a7c2f0f28d767157d3d9315763ca7ef47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20Emre=20Kabakc=C4=B1?= Date: Fri, 29 May 2026 14:56:17 +0100 Subject: [PATCH 1/2] fix(linkedin): home_feed author body-fallback + drop promoted/suggested/noise rows --- .../connectors/src/__tests__/linkedin.test.ts | 168 +++++++++++++++++- packages/connectors/src/linkedin.ts | 56 +++++- 2 files changed, 212 insertions(+), 12 deletions(-) diff --git a/packages/connectors/src/__tests__/linkedin.test.ts b/packages/connectors/src/__tests__/linkedin.test.ts index e0d71029d..79f6cf0a5 100644 --- a/packages/connectors/src/__tests__/linkedin.test.ts +++ b/packages/connectors/src/__tests__/linkedin.test.ts @@ -9,25 +9,37 @@ mock.module('@lobu/connector-sdk', connectorSdkMock); let LinkedInConnector: any; // biome-ignore lint/suspicious/noExplicitAny: dynamic import after mock let buildHomeFeedEvents: any; +// biome-ignore lint/suspicious/noExplicitAny: dynamic import after mock +let parseHomeFeedAuthor: any; +// biome-ignore lint/suspicious/noExplicitAny: dynamic import after mock +let isHomeFeedNoise: any; beforeAll(async () => { const mod = await import('../linkedin'); LinkedInConnector = mod.default; buildHomeFeedEvents = mod.buildHomeFeedEvents; + parseHomeFeedAuthor = mod.parseHomeFeedAuthor; + isHomeFeedNoise = mod.isHomeFeedNoise; }); describe('buildHomeFeedEvents', () => { test('maps a token-id row to li_home_ with /feed/ source_url', () => { const occurredAt = new Date('2026-05-29T12:00:00.000Z'); const events = buildHomeFeedEvents( - [{ id: 'aBc123_token', body: 'Hello from the home feed', author: 'Jane Doe' }], + [ + { + id: 'aBc123_token', + body: 'Hello from the home feed, this body is long enough', + author: 'Jane Doe', + }, + ], occurredAt ); expect(events).toHaveLength(1); const [ev] = events; expect(ev.origin_id).toBe('li_home_aBc123_token'); - expect(ev.payload_text).toBe('Hello from the home feed'); + expect(ev.payload_text).toBe('Hello from the home feed, this body is long enough'); expect(ev.author_name).toBe('Jane Doe'); expect(ev.origin_type).toBe('post'); // Token id is NOT numeric → no urn:li:activity permalink, link to /feed/. @@ -36,24 +48,154 @@ describe('buildHomeFeedEvents', () => { expect(ev.metadata).toEqual({ author: 'Jane Doe' }); }); - test('defaults author to empty string when missing', () => { - const [ev] = buildHomeFeedEvents([{ id: 'tok', body: 'body only' }], new Date()); + test('defaults author to empty string when no author and no parseable body', () => { + // Body long enough to survive the noise filter but with no " • " marker. + const [ev] = buildHomeFeedEvents( + [{ id: 'tok', body: 'a plain body with no author marker whatsoever here' }], + new Date() + ); expect(ev.author_name).toBe(''); expect(ev.metadata).toEqual({ author: '' }); }); + test('prefers row.author over body parse when the DOM selector won', () => { + const [ev] = buildHomeFeedEvents( + [ + { + id: 'tok', + body: 'Feed post Hugo Lu • 1st Founder at Orchestra 4h • Yesterday Snowflake popped', + author: 'DOM Author', + }, + ], + new Date() + ); + expect(ev.author_name).toBe('DOM Author'); + expect(ev.metadata).toEqual({ author: 'DOM Author' }); + }); + + test('falls back to body-parsed author when row.author is empty', () => { + const [ev] = buildHomeFeedEvents( + [ + { + id: 'tok', + body: 'Feed post Hugo Lu • 1st Founder at Orchestra 4h • Yesterday Snowflake popped', + author: ' ', + }, + ], + new Date() + ); + expect(ev.author_name).toBe('Hugo Lu'); + expect(ev.metadata).toEqual({ author: 'Hugo Lu' }); + }); + test('drops rows without id or body and dedupes by id', () => { + const longBody = 'this body is definitely longer than thirty characters for the test'; const events = buildHomeFeedEvents( [ - { id: 'a', body: 'first' }, - { id: '', body: 'no id' }, + { id: 'a', body: longBody }, + { id: '', body: 'no id but long enough body to pass the noise filter check' }, { id: 'b' }, // no body - { id: 'a', body: 'dup id' }, + { id: 'a', body: 'dup id with a sufficiently long body to pass the noise filter' }, ], new Date() ); expect(events.map((e: { origin_id: string }) => e.origin_id)).toEqual(['li_home_a']); }); + + test('drops promoted, suggested, and too-short noise rows end-to-end', () => { + const occurredAt = new Date('2026-05-29T12:00:00.000Z'); + const events = buildHomeFeedEvents( + [ + { + id: 'keep1', + body: 'Feed post Hugo Lu • 1st Founder at Orchestra 4h • Yesterday Snowflake popped', + }, + { + id: 'keep2', + body: 'Feed post Sabri Karagönen reposted this Hardal 17h • Follow Hardal is now integrated with Bruin', + }, + { + id: 'ad', + body: 'Feed post Attio 52,728 followers Promoted Introducing GTM Atlas the new way to map your market', + }, + { id: 'sug', body: 'Feed post Suggested Matt Graham • 2nd CEO @ RapidDev building fast' }, + { id: 'short', body: 'Load more comments' }, + ], + occurredAt + ); + expect(events.map((e: { origin_id: string }) => e.origin_id)).toEqual([ + 'li_home_keep1', + 'li_home_keep2', + ]); + expect(events.map((e: { author_name: string }) => e.author_name)).toEqual([ + 'Hugo Lu', + 'Hardal', + ]); + }); +}); + +describe('parseHomeFeedAuthor', () => { + test('extracts the leading name before the connection-degree marker', () => { + expect( + parseHomeFeedAuthor( + 'Feed post Hugo Lu • 1st Founder at Orchestra 4h • Yesterday Snowflake popped' + ) + ).toBe('Hugo Lu'); + }); + + test('handles an emoji-laden headline', () => { + expect( + parseHomeFeedAuthor( + 'Feed post Arpit Choudhury • 1st I am the calmest when the music is loud 🔊 1h • Today' + ) + ).toBe('Arpit Choudhury'); + }); + + test('takes the original poster after "reposted this"', () => { + expect( + parseHomeFeedAuthor( + 'Feed post Sabri Karagönen reposted this Hardal 17h • Follow Hardal is now integrated with Bruin' + ) + ).toBe('Hardal'); + }); + + test('returns empty string when no " • " marker is present', () => { + expect(parseHomeFeedAuthor('Feed post some text with no marker at all')).toBe(''); + }); + + test('returns empty string for empty input', () => { + expect(parseHomeFeedAuthor('')).toBe(''); + }); + + test('caps the result to 60 chars', () => { + const longName = 'A'.repeat(100); + expect(parseHomeFeedAuthor(`Feed post ${longName} • 1st headline`).length).toBe(60); + }); +}); + +describe('isHomeFeedNoise', () => { + test('drops empty or too-short bodies', () => { + expect(isHomeFeedNoise('')).toBe(true); + expect(isHomeFeedNoise('Load more comments')).toBe(true); + }); + + test('drops promoted ads', () => { + expect( + isHomeFeedNoise( + 'Feed post Attio 52,728 followers Promoted Introducing GTM Atlas the new way to map your market' + ) + ).toBe(true); + }); + + test('drops suggested rows', () => { + expect(isHomeFeedNoise('Feed post Suggested Matt Graham • 2nd CEO @ RapidDev')).toBe(true); + }); + + test('keeps a normal post', () => { + expect( + isHomeFeedNoise('Feed post Hugo Lu • 1st Founder at Orchestra 4h • Yesterday Snowflake popped') + ).toBe(false); + }); }); describe('LinkedInConnector home_feed', () => { @@ -74,8 +216,16 @@ describe('LinkedInConnector home_feed', () => { result: { loggedIn: true, rows: [ - { id: 'tok1', body: 'post one', author: 'Alice' }, - { id: 'tok2', body: 'post two', author: 'Bob' }, + { + id: 'tok1', + body: 'post one with a body long enough to pass the noise filter', + author: 'Alice', + }, + { + id: 'tok2', + body: 'post two with a body long enough to pass the noise filter', + author: 'Bob', + }, ], }, }; diff --git a/packages/connectors/src/linkedin.ts b/packages/connectors/src/linkedin.ts index bcdcfa0f9..a60683314 100644 --- a/packages/connectors/src/linkedin.ts +++ b/packages/connectors/src/linkedin.ts @@ -93,13 +93,61 @@ const HOME_FEED_SCRAPE_CONFIG = { fields: { body: { take: 'text' }, author: { - selector: '.update-components-actor__title, .update-components-actor__name', + // LinkedIn obfuscates the actor classes, so the old + // .update-components-actor__* selectors no longer match. Best-effort: + // grab the visible name span inside the actor's profile/company link + // when present. When this misses, buildHomeFeedEvents falls back to + // parsing the author out of the row body text. + selector: + 'a[href*="/in/"] span[aria-hidden="true"], a[href*="/company/"] span[aria-hidden="true"]', take: 'text', firstLine: true, }, }, } as const; +/** + * Best-effort author extraction from a home-feed row's body text. The home + * feed DOM obfuscates the actor classes, so the selector often misses and the + * only reliable place the author name appears is the row's body text. This is + * inherently heuristic — the feed can't use network capture, so there is no + * structured author field to read. + */ +export function parseHomeFeedAuthor(body: string): string { + if (!body) return ''; + let text = body.replace(/^feed post\s+/i, '').trim(); + + // A repost surfaces the resharer first, then "reposted this", then the + // original poster whose content this actually is — take the original poster. + const repostIdx = text.toLowerCase().indexOf('reposted this'); + if (repostIdx !== -1) { + text = text.slice(repostIdx + 'reposted this'.length).trim(); + } + + // The author is the leading name before the " • " connection-degree marker. + const sepIdx = text.indexOf(' • '); + if (sepIdx === -1) return ''; + let name = text.slice(0, sepIdx).trim(); + // A repost segment puts a relative-time token (e.g. "17h") right after the + // original poster's name and before the marker — strip it so we keep just + // the name. + name = name.replace(/\s+\d+\s*[smhdwy]o?$/i, '').trim(); + return name.slice(0, 60); +} + +/** + * The home feed mixes in ads, suggestions, and non-post noise. These never + * become useful events, so drop them before emitting. Heuristic by necessity — + * the home feed has no structured "is this an ad" field over the content-script + * scrape. + */ +export function isHomeFeedNoise(body: string): boolean { + if (!body || body.trim().length < 30) return true; + if (/\bPromoted\b/i.test(body.slice(0, 130))) return true; + if (/\bSuggested\b/i.test(body.slice(0, 30))) return true; + return false; +} + /** * Map cs_scrape home-feed rows to event envelopes. The componentkey token is * not a numeric activity id, so there is no /feed/update permalink — source_url @@ -111,18 +159,20 @@ export function buildHomeFeedEvents(rows: HomeFeedRow[], occurredAt: Date): Even const events: EventEnvelope[] = []; for (const row of rows) { if (!row?.id || !row.body || seen.has(row.id)) continue; + if (isHomeFeedNoise(row.body)) continue; seen.add(row.id); + const author = (row.author ?? '').trim() || parseHomeFeedAuthor(row.body ?? ''); events.push({ origin_id: `li_home_${row.id}`, payload_text: row.body, - author_name: row.author || '', + author_name: author, // Feed posts expose no reliable timestamp; use the sync time. occurred_at: occurredAt, origin_type: 'post', // Token id is NOT a numeric activity id, so we cannot build a // urn:li:activity permalink — link to the feed itself. source_url: 'https://www.linkedin.com/feed/', - metadata: { author: row.author || '' }, + metadata: { author }, }); } return events; From f3d89bc64cab335cf05afca1b0fe6403c11afeaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20Emre=20Kabakc=C4=B1?= Date: Fri, 29 May 2026 14:59:45 +0100 Subject: [PATCH 2/2] fix(linkedin): strip connection-degree marker from DOM-selector home_feed author --- .../connectors/src/__tests__/linkedin.test.ts | 15 +++++++++++++++ packages/connectors/src/linkedin.ts | 7 ++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/packages/connectors/src/__tests__/linkedin.test.ts b/packages/connectors/src/__tests__/linkedin.test.ts index 79f6cf0a5..d12142079 100644 --- a/packages/connectors/src/__tests__/linkedin.test.ts +++ b/packages/connectors/src/__tests__/linkedin.test.ts @@ -73,6 +73,21 @@ describe('buildHomeFeedEvents', () => { expect(ev.metadata).toEqual({ author: 'DOM Author' }); }); + test('strips the connection-degree marker from a DOM-selector author', () => { + const [ev] = buildHomeFeedEvents( + [ + { + id: 'tok', + body: 'Julien Hurault 1st Julien Hurault • 1st Freelance Data Eng newsletter', + author: 'Julien Hurault • 1st', + }, + ], + new Date() + ); + expect(ev.author_name).toBe('Julien Hurault'); + expect(ev.metadata).toEqual({ author: 'Julien Hurault' }); + }); + test('falls back to body-parsed author when row.author is empty', () => { const [ev] = buildHomeFeedEvents( [ diff --git a/packages/connectors/src/linkedin.ts b/packages/connectors/src/linkedin.ts index a60683314..2d4cdda2b 100644 --- a/packages/connectors/src/linkedin.ts +++ b/packages/connectors/src/linkedin.ts @@ -161,7 +161,12 @@ export function buildHomeFeedEvents(rows: HomeFeedRow[], occurredAt: Date): Even if (!row?.id || !row.body || seen.has(row.id)) continue; if (isHomeFeedNoise(row.body)) continue; seen.add(row.id); - const author = (row.author ?? '').trim() || parseHomeFeedAuthor(row.body ?? ''); + // The DOM actor span often includes the connection-degree marker + // ("Julien Hurault • 1st"); strip it the same way body-parse does. Fall + // back to parsing the name out of the post body when the selector misses. + const author = + (row.author ?? '').trim().split(' • ')[0].trim() || + parseHomeFeedAuthor(row.body ?? ''); events.push({ origin_id: `li_home_${row.id}`, payload_text: row.body,