Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 174 additions & 9 deletions packages/connectors/src/__tests__/linkedin.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,37 @@ mock.module('@lobu/connector-sdk', connectorSdkMock);
let LinkedInConnector: any;
// biome-ignore lint/suspicious/noExplicitAny: dynamic import after mock
let buildHomeFeedEvents: any;
// biome-ignore lint/suspicious/noExplicitAny: dynamic import after mock
let parseHomeFeedAuthor: any;
// biome-ignore lint/suspicious/noExplicitAny: dynamic import after mock
let isHomeFeedNoise: any;

beforeAll(async () => {
const mod = await import('../linkedin');
LinkedInConnector = mod.default;
buildHomeFeedEvents = mod.buildHomeFeedEvents;
parseHomeFeedAuthor = mod.parseHomeFeedAuthor;
isHomeFeedNoise = mod.isHomeFeedNoise;
});

describe('buildHomeFeedEvents', () => {
test('maps a token-id row to li_home_<token> with /feed/ source_url', () => {
const occurredAt = new Date('2026-05-29T12:00:00.000Z');
const events = buildHomeFeedEvents(
[{ id: 'aBc123_token', body: 'Hello from the home feed', author: 'Jane Doe' }],
[
{
id: 'aBc123_token',
body: 'Hello from the home feed, this body is long enough',
author: 'Jane Doe',
},
],
occurredAt
);

expect(events).toHaveLength(1);
const [ev] = events;
expect(ev.origin_id).toBe('li_home_aBc123_token');
expect(ev.payload_text).toBe('Hello from the home feed');
expect(ev.payload_text).toBe('Hello from the home feed, this body is long enough');
expect(ev.author_name).toBe('Jane Doe');
expect(ev.origin_type).toBe('post');
// Token id is NOT numeric → no urn:li:activity permalink, link to /feed/.
Expand All @@ -36,24 +48,169 @@ describe('buildHomeFeedEvents', () => {
expect(ev.metadata).toEqual({ author: 'Jane Doe' });
});

test('defaults author to empty string when missing', () => {
const [ev] = buildHomeFeedEvents([{ id: 'tok', body: 'body only' }], new Date());
test('defaults author to empty string when no author and no parseable body', () => {
// Body long enough to survive the noise filter but with no " • " marker.
const [ev] = buildHomeFeedEvents(
[{ id: 'tok', body: 'a plain body with no author marker whatsoever here' }],
new Date()
);
expect(ev.author_name).toBe('');
expect(ev.metadata).toEqual({ author: '' });
});

test('prefers row.author over body parse when the DOM selector won', () => {
const [ev] = buildHomeFeedEvents(
[
{
id: 'tok',
body: 'Feed post Hugo Lu • 1st Founder at Orchestra 4h • Yesterday Snowflake popped',
author: 'DOM Author',
},
],
new Date()
);
expect(ev.author_name).toBe('DOM Author');
expect(ev.metadata).toEqual({ author: 'DOM Author' });
});

test('strips the connection-degree marker from a DOM-selector author', () => {
const [ev] = buildHomeFeedEvents(
[
{
id: 'tok',
body: 'Julien Hurault 1st Julien Hurault • 1st Freelance Data Eng newsletter',
author: 'Julien Hurault • 1st',
},
],
new Date()
);
expect(ev.author_name).toBe('Julien Hurault');
expect(ev.metadata).toEqual({ author: 'Julien Hurault' });
});

test('falls back to body-parsed author when row.author is empty', () => {
const [ev] = buildHomeFeedEvents(
[
{
id: 'tok',
body: 'Feed post Hugo Lu • 1st Founder at Orchestra 4h • Yesterday Snowflake popped',
author: ' ',
},
],
new Date()
);
expect(ev.author_name).toBe('Hugo Lu');
expect(ev.metadata).toEqual({ author: 'Hugo Lu' });
});

test('drops rows without id or body and dedupes by id', () => {
const longBody = 'this body is definitely longer than thirty characters for the test';
const events = buildHomeFeedEvents(
[
{ id: 'a', body: 'first' },
{ id: '', body: 'no id' },
{ id: 'a', body: longBody },
{ id: '', body: 'no id but long enough body to pass the noise filter check' },
{ id: 'b' }, // no body
{ id: 'a', body: 'dup id' },
{ id: 'a', body: 'dup id with a sufficiently long body to pass the noise filter' },
],
new Date()
);
expect(events.map((e: { origin_id: string }) => e.origin_id)).toEqual(['li_home_a']);
});

test('drops promoted, suggested, and too-short noise rows end-to-end', () => {
const occurredAt = new Date('2026-05-29T12:00:00.000Z');
const events = buildHomeFeedEvents(
[
{
id: 'keep1',
body: 'Feed post Hugo Lu • 1st Founder at Orchestra 4h • Yesterday Snowflake popped',
},
{
id: 'keep2',
body: 'Feed post Sabri Karagönen reposted this Hardal 17h • Follow Hardal is now integrated with Bruin',
},
{
id: 'ad',
body: 'Feed post Attio 52,728 followers Promoted Introducing GTM Atlas the new way to map your market',
},
{ id: 'sug', body: 'Feed post Suggested Matt Graham • 2nd CEO @ RapidDev building fast' },
{ id: 'short', body: 'Load more comments' },
],
occurredAt
);
expect(events.map((e: { origin_id: string }) => e.origin_id)).toEqual([
'li_home_keep1',
'li_home_keep2',
]);
expect(events.map((e: { author_name: string }) => e.author_name)).toEqual([
'Hugo Lu',
'Hardal',
]);
});
});

describe('parseHomeFeedAuthor', () => {
test('extracts the leading name before the connection-degree marker', () => {
expect(
parseHomeFeedAuthor(
'Feed post Hugo Lu • 1st Founder at Orchestra 4h • Yesterday Snowflake popped'
)
).toBe('Hugo Lu');
});

test('handles an emoji-laden headline', () => {
expect(
parseHomeFeedAuthor(
'Feed post Arpit Choudhury • 1st I am the calmest when the music is loud 🔊 1h • Today'
)
).toBe('Arpit Choudhury');
});

test('takes the original poster after "reposted this"', () => {
expect(
parseHomeFeedAuthor(
'Feed post Sabri Karagönen reposted this Hardal 17h • Follow Hardal is now integrated with Bruin'
)
).toBe('Hardal');
});

test('returns empty string when no " • " marker is present', () => {
expect(parseHomeFeedAuthor('Feed post some text with no marker at all')).toBe('');
});

test('returns empty string for empty input', () => {
expect(parseHomeFeedAuthor('')).toBe('');
});

test('caps the result to 60 chars', () => {
const longName = 'A'.repeat(100);
expect(parseHomeFeedAuthor(`Feed post ${longName} • 1st headline`).length).toBe(60);
});
});

describe('isHomeFeedNoise', () => {
test('drops empty or too-short bodies', () => {
expect(isHomeFeedNoise('')).toBe(true);
expect(isHomeFeedNoise('Load more comments')).toBe(true);
});

test('drops promoted ads', () => {
expect(
isHomeFeedNoise(
'Feed post Attio 52,728 followers Promoted Introducing GTM Atlas the new way to map your market'
)
).toBe(true);
});

test('drops suggested rows', () => {
expect(isHomeFeedNoise('Feed post Suggested Matt Graham • 2nd CEO @ RapidDev')).toBe(true);
});

test('keeps a normal post', () => {
expect(
isHomeFeedNoise('Feed post Hugo Lu • 1st Founder at Orchestra 4h • Yesterday Snowflake popped')
).toBe(false);
});
});

describe('LinkedInConnector home_feed', () => {
Expand All @@ -74,8 +231,16 @@ describe('LinkedInConnector home_feed', () => {
result: {
loggedIn: true,
rows: [
{ id: 'tok1', body: 'post one', author: 'Alice' },
{ id: 'tok2', body: 'post two', author: 'Bob' },
{
id: 'tok1',
body: 'post one with a body long enough to pass the noise filter',
author: 'Alice',
},
{
id: 'tok2',
body: 'post two with a body long enough to pass the noise filter',
author: 'Bob',
},
],
},
};
Expand Down
61 changes: 58 additions & 3 deletions packages/connectors/src/linkedin.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,61 @@ const HOME_FEED_SCRAPE_CONFIG = {
fields: {
body: { take: 'text' },
author: {
selector: '.update-components-actor__title, .update-components-actor__name',
// LinkedIn obfuscates the actor classes, so the old
// .update-components-actor__* selectors no longer match. Best-effort:
// grab the visible name span inside the actor's profile/company link
// when present. When this misses, buildHomeFeedEvents falls back to
// parsing the author out of the row body text.
selector:
'a[href*="/in/"] span[aria-hidden="true"], a[href*="/company/"] span[aria-hidden="true"]',
take: 'text',
firstLine: true,
},
},
} as const;

/**
* Best-effort author extraction from a home-feed row's body text. The home
* feed DOM obfuscates the actor classes, so the selector often misses and the
* only reliable place the author name appears is the row's body text. This is
* inherently heuristic — the feed can't use network capture, so there is no
* structured author field to read.
*/
export function parseHomeFeedAuthor(body: string): string {
if (!body) return '';
let text = body.replace(/^feed post\s+/i, '').trim();

// A repost surfaces the resharer first, then "reposted this", then the
// original poster whose content this actually is — take the original poster.
const repostIdx = text.toLowerCase().indexOf('reposted this');
if (repostIdx !== -1) {
text = text.slice(repostIdx + 'reposted this'.length).trim();
}

// The author is the leading name before the " • " connection-degree marker.
const sepIdx = text.indexOf(' • ');
if (sepIdx === -1) return '';
let name = text.slice(0, sepIdx).trim();
// A repost segment puts a relative-time token (e.g. "17h") right after the
// original poster's name and before the marker — strip it so we keep just
// the name.
name = name.replace(/\s+\d+\s*[smhdwy]o?$/i, '').trim();
return name.slice(0, 60);
}

/**
* The home feed mixes in ads, suggestions, and non-post noise. These never
* become useful events, so drop them before emitting. Heuristic by necessity —
* the home feed has no structured "is this an ad" field over the content-script
* scrape.
*/
export function isHomeFeedNoise(body: string): boolean {
if (!body || body.trim().length < 30) return true;
if (/\bPromoted\b/i.test(body.slice(0, 130))) return true;
if (/\bSuggested\b/i.test(body.slice(0, 30))) return true;
return false;
}
Comment on lines +153 to +158
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Use a case-sensitive match for the Promoted label to avoid dropping genuine posts.

The LinkedIn ad label is rendered exactly as Promoted (and Suggested). With /i over the first 130 chars, a legitimate post whose body mentions "promoted" early (e.g. a "just got promoted" update) is silently filtered out — and for a feed connector, dropping real posts is worse than letting an occasional ad through.

🛡️ Match the literal capitalized labels
-	if (/\bPromoted\b/i.test(body.slice(0, 130))) return true;
-	if (/\bSuggested\b/i.test(body.slice(0, 30))) return true;
+	if (/\bPromoted\b/.test(body.slice(0, 130))) return true;
+	if (/\bSuggested\b/.test(body.slice(0, 30))) return true;
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
export function isHomeFeedNoise(body: string): boolean {
if (!body || body.trim().length < 30) return true;
if (/\bPromoted\b/i.test(body.slice(0, 130))) return true;
if (/\bSuggested\b/i.test(body.slice(0, 30))) return true;
return false;
}
export function isHomeFeedNoise(body: string): boolean {
if (!body || body.trim().length < 30) return true;
if (/\bPromoted\b/.test(body.slice(0, 130))) return true;
if (/\bSuggested\b/.test(body.slice(0, 30))) return true;
return false;
}
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@packages/connectors/src/linkedin.ts` around lines 153 - 158, The
isHomeFeedNoise function is too permissive because it uses case-insensitive
regexes and thus filters posts that mention lowercase "promoted" or "suggested";
update the two regexes in isHomeFeedNoise to match the literal capitalized
labels by removing the /i flag (change /\bPromoted\b/i to /\bPromoted\b/ and
/\bSuggested\b/i to /\bSuggested\b/) while keeping the same slice ranges and
early-return logic.


/**
* Map cs_scrape home-feed rows to event envelopes. The componentkey token is
* not a numeric activity id, so there is no /feed/update permalink — source_url
Expand All @@ -111,18 +159,25 @@ export function buildHomeFeedEvents(rows: HomeFeedRow[], occurredAt: Date): Even
const events: EventEnvelope[] = [];
for (const row of rows) {
if (!row?.id || !row.body || seen.has(row.id)) continue;
if (isHomeFeedNoise(row.body)) continue;
seen.add(row.id);
// The DOM actor span often includes the connection-degree marker
// ("Julien Hurault • 1st"); strip it the same way body-parse does. Fall
// back to parsing the name out of the post body when the selector misses.
const author =
(row.author ?? '').trim().split(' • ')[0].trim() ||
parseHomeFeedAuthor(row.body ?? '');
events.push({
origin_id: `li_home_${row.id}`,
payload_text: row.body,
author_name: row.author || '',
author_name: author,
// Feed posts expose no reliable timestamp; use the sync time.
occurred_at: occurredAt,
origin_type: 'post',
// Token id is NOT a numeric activity id, so we cannot build a
// urn:li:activity permalink — link to the feed itself.
source_url: 'https://www.linkedin.com/feed/',
metadata: { author: row.author || '' },
metadata: { author },
});
}
return events;
Expand Down
Loading