Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ mock.module('@lobu/connector-sdk', () => ({
captureErrorArtifacts: () => {
throw new Error('not used in unit tests');
},
// Other connector tests (linkedin) load the same stubbed module in the same
// run; expose the symbols they need so the shared mock satisfies every
// importer regardless of file order.
ConnectorRuntime: class {},
calculateEngagementScore: () => 0,
extensionNetworkSync: () => {
throw new Error('not used in unit tests');
},
}));

const {
Expand Down
129 changes: 129 additions & 0 deletions packages/connectors/src/__tests__/linkedin.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import { beforeAll, describe, expect, mock, test } from 'bun:test';

// linkedin.ts imports ConnectorRuntime / calculateEngagementScore /
// extensionNetworkSync from @lobu/connector-sdk, which pulls in playwright.
// Stub the SDK so the connector can be imported + instantiated without the
// real browser stack. ConnectorRuntime is a no-op base class here; the
// home-feed path only needs the dispatcher we pass in.
mock.module('@lobu/connector-sdk', () => ({
ConnectorRuntime: class {},
calculateEngagementScore: () => 0,
extensionNetworkSync: () => {
throw new Error('not used in home_feed unit tests');
},
}));

// biome-ignore lint/suspicious/noExplicitAny: dynamic import after mock
let LinkedInConnector: any;
// biome-ignore lint/suspicious/noExplicitAny: dynamic import after mock
let buildHomeFeedEvents: any;

beforeAll(async () => {
const mod = await import('../linkedin');
LinkedInConnector = mod.default;
buildHomeFeedEvents = mod.buildHomeFeedEvents;
});

describe('buildHomeFeedEvents', () => {
test('maps a token-id row to li_home_<token> with /feed/ source_url', () => {
const occurredAt = new Date('2026-05-29T12:00:00.000Z');
const events = buildHomeFeedEvents(
[{ id: 'aBc123_token', body: 'Hello from the home feed', author: 'Jane Doe' }],
occurredAt
);

expect(events).toHaveLength(1);
const [ev] = events;
expect(ev.origin_id).toBe('li_home_aBc123_token');
expect(ev.payload_text).toBe('Hello from the home feed');
expect(ev.author_name).toBe('Jane Doe');
expect(ev.origin_type).toBe('post');
// Token id is NOT numeric → no urn:li:activity permalink, link to /feed/.
expect(ev.source_url).toBe('https://www.linkedin.com/feed/');
expect(ev.occurred_at).toBe(occurredAt);
expect(ev.metadata).toEqual({ author: 'Jane Doe' });
});

test('defaults author to empty string when missing', () => {
const [ev] = buildHomeFeedEvents([{ id: 'tok', body: 'body only' }], new Date());
expect(ev.author_name).toBe('');
expect(ev.metadata).toEqual({ author: '' });
});

test('drops rows without id or body and dedupes by id', () => {
const events = buildHomeFeedEvents(
[
{ id: 'a', body: 'first' },
{ id: '', body: 'no id' },
{ id: 'b' }, // no body
{ id: 'a', body: 'dup id' },
],
new Date()
);
expect(events.map((e: { origin_id: string }) => e.origin_id)).toEqual(['li_home_a']);
});
});

describe('LinkedInConnector home_feed', () => {
test('declares a home_feed feed with no required company_url', () => {
const def = new LinkedInConnector().definition;
expect(def.feeds.home_feed).toBeDefined();
expect(def.feeds.home_feed.configSchema.required).toBeUndefined();
});

test('syncHomeFeed dispatches cs_scrape and maps rows to events', async () => {
const calls: Array<{ action: string; input: Record<string, unknown> }> = [];
const dispatcher = {
dispatch: async (action: string, input: Record<string, unknown>) => {
calls.push({ action, input });
return {
tab_id: 1,
cs_scrape: true,
result: {
loggedIn: true,
rows: [
{ id: 'tok1', body: 'post one', author: 'Alice' },
{ id: 'tok2', body: 'post two', author: 'Bob' },
],
},
};
},
};

const connector = new LinkedInConnector();
const ctx = {
feedKey: 'home_feed',
config: { max_scrolls: 4 },
checkpoint: {},
sessionState: { chrome_dispatcher: dispatcher },
};
const res = await connector.sync(ctx);

// Dispatched a cs_scrape navigate against /feed/ with the home-feed config.
expect(calls).toHaveLength(1);
expect(calls[0].action).toBe('navigate');
expect(calls[0].input.cs_scrape).toBe(true);
expect(calls[0].input.persistent).toBe(true);
expect(calls[0].input.url).toBe('https://www.linkedin.com/feed/');
expect((calls[0].input.scrape_config as { scroll: { max: number } }).scroll.max).toBe(4);

expect(res.events).toHaveLength(2);
expect(res.events[0].origin_id).toBe('li_home_tok1');
expect(res.events[1].origin_id).toBe('li_home_tok2');
expect(res.metadata.backend).toBe('extension-cs-scrape');
});

test('throws a clear error when not logged into LinkedIn', async () => {
const dispatcher = {
dispatch: async () => ({ result: { loggedIn: false, rows: [] } }),
};
const connector = new LinkedInConnector();
const ctx = {
feedKey: 'home_feed',
config: {},
checkpoint: {},
sessionState: { chrome_dispatcher: dispatcher },
};
await expect(connector.sync(ctx)).rejects.toThrow(/Not logged into LinkedIn/);
});
});
175 changes: 175 additions & 0 deletions packages/connectors/src/linkedin.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,96 @@ function normalizeCheckpointPostId(postId?: string): string | undefined {
return postId.startsWith('li_post_') ? postId.slice('li_post_'.length) : postId;
}

// ── Home-feed content-script scrape contract ────────────────────
//
// The personalized home feed (linkedin.com/feed/) is the ONE feed that can't
// be read via network capture: attaching the CDP debugger stops the feed from
// rendering, so the Voyager responses never arrive. Instead we drive the
// extension's `cs_scrape` op (a content script, no debugger) with a declarative
// selector config defined here. The extension runs a site-agnostic scrape
// engine — the LinkedIn selectors live in this connector, not the extension.

/** A row produced by the extension's cs_scrape from HOME_FEED_SCRAPE_CONFIG. */
interface HomeFeedRow {
/** The componentkey token (base64url-ish, NOT a numeric activity id). */
id?: string;
body?: string;
author?: string;
}

/** The `.result` payload of a cs_scrape dispatch. */
interface HomeFeedScrapeResult {
count?: number;
host?: string;
landedUrl?: string;
loggedIn?: boolean;
rows?: HomeFeedRow[];
}

/**
* The dispatch observation wrapping a cs_scrape result. The index signature
* keeps it assignable to ChromeActionDispatcher.dispatch's `ChromeActionOutput`
* (= Record<string, unknown>) constraint.
*/
type CsScrapeObservation = Record<string, unknown> & {
tab_id?: number;
cs_scrape?: boolean;
persistent_reused?: boolean;
result?: HomeFeedScrapeResult;
};

/** LinkedIn origins the cs_scrape window is allowed to touch. */
const LINKEDIN_ALLOWED_ORIGINS = ['linkedin.com', '*.linkedin.com'];

/**
* Selectors for the virtualized linkedin.com/feed/ DOM. Home-feed posts are
* componentkey divs with no activity urn, so the row id is the componentkey
* token (NOT numeric). These selectors live here, not in the extension.
*/
const HOME_FEED_SCRAPE_CONFIG = {
scroll: { max: 8, stall: 3, waitMs: 1500 },
loggedOutWhen: { pathRegex: '/(login|authwall|uas/login|checkpoint|signup)\\b' },
rowSelector: 'div[componentkey*="FeedType_MAIN_FEED_RELEVANCE"]',
id: { source: 'attr', name: 'componentkey', regex: '^(?:expanded)?(.+?)FeedType_', group: 1 },
requireFields: ['body'],
fields: {
body: { take: 'text' },
author: {
selector: '.update-components-actor__title, .update-components-actor__name',
take: 'text',
firstLine: true,
},
},
} as const;

/**
* Map cs_scrape home-feed rows to event envelopes. The componentkey token is
* not a numeric activity id, so there is no /feed/update permalink — source_url
* stays at /feed/. Home-feed posts expose no reliable timestamp, so the caller
* stamps occurred_at with the sync time.
*/
export function buildHomeFeedEvents(rows: HomeFeedRow[], occurredAt: Date): EventEnvelope[] {
const seen = new Set<string>();
const events: EventEnvelope[] = [];
for (const row of rows) {
if (!row?.id || !row.body || seen.has(row.id)) continue;
seen.add(row.id);
events.push({
origin_id: `li_home_${row.id}`,
payload_text: row.body,
author_name: row.author || '',
// Feed posts expose no reliable timestamp; use the sync time.
occurred_at: occurredAt,
origin_type: 'post',
// Token id is NOT a numeric activity id, so we cannot build a
// urn:li:activity permalink — link to the feed itself.
source_url: 'https://www.linkedin.com/feed/',
metadata: { author: row.author || '' },
});
}
return events;
}

/**
* Pull the chrome action dispatcher from sessionState. The connector-worker
* subprocess (child-runner.ts) splices a live `chrome_dispatcher` object
Expand Down Expand Up @@ -232,6 +322,19 @@ const companyUpdatesConfigSchema = {
},
};

const homeFeedConfigSchema = {
type: 'object',
properties: {
max_scrolls: {
type: 'integer',
minimum: 1,
maximum: 30,
default: 8,
description: 'Maximum scroll iterations for the home feed (default: 8)',
},
},
};

const jobsConfigSchema = {
type: 'object',
required: ['company_url'],
Expand Down Expand Up @@ -284,6 +387,23 @@ export default class LinkedInConnector extends ConnectorRuntime {
],
},
feeds: {
home_feed: {
key: 'home_feed',
name: 'Home Feed',
description: 'Your personalized LinkedIn home feed.',
configSchema: homeFeedConfigSchema,
eventKinds: {
post: {
description: 'A post from your personalized LinkedIn home feed',
metadataSchema: {
type: 'object',
properties: {
author: { type: 'string' },
},
},
},
},
},
company_updates: {
key: 'company_updates',
name: 'Company Updates',
Expand Down Expand Up @@ -330,6 +450,14 @@ export default class LinkedInConnector extends ConnectorRuntime {
const checkpoint = (ctx.checkpoint ?? {}) as LinkedInCheckpoint;
const feedKey = ctx.feedKey ?? 'company_updates';

// home_feed is the one feed that needs a content-script scrape (the CDP
// debugger stops the personalized feed from rendering) and takes no
// company_url — it always reads linkedin.com/feed/.
if (feedKey === 'home_feed') {
const homeScrolls = (config.max_scrolls as number) ?? 8;
return this.syncHomeFeed(homeScrolls, checkpoint, requireExtensionDispatcher(ctx));
}

const companyUrl = config.company_url as string;
if (!companyUrl) {
throw new Error('company_url is required');
Expand All @@ -346,6 +474,53 @@ export default class LinkedInConnector extends ConnectorRuntime {
return this.syncUpdates(baseUrl, maxScrolls, checkpoint, dispatcher);
}

/**
* Personalized home feed via the extension's content-script scrape. Network
* capture can't read it (the CDP debugger stops the feed rendering), so we
* dispatch a `cs_scrape` against linkedin.com/feed/ with the home-feed
* selectors. The persistent window is reused/focused so an auth wall can be
* cleared in place for the next run.
*/
private async syncHomeFeed(
maxScrolls: number,
checkpoint: LinkedInCheckpoint,
dispatcher: ChromeActionDispatcher
): Promise<SyncResult> {
const observation = await dispatcher.dispatch<CsScrapeObservation>('navigate', {
cs_scrape: true,
persistent: true,
focus: true,
url: 'https://www.linkedin.com/feed/',
scrape_config: {
...HOME_FEED_SCRAPE_CONFIG,
scroll: { ...HOME_FEED_SCRAPE_CONFIG.scroll, max: maxScrolls },
},
allowed_origins: LINKEDIN_ALLOWED_ORIGINS,
});

const result = observation?.result;
if (result?.loggedIn === false) {
throw new Error(
'Not logged into LinkedIn. The home feed could not be read — sign in to LinkedIn in the focused Owletto window, then re-run the sync.'
);
}

const rows = result?.rows ?? [];
const events = buildHomeFeedEvents(rows, new Date());

return {
events,
// The home feed exposes no stable per-post cursor (opaque token ids, no
// timestamps), so there is nothing new to checkpoint — pass it through.
checkpoint: checkpoint as unknown as Record<string, unknown>,
Comment on lines +511 to +515
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | 🏗️ Heavy lift

Persist a home-feed cursor instead of passing the old checkpoint through.

Line 515 returns the incoming checkpoint unchanged even though buildHomeFeedEvents() only dedupes within one scrape and every emitted event gets a fresh sync-time occurred_at. If LinkedIn keeps the same posts visible across runs, this path will emit them again on every sync. Store a bounded set of seen home-feed ids (or another stable cursor) and filter before returning events.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@packages/connectors/src/linkedin.ts` around lines 511 - 515, The current
return passes the incoming checkpoint through which causes repeated emission of
the same home-feed posts; modify the logic in the routine that calls
buildHomeFeedEvents() (the block returning { events, checkpoint }) to persist a
bounded set of seen home-feed ids in the checkpoint, filter out events whose
post id is already in that set before assigning occurred_at, and update the
checkpoint with the new bounded set (e.g., fixed-size queue or LRU) so
subsequent runs skip previously seen ids; reference the variables events,
checkpoint and the buildHomeFeedEvents() output when implementing the filtering
and checkpoint mutation.

metadata: {
items_found: events.length,
items_scraped: rows.length,
backend: 'extension-cs-scrape',
},
};
}

private async syncUpdates(
baseUrl: string,
maxScrolls: number,
Expand Down
2 changes: 1 addition & 1 deletion packages/owletto
Loading