diff --git a/packages/connectors/src/__tests__/browser-scraper-utils.test.ts b/packages/connectors/src/__tests__/browser-scraper-utils.test.ts index 9c1d19dc8..75f91b1c7 100644 --- a/packages/connectors/src/__tests__/browser-scraper-utils.test.ts +++ b/packages/connectors/src/__tests__/browser-scraper-utils.test.ts @@ -12,6 +12,14 @@ mock.module('@lobu/connector-sdk', () => ({ captureErrorArtifacts: () => { throw new Error('not used in unit tests'); }, + // Other connector tests (linkedin) load the same stubbed module in the same + // run; expose the symbols they need so the shared mock satisfies every + // importer regardless of file order. + ConnectorRuntime: class {}, + calculateEngagementScore: () => 0, + extensionNetworkSync: () => { + throw new Error('not used in unit tests'); + }, })); const { diff --git a/packages/connectors/src/__tests__/linkedin.test.ts b/packages/connectors/src/__tests__/linkedin.test.ts new file mode 100644 index 000000000..72fb6fa55 --- /dev/null +++ b/packages/connectors/src/__tests__/linkedin.test.ts @@ -0,0 +1,129 @@ +import { beforeAll, describe, expect, mock, test } from 'bun:test'; + +// linkedin.ts imports ConnectorRuntime / calculateEngagementScore / +// extensionNetworkSync from @lobu/connector-sdk, which pulls in playwright. +// Stub the SDK so the connector can be imported + instantiated without the +// real browser stack. ConnectorRuntime is a no-op base class here; the +// home-feed path only needs the dispatcher we pass in. +mock.module('@lobu/connector-sdk', () => ({ + ConnectorRuntime: class {}, + calculateEngagementScore: () => 0, + extensionNetworkSync: () => { + throw new Error('not used in home_feed unit tests'); + }, +})); + +// biome-ignore lint/suspicious/noExplicitAny: dynamic import after mock +let LinkedInConnector: any; +// biome-ignore lint/suspicious/noExplicitAny: dynamic import after mock +let buildHomeFeedEvents: any; + +beforeAll(async () => { + const mod = await import('../linkedin'); + LinkedInConnector = mod.default; + buildHomeFeedEvents = mod.buildHomeFeedEvents; +}); + +describe('buildHomeFeedEvents', () => { + test('maps a token-id row to li_home_ with /feed/ source_url', () => { + const occurredAt = new Date('2026-05-29T12:00:00.000Z'); + const events = buildHomeFeedEvents( + [{ id: 'aBc123_token', body: 'Hello from the home feed', author: 'Jane Doe' }], + occurredAt + ); + + expect(events).toHaveLength(1); + const [ev] = events; + expect(ev.origin_id).toBe('li_home_aBc123_token'); + expect(ev.payload_text).toBe('Hello from the home feed'); + expect(ev.author_name).toBe('Jane Doe'); + expect(ev.origin_type).toBe('post'); + // Token id is NOT numeric → no urn:li:activity permalink, link to /feed/. + expect(ev.source_url).toBe('https://www.linkedin.com/feed/'); + expect(ev.occurred_at).toBe(occurredAt); + expect(ev.metadata).toEqual({ author: 'Jane Doe' }); + }); + + test('defaults author to empty string when missing', () => { + const [ev] = buildHomeFeedEvents([{ id: 'tok', body: 'body only' }], new Date()); + expect(ev.author_name).toBe(''); + expect(ev.metadata).toEqual({ author: '' }); + }); + + test('drops rows without id or body and dedupes by id', () => { + const events = buildHomeFeedEvents( + [ + { id: 'a', body: 'first' }, + { id: '', body: 'no id' }, + { id: 'b' }, // no body + { id: 'a', body: 'dup id' }, + ], + new Date() + ); + expect(events.map((e: { origin_id: string }) => e.origin_id)).toEqual(['li_home_a']); + }); +}); + +describe('LinkedInConnector home_feed', () => { + test('declares a home_feed feed with no required company_url', () => { + const def = new LinkedInConnector().definition; + expect(def.feeds.home_feed).toBeDefined(); + expect(def.feeds.home_feed.configSchema.required).toBeUndefined(); + }); + + test('syncHomeFeed dispatches cs_scrape and maps rows to events', async () => { + const calls: Array<{ action: string; input: Record }> = []; + const dispatcher = { + dispatch: async (action: string, input: Record) => { + calls.push({ action, input }); + return { + tab_id: 1, + cs_scrape: true, + result: { + loggedIn: true, + rows: [ + { id: 'tok1', body: 'post one', author: 'Alice' }, + { id: 'tok2', body: 'post two', author: 'Bob' }, + ], + }, + }; + }, + }; + + const connector = new LinkedInConnector(); + const ctx = { + feedKey: 'home_feed', + config: { max_scrolls: 4 }, + checkpoint: {}, + sessionState: { chrome_dispatcher: dispatcher }, + }; + const res = await connector.sync(ctx); + + // Dispatched a cs_scrape navigate against /feed/ with the home-feed config. + expect(calls).toHaveLength(1); + expect(calls[0].action).toBe('navigate'); + expect(calls[0].input.cs_scrape).toBe(true); + expect(calls[0].input.persistent).toBe(true); + expect(calls[0].input.url).toBe('https://www.linkedin.com/feed/'); + expect((calls[0].input.scrape_config as { scroll: { max: number } }).scroll.max).toBe(4); + + expect(res.events).toHaveLength(2); + expect(res.events[0].origin_id).toBe('li_home_tok1'); + expect(res.events[1].origin_id).toBe('li_home_tok2'); + expect(res.metadata.backend).toBe('extension-cs-scrape'); + }); + + test('throws a clear error when not logged into LinkedIn', async () => { + const dispatcher = { + dispatch: async () => ({ result: { loggedIn: false, rows: [] } }), + }; + const connector = new LinkedInConnector(); + const ctx = { + feedKey: 'home_feed', + config: {}, + checkpoint: {}, + sessionState: { chrome_dispatcher: dispatcher }, + }; + await expect(connector.sync(ctx)).rejects.toThrow(/Not logged into LinkedIn/); + }); +}); diff --git a/packages/connectors/src/linkedin.ts b/packages/connectors/src/linkedin.ts index d9dd167ec..730d292ef 100644 --- a/packages/connectors/src/linkedin.ts +++ b/packages/connectors/src/linkedin.ts @@ -58,6 +58,96 @@ function normalizeCheckpointPostId(postId?: string): string | undefined { return postId.startsWith('li_post_') ? postId.slice('li_post_'.length) : postId; } +// ── Home-feed content-script scrape contract ──────────────────── +// +// The personalized home feed (linkedin.com/feed/) is the ONE feed that can't +// be read via network capture: attaching the CDP debugger stops the feed from +// rendering, so the Voyager responses never arrive. Instead we drive the +// extension's `cs_scrape` op (a content script, no debugger) with a declarative +// selector config defined here. The extension runs a site-agnostic scrape +// engine — the LinkedIn selectors live in this connector, not the extension. + +/** A row produced by the extension's cs_scrape from HOME_FEED_SCRAPE_CONFIG. */ +interface HomeFeedRow { + /** The componentkey token (base64url-ish, NOT a numeric activity id). */ + id?: string; + body?: string; + author?: string; +} + +/** The `.result` payload of a cs_scrape dispatch. */ +interface HomeFeedScrapeResult { + count?: number; + host?: string; + landedUrl?: string; + loggedIn?: boolean; + rows?: HomeFeedRow[]; +} + +/** + * The dispatch observation wrapping a cs_scrape result. The index signature + * keeps it assignable to ChromeActionDispatcher.dispatch's `ChromeActionOutput` + * (= Record) constraint. + */ +type CsScrapeObservation = Record & { + tab_id?: number; + cs_scrape?: boolean; + persistent_reused?: boolean; + result?: HomeFeedScrapeResult; +}; + +/** LinkedIn origins the cs_scrape window is allowed to touch. */ +const LINKEDIN_ALLOWED_ORIGINS = ['linkedin.com', '*.linkedin.com']; + +/** + * Selectors for the virtualized linkedin.com/feed/ DOM. Home-feed posts are + * componentkey divs with no activity urn, so the row id is the componentkey + * token (NOT numeric). These selectors live here, not in the extension. + */ +const HOME_FEED_SCRAPE_CONFIG = { + scroll: { max: 8, stall: 3, waitMs: 1500 }, + loggedOutWhen: { pathRegex: '/(login|authwall|uas/login|checkpoint|signup)\\b' }, + rowSelector: 'div[componentkey*="FeedType_MAIN_FEED_RELEVANCE"]', + id: { source: 'attr', name: 'componentkey', regex: '^(?:expanded)?(.+?)FeedType_', group: 1 }, + requireFields: ['body'], + fields: { + body: { take: 'text' }, + author: { + selector: '.update-components-actor__title, .update-components-actor__name', + take: 'text', + firstLine: true, + }, + }, +} as const; + +/** + * Map cs_scrape home-feed rows to event envelopes. The componentkey token is + * not a numeric activity id, so there is no /feed/update permalink — source_url + * stays at /feed/. Home-feed posts expose no reliable timestamp, so the caller + * stamps occurred_at with the sync time. + */ +export function buildHomeFeedEvents(rows: HomeFeedRow[], occurredAt: Date): EventEnvelope[] { + const seen = new Set(); + const events: EventEnvelope[] = []; + for (const row of rows) { + if (!row?.id || !row.body || seen.has(row.id)) continue; + seen.add(row.id); + events.push({ + origin_id: `li_home_${row.id}`, + payload_text: row.body, + author_name: row.author || '', + // Feed posts expose no reliable timestamp; use the sync time. + occurred_at: occurredAt, + origin_type: 'post', + // Token id is NOT a numeric activity id, so we cannot build a + // urn:li:activity permalink — link to the feed itself. + source_url: 'https://www.linkedin.com/feed/', + metadata: { author: row.author || '' }, + }); + } + return events; +} + /** * Pull the chrome action dispatcher from sessionState. The connector-worker * subprocess (child-runner.ts) splices a live `chrome_dispatcher` object @@ -232,6 +322,19 @@ const companyUpdatesConfigSchema = { }, }; +const homeFeedConfigSchema = { + type: 'object', + properties: { + max_scrolls: { + type: 'integer', + minimum: 1, + maximum: 30, + default: 8, + description: 'Maximum scroll iterations for the home feed (default: 8)', + }, + }, +}; + const jobsConfigSchema = { type: 'object', required: ['company_url'], @@ -284,6 +387,23 @@ export default class LinkedInConnector extends ConnectorRuntime { ], }, feeds: { + home_feed: { + key: 'home_feed', + name: 'Home Feed', + description: 'Your personalized LinkedIn home feed.', + configSchema: homeFeedConfigSchema, + eventKinds: { + post: { + description: 'A post from your personalized LinkedIn home feed', + metadataSchema: { + type: 'object', + properties: { + author: { type: 'string' }, + }, + }, + }, + }, + }, company_updates: { key: 'company_updates', name: 'Company Updates', @@ -330,6 +450,14 @@ export default class LinkedInConnector extends ConnectorRuntime { const checkpoint = (ctx.checkpoint ?? {}) as LinkedInCheckpoint; const feedKey = ctx.feedKey ?? 'company_updates'; + // home_feed is the one feed that needs a content-script scrape (the CDP + // debugger stops the personalized feed from rendering) and takes no + // company_url — it always reads linkedin.com/feed/. + if (feedKey === 'home_feed') { + const homeScrolls = (config.max_scrolls as number) ?? 8; + return this.syncHomeFeed(homeScrolls, checkpoint, requireExtensionDispatcher(ctx)); + } + const companyUrl = config.company_url as string; if (!companyUrl) { throw new Error('company_url is required'); @@ -346,6 +474,53 @@ export default class LinkedInConnector extends ConnectorRuntime { return this.syncUpdates(baseUrl, maxScrolls, checkpoint, dispatcher); } + /** + * Personalized home feed via the extension's content-script scrape. Network + * capture can't read it (the CDP debugger stops the feed rendering), so we + * dispatch a `cs_scrape` against linkedin.com/feed/ with the home-feed + * selectors. The persistent window is reused/focused so an auth wall can be + * cleared in place for the next run. + */ + private async syncHomeFeed( + maxScrolls: number, + checkpoint: LinkedInCheckpoint, + dispatcher: ChromeActionDispatcher + ): Promise { + const observation = await dispatcher.dispatch('navigate', { + cs_scrape: true, + persistent: true, + focus: true, + url: 'https://www.linkedin.com/feed/', + scrape_config: { + ...HOME_FEED_SCRAPE_CONFIG, + scroll: { ...HOME_FEED_SCRAPE_CONFIG.scroll, max: maxScrolls }, + }, + allowed_origins: LINKEDIN_ALLOWED_ORIGINS, + }); + + const result = observation?.result; + if (result?.loggedIn === false) { + throw new Error( + 'Not logged into LinkedIn. The home feed could not be read — sign in to LinkedIn in the focused Owletto window, then re-run the sync.' + ); + } + + const rows = result?.rows ?? []; + const events = buildHomeFeedEvents(rows, new Date()); + + return { + events, + // The home feed exposes no stable per-post cursor (opaque token ids, no + // timestamps), so there is nothing new to checkpoint — pass it through. + checkpoint: checkpoint as unknown as Record, + metadata: { + items_found: events.length, + items_scraped: rows.length, + backend: 'extension-cs-scrape', + }, + }; + } + private async syncUpdates( baseUrl: string, maxScrolls: number, diff --git a/packages/owletto b/packages/owletto index ab5064524..1f1c027da 160000 --- a/packages/owletto +++ b/packages/owletto @@ -1 +1 @@ -Subproject commit ab506452478a0220a9f45a833ff5b8ed62a25648 +Subproject commit 1f1c027da939e485b48a47f204de1b3f3fb20972