Skip to content

Commit

Permalink
Dealing with ndjson like fb payloads
Browse files Browse the repository at this point in the history
  • Loading branch information
Yomguithereal committed Nov 3, 2023
1 parent 31219d6 commit 8d40db8
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 5 deletions.
9 changes: 6 additions & 3 deletions minet/facebook/emulated_scraper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import List

import re
import json
from asyncio import gather
from playwright.async_api import BrowserContext, Response, TimeoutError, Locator
from playwright_stealth import stealth_async
Expand Down Expand Up @@ -93,11 +94,13 @@ async def expect_comments(action) -> List[FacebookComment]:
page, action, is_graphql_comments_response
)

payload = await response.json()
# NOTE: sometimes FB will pack multiple payload in a single
# ndjson-like body
text = await response.text()
first_doc = text.splitlines()[0]
payload = json.loads(first_doc)

# with open("./dump.json", "w") as f:
# import json

# json.dump(payload, f, ensure_ascii=False, indent=2)

return FacebookComment.from_payload(payload)
Expand Down
8 changes: 6 additions & 2 deletions minet/facebook/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def from_graphql_node(cls, node) -> "FacebookCommentAuthor":
)


# NOTE: we could better collect attachments, and we could get the reaction details
@dataclass
class FacebookComment(TabularRecord):
id: str
Expand All @@ -32,13 +33,15 @@ class FacebookComment(TabularRecord):
parent_id: Optional[str]
author: FacebookCommentAuthor
created_time: int
text: str
text: Optional[str]
attachments: int
reactions: int
replies: int

@classmethod
def from_graphql_node(cls, node) -> "FacebookComment":
feedback = node["feedback"]
attachments = node.get("attachments", [])

return cls(
id=node["id"],
Expand All @@ -51,7 +54,8 @@ def from_graphql_node(cls, node) -> "FacebookComment":
),
author=FacebookCommentAuthor.from_graphql_node(node["author"]),
created_time=node["created_time"],
text=node["preferred_body"]["text"],
text=getpath(node, ("preferred_body", "text")),
attachments=len(attachments),
reactions=feedback["reactors"]["count"],
replies=feedback.get("total_comment_count", 0),
)
Expand Down

0 comments on commit 8d40db8

Please sign in to comment.