-
Notifications
You must be signed in to change notification settings - Fork 1.1k
feat: better parsing of tables #291
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
feb30b9
5ac88c6
b411faf
3176fa2
50bcd4b
641b655
991b554
25a0d1a
33de0f1
31d9730
fcc71e0
97efd15
fe38f5a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -85,6 +85,11 @@ export const getElementInformation = async ( | |
| let element = originalEl; | ||
|
|
||
| while (element.parentElement) { | ||
| if (element.tagName.toLowerCase() === 'body' || | ||
| element.tagName.toLowerCase() === 'html') { | ||
| break; | ||
| } | ||
|
|
||
| const parentRect = element.parentElement.getBoundingClientRect(); | ||
| const childRect = element.getBoundingClientRect(); | ||
|
|
||
|
|
@@ -99,7 +104,14 @@ export const getElementInformation = async ( | |
| (parentRect.width * parentRect.height) > 0.5; | ||
|
|
||
| if (fullyContained && significantOverlap) { | ||
| element = element.parentElement; | ||
| // Only traverse up if next parent isn't body or html | ||
| const nextParent = element.parentElement; | ||
| if (nextParent.tagName.toLowerCase() !== 'body' && | ||
| nextParent.tagName.toLowerCase() !== 'html') { | ||
| element = nextParent; | ||
| } else { | ||
| break; | ||
| } | ||
| } else { | ||
| break; | ||
| } | ||
|
|
@@ -201,6 +213,11 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector | |
| let element = originalEl; | ||
|
|
||
| while (element.parentElement) { | ||
| if (element.tagName.toLowerCase() === 'body' || | ||
| element.tagName.toLowerCase() === 'html') { | ||
| break; | ||
| } | ||
|
|
||
| const parentRect = element.parentElement.getBoundingClientRect(); | ||
| const childRect = element.getBoundingClientRect(); | ||
|
|
||
|
|
@@ -215,7 +232,14 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector | |
| (parentRect.width * parentRect.height) > 0.5; | ||
|
|
||
| if (fullyContained && significantOverlap) { | ||
| element = element.parentElement; | ||
| // Only traverse up if next parent isn't body or html | ||
| const nextParent = element.parentElement; | ||
| if (nextParent.tagName.toLowerCase() !== 'body' && | ||
| nextParent.tagName.toLowerCase() !== 'html') { | ||
| element = nextParent; | ||
| } else { | ||
| break; | ||
| } | ||
| } else { | ||
| break; | ||
| } | ||
|
|
@@ -869,6 +893,13 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates | |
| function getNonUniqueSelector(element: HTMLElement): string { | ||
| let selector = element.tagName.toLowerCase(); | ||
|
|
||
| if (selector === 'td' && element.parentElement) { | ||
| // Find position among td siblings | ||
| const siblings = Array.from(element.parentElement.children); | ||
| const position = siblings.indexOf(element) + 1; | ||
| return `${selector}:nth-child(${position})`; | ||
| } | ||
|
|
||
| if (element.className) { | ||
| const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls)); | ||
| if (classes.length > 0) { | ||
|
|
@@ -904,6 +935,11 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates | |
|
|
||
| // if (listSelector === '') { | ||
| while (element.parentElement) { | ||
| if (element.tagName.toLowerCase() === 'body' || | ||
| element.tagName.toLowerCase() === 'html') { | ||
| break; | ||
| } | ||
|
|
||
| const parentRect = element.parentElement.getBoundingClientRect(); | ||
| const childRect = element.getBoundingClientRect(); | ||
|
|
||
|
|
@@ -918,7 +954,14 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates | |
| (parentRect.width * parentRect.height) > 0.5; | ||
|
|
||
| if (fullyContained && significantOverlap) { | ||
| element = element.parentElement; | ||
| // Only traverse up if next parent isn't body or html | ||
| const nextParent = element.parentElement; | ||
| if (nextParent.tagName.toLowerCase() !== 'body' && | ||
| nextParent.tagName.toLowerCase() !== 'html') { | ||
| element = nextParent; | ||
| } else { | ||
| break; | ||
| } | ||
| } else { | ||
| break; | ||
| } | ||
|
|
@@ -937,6 +980,12 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates | |
| function getNonUniqueSelector(element: HTMLElement): string { | ||
| let selector = element.tagName.toLowerCase(); | ||
|
|
||
| if (selector === 'td' && element.parentElement) { | ||
| const siblings = Array.from(element.parentElement.children); | ||
| const position = siblings.indexOf(element) + 1; | ||
| return `${selector}:nth-child(${position})`; | ||
| } | ||
|
|
||
|
Comment on lines
+1007
to
+1012
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Consolidate duplicate table cell selector logic. The table cell selector generation logic is duplicated in three different places. This makes the code harder to maintain and increases the risk of inconsistencies. Extract this logic into a shared utility function: +const getTableCellSelector = (element: HTMLElement): string | null => {
+ if (element.tagName.toLowerCase() !== 'td' || !element.parentElement) {
+ return null;
+ }
+ if (element.parentElement.tagName !== 'TR') {
+ console.warn('Table cell found outside of a table row');
+ return null;
+ }
+ const siblings = Array.from(element.parentElement.children);
+ const position = siblings.indexOf(element) + 1;
+ return `td:nth-child(${position})`;
+};
// Replace all occurrences with:
-if (selector === 'td' && element.parentElement) {
- const siblings = Array.from(element.parentElement.children);
- const position = siblings.indexOf(element) + 1;
- return `${selector}:nth-child(${position})`;
-}
+const cellSelector = getTableCellSelector(element);
+if (cellSelector) {
+ return cellSelector;
+}Also applies to: 1067-1072 |
||
| if (element.className) { | ||
| const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls)); | ||
| if (classes.length > 0) { | ||
|
|
@@ -991,6 +1040,12 @@ export const getChildSelectors = async (page: Page, parentSelector: string): Pro | |
| function getNonUniqueSelector(element: HTMLElement): string { | ||
| let selector = element.tagName.toLowerCase(); | ||
|
|
||
| if (selector === 'td' && element.parentElement) { | ||
| const siblings = Array.from(element.parentElement.children); | ||
| const position = siblings.indexOf(element) + 1; | ||
| return `${selector}:nth-child(${position})`; | ||
| } | ||
|
|
||
| const className = typeof element.className === 'string' ? element.className : ''; | ||
| if (className) { | ||
| const classes = className.split(/\s+/).filter((cls: string) => Boolean(cls)); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion
Enhance non-table field queries
Note that using parent.querySelector may only fetch the first matching element. If multiple elements match the same selector, subsequent ones will be ignored. Consider using querySelectorAll or clarifying that only the first match is necessary.