Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Nested Lists support to ListProcessor #410

Merged
merged 8 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions marker/processors/blockquote.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
from marker.schema import BlockTypes
from marker.schema.document import Document


class BlockquoteProcessor(BaseProcessor):
"""
A processor for tagging blockquotes
"""
block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
min_x_indent = 0.05 # % of block width
x_start_tolerance = 0.01 # % of block width
x_end_tolerance = 0.01 # % of block width
min_x_indent = 0.05 # % of block width
x_start_tolerance = 0.01 # % of block width
x_end_tolerance = 0.01 # % of block width

def __init__(self, config):
super().__init__(config)
Expand All @@ -19,7 +20,7 @@ def __call__(self, document: Document):
for block in page.contained_blocks(document, self.block_types):
if block.structure is None:
continue

if not len(block.structure) >= 2:
continue

Expand All @@ -38,11 +39,11 @@ def __call__(self, document: Document):
x_indent = next_block.polygon.x_start > block.polygon.x_start + (self.min_x_indent * block.polygon.width)
y_indent = next_block.polygon.y_start > block.polygon.y_end

if block.block_type in self.block_types and block.blockquote:
if block.blockquote:
next_block.blockquote = (matching_x_end and matching_x_start) or (x_indent and y_indent)
next_block.blockquote_level = block.blockquote_level
if (x_indent and y_indent):
next_block.blockquote_level += 1
else:
next_block.blockquote = len(next_block.structure) >= 2 and (x_indent and y_indent)
elif len(next_block.structure) >= 2 and (x_indent and y_indent):
next_block.blockquote = True
next_block.blockquote_level = 1
55 changes: 52 additions & 3 deletions marker/processors/list.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from typing import List

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import ListItem
from marker.schema.document import Document


Expand All @@ -9,11 +12,16 @@ class ListProcessor(BaseProcessor):
"""
block_types = (BlockTypes.ListGroup,)
ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
min_x_indent = 0.01 # % of page width

def __init__(self, config):
super().__init__(config)

def __call__(self, document: Document):
self.list_group_continuation(document)
self.list_group_indentation(document)

def list_group_continuation(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
next_block = document.get_next_block(block, self.ignored_block_types)
Expand All @@ -29,13 +37,54 @@ def __call__(self, document: Document):
column_break, page_break = False, False
next_block_in_first_quadrant = False

if next_block.page_id == block.page_id: # block on the same page
if next_block.page_id == block.page_id: # block on the same page
# we check for a column break
column_break = next_block.polygon.y_start <= block.polygon.y_end
else:
page_break = True
next_page = document.get_page(next_block.page_id)
next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
(next_block.polygon.y_start < next_page.polygon.height // 2)
(next_block.polygon.y_start < next_page.polygon.height // 2)

block.has_continuation = column_break or (page_break and next_block_in_first_quadrant)

def list_group_indentation(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
if block.structure is None:
continue
if block.ignore_for_output:
continue

stack: List[ListItem] = [block.get_next_block(page, None)]
for list_item_id in block.structure:
list_item_block: ListItem = page.get_block(list_item_id)

while stack and list_item_block.polygon.x_start <= stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width):
stack.pop()

if stack and list_item_block.polygon.y_start > stack[-1].polygon.y_start:
list_item_block.list_indent_level = stack[-1].list_indent_level
if list_item_block.polygon.x_start > stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width):
list_item_block.list_indent_level += 1

next_list_item_block = block.get_next_block(page, list_item_block)
if next_list_item_block is not None and next_list_item_block.polygon.x_start > list_item_block.polygon.x_end:
stack = [next_list_item_block] # reset stack on column breaks
else:
stack.append(list_item_block)

stack: List[ListItem] = [block.get_next_block(page, None)]
for list_item_id in block.structure.copy():
list_item_block: ListItem = page.get_block(list_item_id)

while stack and list_item_block.list_indent_level <= stack[-1].list_indent_level:
stack.pop()

if stack:
current_parent = stack[-1]
current_parent.add_structure(list_item_block)
current_parent.polygon = current_parent.polygon.merge([list_item_block.polygon])

block.remove_structure_items([list_item_id])
stack.append(list_item_block)
28 changes: 27 additions & 1 deletion marker/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class Block(BaseModel):
page_id: Optional[int] = None
text_extraction_method: Optional[Literal['pdftext', 'surya']] = None
structure: List[BlockId] | None = None # The top-level page structure, which is the block ids in order
ignore_for_output: bool = False # Whether this block should be ignored in output
ignore_for_output: bool = False # Whether this block should be ignored in output
source: Literal['layout', 'heuristics', 'processor'] = 'layout'

model_config = ConfigDict(arbitrary_types_allowed=True)
Expand All @@ -87,6 +87,32 @@ def structure_blocks(self, document_page: Document | PageGroup) -> List[Block]:
return []
return [document_page.get_block(block_id) for block_id in self.structure]

def get_prev_block(self, document_page: Document | PageGroup, block: Block, ignored_block_types: Optional[List[BlockTypes]] = None):
if ignored_block_types is None:
ignored_block_types = []

structure_idx = self.structure.index(block.id)
if structure_idx == 0:
return None

for prev_block_id in reversed(self.structure[:structure_idx]):
if prev_block_id.block_type not in ignored_block_types:
return document_page.get_block(prev_block_id)

def get_next_block(self, document_page: Document | PageGroup, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None):
if ignored_block_types is None:
ignored_block_types = []

structure_idx = 0
if block is not None:
structure_idx = self.structure.index(block.id) + 1

for next_block_id in self.structure[structure_idx:]:
if next_block_id.block_type not in ignored_block_types:
return document_page.get_block(next_block_id)

return None # No valid next block found

def add_structure(self, block: Block):
if self.structure is None:
self.structure = [block.id]
Expand Down
8 changes: 4 additions & 4 deletions marker/schema/blocks/inlinemath.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")

class_attr = f" block-type='{self.block_type}'"
el_attr = f" block-type='{self.block_type}'"
if self.has_continuation:
class_attr += " class='has-continuation'"
el_attr += " class='has-continuation'"

if self.blockquote:
# Add indentation for blockquote levels
blockquote_prefix = "<blockquote>" * self.blockquote_level
blockquote_suffix = "</blockquote>" * self.blockquote_level
return f"{blockquote_prefix}<p{class_attr}>{template}</p>{blockquote_suffix}"
return f"{blockquote_prefix}<p{el_attr}>{template}</p>{blockquote_suffix}"
else:
return f"<p{class_attr}>{template}</p>"
return f"<p{el_attr}>{template}</p>"
9 changes: 7 additions & 2 deletions marker/schema/blocks/listitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,21 @@ def replace_bullets(child_blocks):
child_blocks = first_block.children

if first_block is not None and first_block.id.block_type == BlockTypes.Line:
bullet_pattern = r"(^|[\n ]|<[^>]*>)[•●○■▪▫–—-]( )"
bullet_pattern = r"(^|[\n ]|<[^>]*>)[•●○ഠ ം◦■▪▫–—-]( )"
first_block.html = re.sub(bullet_pattern, r"\1\2", first_block.html)


class ListItem(Block):
block_type: BlockTypes = BlockTypes.ListItem
list_indent_level: int = 0

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
# Remove the first bullet character
replace_bullets(child_blocks)
return f"<li>{template}</li>"

el_attr = f" block-type='{self.block_type}'"
if self.list_indent_level:
return f"<ul><li{el_attr} class='list-indent-{self.list_indent_level}'>{template}</li></ul>"
return f"<li{el_attr}>{template}</li>"
8 changes: 4 additions & 4 deletions marker/schema/blocks/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@ def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")

class_attr = f" block-type='{self.block_type}'"
el_attr = f" block-type='{self.block_type}'"
if self.has_continuation:
class_attr += " class='has-continuation'"
el_attr += " class='has-continuation'"

if self.blockquote:
blockquote_prefix = "<blockquote>" * self.blockquote_level
blockquote_suffix = "</blockquote>" * self.blockquote_level
return f"{blockquote_prefix}<p{class_attr}>{template}</p>{blockquote_suffix}"
return f"{blockquote_prefix}<p{el_attr}>{template}</p>{blockquote_suffix}"
else:
return f"<p{class_attr}>{template}</p>"
return f"<p{el_attr}>{template}</p>"
6 changes: 3 additions & 3 deletions marker/schema/groups/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class ListGroup(Group):
def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)

class_attr = f" block-type='{self.block_type}'"
el_attr = f" block-type='{self.block_type}'"
if self.has_continuation:
class_attr += " class='has-continuation'"
return f"<p{class_attr}><ul>{template}</ul></p>"
el_attr += " class='has-continuation'"
return f"<p{el_attr}><ul>{template}</ul></p>"
Loading