Skip to content

Commit

Permalink
feat: Disambiguate section headings and list items from text items in…
Browse files Browse the repository at this point in the history
… DoclingDocument (#86)

* Disambiguate section headings and list items from text items

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Group list items

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
  • Loading branch information
cau-git authored Oct 21, 2024
1 parent 53dfbff commit d2b1e60
Showing 1 changed file with 22 additions and 5 deletions.
27 changes: 22 additions & 5 deletions deepsearch_glm/utils/doc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
ProvenanceItem,
Size,
TableCell,
Expand Down Expand Up @@ -82,6 +83,8 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
else:
props = pd.DataFrame()

current_list = None

for ix, pelem in enumerate(doc_glm["page-elements"]):
ptype = pelem["type"]
span_i = pelem["span"][0]
Expand All @@ -105,10 +108,12 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
obj = resolve_item(path, doc_glm)

if obj is None:
current_list = None
print(f"warning: undefined {path}")
continue

if ptype == "figure":
current_list = None
text = ""
caption_refs = []
for caption in obj["captions"]:
Expand Down Expand Up @@ -154,6 +159,7 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
pic.captions.extend(caption_refs)

elif ptype == "table":
current_list = None
text = ""
caption_refs = []
for caption in obj["captions"]:
Expand Down Expand Up @@ -263,13 +269,24 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
),
)
label = DocItemLabel(name_label)

# TODO: Decide on add_heading, add_list_item, or add_text according to label.
doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov)
if label == DocItemLabel.LIST_ITEM:
if current_list is None:
current_list = doc.add_group(label=GroupLabel.LIST, name="list")

else:
pass
# This branch should not be reachable.
# TODO: Infer if this is a numbered or a bullet list item
doc.add_list_item(
text=text, enumerated=False, prov=prov, parent=current_list
)
elif label == DocItemLabel.SECTION_HEADER:
current_list = None

doc.add_heading(text=text, prov=prov)
else:
current_list = None

doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov)

for page_dim in doc_glm["page-dimensions"]:
page_no = int(page_dim["page"])
Expand Down

0 comments on commit d2b1e60

Please sign in to comment.