
    kh&                         d Z ddlmZmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ  G d dee      Zy)zA wrapper of pdf page engine (e.g. PyMuPDF, pdfminer) to do the following work:

* extract source contents
* clean up blocks/shapes, e.g. elements out of page
* calculate page margin
* parse page structure roughly, i.e. section and column
    )ABCabstractmethod   )BasePage   )Section)Column)	Hyperlink)Shapes)Blocks)Fonts)TextSpan)
debug_plot)	constants)
Collectionc            	           e Zd ZdZddZed        Zed        Zed        Z	 e
d      d        Z e
d	      d
        ZdefdZd Zd Zededededefd       Zy)RawPagezA wrapper of page engine.Nc                     t        j                  |        || _        t        |       | _        t        |       | _        y)zc Initialize page layout.

        Args:
            page_engine (Object): Source pdf page.
        )parentN)r   __init__page_enginer   blocksr   shapes)selfr   s     Q/var/www/teggl/fontify/venv/lib/python3.12/site-packages/pdf2docx/page/RawPage.pyr   zRawPage.__init__   s2     	$&D)D)    c                      y)a&  Extract source data with page engine. Return a dict with the following structure:
        ```
            {
                "width" : w,
                "height": h,
                "blocks": [{...}, {...}, ...],
                "shapes" : [{...}, {...}, ...]
            }
        ```
        N r   settingss     r   extract_raw_dictzRawPage.extract_raw_dict&   s    r   c                 r    dj                  | j                  D cg c]  }|j                   c}      S c c}w )zwAll extracted text in this page, with images considered as ``<image>``.
        Should be run after ``restore()`` data.
)joinr   textr   blocks     r   r%   zRawPage.text4   s)     yy$++>%**>??>   4c                 r    dj                  | j                  D cg c]  }|j                   c}      S c c}w )zKExtracted raw text in current page. Should be run after ``restore()`` data.r#   )r$   r   raw_textr&   s     r   r*   zRawPage.raw_text:   s)     yydkkBU%..BCCBr(   zSource Text Blocksc                      | j                   di |}| j                  j                  |j                  dg              | j                  j                  |j                  dg              | j                  S )z-Initialize layout extracted with ``PyMuPDF``.r   r   r   )r!   r   restoregetr   )r   r    raw_dicts      r   r,   zRawPage.restore@   s]     )4((484HLL267HLL267{{r   zCleaned Shapesc                     | j                   j                  |d   |d          | j                  j                  |d   |d          | j                  S )zClean up raw blocks and shapes, e.g.

        * remove negative or duplicated instances,
        * detect semantic type of shapes
        float_image_ignorable_gapline_overlap_thresholdmax_border_widthshape_min_dimension)r   clean_upr   r   s     r   r4   zRawPage.clean_upI   sZ     	01-.	0 	'(*+	- {{r   fontsc           
      j   g }| j                   D ]<  }|j                  |j                  D cg c]  }t        |t              s| c}       > |D ]\  }|j                  |j                        }|s!|j                  |_        |j                  s?|j                  |j                  z  |_        ^ yc c}w )zUpdate font properties, e.g. font name, font line height ratio, of ``TextSpan``.

        Args:
            fonts (Fonts): Fonts parsed by ``fonttools``.
        N)
r   extendspans
isinstancer   r-   fontnameline_heightsize)r   r5   r8   linespanr:   s         r   process_fontzRawPage.process_font[   s     KK 	VDLL4::T4D(9S$TU	V  	@D99TYY'D 		DI#'#3#3dii#? 	@ Us   B0
B0
c           
          t        | j                  D cg c]  }t        |t              r| c}      }| j                  s|st
        j                  fdz  S | j                  \  }}}}| j                  j                  |j                  z  \  }}	}
}t        ||z
  d      }t        ||
z
  t
        j                  z
  d      }t        |	|z
  d      }t        ||z
  d      }||d   z  }||d   z  }t        t
        j                  t        |d            t        t
        j                  t        |d            t        t
        j                  t        |d            t        t
        j                  t        |d            fS c c}w )zCalculate page margin.

        .. note::
            Ensure this method is run right after cleaning up the layout, so the page margin is
            calculated based on valid layout, and stay constant.
           g        page_margin_factor_toppage_margin_factor_bottomr   )r   r   r9   r
   r   r   ITPbboxmax
MINOR_DISTminround)r   r    shaper   x0y0x1y1u0v0u1v1leftrighttopbottoms                   r   calculate_marginzRawPage.calculate_marginq   sC    DKK\5z%QZ?[\] {{69==2Ca2G+GBB))FKK7BB 2b5#BrE)...4"R%oRUC 	x011(677 	uT1~.	uUA/	uS!}-	uVQ/0	2 	2' ]s
   E;E;c                 L    !  j                   \  } }t               }|j                   j                         |j                   j                         |syt               }g ! ! fd}d}|}|j                         D ]  }	|	j                         }
t        |
      }|dkD  rd}nq|dk(  rl|
d   j                  \  }}}}|
d   j                  \  }}}}||z   dz  }|z
   |z
  }}||z
  ||z
  }}d}d|z  ||z  cxk  r|k  rn n||z  dk  s||z  dk  rd}|dk(  rm|dk(  rh|j                         }
|
d   j                  d   }|	j                  d   |k  s|	j                  d   |kD  rd}ne|j                  \  }}}}||z
  |d   k  rId}nF|dk(  rA|dk(  r<t        |      }|j                  |	       t        |j                  d	
            dk(  rd}||k7  r0 ||||       !r!d   d   j                  d   }t        |	      }|}|j                  |	         |||       !S )zDetect and create page sections.

        .. note::
            - Only two-columns Sections are considered for now.
            - Page margin must be parsed before this step.
        Nc                     rFd   j                   | cxk(  rdk(  r.n n+d   d   }|j                  |       |j                  |       y j                  | |f|      }|rj	                  |       y y )Nr   r   )num_cols
union_bboxadd_elements_create_sectionappend)	num_colelementsy_refcolumnsectionX0X1sectionsr   s	        r   close_sectionz,RawPage.parse_section.<locals>.close_section   su    HRL117=A=!"a!!(+##H- ..w2r(EROOG, r   r   r   r          @gQ?min_section_heightF)sortedr[      )	working_bboxr   r7   r   r   group_by_rowsgroup_by_columnslenrF   )"r   r    Y0_rb   linesri   pre_num_colrc   rowcolscurrent_num_colrP   rQ   rR   rS   m0n0m1n1rL   c1c2w1w2fposrM   rN   rO   combinerf   rg   rh   s"   `                              @@@r   parse_sectionzRawPage.parse_section   s`    ))BA <$$ 
	- ))+ :	"C'')D!$iO q "# !!#!%aBB!%aBBeS[B2BB2BsBrE}1}2d
beDj&'O A~/1"4 --/1gll1o88A;#sxx{3&'O
 &+ZZNBB"uX&:;;&' aOQ$6$U+s#w//u/=>AUV? +k5%8$RL,11!4E #3- S!u:	"z 	oue4r   ra   rb   h_rangerc   c                 H   |sy|\  }}| dk(  rS|j                   \  }}}}	t               j                  ||||	f      }
|
j                  |       t	        d|
g      }||z
  }n|j                         }|d   j                   \  }}}}|d   j                   \  }}}}||z   dz  }t               j                  ||||f      }|j                  |       t               j                  ||||f      }|j                  |       t	        d||g      }||z
  }t        |d      |_        |S )zQCreate section based on column count, candidate elements and horizontal boundary.Nr   r   )spacecolumnsrj   )rF   r	   update_bboxr^   r   rp   rJ   before_space)ra   rb   r   rc   rf   rg   rL   rM   rN   rO   rd   re   r   rw   rP   rQ   rR   rS   ry   rz   r{   r|   ucolumn_1column_2s                            r   r_   zRawPage._create_section   s/    BA:%]]NBB X))2r2r*:;F)Ax8G:L,,.D!!W\\NBB!!W\\NBBBAx++RQO<H!!(+x++QBO<H!!(+A(/CDG:L$\15r   )N)__name__
__module____qualname____doc__r   r   r!   propertyr%   r*   r   r,   r4   r   r@   rX   r   staticmethodintr   tuplefloatr_   r   r   r   r   r      s    #	* 
 
 @ @
 D D
 $% &  ! ""@ @, 2FbJ  j % u  r   r   N)r   abcr   r   r   layout.Sectionr   layout.Columnr	   shape.Shaper
   shape.Shapesr   layout.Blocksr   
font.Fontsr   text.TextSpanr   common.sharer   commonr   common.Collectionr   r   r   r   r   <module>r      s@    &  $ " # ! "  $ %  *@h @r   