
    kh-H                         d Z ddlmZmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZ dd	lmZ dd
l	mZmZ ddlmZ ddlmZ  G d de      Zy)aI  Text block objects based on PDF raw dict extracted with ``PyMuPDF``.

Data structure based on this `link <https://pymupdf.readthedocs.io/en/latest/textpage.html>`_::

    {
        # raw dict
        # --------------------------------
        'type': 0,
        'bbox': (x0,y0,x1,y1),
        'lines': [ lines ]

        # introduced dict
        # --------------------------------
        'before_space': bs,
        'after_space': as,
        'line_space': ls,

        'alignment': 0,
        'left_space': 10.0,
        'right_space': 0.0,

        'tab_stops': [15.4, 35.0]
    }
    )PtInches)WD_ALIGN_PARAGRAPH   )Lines   )	ImageSpan)RectTypeTextAlignmentlower_round)Block)rgb_component_from_namer   )	constants)docxc                        e Zd ZdZddef fdZed        Zed        Zed        Z	ed        Z
ed        Zed	        Z fd
Zd Z fdZd ZdedededededefdZd Zd Zd Zdededededef
dZ xZS )	TextBlockzText block.rawc                     |xs i }d|v r|j                  d       t        | 	  |       t        |       j	                  |j                  dg             | _        | j                          y )Nbbox)parentlines)popsuper__init__r   restoregetr   set_text_block)selfr   	__class__s     S/var/www/teggl/fontify/venv/lib/python3.12/site-packages/pdf2docx/text/TextBlock.pyr   zTextBlock.__init__)   s^    iR S=#''&/ $'//0DE
 	    c                 t    | j                   D cg c]  }|j                   }}dj                  |      S c c}w )zJText content in block. Note image is counted as a placeholder ``<image>``. )r   textjoinr   line
lines_texts      r    r$   zTextBlock.text7   s3     -1JJ7Ddii7
7wwz"" 8   5c                 t    | j                   D cg c]  }|j                   }}dj                  |      S c c}w )z5Raw text content in block without considering images.r#   )r   raw_textr%   r&   s      r    r+   zTextBlock.raw_text=   s3     15

;dmm;
;wwz"" <r)   c                 :    t        d | j                  D              S )zZIf this block contains only white space or not. If True, this block is safe to be removed.c              3   4   K   | ]  }|j                     y wN)white_space_only).0r'   s     r    	<genexpr>z-TextBlock.white_space_only.<locals>.<genexpr>F   s     @T4((@   )allr   r   s    r    r/   zTextBlock.white_space_onlyC   s     @TZZ@@@r!   c                 .    | j                   j                  S )zsAll lines contained in text block must have same text direction. 
        Otherwise, set normal direction.
        )r   text_directionr4   s    r    r6   zTextBlock.text_directionH   s    
 zz(((r!   c                    | j                   rdnd| j                  j                         }t        |      }|dk(  ry| j                  dz      | j                     z
  }fd}t        t        ||            }||z
  |dz
  z  S )z4Average distance between adjacent two physical rows.r   r   Nr   c                 ,    t        fd| D              S )Nc              3   t   K   | ]/  }t        |j                  d z      |j                     z
         1 yw)r   N)absr   )r0   r'   idxs     r    r1   z>TextBlock.average_row_gap.<locals>.<lambda>.<locals>.<genexpr>[   s1     *`TX3tyyQ/?		#/N+O*`s   58max)rowr;   s    r    <lambda>z+TextBlock.average_row_gap.<locals>.<lambda>[   s    s*`\_*`'` r!   )is_horizontal_textr   group_by_physical_rowslenr   summap)r   rowsnumblock_heightf_max_row_heightsum_row_heightr;   s         @r    average_row_gapzTextBlock.average_row_gapO   s     **azz002$i 6$ yyQ'		#6`S!1489^+A66r!   c                 H    t        | j                  j                               S )zCount of physical rows.)rB   r   rA   r4   s    r    	row_countzTextBlock.row_count`   s     4::44677r!   c                 z    t         |          }|j                  d| j                  j                         i       |S )Nr   )r   storeupdater   )r   resr   s     r    rN   zTextBlock.storef   s8    gmo

TZZ%%'
 	 
r!   c                     t        |t        t        t        f      r#|D ]  }| j                  j                  |        y| j                  j                  |       y)zAdd line or lines to TextBlock.N)
isinstancer   listtupler   append)r   line_or_linesr'   s      r    addzTextBlock.addn   sH    meT5%9:% (

!!$'( JJm,r!   c                     t        d      }t        | 	  ||d       | j                  D ]O  }t        d      }|j                  ||       |j                  D ]   }t        d      }|j                  ||       " Q y)	ztPlot block/line/span area for debug purpose.
        
        Args:
            page (fitz.Page): pdf page.
        bluez[3.0 3.0] 0)strokedashesred)rZ   r#   )colorN)r   r   plotr   spans)r   pagerY   r'   r\   spancr   s          r    r^   zTextBlock.plotw   s     'v.T$}= JJ 	)D)%0CIId3I' 

 )+B/		$a	()	)r!   c                     d}|D ]r  }|j                  t        j                        s|j                  r/| j                  j                  |j                        sU| j                  j                  |      sqd}t |S )zParse text format with style represented by rectangles.
        
        Args:
            shapes (Shapes): Shapes representing potential styles applied on blocks.
        FT)equal_to_typer
   	HYPERLINKis_determinedr   
intersectsr   parse_text_format)r   shapesflagshapes       r    rh   zTextBlock.parse_text_format   su       	E &&x'9'9:u?R?RT\ 99''

3X zz++E2	 r!   line_separate_thresholdline_break_width_ratioline_break_free_space_ratiolines_left_aligned_thresholdlines_right_aligned_thresholdlines_center_aligned_thresholdc                 \   | j                   rdnd\  }}	}
| j                  |||	|
f||||      | _        | j                  t        j                  k(  r0t        j
                  | _        | j                  j                  |       | j                  }|dk(  r%| j                  t        j
                  k(  rd| _	        nZ|dk(  r%| j                  t        j                  k(  rd| _        n0|dk(  r+| j                  t        j                  k(  rd| _        d| _	        | j                  j                  |||       y)a   Set horizontal spacing based on lines layout and page bbox.
        
        * The general spacing is determined by paragraph alignment and indentation.
        * The detailed spacing of block lines is determined by tab stops.

        Multiple alignment modes may exist in block (due to improper organized lines
        from ``PyMuPDF``), e.g. some lines align left, and others right. In this case,
        **LEFT** alignment is set, and use ``TAB`` to position each line.
        )r   r   g      ?)   r   g      r   r   N)r@   _parse_alignment	alignmentr   NONELEFTr   parse_tab_stoprL   right_spaceRIGHT
left_spaceCENTERparse_line_break)r   r   rl   rm   rn   ro   rp   rq   idx0idx1frL   s               r    parse_horizontal_spacingz"TextBlock.parse_horizontal_spacing   s   $ (,'>'>LdA ..t4O#()*, >>]///*//DNJJ%%&=>
 NN	a<DNNm.@.@@ D\dnn0C0CCDO\dnn0D0DDDO D 	

##D"'	)r!   c                    | j                   D ]5  }t        d |j                  D              s t        j                  | _         y | j                  rdnd}| j                  |dz      | j                  |   z
  }| j                   j                         }d fdt        fd|D              }||z  }t        |      dkD  rt        |t        j                        }|| _        y)	al  Calculate relative line spacing, e.g. `spacing = 1.02`.  Relative line spacing is based on standard 
        single line height, which is font-related. 

        .. note::
            The line spacing could be updated automatically when changing the font size, while the layout might
            be broken in exact spacing mode, e.g. overlapping of lines.
        c              3   B   K   | ]  }t        |t              s|  y wr.   )rR   r	   r0   ra   s     r    r1   z8TextBlock.parse_relative_line_spacing.<locals>.<genexpr>   s     OT:dI3NDOs   Nr   r   r   c                 :    t        d | j                  D              S )Nc              3   4   K   | ]  }|j                     y wr.   )line_heightr   s     r    r1   zJTextBlock.parse_relative_line_spacing.<locals>.<lambda>.<locals>.<genexpr>   s     .WDt/?/?.Wr2   )r=   r_   )r'   s    r    r?   z7TextBlock.parse_relative_line_spacing.<locals>.<lambda>   s    3.WDJJ.W+W r!   c                 ,    t        fd| D              S )Nc              3   .   K   | ]  } |        y wr.    )r0   r'   fun_max_line_heights     r    r1   zJTextBlock.parse_relative_line_spacing.<locals>.<lambda>.<locals>.<genexpr>   s     ,W4-@-F,W   r<   )r>   r   s    r    r?   z7TextBlock.parse_relative_line_spacing.<locals>.<lambda>   s    ,WSV,W)W r!   c              3   .   K   | ]  } |        y wr.   r   )r0   r>   fun_max_row_heights     r    r1   z8TextBlock.parse_relative_line_spacing.<locals>.<genexpr>   s     F#05Fr   )r   rS   r_   r   DEFAULT_LINE_SPACING
line_spacer@   r   rA   rC   rB   r=   )	r   r'   r;   rG   rE   standard_heightr   r   r   s	          @@r    parse_relative_line_spacingz%TextBlock.parse_relative_line_spacing   s     JJ 	DOTZZOO"+"@"@	 **ayyQ'		#6 zz002WWFFF!/1
 t9Q;SY5S5S%T
$r!   c                    | j                   rdnd}| j                  d   j                  }||dz      ||   z
  }| j                  |dz      | j                  |   z
  }| j                  }|dkD  r||z
  |dz
  z  }n|}|| _        | xj
                  ||z
  z  c_        | j
                  dk  r*| xj                  | j
                  |z  z  c_        d| _        yy)a0  Calculate exact line spacing, e.g. `spacing = Pt(12)`. 

        The layout of pdf text block: line-space-line-space-line, excepting space before first line, 
        i.e. space-line-space-line, when creating paragraph in docx. So, an average line height is 
        ``space+line``. Then, the height of first line can be adjusted by updating paragraph before-spacing.

        .. note::
            Compared with the relative spacing mode, it has a more precise layout, but less flexible editing
            ability, especially changing the font size.
        r   r   r           N)r@   r   r   rL   r   before_space)r   r;   r   first_line_heightrG   countr   s          r    parse_exact_line_spacingz"TextBlock.parse_exact_line_spacing   s     **azz!}!! QK$s)3yyQ'		#6 19&'8857CJ%J$
 	.;; q OOt00588O #D !r!   c                     t        j                  |      }t        t        | j                  d      d      }t        t        | j
                  d      d      }t        |      |_        t        |      |_        | j                  dk(  r%t        t        | j                  d            |_        nt        | j                  d      |_        | j                  }| j                  dk  r|| j                  z  }t        |      |_        t        | j                        |_        t        | j                        |_        | j$                  t&        j(                  k(  rt*        j(                  |_        | j,                  D ]3  }|j,                  j/                  t        | j                  |z                5 t1        | j                  t2        j4                  z  d      }t7        |      |_        n| j$                  t&        j8                  k(  rCt*        j8                  |_        t1        |t2        j4                  z  d      }t7        |      |_        n| j$                  t&        j:                  k(  rzt*        j:                  |_        t1        |t2        j4                  z  d      }t7        |      |_        t1        | j                  t2        j4                  z  d      }t7        |      |_        nt*        j<                  |_        | j>                  D ]  }|jA                  |        |S )a  Create paragraph for a text block.

        Refer to ``python-docx`` doc for details on text format:

        * https://python-docx.readthedocs.io/en/latest/user/text.html
        * https://python-docx.readthedocs.io/en/latest/api/enum/WdAlignParagraph.html#wdparagraphalignment
        
        Args:
            p (Paragraph): ``python-docx`` paragraph instance.

        .. note::
            The left position of paragraph is set by paragraph indent, rather than ``TAB`` stop.
        r   r   r   r   )!r   reset_paragraph_formatr=   roundr   after_spacer   space_beforespace_afterline_space_typer   line_spacingr{   first_line_spaceleft_indentry   right_indentfirst_line_indentru   r   rw   r   	tab_stopsadd_tab_stopr   r   ITPr   rz   r|   JUSTIFYr   	make_docx)	r   ppfbefore_spacingafter_spacingr{   posdr'   s	            r    r   zTextBlock.make_docx   sH    ((+
 U4#4#4a8#>E$"2"2A6<^,M* " t!:;BO#DOOQ7BO oo
  "$///JZ.d../!$"7"78
 >>=----22BL~~ E))"T__s-B*CDE D,,Y]]:A>A$QiBO^^]000-33BL Jy}}4a8A#AYBN^^]111-44BL Jy}}4a8A#AYBND,,Y]]:A>A$QiBO .55BL
 JJ1Dq 11r!   text_direction_paramc                 h   |\  }}}	t        | j                  |   ||   z
  |	z  d      t        ||   | j                  |   z
  |	z  d      }
t        |
z
  dz  d      t        d      t        |
d      }
t        ||   ||   z
        | _        |
| _        | j                  j                         }|D ]|  }t        |      dk(  rt        dt        |            D cg c]0  }||   j                  |   ||dz
     j                  |   z
  |	z  |k\  2 }}t        |      slt        j                  c S  fd}t        |      dk(  r |       S |D cg c]  }|d   j                  |    }}|D cg c]  }|d   j                  |    }}t        ||      D cg c]  \  }}||z   dz   }}}t        |      dk\  r
|dd |dd }}t        t        |      t        |      z
        |k  }t        t        |      t        |      z
        |k  }t        t        |      t        |      z
        k  }|r(|r&t        |      dk\  rt        j                  n |       }ng|rt        j                   }nT|rt        j"                  }nA|r/t        |      dk\  rt        j$                  nt        j"                  }nt        j                  }|t        j"                  k(  s|t        j                  k(  r0|d   d   j                  |   |d   d   j                  |   z
  | _        |S c c}w c c}w c c}w c c}}w )	a  Detect text alignment mode based on layout of internal lines. It can't decide when only
        one line, in such case, the alignment mode is determined by externally check.
        
        Args:
            text_direction_param (tuple): ``(x0_index, x1_index, direction_factor)``, 
                e.g. ``(0, 2, 1)`` for horizontal text, while ``(3, 1, -1)`` for vertical text.
        r   g       @r   c                      t              k  rt        j                  S d z  k  rt        j                  S t        j                  S )Ng      ?)r:   r   r|   rw   rz   )Wd_centerd_leftrq   s   r    external_alignmentz6TextBlock._parse_alignment.<locals>.external_alignment  s@    8}==$+++46!$)))$***r!   r   rs   N)r   r   r=   r:   r{   ry   r   rA   rB   rangeanyr   rv   zipminr   r|   rw   rz   r   )r   r   r   rl   ro   rp   rq   r~   r   r   d_rightrE   r>   idisr   r   X0X1x0x1Xleft_alignedright_alignedcenter_alignedru   r   r   r   s         `                   @@@r    rt   zTextBlock._parse_alignmentt  s    -dA $))D/$t*4a7;$t*TYYt_4a7;&.#-q1vs#w$T
4:%& "" zz002 	*C3x{H!&q#c(!35 FKK%c!A#hmmD&991<>UU 5C 53x$)))	*	+ t9>"4"66 155ueAhmmD!55045ueBinnT"55*-b"+6hr2r"uck66t9>BqrFBqHr2SWSW_-/KKSWSW_-/LLSVCF]+-KKM14TA--CUCWI%,,I%**I/24y!|++ASASI &**I m(((I}7L7L,L$(GAJOOD$9DGAJOOD<Q$QD!5B 656s    5LL$6L) L.r.   )__name__
__module____qualname____doc__dictr   propertyr$   r+   r/   r6   rJ   rL   rN   rW   r^   rh   floatr   r   r   r   rT   rt   __classcell__)r   s   @r    r   r   '   s)   4  # #
 # #
 A A ) ) 7 7  8 8
-),22),12)+02) 162) 27	2)
 382) 492)j%>#$LPhc).c,1c 27c 38	c
 49cr!   r   N)r   docx.sharedr   r   docx.enum.textr   r   image.ImageSpanr	   common.sharer
   r   r   common.Blockr   r   commonr   r   r   r   r!   r    <module>r      s8   2 $ -  ' A A   A  p pr!   