
    khh(                     n    d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ dd	lm	Z	 dd
l
mZ  G d de      Zy)zA group of Line objects.
    N   )Line)TextSpan   )	ImageSpan)ElementCollection)TextAlignment)	constantsc                   z    e Zd ZdZed        ZdefdZed        Zde	de	fdZ
d	efd
Zd Zde	de	fdZde	fdZy)LineszCollection of text lines.c                 v    t        |       sy| j                  d   t        fd| j                  D              S )z-Whether all contained lines have same parent.Fr   c              3   @   K   | ]  }|j                          y wN)same_source_parent).0line
first_lines     O/var/www/teggl/fontify/venv/lib/python3.12/site-packages/pdf2docx/text/Lines.py	<genexpr>z&Lines.unique_parent.<locals>.<genexpr>   s     S44**:6Ss   )bool
_instancesall)selfr   s    @r   unique_parentzLines.unique_parent   s1     Dz%__Q'
S4??SSS    rawsc                 L    |D ]  }t        |      }| j                  |         | S )z$Construct lines from raw dicts list.)r   append)r   r   rawr   s       r   restorezLines.restore   s-     	C9DKK	 r   c                 b    g }| j                   D ]  }|j                  |j                          |S )zGet all ImageSpan instances.)r   extendimage_spans)r   spansr   s      r   r#   zLines.image_spans$   s3     OO 	+DLL))*	+r   line_break_free_space_rationew_paragraph_free_space_ratioc                    | j                         }t        |      }|dk(  r|S t        d |dd D              }t        d |D              |z  }g }t	               }t        t        j                        }	dx}
}dx}}|D ]  }|d   j                  j                         j                  |	      }|d   j                  d   |d   j                  d   z
  }|r||z  d	|z
  k  rd
}n|r||z
  |z  |k\  rd
}
|r-|j                  |       |j                  |       t	               }n@|
r-|j                  |       t	               }|j                  |       n|j                  |       |}dx}
} |r|j                  |       |S )a  Split lines into separate paragraph by checking text. The parent text block consists of 
        lines with similar line spacing, while lines in other paragraph might be counted when the
        paragraph spacing is relatively small. So, it's necessary to split those lines by checking
        the text contents.

        .. note::
            Considered only normal reading direction, from left to right, from top
            to bottom.
        r   c              3   f   K   | ])  }|d    j                   d   |d   j                   d   z
   + yw)r   r   Nbboxr   rows     r   r   z1Lines.split_vertically_by_text.<locals>.<genexpr>>   s-     C3BQAA.C   /1Nc              3   f   K   | ])  }|d    j                   d   |d    j                   d   z
   + yw)r      r   Nr*   r,   s     r   r   z1Lines.split_vertically_by_text.<locals>.<genexpr>?   s-     >#AAs1v{{1~->r.   Fr)   r   r   g      ?T)group_by_physical_rowslenmaxsumr   tupler
   SENTENCE_END_PUNCtextstripendswithr+   r"   r   )r   r%   r&   rowsnumWHreslinespuncstart_of_paraend_of_parastart_of_sen
end_of_senr-   ws                   r   split_vertically_by_textzLines.split_vertically_by_text-   s    **, $i6$; C$qr(CC>>>D Y001&++$))z 	0CR++-66t<JRaQQ/A acS)D%DD" 1Q3'-K"K $ S!

5!

5!S!S! &L*//MK5	0: #**U#
r   delete_end_line_hyphenc                 ~   dj                  d t        j                  D              fd}t        | j                  dd       D ]  \  }}|j
                  d   }t        |t              s&|j                  }|s5|d   }| j                  |dz      j
                  d   }t        |t              sj|j                  }	|	sy|	d   }
|r<|j                  j                  d      r!|
j                  j                         rd|_	         ||j                        s ||
j                        s|xj                  d	z  c_	         y)
zAdjust word at the end of line:
        # - it might miss blank between words from adjacent lines
        # - it's optional to delete hyphen since it might not at the the end 
           of line after conversion
         c              3   ,   K   | ]  }|d k7  s	|  yw)-N )r   cs     r   r   z)Lines.adjust_last_word.<locals>.<genexpr>o   s      Iq!S& Is   
c                 T    | j                         j                         xs | xr | v S r   )encodeisalnum)rM   punc_ex_hyphens    r   is_end_of_english_wordz6Lines.adjust_last_word.<locals>.is_end_of_english_wordp   s'    88:%%'FA,E!~2EFr   Nr)   r   r   rK    )joinstringpunctuation	enumerater   r$   
isinstancer   charsrM   r9   islower)r   rG   rR   ir   end_span	end_charsend_char
start_spanstart_charsnext_start_charrQ   s              @r   adjust_last_wordzLines.adjust_last_wordi   s     IF,>,> II	G !"!56 	"GAtzz"~Hh18 Ih }H 1-33A6Jj(3X$**K)!nO &

##C(_->->-F-F-H

 &hjj16L_M^M^6_

c!
1	"r   c                     d}| j                   D ]  }|j                  t        j                        }|j                  j                  |      s1|j                  j                  |j                  j                  k  r |S ng }|j                  D ]S  }t        |t              r|j                  |       %|j                  ||j                        }|j                  |       d}U |j                  j                  |        |S )zParse text format with style represented by rectangle shape.
        
        Args:
            shape (Shape): Potential style shape applied on blocks.
        
        Returns:
            bool: Whether a valid text style.
        FT)r   get_expand_bboxr
   
MAJOR_DISTr+   
intersectsy1y0r$   rX   r   r   splitis_horizontal_textr"   reset)r   shapeflagr   expanded_bboxsplit_spansspanr$   s           r   parse_text_formatzLines.parse_text_format   s     OO 	*D 001E1EFM::((7::==499<</$ #  K

  dI.0B0B40H !JJud.E.EFE&&u-D  JJ[))	*, r   line_break_width_ratioc                 |   | j                   }|j                  rdnd\  }}t        |j                  |   |j                  |   z
        }||   ||   z
  }||z  |k  }	| j	                         }
|
D ]  }|D ]	  }d|_         |j                  t        j                  k(  r.|d   }t        |j                  |   |j                  |   z
        }n-|d   }t        |j                  |   |j                  |   z
        }|j                  t        j                  k(  r|dz  }|	s	||z  |kD  sd|_         |
d   D ]	  }d|_         y)a  Whether hard break each line. 

        Args:
            bbox (Rect): bbox of parent layout, e.g. page or cell.
            line_break_width_ratio (float): user defined threshold, break line if smaller than this value.
            line_break_free_space_ratio (float): user defined threshold, break line if exceeds this value.

        Hard line break helps ensure paragraph structure, but pdf-based layout calculation may
        change in docx due to different rendering mechanism like font, spacing. For instance, when
        one paragraph row can't accommodate a Line, the hard break leads to an unnecessary empty row.
        Since we can't 100% ensure a same structure, it's better to focus on the content - add line
        break only when it's necessary to, e.g. short lines.
        r   r   r0   r   r   r)   r   r   N)
parentrj   absr+   r1   
line_break	alignmentr	   RIGHTCENTER)r   r+   rr   r%   blockidx0idx1block_widthlayout_widthrx   r:   r?   r   end_line
free_spaces                  r   parse_line_breakzLines.parse_line_break   sH   " $77VV
d%**T*5::d+;;<DzDJ. !-1GG
 **, 	(E2t2 -"5"55 8 D!1(--2E!EF
 9 D!1(--2E!EF
-"6"66
a

 Z36QQ&'##	(( H1Ddo1r   line_separate_thresholdc                   
 | j                   

j                  rdnd\  }
fd}t        t        || j                              }t        t        d |            j                          
_        
j                  sy
fd}
j                     }t        | j                        D ]  \  }}|j                     |z
  }	|	|kD  r# ||j                            ||      z
  |_        || j                  d   k(  r y|j                  | j                  |dz            r|j                  |   n
j                     } y)	zCalculate tab stops for parent block and whether add TAB stop before each line. 

        Args:
            line_separate_threshold (float): Don't need a tab stop if the line gap less than this value.
        rt   ru   c                 h    t        t        | j                     j                     z
        d      S )Nr   )roundrw   r+   )r   r|   r}   s    r   <lambda>z&Lines.parse_tab_stop.<locals>.<lambda>   s)    5TYYt_UZZ5E%E!FJ r   c                 (    | t         j                  k\  S r   )r
   
MINOR_DIST)poss    r   r   z&Lines.parse_tab_stop.<locals>.<lambda>   s    C1E1E,E r   Nc                 X    d}| j                      z  } D ]  }| |k  r |S |dz  } |S )Nr   r   r*   )r   r[   tr|   r}   	tab_stopss      r   tab_positionz*Lines.parse_tab_stop.<locals>.tab_position   sJ    A5::d##C q5%H Q Hr   r)   r   )rv   rj   setmapr   listfiltersortr   r+   rW   tab_stopin_same_row)r   r   r~   funall_posr   refr[   r   distancer|   r}   r   s             @@@r   parse_tab_stopzLines.parse_tab_stop   s     $77VV
dJc#t/0 EwOP	# 	 jj 1 	bGAtyy,H// ,TYYt_ =l3>O O T__R((%%)%5%5dooac6J%K$))D/QVQ[Q[\`QaC	br   N)__name__
__module____qualname____doc__propertyr   r   r    r#   floatrF   r   rb   rq   r   r   rL   r   r   r   r      s    #T T4   95 9in 9x""d ""J!H/2',/2,1/2d(bU (br   r   )r   rU   r   r   image.ImageSpanr   common.Collectionr   common.sharer	   commonr
   r   rL   r   r   <module>r      s2       ' 1 ( |b |br   