
    kh=                         d Z ddlZddlmZmZ ddlmZ ddlmZ ddlm	Z	 dd	l
mZmZmZmZ dd
lmZ ddlmZ ddlmZ  G d de	      Zy)a  Text Span object based on PDF raw dict extracted with ``PyMuPDF``.

Data structure for Span refer to
this `link <https://pymupdf.readthedocs.io/en/latest/textpage.html>`_::

    {
        # raw dict
        ---------------------------
        'bbox': (x0,y0,x1,y1),
        'color': sRGB
        'font': fontname,
        'size': fontsize,
        'flags': fontflags,
        'chars': [ chars ],

        # added dict
        ----------------------------
        'text': text,
        'style': [
            {
                'type': int,
                'color': int,
                'uri': str    # for hyperlink
            },
            ...
        ]
    }
    N)PtRGBColor)qn   )Char   )Element)RectType	rgb_valuergb_componentdecode)	constants)docx)Shapec                        e Zd ZdZddef fdZed        Zej                  d        Zd Z	ed        Z
defd	Zd
efdZd Zd Z fdZdef fdZddedefdZddedefdZd Zd Zd Z xZS )TextSpanzObject representing text span.rawc                 L   |xs i }|j                  dd      | _        |j                  dd      | _        |j                  dg       D cg c]  }t        |       }}|D cg c]  }|j                  dvs| c}| _        |j                  dd      | _        t        |j                  dd            | _        |j                  d	d
      | _	        |j                  dd      | _
        |j                  dd      | _        |j                  dd      | _        |j                  dg       | _        |j                  dd      | _        t        | A  |       | j
                  r=d| j                  j#                         v r | j%                  t&        j(                         y y y c c}w c c}w )Ncolorr   flagschars) u   �textr   fontsizeg      (@ascender      ?	descender        line_heightstylechar_spacingUNNAMED)getr   r   r   cr   _textr   r   r   r   r   r    r"   r#   super__init__upper_change_font_and_update_bboxr   DEFAULT_FONT_NAME)selfr   r&   r   char	__class__s        R/var/www/teggl/fontify/venv/lib/python3.12/site-packages/pdf2docx/text/TextSpan.pyr)   zTextSpan.__init__+   s[   iRWWWa(
WWWa(
 #&'''2"67Qa77',Mtn0LdM
WWVR(

 37762./	GGFD)	
C0c277="5 WWWb)

  GGNC8 	 ::)tyy'88--i.I.IJ 9:7 8Ms   
F"F!6F!c                     | j                   r3dj                  | j                   D cg c]  }|j                   c}      S | j                  S c c}w )z:Get span text. Note joining chars is in a higher priority.r   )r   joinr&   r'   r-   r.   s     r0   r   zTextSpan.textP   s9     <@::rww4::6467U4::U6s   Ac                     || _         y)zLSet span text directly in case no chars are stores, e.g. restored from json.N)r'   )r-   values     r0   r   zTextSpan.textU   s     
    c                 n    t        j                         }| j                  D ]  }||j                  z  } |S )z,Calculate bbox based on contained instances.)fitzRectr   bbox)r-   r:   r.   s      r0   cal_bboxzTextSpan.cal_bboxZ   s.    yy{JJ1D		 11r6   c                      | j                   dk7  S )Nr!   )r    )r-   s    r0   is_valid_line_heightzTextSpan.is_valid_line_height`   s    +/+;+;R+?$?r6   	font_namec                    || _         t        j                  |      }|j                  | j                  | j
                        }|| j                  j                  kD  r,| xj
                  | j                  j                  |z  z  c_        | j                  \  }}}}t        j                  dd||f      }|j                  | j                  d   j                  | j                  || j
                        \  }	}
|	j                  | j
                  z
  dz  }|	j                  |z   }|	j                  |z
  }| j                  ||||f       | j                  D ](  }|j                  \  }}
}}
|j                  ||||f       * y)a  Set new font, and update font size, span/char bbox accordingly.

        It's generally used for span with unnamed fonts.
        See this `issue <https://github.com/pymupdf/PyMuPDF/issues/642>`_.

        In corner case, where the PDF file containing unnamed and not embedded fonts, the span bbox
        extracted from ``PyMuPDF`` is not correct. ``PyMuPDF`` provides feature to replace these
        unnamed fonts with specified fonts, then extract correct bbox from the updated PDF. Since we
        care less about the original PDF itself but its layout, the idea here is to set a default
        font for text spans with unnamed fonts, and estimate the updated bbox with method from
        ``fitz.TextWriter``.

        Args:
            font_name (str): Font name.
        )fontsizer   )r   r@          @N)r   r8   Fonttext_lengthr   r   r:   width
TextWriterappendr   originheighty0y1update_bbox)r-   r>   r   
new_lengthx0rI   x1rJ   twrect_buffr.   s                r0   r+   z%TextSpan._change_font_and_update_bboxd   sL   " 	 yy#%%dii$))%D
		'II:55I BB__aB^,))JJqM  IIYY	  
a DII%s*WWt^WWt^"b"b)* JJ 	/D99LB2qb"b"-.	/r6   r.   c                 \    | j                   j                  |       | j                  |       y)z%Add char and update bbox accordingly.N)r   rF   
union_bboxr3   s     r0   addzTextSpan.add   s     

$r6   c                     | j                   }|j                  d      syt        |      t        |j                               z
  }| j                  |dz
  d | _        | j                  | j                                y)z3Remove blanks at the left side, but keep one blank.  Fr   NrP   T)r   
startswithlenlstripr   rK   r;   r-   original_text
num_blankss      r0   r[   zTextSpan.lstrip   sj    		''.u '#m.B.B.D*EE
ZZ
1.
dmmo.r6   c                     | j                   }|j                  d      syt        |      t        |j                               z
  }| j                  dd|z
   | _        | j                  | j                                y)z4Remove blanks at the right side, but keep one blank.rW   FNr   rX   T)r   endswithrZ   rstripr   rK   r;   r\   s      r0   ra   zTextSpan.rstrip   sj    		%%e,U '#m.B.B.D*EE
ZZ:.
dmmo.r6   c                     t         |          }|j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  d       |S )N)r   r   r   r    r   r   r"   r#   )r(   storeupdater   r   r   r    r   r   r"   r#   )r-   resr/   s     r0   rc   zTextSpan.store   sb    gmo

ZZIIII++ZZIIZZ --	
 		 
r6   r   c                 ,    t         |   |||d       y )Nr   )strokefillrD   )r(   plot)r-   pager   r/   s      r0   ri   zTextSpan.plot   s    uw|DUZ[|'\r6   rP   
horizontalc                    j                   | j                   z  }|j                  r| gS g }r7| j                   j                  |_        | j                   j                  |_        n6| j                   j                  |_        | j                   j
                  |_        fd}t        t        |t        | j                                    }|r|d   d   nd}t        |      }t        ||z   d      }	|dkD  r߉rM| j                   j                  | j                   j                  |j                  | j                   j                  f}
nL| j                   j                  |j                  | j                   j
                  | j                   j                  f}
| j                         j                  |
      }| j                  d| |_	        |j                  |       |dkD  r|j                  |j                  |j
                  |j                  f}
| j                         j                  |
      }| j                  ||	 |_	        |j                         |j                  |       |	t        | j                        k  r߉rM|j
                  | j                   j                  | j                   j
                  | j                   j                  f}
nL| j                   j                  | j                   j                  | j                   j
                  |j                  f}
| j                         j                  |
      }| j                  |	d |_	        |j                  |       |S )a  Split span with the intersection: span-intersection-span.

        Args:
            rect (Shape): Target shape to split this text span.
            horizontal (bool, optional): Text direction. Defaults to True.

        Returns:
            list: Split text spans.
        c                 .    | d   j                        S )Nr   )contained_in_rect)itemsrk   rP   s    r0   <lambda>z TextSpan.split.<locals>.<lambda>   s    %(44T:F r6   r   r!   N)r:   is_emptyrI   rJ   rM   rN   listfilter	enumerater   rZ   maxcopyrK   rF   _parse_text_format)r-   rP   rk   intsecsplit_spansfindex_charsposlengthpos_endr:   
split_spans    ``         r0   splitzTextSpan.split   si    TYY& ??D6M
 
 		FI		FI		FI		FI G6!Ytzz%:;< $/k!nQB[!c&j!$ 7		diillFIItyy||L		fiityy||L006J#zz!C0Jz* A:IIvyy&))VYY?D006J#zz#g6J))$
;z* S_$		499<<tyy||L		diillDIILL&))L006J#zz'(3Jz*r6   c                    |j                  t        j                        s|j                  t        j                        ry|j                  t        j                        r>| j
                  j                  |j                  |j                  |j                  d       y|rdnd}|j                  |dz      |j                  |   z
  }| j                  |dz      | j                  |   z
  }t        | j                  |dz      |j                  |   z
        }|d|z  k\  rN|j                  t        d      k7  r}| j                  |t        j                        r]t        j                   |_        nG|d	|z  k  rt        j"                  |_        n)d
|z  |cxk  rd|z  k  rn nt        j$                  |_        |j&                  sy|j                  |j                  d}| j
                  j                  |       y)a?  Parse text style based on the position to a rect shape.

        Args:
            rect (Shape): Target rect shape representing potential text style.
            horizontal (bool, optional): Horizontal text direction. Defaults to True.

        Returns:
            bool: Parsed text style successfully or not.
        F)typer   uriTr   r   r   g      ?)r   r   r   g      ?gffffff?g      ?)r   r   )equal_to_typer
   BORDERSHADING	HYPERLINKr"   rF   r   r   r   r:   absr   get_main_bboxr   FACTOR_MAJOR	HIGHLIGHT	UNDERLINESTRIKEis_determined)r-   rP   rk   idxh_recth_spandr"   s           r0   rw   zTextSpan._parse_text_format  s    hoo.$2D2DXEUEU2V h001JJ		xx 
  a1 3q5!DIIcN23q5!DIIcN2 		#a% 499S>12 SZzzYw//""4)?)?@$..	 $v+ **DI &[1*tF{* DI !!% IIZZ
 	

% r6   c                    |j                  | j                        r| j                         S |j                  | j                        s
t	               S | j                         }|j
                  j                          |j                  d       | j
                  D ]O  }|j                  |t        j                        s$|j
                  j                  |       |j                  |       Q |S )zCreate new TextSpan object with chars contained in given bbox.

        Args:
            rect (fitz.Rect): Target bbox.
        )r   r   r   r   )containsr:   rv   
intersectsr   r   clearrK   r   r   FACTOR_A_HALFrF   rT   )r-   rP   spanr.   s       r0   r   zTextSpan.intersectsN  s     ==#99; tyy): yy{

*+JJ 	&D!!$	(?(?@

!!$'%	&
 r6   c                    | j                   D ]c  }|d   t        j                  j                  k(  s$| j                  j                         s?t        j                  ||d   | j                        } n |j                  | j                        }| j                  |       | j                  r!t        j                  || j                         yy)a  Add text span to a docx paragraph, and set text style, e.g.
        font, color, underline, hyperlink, etc.

        .. note::
            Hyperlink and its style is parsed separately from pdf. For instance, regarding a general
            hyperlink with an underline, the text and uri is parsed as hyperlink itself, while the
            underline is treated as a normal text style.
        r   r   N)r"   r
   r   r5   r   stripr   add_hyperlinkadd_run_set_text_formatr#   set_char_spacing)r-   	paragraphr"   docx_runs       r0   	make_docxzTextSpan.make_docxi  s     ZZ 	4EV}h00666499??;L--iutyyQ	4
 !((3H 	h' !!(D,=,=> r6   c                 n   t        | j                  dz        |_        t        | j                  dz        |_        t        | j                  dz        |_        | j
                  }||j
                  _        |j                  j                  j                  j                  t        d      |       t        t        | j                         |j
                  j                  _        t!        | j"                  dz        dz  }t%        |      |j
                  _        | j"                  |xs | j"                  xs dz  }t'        |dz
        dk\  rt)        j*                  ||       | j,                  D ]  }|d   }|t.        j0                  j2                  k(  rt)        j4                  ||d	          ?|t.        j6                  j2                  k(  r>| j                  |d	   k(  rd
|j
                  _        t)        j:                  ||d	          |t.        j<                  j2                  k(  sd
|j
                  _         y)z/Set text format for ``python-docx.run`` object.r   r      z
w:eastAsiarA   r   g{Gz?r   r   TN) boolr   superscriptitalicboldr   name_elementrPrrFontssetr   r   r   r   rgbroundr   r   r   r   set_char_scalingr"   r
   r   r5   set_char_shadingr   	underlineset_char_underliner   strike)r-   r   r>   	font_sizescaler"   ts          r0   r   zTextSpan._set_text_format  s     $DJJ$56tzzD01TZZ$./ II	&$$((L)99E"*M$**,E"F
 $))A+&s*		] 		Y8$))8q9uSy>4!!(E2 ZZ 	,EfA ($$***%%hg? H&&,,,::uW~-.2HMM+++HeGnE HOO)))'+$%	,r6   )N)T)__name__
__module____qualname____doc__dictr)   propertyr   setterr;   r=   strr+   r   rU   r[   ra   rc   tupleri   r   r   r   rw   r   r   r   __classcell__)r/   s   @r0   r   r   )   s    ("K4 "KJ V V 
[[  ? ?./S ./bt 		 ]u\H H4 HV;e ; ;|6?25,r6   r   )r   r8   docx.sharedr   r   docx.oxml.nsr   r   common.Elementr	   common.sharer
   r   r   r   commonr   r   shape.Shaper   r    r6   r0   <module>r      s8   :  $   $ G G   N,w N,r6   