#!/usr/bin/env python3
"""
:mod:`apsw.unicode` - Up to date Unicode aware methods and lookups
This module helps with :doc:`textsearch` and general Unicode,
addressing the following:
* The standard library :mod:`unicodedata` has limited information
  available (eg no information about emoji), and is only updated to
  new `Unicode versions
  <https://www.unicode.org/versions/enumeratedversions.html>`__ on a
  new Python version.
* Multiple consecutive codepoints can combine into a single user
  perceived character (grapheme cluster), such as combining accents,
  vowels and marks in some writing systems, variant selectors, joiners
  and linkers, etc.  That means you can't use indexes into
  :class:`str` safely without potentially breaking them.
* The standard library provides no help in splitting text into
  grapheme clusters, words, and sentences, or into breaking text into
  multiple lines.
* Text processing is performance sensitive - FTS5 easily handles
  hundreds of megabytes to gigabytes of text, and so should this
  module.  It also affects the latency of each query as that is
  tokenized, and results can highlight words and sentences.
This module is independent of the main apsw module, and loading it
does not load any database functionality.  The majority of the
functionality is implemented in C for size and performance reasons.
See :data:`unicode_version` for the implemented version.
Grapheme cluster, word, and sentence splitting
    `Unicode Technical Report #29
    <https://www.unicode.org/reports/tr29/>`__ rules for finding
    grapheme clusters, words, and sentences are implemented.  Tr29
    specifies break points which can be found via
    :func:`grapheme_next_break`, :func:`word_next_break`, and
    :func:`sentence_next_break`.
    Building on those are iterators providing optional offsets and the
    text.  This is used for tokenization (getting character and word
    boundaries correct), and for result highlighting (showing
    words/sentences before and after match).
Line break splitting
    `Unicode Technical Report #14
    <https://www.unicode.org/reports/tr14/>`__ rules for finding where
    text cam be broken and resumed on the next line.  Tr14 specifies
    break points which can be found via :func:`line_break_next_break`.
    Building on those are iterators providing optional offsets and the
    text.  This is used for :func:`text_wrap`.
Unicode lookups
   * Category information :func:`category`
   * Is an emoji or similar :func:`is_extended_pictographic`
   * Flag characters :func:`is_regional_indicator`
   * Codepoint names :func:`codepoint_name`
Case folding, accent removal
   * :func:`casefold` is used to do case insensitive comparisons.
   * :func:`strip` is used to remove accents, marks, punctuation,
     joiners etc
Helpers
These are aware of grapheme cluster boundaries which Python's builtin
string operations are not.  The text width functions take into account
how wide the text is when displayed on most terminals.
    * :func:`grapheme_length` to get the number of grapheme clusters
      in a string
    * :func:`grapheme_substr` to get substrings using grapheme cluster
      indexing
    * :func:`grapheme_startswith` and :func:`grapheme_endswith`
    * :func:`grapheme_find` to find a substring
    * :func:`split_lines` to split text into lines using all the
      Unicode hard line break codepoints
    * :func:`text_width` to count how wide the text is
    * :func:`expand_tabs` to expand tabs using text width
    * :func:`text_width_substr` to extract substrings based on text width
    * :func:`text_wrap` to wrap paragraphs using Unicode words, line
      breaking, and text width
    * :func:`guess_paragraphs` to establish paragraph boundaries for
      text that has line breaks in paragraphs like many plain text
      and similar markup formats.
Size
    Using the `ICU <https://icu.unicode.org/>`__ `extension
    <https://pypi.org/project/PyICU/>`__ is 5MB of code that then
    links to shared libraries containing another 5MB of code, and 30MB
    of data.  This module is 0.5MB, 5 to 50% faster, and has no
    dependencies.  (ICU includes numerous extra customisations,
    formatting, locale helpers etc.)
Performance
    There some pure Python alternatives, with less functionality.
    They take 5 to 15 times more CPU time to process the same text.
    Use ``python3 -m apsw.unicode benchmark --help``.
"""
from __future__ import annotations
from typing import Iterator, Iterable, Any
import re
### BEGIN UNICODE UPDATE SECTION ###
unicode_version = "16.0"
"""The `Unicode version <https://www.unicode.org/versions/enumeratedversions.html>`__
that the rules and data tables implement"""
class _Category:
    Cc = 2**0
    Cf = 2**1
    Cn = 2**2
    Co = 2**3
    Cs = 2**4
    Extended_Pictographic = 2**5
    Ll = 2**6
    Lm = 2**7
    Lo = 2**8
    Lt = 2**9
    Lu = 2**10
    Mc = 2**11
    Me = 2**12
    Mn = 2**13
    Nd = 2**14
    Nl = 2**15
    No = 2**16
    Pc = 2**17
    Pd = 2**18
    Pe = 2**19
    Pf = 2**20
    Pi = 2**21
    Po = 2**22
    Ps = 2**23
    Regional_Indicator = 2**24
    Sc = 2**25
    Sk = 2**26
    Sm = 2**27
    So = 2**28
    WIDTH_INVALID = 2**29
    WIDTH_TWO = 2**30
    WIDTH_ZERO = 2**31
    Zl = 2**32
    Zp = 2**33
    Zs = 2**34
### END UNICODE UPDATE SECTION ###
from . import _unicode
assert unicode_version == _unicode.unicode_version
_unicode_category = _unicode.category_category
[docs]
def category(codepoint: int | str) -> str:
    """Returns the `general category <https://en.wikipedia.org/wiki/Unicode_character_property#General_Category>`__ - eg ``Lu`` for Letter Uppercase
    See :data:`apsw.fts5.unicode_categories` for descriptions mapping"""
    cat = _unicode_category(codepoint)
    if cat & _Category.Lu:
        return "Lu"  # Letter Uppercase
    elif cat & _Category.Ll:
        return "Ll"  # Letter Lowercase
    elif cat & _Category.Lt:
        return "Lt"  # Letter Titlecase
    elif cat & _Category.Lm:
        return "Lm"  # Letter Modifier
    elif cat & _Category.Lo:
        return "Lo"  # Letter Other
    elif cat & _Category.Mn:
        return "Mn"  # Mark NonSpacing
    elif cat & _Category.Mc:
        return "Mc"  # Mark SpacingCombining
    elif cat & _Category.Me:
        return "Me"  # Mark Enclosing
    elif cat & _Category.Nd:
        return "Nd"  # Number DecimalDigit
    elif cat & _Category.Nl:
        return "Nl"  # Number Letter
    elif cat & _Category.No:
        return "No"  # Number Other
    elif cat & _Category.Pc:
        return "Pc"  # Punctuation Connector
    elif cat & _Category.Pd:
        return "Pd"  # Punctuation Dash
    elif cat & _Category.Ps:
        return "Ps"  # Punctuation Open
    elif cat & _Category.Pe:
        return "Pe"  # Punctuation Close
    elif cat & _Category.Pi:
        return "Pi"  # Punctuation InitialQuote
    elif cat & _Category.Pf:
        return "Pf"  # Punctuation FinalQuote
    elif cat & _Category.Po:
        return "Po"  # Punctuation Other
    elif cat & _Category.Sm:
        return "Sm"  # Symbol Math
    elif cat & _Category.Sc:
        return "Sc"  # Symbol Currency
    elif cat & _Category.Sk:
        return "Sk"  # Symbol Modifier
    elif cat & _Category.So:
        return "So"  # Symbol Other
    elif cat & _Category.Zs:
        return "Zs"  # Separator Space
    elif cat & _Category.Zl:
        return "Zl"  # Separator Line
    elif cat & _Category.Zp:
        return "Zp"  # Separator Paragraph
    elif cat & _Category.Cc:
        return "Cc"  # Other Control
    elif cat & _Category.Cf:
        return "Cf"  # Other Format
    elif cat & _Category.Cs:
        return "Cs"  # Other Surrogate
    elif cat & _Category.Co:
        return "Co"  # Other PrivateUse
    assert cat & _Category.Cn
    return "Cn"  # Other NotAssigned 
[docs]
def is_extended_pictographic(text: str) -> bool:
    "Returns True if any of the text has the extended pictographic property (Emoji and similar)"
    return _unicode.has_category(text, 0, len(text), _Category.Extended_Pictographic) 
[docs]
def is_regional_indicator(text: str) -> bool:
    "Returns True if any of the text is one of the 26 `regional indicators <https://en.wikipedia.org/wiki/Regional_indicator_symbol>`__ used in pairs to represent country flags"
    return _unicode.has_category(text, 0, len(text), _Category.Regional_Indicator) 
[docs]
def casefold(text: str) -> str:
    """Returns the text for equality comparison without case distinction
    Case folding maps text to a canonical form where case differences
    are removed allowing case insensitive comparison.  Unlike upper,
    lower, and title case, the result is not intended to be displayed
    to people.
    """
    return _unicode.casefold(text) 
[docs]
def strip(text: str) -> str:
    """Returns the text for less exact comparison with accents, punctuation, marks etc removed
    It will strip diacritics leaving the underlying characters so ``áççéñțś`` becomes ``accents``,
    punctuation so ``e.g.`` becomes ``eg`` and ``don't`` becomes ``dont``,  marks so ``देवनागरी``
    becomes ``दवनगर``, as well as all spacing, formatting, `variation selectors
    <https://en.wikipedia.org/wiki/Variation_Selectors_%28Unicode_block%29>`__ and similar codepoints.
    Codepoints are also converted to their compatibility representation.  For example
    the single codepoint Roman numeral ``Ⅲ`` becomes ``III`` (three separate regular upper case `I`),
    and ``🄷🄴🄻🄻🄾`` becomes ``HELLO``.
    The resulting text should not be shown to people, and is intended for doing relaxed equality
    comparisons, at the expense of false positives when the accents, marks, punctuation etc were
    intended.
    You should do :func:`case folding <casefold>` after this.
    Emoji are preserved but variation selectors, `fitzpatrick <https://en.wikipedia.org/wiki/Emoji#Skin_color>`__
    and `joiners <https://en.wikipedia.org/wiki/Zero-width_joiner>`__ are stripped.
    `Regional indicators <https://en.wikipedia.org/wiki/Regional_indicator_symbol>`__ are preserved.
    """
    return _unicode.strip(text) 
[docs]
def split_lines(text: str, offset: int = 0) -> Iterator[str]:
    """Each line, using hard line break rules
    This is a iterator yielding a line at a time.  The end of line
    yielded will not include the hard line break characters.
    """
    lt = len(text)
    while offset < lt:
        end = _unicode.line_next_hard_break(text, offset)
        for hard in range(-1, offset - end - 1, -1):
            if ord(text[end + hard]) not in _unicode.hard_breaks:
                yield text[offset : end + hard + 1]
                break
        else:
            # it was entirely hard break chars
            yield ""
        offset = end 
[docs]
def expand_tabs(text: str, tabsize: int = 8, invalid: str = ".") -> str:
    """Turns tabs into spaces aligning on tabsize boundaries, similar to :meth:`str.expandtabs`
    This is aware of grapheme clusters and text width.  Codepoints
    that have an invalid width are also replaced by ``invalid``.
    Control characters are an example of an invalid character.  Line
    breaks are replaced with newline.
    """
    res: list[str] = []
    for line in split_lines(text):
        # short cut
        if "\t" not in line and text_width(line) >= 0:
            res.append(line)
            continue
        # work on it cluster by cluster
        clusters: list[str] = []
        pos: int = 0
        for gr in grapheme_iter(line):
            if gr != "\t":
                w = text_width(gr)
                if w < 0:
                    gr = invalid
                    w = text_width(gr)
                pos += w
                clusters.append(gr)
            else:
                # str.expandtabs allows zero and negative numbers
                incr = tabsize - (pos % tabsize) if tabsize > 0 else 0
                clusters.append(" " * incr)
                pos += incr
        res.append("".join(clusters))
    return "\n".join(res) + ("\n" if len(res) > 1 else "") 
[docs]
def grapheme_length(text: str, offset: int = 0) -> int:
    "Returns number of grapheme clusters in the text.  Unicode aware version of len"
    return _unicode.grapheme_length(text, offset) 
[docs]
def grapheme_substr(text: str, start: int | None = None, stop: int | None = None) -> str:
    """Like ``text[start:end]`` but in grapheme cluster units
    ``start`` and ``end`` can be negative to index from the end, or
    outside the bounds of the text but are never an invalid
    combination (you get empty string returned).
    To get one grapheme cluster, make stop one more than start.
    For example to get the 3rd last grapheme cluster::
        grapheme_substr(text, -3, -3 + 1)
    """
    return _unicode.grapheme_substr(text, start, stop) 
[docs]
def grapheme_endswith(text: str, substring: str) -> bool:
    "Returns True if `text` ends with `substring` being aware of grapheme cluster boundaries"
    # match str.endswith
    if len(substring) == 0:
        return True
    if text.endswith(substring):
        # it must end with the same codepoints, but also has to start at
        # a grapheme cluster boundary
        expected = len(text) - len(substring)
        boundary = 0
        while boundary < expected:
            boundary = _unicode.grapheme_next_break(text, boundary)
        return boundary == expected
    return False 
[docs]
def grapheme_startswith(text: str, substring: str) -> bool:
    "Returns True if `text` starts with `substring` being aware of grapheme cluster boundaries"
    # match str.startswith
    if len(substring) == 0:
        return True
    if text.startswith(substring):
        # it must start with the same codepoints, but also has to end at
        # a grapheme cluster boundary
        expected = len(substring)
        boundary = 0
        while boundary < expected:
            boundary = _unicode.grapheme_next_break(text, boundary)
        return boundary == expected
    return False 
[docs]
def grapheme_find(text: str, substring: str, start: int = 0, end: int | None = None) -> int:
    """Returns the offset in text where substring can be found, being aware of grapheme clusters.
    The start and end of the substring have to be at a grapheme cluster boundary.
    :param start: Where in text to start the search (default beginning)
    :param end: Where to stop the search exclusive (default remaining text)
    :returns: offset into text, or -1 if not found or substring is zero length
    """
    # C version is 7.5X faster than Python version
    return _unicode.grapheme_find(text, substring, start, end if end is not None else len(text)) 
[docs]
def text_width(text: str, offset: int = 0) -> int:
    """Returns how many columns the text would be if displayed in a terminal
    You should :func:`split_lines` first and then operate on each line
    separately.
    If the `text` contains new lines, control characters, and similar
    unrepresentable codepoints then minus 1 is returned.
    Terminals aren't entirely consistent with each other, and Unicode
    has many kinds of codepoints, and combinations.  Consequently this
    is right the vast majority of the time, but not always.
    Note that web browsers do variable widths even in monospaced
    sections like ``<pre>`` so they won't always agree with the terminal
    either.
    """
    # Some benchmarks in seconds running on 45MB of the UNDR
    # 8.34  wcwidth Python module
    # 8.14  The implementation of this in Python
    # 0.20  Calling libc wcswidth via ctypes
    # 0.14  The Python converted to C
    return _unicode.text_width(text, offset) 
[docs]
def text_width_substr(text: str, width: int, offset: int = 0) -> tuple[int, str]:
    """Extracts substring width or less wide being aware of grapheme cluster boundaries.
    For example you could use this to get a substring that is 80 (or
    less) wide.
    :returns: A tuple of how wide the substring is, and the substring"""
    if not isinstance(width, int) or width < 1:
        raise ValueError("width must be an int at least 1")
    width_so_far = 0
    accepted = offset
    for _, end, grapheme in grapheme_iter_with_offsets(text, offset):
        seg_width = text_width(grapheme)
        if seg_width < 0:
            raise ValueError(f"text contains invalid codepoints {grapheme=}")
        if width_so_far + seg_width <= width:
            width_so_far += seg_width
            accepted = end
        else:
            break
        if width_so_far == width:
            break
    return width_so_far, text[offset:accepted] 
[docs]
def guess_paragraphs(text: str, tabsize: int = 8) -> str:
    """Given text that contains paragraphs containing newlines, guesses where the paragraphs end.
    The returned :class:`str` will have ``\n`` removed where it was
    determined to not mark a paragraph end.
    .. code-block:: output
        If you have text like this, where paragraphs have newlines in
        them, then each line gets wrapped separately by text_wrap.
        This function tries to guess where the paragraphs end.
        Blank lines like above are definite.
          Indented lines that continue preserving the indent
          are considered the same paragraph, and a change of indent
          (in or out) is a new paragraph.
            So this will be a new paragraph,
        And this will be a new paragraph.
         * Punctuation/numbers at the start of line
           followed by indented text are considered the same
           paragraph
        2. So this is a new paragraph, while
           this line is part of the line above
        3. Optional numbers followed by punctuation then space
        - are considered new paragraphs
    """
    # regex to match what looks like an (optionally numbered) list
    # item
    list_item_re = r"^(?P<indent>\s*[0-9+=,\.*:-]+\s+).*"
    # what we turn definite end of paragraph into
    parasep = "\u2029"
    # Force unicode end of line, form feed, next line to parasep
    text = text.replace("\u2028", parasep)
    text = text.replace("\u000d", parasep)
    text = text.replace("\u0085", parasep)
    # tabify
    text = expand_tabs(text, tabsize)
    # Fix Windows EOL
    text = text.replace("\r\n", "\n")
    # Any stray CR become parasep
    text = text.replace("\r", parasep)
    # Two newlines is definite
    text = text.replace("\n\n", parasep + parasep)
    paragraphs: list[str] = []
    def append_paragraph(p: list[str]) -> None:
        # appends the list of strings as a paragraph
        # but we have to strip any indent from second and
        # succeeding line
        not_first = [line.lstrip(" ") for line in p[1:]]
        paragraphs.append(" ".join([p[0]] + not_first))
    # each segment is one or more paragraphs
    for segment in text.split(parasep):
        if "\n" not in segment:
            paragraphs.append(segment)
            continue
        para: list[str] = []
        for line in segment.split("\n"):
            if not para:
                # this is definitely a new paragraph
                para.append(line)
                continue
            # optional spaces, followed by digits|punctuation followed by space
            # is considered a new paragraph as a list item.
            if re.match(list_item_re, line):
                if para:
                    append_paragraph(para)
                para = [line]
                continue
            # Does indent match previous line
            if len(line) - len(line.lstrip(" ")) == len(para[-1]) - len(para[-1].lstrip(" ")):
                para.append(line)
                continue
            # Does indent match previous line as a list item indent?
            mo = re.match(list_item_re, para[-1])
            if mo:
                if len(mo.group("indent")) == len(line) - len(line.lstrip(" ")):
                    para.append(line)
                    continue
            # new paragraph
            append_paragraph(para)
            para = [line]
            continue
        if para:
            append_paragraph(para)
    # turn back into newline as the expected delimiter
    return "\n".join(paragraphs) + "\n" 
[docs]
def text_wrap(
    text: str,
    width: int = 70,
    *,
    tabsize: int = 8,
    hyphen: str = "-",
    combine_space: bool = True,
    invalid: str = "?",
) -> Iterator[str]:
    """Similar to :func:`textwrap.wrap` but Unicode grapheme cluster and line break aware
    .. note::
       Newlines in the text are treated as end of paragraph.  If your text has paragraphs
       with newlines in them, then call :func:`guess_paragraphs` first.
    :param text: string to process
    :param width: width of yielded lines, if rendered using a monospace font such as to a terminal
    :param tabsize: Tab stop spacing as tabs are expanded
    :param hyphen: Used to show a segment was broken because it was wider than ``width``
    :param combine_space: Leading space on each (indent) is always preserved.  Other spaces where
          multiple occur are combined into one space.
    :param invalid: If invalid codepoints are encountered such as control characters and surrogates
          then they are replaced with this.
    This yields one line of :class:`str` at a time, which will be
    exactly ``width`` when output to a terminal.  It will be right
    padded with spaces if necessary and not have a trailing newline.
    :func:`apsw.ext.format_query_table` uses this method to ensure
    each column is the desired width.
    """
    hyphen_width = text_width(hyphen)
    text = expand_tabs(text, tabsize, invalid)
    for line in split_lines(text):
        accumulated: list[str] = []
        line_width = 0
        indent = None
        space = False
        for segment in line_break_iter(line):
            if indent is None:
                indent = " " * (len(segment) - len(segment.lstrip(" "))) if segment[0] == " " else ""
                if len(indent) >= width - hyphen_width:
                    # make space for double width char if indent wider than width
                    indent = indent[: max(0, width - hyphen_width - 2)]
                accumulated = [indent]
                line_width = len(indent)
                if line_width:
                    if len(indent) != len(segment):  # there was spaces and text
                        segment = segment[line_width:]
                    else:
                        continue
            if combine_space:
                new_segment = segment.rstrip(" ")
                new_space = new_segment != segment
                # we want to prepend a space if the previous segment
                # ended in space
                segment = (" " if space else "") + new_segment
                space = new_space
            seg_width = text_width(segment)
            assert seg_width >= 0
            while line_width + seg_width > width:
                if len(accumulated) == 1:  # only indent present
                    if combine_space and segment[0] == " ":
                        # we added a space, but don't need it on new line
                        segment = segment[1:]
                    # hyphenate too long
                    hyphen_out = hyphen
                    desired = width - hyphen_width - line_width
                    if desired < 1:
                        hyphen_out = ""
                        desired = width - line_width
                    seg_width, substr = text_width_substr(segment, desired)
                    if seg_width == 0:
                        # the first grapheme cluster is wider than desired so
                        # we will display '*' instead for that first grapheme cluster
                        segment = grapheme_substr(segment, 1)
                        substr = "*" * desired
                    else:
                        segment = segment[len(substr) :]
                        if desired - seg_width:  # did we get less than asked for?
                            substr += " " * (desired - seg_width)
                    yield indent + substr + hyphen_out
                    accumulated = [indent]
                    line_width = len(indent)
                    seg_width = text_width(segment)
                    continue
                yield "".join(accumulated) + " " * (width - line_width)
                if combine_space and segment[0] == " ":
                    # we added a space, but don't need it on new line
                    segment = segment[1:]
                    seg_width -= 1
                accumulated = [indent]
                line_width = len(indent)
                continue
            if segment:
                accumulated.append(segment)
            line_width += seg_width
        if len(accumulated) == 1:
            # only indent
            yield " " * width
        else:
            yield "".join(accumulated) + " " * (width - line_width) 
[docs]
def codepoint_name(codepoint: int | str) -> str | None:
    """Name or ``None`` if it doesn't have one
    For example codepoint 65 is named ``LATIN CAPITAL LETTER A``
    while codepoint U+D1234 is not assigned and would return
    ``None``.
    """
    return _unicode.codepoint_name(codepoint) 
[docs]
def version_added(codepoint: int | str) -> str | None:
    "Returns the unicode version the codepoint was added"
    return _unicode.version_added(codepoint) 
version_dates = {
    # Extracted from https://www.unicode.org/history/publicationdates.html
    "16.0": (2024, 9, 10),
    "15.1": (2023, 9, 12),
    "15.0": (2022, 9, 13),
    "14.0": (2021, 9, 14),
    "13.0": (2020, 3, 10),
    "12.1": (2019, 5, 7),
    "12.0": (2019, 3, 5),
    "11.0": (2018, 6, 5),
    "10.0": (2017, 6, 20),
    "9.0": (2016, 6, 21),
    "8.0": (2015, 6, 17),
    "7.0": (2014, 6, 16),
    "6.3": (2013, 9, 30),
    "6.2": (2012, 9, 26),
    "6.1": (2012, 1, 31),
    "6.0": (2010, 10, 11),
    "5.2": (2009, 10, 1),
    "5.1": (2008, 4, 4),
    "5.0": (2006, 7, 14),
    "4.1": (2005, 3, 31),
    # These releases have no day, so we use the first of the month
    "4.0": (2003, 4, 1),
    "3.2": (2002, 3, 1),
    "3.1": (2001, 3, 1),
    "3.0": (1999, 9, 1),
    "2.1": (1998, 5, 1),
    "2.0": (1996, 7, 1),
    "1.1": (1993, 6, 1),
    "1.0": (1991, 10, 1),
}
"""Release date (year, month, day) for each unicode version
intended for use with :meth:`version_added`"""
[docs]
def grapheme_next_break(text: str, offset: int = 0) -> int:
    """Returns end of Grapheme cluster /  User Perceived Character
    For example regional indicators are in pairs, and a base codepoint
    can be combined with zero or more additional codepoints providing
    diacritics, marks, and variations.  Break points are defined in
    the `TR29 spec
    <https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules>`__.
    :param text: The text to examine
    :param offset: The first codepoint to examine
    :returns:  Index of first codepoint not part of the grapheme cluster
        starting at offset. You should extract ``text[offset:span]``
    """
    return _unicode.grapheme_next_break(text, offset) 
[docs]
def grapheme_next(text: str, offset: int = 0) -> tuple[int, int]:
    "Returns span of next grapheme cluster"
    end = grapheme_next_break(text, offset)
    return offset, end 
[docs]
def grapheme_iter(text: str, offset: int = 0) -> Iterator[str]:
    "Iterator providing text of each grapheme cluster"
    lt = len(text)
    meth = _unicode.grapheme_next_break
    start = offset
    while offset < lt:
        offset = meth(text, offset)
        yield text[start:offset]
        start = offset 
[docs]
def grapheme_iter_with_offsets(text: str, offset: int = 0) -> Iterator[tuple[int, int, str]]:
    "Iterator providing start, end, text of each grapheme cluster"
    lt = len(text)
    meth = _unicode.grapheme_next_break
    start = offset
    while offset < lt:
        offset = meth(text, offset)
        yield (start, offset, text[start:offset])
        start = offset 
[docs]
def grapheme_iter_with_offsets_filtered(
    text: str, offset: int = 0, *, categories: Iterable[str], emoji: bool = False, regional_indicator: bool = False
) -> Iterator[tuple[int, int, str]]:
    "Iterator providing start, end, text of each grapheme cluster, providing it includes codepoints from categories, emoji, or regional indicator"
    mask = _cats_to_mask(categories, emoji, regional_indicator)
    lt = len(text)
    meth = _unicode.grapheme_next_break
    catcheck = _unicode.has_category
    while offset < lt:
        end = meth(text, offset)
        if catcheck(text, offset, end, mask):
            yield (offset, end, text[offset:end])
        offset = end 
[docs]
def word_next_break(text: str, offset: int = 0) -> int:
    """Returns end of next word or non-word
    Finds the next break point according to the `TR29 spec
    <https://www.unicode.org/reports/tr29/#Word_Boundary_Rules>`__.
    Note that the segment returned may be a word, or a non-word
    (spaces, punctuation etc).  Use :func:`word_next` to get words.
    :param text: The text to examine
    :param offset: The first codepoint to examine
    :returns:  Next break point
    """
    return _unicode.word_next_break(text, offset) 
_cats_to_mask_mapping = {
    "Lu": _Category.Lu,
    "Ll": _Category.Ll,
    "Lt": _Category.Lt,
    "Lm": _Category.Lm,
    "Lo": _Category.Lo,
    "Mn": _Category.Mn,
    "Mc": _Category.Mc,
    "Me": _Category.Me,
    "Nd": _Category.Nd,
    "Nl": _Category.Nl,
    "No": _Category.No,
    "Pc": _Category.Pc,
    "Pd": _Category.Pd,
    "Ps": _Category.Ps,
    "Pe": _Category.Pe,
    "Pi": _Category.Pi,
    "Pf": _Category.Pf,
    "Po": _Category.Po,
    "Sm": _Category.Sm,
    "Sc": _Category.Sc,
    "Sk": _Category.Sk,
    "So": _Category.So,
    "Zs": _Category.Zs,
    "Zl": _Category.Zl,
    "Zp": _Category.Zp,
    "Cc": _Category.Cc,
    "Cf": _Category.Cf,
    "Cs": _Category.Cs,
    "Co": _Category.Co,
    "Cn": _Category.Cn,
}
def _cats_to_mask(categories: Iterable[str], emoji: bool, regional_indicator: bool) -> int:
    mask = 0
    for cat in categories:
        mask |= _cats_to_mask_mapping[cat]
    if emoji:
        mask |= _Category.Extended_Pictographic
    if regional_indicator:
        mask |= _Category.Regional_Indicator
    return mask
word_default_categories = {"Lu", "Ll", "Lt", "Lm", "Lo", "Nd", "Nl", "No"}
"Default categories for selecting word segments - letters and numbers"
[docs]
def word_next(
    text: str,
    offset: int = 0,
    *,
    categories: Iterable[str] = word_default_categories,
    emoji: bool = False,
    regional_indicator: bool = False,
) -> tuple[int, int]:
    """Returns span of next word
    A segment is considered a word if it contains at least one codepoint corresponding
    to any of the `categories`, plus:
    * emoji (Extended_Pictographic in Unicode specs)
    * regional indicator - two character sequence for flags like 🇧🇷🇨🇦
    """
    mask = _cats_to_mask(categories, emoji, regional_indicator)
    lt = len(text)
    meth = _unicode.word_next_break
    catcheck = _unicode.has_category
    while offset < lt:
        end = meth(text, offset)
        if catcheck(text, offset, end, mask):
            return offset, end
        offset = end
    return offset, offset 
[docs]
def word_iter(
    text: str,
    offset: int = 0,
    *,
    categories: Iterable[str] = word_default_categories,
    emoji: bool = False,
    regional_indicator: bool = False,
) -> Iterator[str]:
    "Iterator providing text of each word"
    mask = _cats_to_mask(categories, emoji, regional_indicator)
    lt = len(text)
    meth = _unicode.word_next_break
    catcheck = _unicode.has_category
    while offset < lt:
        end = meth(text, offset)
        if catcheck(text, offset, end, mask):
            yield text[offset:end]
        offset = end 
[docs]
def word_iter_with_offsets(
    text: str,
    offset: int = 0,
    *,
    categories: Iterable[str] = word_default_categories,
    emoji: bool = False,
    regional_indicator: bool = False,
) -> Iterator[str]:
    "Iterator providing start, end, text of each word"
    mask = _cats_to_mask(categories, emoji, regional_indicator)
    lt = len(text)
    meth = _unicode.word_next_break
    catcheck = _unicode.has_category
    while offset < lt:
        end = meth(text, offset)
        if catcheck(text, offset, end, mask):
            yield (offset, end, text[offset:end])
        offset = end 
[docs]
def sentence_next_break(text: str, offset: int = 0) -> int:
    """Returns end of sentence location.
    Finds the next break point according to the `TR29 spec
    <https://www.unicode.org/reports/tr29/#Sentence_Boundary_Rules>`__.
    Note that the segment returned includes leading and trailing white space.
    :param text: The text to examine
    :param offset: The first codepoint to examine
    :returns:  Next break point
    """
    return _unicode.sentence_next_break(text, offset) 
[docs]
def sentence_next(text: str, offset: int = 0) -> tuple[int, int]:
    """Returns span of next sentence"""
    lt = len(text)
    meth = _unicode.sentence_next_break
    while offset < lt:
        end = meth(text, offset=offset)
        return offset, end
    return offset, offset 
[docs]
def sentence_iter(text: str, offset: int = 0) -> Iterator[str]:
    "Iterator providing text of each sentence"
    lt = len(text)
    meth = _unicode.sentence_next_break
    while offset < lt:
        end = meth(text, offset)
        yield text[offset:end]
        offset = end 
[docs]
def sentence_iter_with_offsets(text: str, offset: int = 0) -> Iterator[tuple[int, int, str]]:
    "Iterator providing start, end, text of each sentence"
    lt = len(text)
    meth = _unicode.sentence_next_break
    while offset < lt:
        end = meth(text, offset)
        yield (offset, end, text[offset:end])
        offset = end 
[docs]
def line_break_next_break(text: str, offset: int = 0) -> int:
    """Returns next opportunity to break a line
    Finds the next break point according to the `TR14 spec
    <https://www.unicode.org/reports/tr14/#LB1>`__.
    :param text: The text to examine
    :param offset: The first codepoint to examine
    :returns:  Next break point
    """
    return _unicode.line_next_break(text, offset) 
[docs]
def line_break_next(text: str, offset: int = 0) -> tuple[int, int]:
    """Returns span of next line"""
    lt = len(text)
    meth = _unicode.line_next_break
    while offset < lt:
        end = meth(text, offset=offset)
        return offset, end
    return offset, offset 
[docs]
def line_break_iter(text: str, offset: int = 0) -> Iterator[str]:
    "Iterator providing text of each line"
    lt = len(text)
    meth = _unicode.line_next_break
    while offset < lt:
        end = meth(text, offset)
        yield text[offset:end]
        offset = end 
[docs]
def line_break_iter_with_offsets(text: str, offset: int = 0) -> Iterator[tuple[int, int, str]]:
    "Iterator providing start, end, text of each line"
    lt = len(text)
    meth = _unicode.line_next_break
    while offset < lt:
        end = meth(text, offset)
        yield (offset, end, text[offset:end])
        offset = end 
if __name__ == "__main__":
    import argparse
    import os
    import sys
    import atexit
    import apsw.fts5
    # We output text non unicode compatible can't handle
    sys.stdout.reconfigure(errors="replace")
    width = 80
    if sys.stdout.isatty():
        width = os.get_terminal_size(sys.stdout.fileno()).columns
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-cc",
        "--compact-codepoints",
        dest="compact_codepoints",
        action="store_true",
        default=False,
        help="Only show hex codepoint values, not full details",
    )
    subparsers = parser.add_subparsers(required=True)
    p = subparsers.add_parser("breaktest", help="Run Unicode test file")
    p.set_defaults(function="breaktest")
    p.add_argument("-v", default=False, action="store_true", dest="verbose", help="Show each line as it is tested")
    p.add_argument("--fail-fast", default=False, action="store_true", help="Exit on first test failure")
    p.add_argument(
        "--fail-codepoints-separator",
        default=" ",
        help="What to separate the list of codepoints with on failure.  Useful for long test strings [%(default)s]",
    )
    p.add_argument("test", choices=("grapheme", "word", "sentence", "line_break"), help="What to test")
    p.add_argument(
        "file",
        help="break test text file.  They can be downloaded from https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/",
        type=argparse.FileType("rt", encoding="utf8"),
    )
    p = subparsers.add_parser("show", help="Run against provided text")
    p.set_defaults(function="show")
    p.add_argument("show", choices=("grapheme", "word", "sentence", "line_break"), help="What to show [%(default)s]")
    p.add_argument("--text-file", type=argparse.FileType("rt", encoding="utf8"))
    p.add_argument(
        "--categories",
        default="L* N*",
        help="For word, which segments are included.  You can use wildcards and ! for negation [%(default)s]",
    )
    p.add_argument(
        "--emoji",
        default=False,
        action="store_true",
        help="For word, if emoji segments are included [%(default)s]",
    )
    p.add_argument(
        "--regional-indicator",
        default=False,
        action="store_true",
        help="For word, if regional indicator segments are included [%(default)s]",
    )
    p.add_argument("text", nargs="*", help="Text to segment unless --text-file used")
    p = subparsers.add_parser("codepoint", help="Show information about codepoints")
    p.add_argument("text", nargs="+", help="If a hex constant then use that value, otherwise treat as text")
    p.set_defaults(function="codepoint")
    p = subparsers.add_parser(
        "benchmark",
        help="Measure how long segmentation takes to iterate each segment",
    )
    p.set_defaults(function="benchmark")
    p.add_argument(
        "--size",
        type=float,
        default=50,
        help="How many million characters (codepoints) of text to use [%(default)s]",
    )
    p.add_argument("--seed", type=int, default=0, help="Random seed to use [%(default)s]")
    p.add_argument(
        "--others",
        help="A comma separated list of other packages to also benchmark.  Use 'all' to get all available ones.  Supported are grapheme, uniseg, pyicu",
    )
    p.add_argument(
        "text_file",
        type=argparse.FileType("rt", encoding="utf8"),
        help="""Text source to use.
        The provided text will be repeatedly duplicated and shuffled then
        appended until the sized amount of text is available.
        A suggested source of text is to download the Universal Declaration of
        Human Rights Corpus from https://www.nltk.org/nltk_data/ and
        concatenate all the files together.  It contains the same text
        in most written languages.
        """,
    )
    p = subparsers.add_parser(
        "textwrap",
        help="""Wrap text to fit the specified number of columns.  Each line output
will be padded with spaces to the width.""",
    )
    p.set_defaults(function="textwrap")
    p.add_argument(
        "--measurement",
        default="apsw.unicode",
        choices=["wcswidth-c", "wcswidth-py"],
        help="""Instead of using the builtin function for measuring how wide text is
use the C library function wcswidth, or use the wcwidth Python package wcswidth function""",
    )
    p.add_argument(
        "--invalid",
        default="?",
        help="Replacement for invalid codepoints such as control characters and surrogates [%(default)s]",
    )
    p.add_argument(
        "--width",
        type=int,
        default=width,
        help="How many columns to wrap to [%(default)s]",
    )
    p.add_argument("--tabsize", type=int, default=8, help="Tabstop size [%(default)s]")
    p.add_argument("--hyphen", default="-", help="Text to use when a segment is longer that width [%(default)s]")
    p.add_argument(
        "--no-combine-space",
        dest="combine_space",
        default=True,
        action="store_false",
        help="Disable combining multiple spaces into one.  Note that leading indents are always preserved",
    )
    p.add_argument(
        "--start", default="", help="Text output at the beginning of each line.  It counts against the width"
    )
    p.add_argument("--end", default="", help="Text output at the end of each line.  It counts against the width")
    p.add_argument(
        "--use-stdlib",
        default=False,
        action="store_true",
        help="Uses the system textwrap library instead.  hyphen is ignored and start/end are applied by this code.",
    )
    p.add_argument(
        "--guess-paragraphs",
        default=False,
        action="store_true",
        help="Guess if newlines in text are the same paragraphs.  See the doc for apsw.unicode.guess_paragraphs for details",
    )
    p.add_argument(
        "text_file",
        type=argparse.FileType("rt", encoding="utf8"),
        help="""Text source to use encoded in UTF8. Newlines are considered to delimit each paragraph, so consider --guess-paragraphs.
        Use a name of a single dash to read from standard input.""",
    )
    p = subparsers.add_parser("casefold", help="Does casefolding on text")
    p.set_defaults(function="casefold")
    p.add_argument("input", default=sys.stdin, type=argparse.FileType("rt", encoding="utf8"), help="Input text [stdin]")
    p.add_argument(
        "output", default=sys.stdout, type=argparse.FileType("wt", encoding="utf8"), help="Output text [stdout]"
    )
    p = subparsers.add_parser("strip", help="Strips accents, uses compatibility codepoints etc")
    p.set_defaults(function="strip")
    p.add_argument("input", default=sys.stdin, type=argparse.FileType("rt", encoding="utf8"), help="Input text [stdin]")
    p.add_argument(
        "output", default=sys.stdout, type=argparse.FileType("wt", encoding="utf8"), help="Output text [stdout]"
    )
    p = subparsers.add_parser(
        "breaktestgen",
        help="""Extracts data strings to be added to test suite""",
    )
    p.set_defaults(function="breaktestgen")
    p.add_argument("grapheme", type=argparse.FileType("rt", encoding="utf8"), help="Grapheme break test file")
    p.add_argument("word", type=argparse.FileType("rt", encoding="utf8"), help="Word break test file")
    p.add_argument("sentence", type=argparse.FileType("rt", encoding="utf8"), help="Sentence break test file")
    p.add_argument("line_break", type=argparse.FileType("rt", encoding="utf8"), help="Line break test file")
    p = subparsers.add_parser(
        "width-check",
        help="""Check how this terminal differs from width database.
        Any differences are reported to stdout in csv format so you should redirect output to a file.
        Cursor positioning ANSI sequences are used.  Do not type in the terminal while it is running.
        It takes about a minute to run with most terminals, 1 hour for kitty, and 21+ hours for gnome
        terminals.""",
    )
    p.set_defaults(function="widthcheck")
    options = parser.parse_args()
    def codepoint_details(kind, c: str, counter=None) -> str:
        if options.compact_codepoints:
            return f"U+{ord(c):04x}"
        name = str(codepoint_name(c))
        cat = category(ord(c))
        counter = f"#{counter}:" if counter is not None else ""
        name += f" ({ cat } { apsw.fts5.unicode_categories[cat] })"
        uni_cat = " | ".join(_unicode.category_name(kind, ord(c)))
        return "{" + f"{counter}U+" + ("%04X" % ord(c)) + f" {name} : { uni_cat }" + "}"
    if options.function == "show":
        if not options.text_file and not options.text:
            p.error("You must specify at least --text-file or text arguments")
        params = {
            "categories": apsw.fts5.convert_unicode_categories(options.categories),
            "emoji": options.emoji,
            "regional_indicator": options.regional_indicator,
        }
        if options.show != "word":
            params = {}
        text = ""
        if options.text_file:
            text += options.text_file.read()
        if options.text:
            if text:
                text += " "
            text += " ".join(options.text)
        next_func = globals()[f"{ options.show }_next"]
        counter = 0
        offset = 0
        while offset < len(text):
            begin, end = next_func(text, offset, **params)
            print(
                f"#{ counter } offset { offset } span { begin }-{ end } codepoints { end - begin } value: { text[begin:end] }"
            )
            for i in range(begin, end):
                print(" ", codepoint_details(options.show, text[i]))
            offset = end
            counter += 1
    elif options.function == "textwrap":
        # stop debug interpreter whining about file not being closed
        atexit.register(lambda: options.text_file.close())
        if options.measurement == "wcswidth-c":
            import ctypes
            import ctypes.util
            libc = ctypes.cdll.LoadLibrary(ctypes.util.find_library("c"))  # C library on Unix/Linux platforms
            if not hasattr(libc, "wcswidth"):
                sys.exit("C library does not have wcswidth function")
            libc.wcswidth.argtypes = [ctypes.c_wchar_p, ctypes.c_size_t]
            libc.wcswidth.restype = ctypes.c_int
            def _text_width(text, offset=0):
                return libc.wcswidth(text[offset:], len(text) * 10)
            # shenanigans so sphinx doesn't try to document these
            setattr(sys.modules[__name__], "text_width", _text_width)
        elif options.measurement == "wcswidth-py":
            import wcwidth
            def _text_width(text, offset=0):
                return wcwidth.wcswidth(text[offset:])
            setattr(sys.modules[__name__], "text_width", _text_width)
        width = options.width
        width = width - text_width(options.start) - text_width(options.end)
        text = options.text_file.read()
        if options.guess_paragraphs:
            text = guess_paragraphs(text)
        if options.use_stdlib:
            import textwrap
            for line in textwrap.wrap(
                text,
                width,
                tabsize=options.tabsize,
                drop_whitespace=options.combine_space,
                replace_whitespace=False,
            ):
                for line in line.splitlines():
                    padding = max(0, width - text_width(line))
                    print(f"{options.start}{line}{' ' * padding}{options.end}")
        else:
            for line in text_wrap(
                text,
                width,
                tabsize=options.tabsize,
                hyphen=options.hyphen,
                combine_space=options.combine_space,
                invalid=options.invalid,
            ):
                print(f"{options.start}{line}{options.end}")
    elif options.function == "breaktest":
        import difflib
        # stop debug interpreter whining about file not being closed
        atexit.register(lambda: options.file.close())
        next_break_func = globals()[f"{ options.test }_next_break"]
        # ::TODO:: add option that inserts LB_CM/ZWJ chars after every
        # codepoint (except BK/CR/NL etc) to verify LB9 is always done
        ok = "÷"
        not_ok = "\u00d7"
        passed: int = 0
        fails: list[str] = []
        for line_num, line in enumerate(options.file, 1):
            orig_line = line
            if not line.strip() or line.startswith("#"):
                continue
            line = line.split("#")[0].strip().split()
            if options.verbose:
                print(f"{ line_num }: { orig_line.rstrip() }")
            expect = not_ok if options.test == "line_break" else ok
            assert line[0] == expect, f"Line { line_num } doesn't start with { expect }!"
            assert line[-1] == ok, f"Line { line_num } doesn't end with { ok }!"
            line = line[1:]
            text = ""
            breaks: list[int] = []
            while line:
                c = line.pop(0)
                if c == not_ok:
                    continue
                if c == ok:
                    breaks.append(len(text))
                    continue
                text += chr(int(c, 16))
            def add_failinfo():
                fails.append(orig_line.strip())
                codepoints = []
                for counter, c in enumerate(text):
                    codepoints.append(codepoint_details(options.test, c, counter))
                fails.append(options.fail_codepoints_separator.join(codepoints))
                fails.append("")
            offset = 0
            seen: list[int] = []
            lf = len(fails)
            while offset < len(text):
                span = next_break_func(text, offset)
                if span not in breaks:
                    fails.append(
                        f"Line { line_num } got unexpected break at { span } - expected are { breaks }.  Seen { seen }"
                    )
                    add_failinfo()
                    break
                seen.append(span)
                offset = span
            if options.fail_fast and fails:
                break
            if len(fails) != lf:
                continue
            if set(seen) != set(breaks):
                fails.append(f"Line { line_num } got breaks at { seen } expected at { breaks }")
                if max(len(seen), len(breaks)) > 5:
                    # use difflib to show difference
                    sm = difflib.SequenceMatcher(a=seen, b=breaks)
                    for tag, a1, a2, b1, b2 in sm.get_opcodes():
                        if tag == "equal":
                            continue
                        if a1 != a2:
                            fails[-1] += f"\n       seen {tag} {seen[a1:a2]}"
                        if b1 != b2:
                            fails[-1] += f"\n    expected {tag} {breaks[b1:b2]}"
                add_failinfo()
            if options.fail_fast and fails:
                break
            passed += 1
        if fails:
            print(f"{ len(fails)//4 } tests failed, {passed:,} passed:", file=sys.stderr)
            for fail in fails:
                print(fail, file=sys.stderr)
            sys.exit(2)
        else:
            print(f"{passed:,} passed")
    elif options.function == "codepoint":
        codepoints = []
        for t in options.text:
            try:
                codepoints.append(int(t, 16))
            except ValueError:
                codepoints.extend(ord(c) for c in t)
        def uniname(cp):
            return str(codepoint_name(cp))
        def deets(cp):
            cat = category(cp)
            return f"{ uniname(cp) } { cat }: { apsw.fts5.unicode_categories[cat] }"
        for i, cp in enumerate(codepoints):
            print(f"#{ i } U+{ cp:04X} - ", end="")
            try:
                print(chr(cp))
            except UnicodeEncodeError:
                print()
            added = version_added(cp)
            year = f"({version_dates[added][0]})" if added is not None else ""
            print(f"Name: { deets(cp) }   Version: { added } { year }")
            mangled = []
            for mangle in casefold(chr(cp)), strip(chr(cp)):
                if not mangle:
                    mangled.append("(nothing)")
                else:
                    mangled.append(", ".join(f"U+{ ord(v):04X} {uniname(ord(v))}" for v in mangle))
            print(f"casefold: { mangled[0] }   stripped: { mangled[1] }")
            print(
                f"Width: { text_width(chr(cp)) }  "
                f"TR29 grapheme: { ' | '.join(_unicode.category_name('grapheme', cp)) }   "
                f"word: { ' | '.join(_unicode.category_name('word', cp )) }   "
                f"sentence: { ' | '.join(_unicode.category_name('sentence', cp)) }  "
                f"TR14 line break: { ' | '.join(_unicode.category_name('line_break', cp)) }"
            )
            print()
    elif options.function == "benchmark":
        import random
        import time
        random.seed(options.seed)
        base_text = options.text_file.read()
        text = base_text
        # these are the non-ascii codepoints used in the various break tests
        interesting = "".join(
            chr(int(x, 16))
            for x in """0085 00A0 00AD 01BB 0300 0308 034F 0378 05D0 0600 062D 0631
                0644 0645 0646 064A 064E 0650 0651 0661 0671 06DD 070F 0710 0712 0717
                0718 0719 071D 0721 072A 072B 072C 0900 0903 0904 0915 0924 092F 093C
                094D 0A03 0D4E 1100 1160 11A8 200D 2018 2019 201C 201D 2060 231A 2701
                3002 3031 5B57 5B83 AC00 AC01 1F1E6 1F1E7 1F1E8 1F1E9 1F3FF 1F476
                1F6D1""".split()
        )
        # make interesting be 0.1% of base text
        base_text += interesting * int(len(interesting) / (len(base_text) * 0.001))
        tests: list[Any] = [
            (
                "apsw.unicode",
                unicode_version,
                (
                    ("grapheme", grapheme_iter),
                    ("word", word_iter),
                    ("sentence", sentence_iter),
                    ("line", line_break_iter),
                ),
            )
        ]
        if options.others:
            if options.others == "all":
                ok = []
                try:
                    import uniseg
                    ok.append("uniseg")
                except ImportError:
                    pass
                try:
                    import grapheme
                    ok.append("grapheme")
                except ImportError:
                    pass
                try:
                    import icu
                    ok.append("pyicu")
                except ImportError:
                    pass
                if ok:
                    options.others = ",".join(ok)
                else:
                    options.others = None
        if options.others:
            for package in options.others.split(","):
                package = package.strip()
                if package == "grapheme":
                    import grapheme
                    import grapheme.finder
                    tests.append(
                        ("grapheme", grapheme.UNICODE_VERSION, (("grapheme", grapheme.finder.GraphemeIterator),))
                    )
                elif package == "uniseg":
                    import uniseg
                    import uniseg.graphemecluster
                    import uniseg.wordbreak
                    import uniseg.sentencebreak
                    import uniseg.linebreak
                    # note that uniseg words doesn't determine which
                    # segments are words or not so you just get all
                    # segments
                    tests.append(
                        (
                            "uniseg",
                            uniseg.unidata_version,
                            (
                                ("grapheme", uniseg.graphemecluster.grapheme_clusters),
                                ("word", uniseg.wordbreak.words),
                                ("sentence", uniseg.sentencebreak.sentences),
                                ("line", uniseg.linebreak.line_break_units),
                            ),
                        )
                    )
                elif package == "pyicu":
                    import icu
                    import functools
                    # api only returns breakpoints, so make it match
                    # the others.  It also does its own utf16 based
                    # strings so there is some conversion overhead
                    def icu_iterate(kind, text):
                        icu_it = getattr(icu.BreakIterator, f"create{kind}Instance")(icu.Locale.getEnglish())
                        icu_str = icu.UnicodeString(text)
                        icu_it.setText(icu_str)
                        offset = 0
                        for pos in icu_it:
                            yield str(icu_str[offset:pos])
                            offset = pos
                    tests.append(
                        (
                            "pyicu",
                            icu.UNICODE_VERSION,
                            (
                                ("grapheme", functools.partial(icu_iterate, "Character")),
                                ("word", functools.partial(icu_iterate, "Word")),
                                ("sentence", functools.partial(icu_iterate, "Sentence")),
                                ("line", functools.partial(icu_iterate, "Line")),
                            ),
                        )
                    )
                else:
                    sys.exit(f"Unknown third party package to benchmark '{package}'")
        print(f"Expanding text to { options.size } million chars ...", end="", flush=True)
        while len(text) < options.size * 1_000_000:
            text += "".join(random.sample(base_text, len(base_text)))
        text = text[: int(options.size * 1_000_000)]
        print("\nResults in codepoints per second processed, returning each segment.  Higher is faster.")
        for name, version, parts in tests:
            print(f"\nBenchmarking {name:20s} unicode version { version }")
            for kind, func in parts:
                print(f"{kind:>8}", end=" ", flush=True)
                count = 0
                offset = 0
                start = time.process_time_ns()
                exc = None
                try:
                    for _ in func(text):
                        count += 1
                except Exception as exc2:
                    exc = exc2
                end = time.process_time_ns()
                if exc is not None:
                    print(f"       EXCEPTION {exc!r}")
                else:
                    seconds = (end - start) / 1e9
                    print(f"codepoints per second: { int(len(text)/seconds): 12,d}    segments: {count: 11,d}")
    elif options.function == "casefold":
        options.output.write(casefold(options.input.read()))
    elif options.function == "strip":
        options.output.write(strip(options.input.read()))
    elif options.function == "breaktestgen":
        # char used to mark ok and not in the files
        ok = "÷"
        not_ok = "\u00d7"
        def get_strings(fh):
            for line in fh:
                if not line.strip() or line.startswith("#"):
                    continue
                line = line.split("#")[0].strip().split()
                line.pop(0)  # remove initial marker
                line.pop(-1)  # and final
                text = ""
                while line:
                    c = line.pop(0)
                    if c == not_ok:
                        continue
                    elif c == ok:
                        text += c
                    else:
                        text += chr(int(c, 16))
                        assert text[-1] != "÷"
                yield text
        def fmt(text):
            res = ""
            for c in text:
                if category(c) in {"Lu", "Ll", "Nd", "Nl", "Pd", "Sm", "Sc", "So", "Zs"}:
                    res += c
                else:
                    c = ord(c)
                    if c <= 0xFFFF:
                        res += f"\\u{c:04X}"
                    else:
                        res += f"\\U{c:08X}"
            return '"' + res + '"'
        for name in ("grapheme", "word", "sentence", "line_break"):
            lines = list(get_strings(getattr(options, name)))
            lines.sort(key=lambda l: len(l))
            print(f'"{name}":')
            print("(")
            # we always take the shorted and longest
            print(fmt(lines.pop(0)), ",")
            print(fmt(lines.pop(-1)), ",")
            # and 20 of the rest
            for offset in range(len(lines) // 20, len(lines), len(lines) // 20):
                print(fmt(lines[offset]), ",")
            print("),")
    elif options.function == "widthcheck":
        import wcwidth # pip install wcwidth
        if sys.platform != "win32":
            import atexit
            import ctypes, ctypes.util
            libc = ctypes.cdll.LoadLibrary(ctypes.util.find_library("c"))
            libc.wcswidth.argtypes = [ctypes.c_wchar_p, ctypes.c_size_t]
            libc.wcswidth.restype = ctypes.c_int
            tty_in = open("/dev/tty", "r")
            tty_out = open("/dev/tty", "w")
            import tty
            import termios
            term_mode = termios.tcgetattr(tty_in)
            def finish():
                termios.tcsetattr(tty_in, termios.TCSAFLUSH, term_mode)
                print("", flush=True, file=tty_out)
            atexit.register(finish)
            tty.setraw(tty_in)
        else:
            import ctypes, msvcrt
            kernel32 = ctypes.windll.kernel32
            # we need console handle (stdout is often redirected)
            h_tty_out = kernel32.CreateFileW("CONOUT$", 0x80000000 | 0x40000000, 0x00000001|0x00000002, None, 3, 0, None)
            assert h_tty_out != -1
            # Convince it raw bytes are utf8
            res = kernel32.SetConsoleCP(65001) # CP_UTF8
            assert res # zero means failure
            res = kernel32.SetConsoleOutputCP(65001) # CP_UTF8
            assert res # zero means failure
            # enable ansi processing
            res = kernel32.SetConsoleMode(h_tty_out, 5)
            assert res # zero means failure
            # fake i/o interfaces using classes as namespace, not instances
            class tty_out:
                def write(data):
                    data = data.encode("utf8")
                    res = kernel32.WriteFile(h_tty_out, data, len(data), None, None)
                    assert res # zero means failure
                def flush():
                    # we do no buffering so flush is a no-op
                    pass
            class tty_in:
                def read(how_much):
                    res = ""
                    while len(res) < how_much:
                        res += msvcrt.getwch()
                    return res
            # fake out wcwidth C API
            class libc:
                def wcswidth(s, n):
                    # give same value as apsw.unicode
                    return text_width(s)
            def finish():
                # utf8 etc above are process local so we don't need to
                # reset the terminal
                pass
        def get_pos():
            print("\x1b[6n", flush=True, file=tty_out, end="")
            x = tty_in.read(2)
            assert x == "\x1b["  # something else was typed
            r = ""
            while True:
                c = tty_in.read(1)
                if c == "R":
                    break
                r += c
            return list(int(part) for part in r.split(";"))
        def set_pos(pos):
            print(f"\x1b[{pos[0]};{pos[1]}H", flush=True, file=tty_out, end="")
        print("\r\n", flush=True, file=tty_out)
        errors = []
        start_pos = get_pos()
        print(f"{0:06X} -> ", flush=True, end="", file=tty_out)
        out_pos = get_pos()
        for cp in range(0, sys.maxunicode + 1):
            # surrogates can't be output
            if 0xD800 <= cp <= 0xDFFF:
                continue
            set_pos(start_pos)
            print(f"{cp:06X} -> ", flush=True, end="", file=tty_out)
            set_pos(out_pos)
            text = "a" + chr(cp) + "b"
            if cp == 0 or (text_width(text) < 0 and libc.wcswidth(text, 1000) < 0 and wcwidth.wcswidth(text) < 0):
                continue
            print(text, end="", flush=True, file=tty_out)
            new_pos = get_pos()
            width = new_pos[1] - out_pos[1] - 2 if new_pos[0] == out_pos[0] else -1
            if width != text_width(chr(cp)):
                errors.append([cp, width])
        finish()
        if errors:
            import csv
            w = csv.writer(sys.stdout)
            w.writerow(
                [
                    "codepoint",
                    "hex",
                    "width",
                    "text_width",
                    "wcswidth_c",
                    "wcswidth_py",
                    "name",
                    "version_added",
                    "category",
                ]
            )
            for row in errors:
                cp = row[0]
                w.writerow(
                    [
                        cp,
                        f"{cp:04X}",
                        row[1],
                        text_width(chr(cp)),
                        libc.wcswidth(chr(cp), 1000),
                        wcwidth.wcswidth(chr(cp)),
                        str(codepoint_name(cp)),
                        version_added(cp),
                        category(cp),
                    ]
                )