Source code for aymara.lima

#!/usr/bin/env python3

"""
The LIMA python bindings.

This python API gives access to the major features of the LIMA linguistic analyzer. To
make it easier to handle, it largely reproduces that of spaCy, including parts of the
documentation. See the GitHub project for spaCy's copyright notice.

Example::

    import aymara.lima
    nlp = aymara.lima.Lima()
    doc = nlp("Mr. Best flew to New York on Saturday morning.")
    print(doc)

Classes:

    Doc
    Lima
    Span
    Token

"""


# SPDX-FileCopyrightText: 2022 CEA LIST <gael.de-chalendar@cea.fr>
#
# SPDX-License-Identifier: MIT

# -*- coding: utf-8 -*-

import os
import pathlib
import sys

from distutils.dir_util import copy_tree
from pydantic import (parse_obj_as, ValidationError)
from typing import (Dict, Tuple, Union)

import aymaralima.cpplima


def _get_data_dir(appname: str):
    """
    This private function returns the application's data dir as defined by the OS.

    :param appname: the name of the application.
    :type appname: str
    :return: the application's data dir.
    :rtype: str
    """
    if sys.platform == "win32":  # pragma: no cover
        import winreg
        key = winreg.OpenKey(
            winreg.HKEY_CURRENT_USER,
            r"Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders"
        )
        dir_, _ = winreg.QueryValueEx(key, "Local AppData")
        ans = pathlib.Path(dir_).resolve(strict=False)
    elif sys.platform == 'darwin':  # pragma: no cover
        ans = pathlib.Path('~/Library/Application Support/').expanduser()
    else:
        ans = pathlib.Path(os.getenv('XDG_DATA_HOME', "~/.local/share")).expanduser()
    return ans.joinpath(appname)


[docs]class Token:
    """A token

    TODO
    Some parts of the API are still not implemented

        sent     The sentence span that this token is a part of.
        Span

        lang    Language of the parent document’s vocabulary.
        str


    """
    def __init__(self, token: aymaralima.cpplima.Token):
        """Token's constructor

        :param token: the C++ binding Token class
        :type token: aymaralima.cpplima.Token
        """
        assert type(token) == aymaralima.cpplima.Token
        self.token = token

    def __repr__(self) -> str:
        """
        The representation of this token in CoNLL-U format. Tab separated columns:
        ID: Word index, integer starting at 1 for each new sentence; may be a range for
            multiword tokens; may be a decimal number for empty nodes (decimal numbers
            can be lower than 1 but must be greater than 0).
        FORM: Word form or punctuation symbol.
        LEMMA: Lemma or stem of word form.
        UPOS: Universal part-of-speech tag.
        XPOS: always _ in LIMA. Language-specific part-of-speech tag; underscore if not
            available.
        FEATS: List of morphological features from the universal feature inventory or
            from a defined language-specific extension; underscore if not available.
        HEAD: Head of the current word, which is either a value of ID or zero (0).
        DEPREL: Universal dependency relation to the HEAD (root iff HEAD = 0) or a
            defined language-specific subtype of one.
        DEPS: Enhanced dependency graph in the form of a list of head-deprel pairs.
        MISC: Any other annotation.

        :return: return the CoNLL-U representation of this token
        :rtype: str
        """
        return (f"{self.i}\t{self.token.text}\t{self.lemma}\t{self.pos}\t_\t"
                + ("|".join([f'{k}:{v}' for k, v in self.features.items()])
                   if self.features else "_")
                + "\t"
                + f"{self.head if self.head > 0 else '_'}\t"
                + f"{self.dep if self.dep else '_'}\t_\t"
                + f"Pos={self.idx}|Len={len(self)}"
                + (f"" if self.token.neIOB == 'O'
                   else f"|NE={self.token.neIOB}-{self.token.neType}"))

    def __len__(self) -> int:
        """Return the length of the token in UTF-8 code points

        :return: the length of the token
        :rtype: int
        """
        return self.token.len

    def __str__(self):
        """Return the original text of the token.

        :return: the original text of the token
        :rtype: str
        """
        return self.token.text

    text = property(
            fget=lambda self: self.token.text,
            doc="The original text of the token.")

    i = property(
            fget=lambda self: self.token.i+1,
            doc="The index of this token in its parent document.")

    lemma = property(
            fget=lambda self: self.token.lemma,
            doc="The token lemma.")

    pos = property(
            fget=lambda self: self.token.tag,
            doc="Coarse-grained part-of-speech from the Universal POS tag set.")

    head = property(
            fget=lambda self: self.token.head,
            doc="The syntactic parent, or “governor”, of this token.")

    dep = property(
            fget=lambda self: self.token.dep,
            doc="Syntactic dependency relation.")

    idx = property(
            fget=lambda self: self.token.pos-1,
            doc="Position of this token in its document text.")

    features = property(
            fget=lambda self:  ({} if self.token.features == "_"
                                else dict(x.split("=")
                                          for x in self.token.features.split("|"))),
            doc="Morphlogical features of this token .")

    ent_type = property(
            fget=lambda self: self.token.neType,
            doc="Named entity type.")

    ent_iob = property(
            fget=lambda self: self.token.neIOB,
            doc=("IOB code of named entity tag. “B” means the token begins an entity, "
                 "“I” means it is inside an entity, “O” means it is outside an entity, "
                 "and \"\" means no entity tag is set."))

    t_status = property(
            fget=lambda self: self.token.tStatus,
            doc=("The tokenization status of this token. Can also be explored with the "
                 "is_* properties. The possible values are::\n"
                 "\n"
                 "  t_alphanumeric\n"
                 "  t_abbrev\n"
                 "  t_acronym\n"
                 "  t_capital\n"
                 "  t_capital_1st\n"
                 "  t_capital_small\n"
                 "  t_cardinal_roman\n"
                 "  t_comma_number\n"
                 "  t_dot_number\n"
                 "  t_fraction\n"
                 "  t_integer\n"
                 "  t_ordinal_integer\n"
                 "  t_ordinal_roman\n"
                 "  t_sentence_brk\n"
                 "  t_small\n"
                 "  t_word_brk\n"
                 "\n"
                 ))

    is_alpha = property(
        fget=lambda self: self.token.tStatus in ["t_alphanumeric", "t_capital",
                                                 "t_capital_1st", "t_capital_small",
                                                 "t_small"],
        doc=("Does the token consist of alphabetic characters? "
             "Equivalent to token.text.isalpha()."))

    is_digit = property(
        fget=lambda self: self.token.tStatus == "t_integer",
        doc=("Does the token consist of digits? "
            "Equivalent to token.text.isdigit()."))

    is_lower = property(
        fget=lambda self: self.token.text.islower(),
        doc=("Is the token in lowercase? Equivalent to token.text.islower()."))

    is_upper = property(
        fget=lambda self: self.token.text.isupper(),
        doc=("Is the token in lowercase? Equivalent to token.text.isupper()."))

    is_punct = property(
        fget=lambda self: self.token.tStatus in ["t_sentence_brk", "t_word_brk"],
        doc=("Is the token punctuation?"))

    is_sent_start = property(
        fget=lambda self: self.token.i == 0,
        # TODO give access to the document to be able to implemnt that:
        # or self.token.i in [s[0].i for s in self.doc.sents],
        doc=("Does the token start a sentence? bool or None if unknown. "
             "Default value = True for the first token in the Doc."
             "\nTODO: implement for sentences other than the first one."))

    is_sent_end = property(
        fget=lambda self: self.token.tStatus == "t_sentence_brk",
        doc=("Does the token end a sentence? bool or None if unknown."))

    is_space = property(
        fget=lambda self: self.token.text.isspace(),
        doc=("Does the token consist of whitespace characters? "
            "Equivalent to token.text.isspace(). "
            "Should always be False in LIMA as there is no space tokens"))

    is_bracket = property(
        fget=lambda self: self.token.text in "()[]{}",
        doc=("Is the token a bracket?"))

    is_quote = property(
        fget=lambda self: self.token.text in "\"'«»`",
        doc=("Is the token a quotation mark?"))


class _SentencesIterator:
    """Doc Sentences Iterator class"""

    def __init__(self, doc):
        # Doc object reference
        self._doc = doc
        # index variable to keep track
        self._index = 0

    def __iter__(self):
        """Returns Iterator object"""
        return self

    def __next__(self):
        """'Returns the next value from doc object's lists"""
        if self._index < len(self._doc.limadoc.sentences()):
            result = Span(self._doc,
                          self._doc.limadoc.sentences()[self._index].start+(
                              1 if self._index > 0 else 0),
                          self._doc.limadoc.sentences()[self._index].end+1)
            self._index += 1
            return result
        # Iteration ends
        raise StopIteration


class _SpanIterator:
    """Span Iterator class"""

    def __init__(self, span):
        # Span object reference
        self._span = span
        # index variable to keep track
        self._index = 0

    def __iter__(self):
        """Returns Iterator object"""
        return self

    def __next__(self):
        """'Returns the next value from span object's lists"""
        if self._index < len(self._span):
            result = self._span[self._index]
            self._index += 1
            return result
        # Iteration ends
        raise StopIteration


[docs]class Span:
    """Represents a continuous span of tokens in a Doc.

    TODO
    Some parts of the API are still not implemented

        ents    The named entities that fall completely within the span. Returns a tuple of
            Span objects.
            Example::

                import aymara.lima
                nlp = aymara.lima.Lima()
                doc = nlp("Mr. Best flew to New York on Saturday morning.")
                span = doc[0:6]
                ents = list(span.ents)
                assert ents[0].label == 346
                assert ents[0].label_ == "PERSON"
                assert ents[0].text == "Mr. Best"

            Name	Description
            RETURNS	Entities in the span, one Span per entity.
            Tuple[Span, …]

        sent    The sentence span that this span is a part of.
            This property is only available when sentence boundaries have been set on the
            document by the pipeline. It will raise an error otherwise.

            If the span happens to cross sentence boundaries, only the first sentence will be returned. If it is required that the sentence always includes the full span, the result can be adjusted as such:

            sent = span.sent
            sent = doc[sent.start : max(sent.end, span.end)]

            Example::

                import aymara.lima
                nlp = aymara.lima.Lima()
                doc = nlp("Give it back! He pleaded.")
                span = doc[1:3]
                assert span.sent.text == "Give it back!"

        Span

        sents   Returns a generator over the sentences the span belongs to.
            This property is only available when sentence boundaries have been set on the
            document by the pipeline. It will raise an error otherwise.

            If the span happens to cross sentence boundaries, all sentences the span overlaps with will be returned.
            Example::

                import aymara.lima
                nlp = aymara.lima.Lima()
                doc = nlp("Give it back! He pleaded.")
                span = doc[2:4]
                assert len(span.sents) == 2

        Iterable[Span]


    """
    def __init__(self, doc, start: int, end: int, label: str = ""):
        """
        Constructor of a Span

        :param doc: The document on which is built the span.
        :type doc: Doc
        :param start: The id of the fist token of the span.
        :type start: int
        :param start: The id of past the last token of the span.
        :type start: int
        :param label: A label to attach to the span, e.g. for named entities.
        :type start: str
        """
        self._doc = doc
        self._start = start
        self._end = end
        self._label = label

    def __iter__(self) -> _SpanIterator:
        """Returns Iterator object"""
        return _SpanIterator(self)

    def __len__(self) -> int:
        """
        Returns the number of tokens of this span

        Example::

            import aymara.lima
            nlp = aymara.lima.Lima()
            doc = nlp("Give it back! He pleaded.")
            span = doc[1:4]
            assert len(span) == 3

        :return: the number of tokens in this span
        :rtype: int
        """
        if self._end < self._start:
            print(f"Error in Span.__len__. Span end ({self._end}) is before start "
                  f"({self._start}). Return 0", file=sys.stderr)
            return 0

        return self._end - self._start

    def __getitem__(self, i: Union[int, slice]):
        """
        Returns either the Token at position i in the span or the subspan defined by
        the slice i.

        Example::

            import aymara.lima
            nlp = aymara.lima.Lima()
            doc = nlp("Give it back! He pleaded.")
            span = doc[1:4]
            assert span[1].text == "back"
            assert span[1:3].text == "back!"

        :param i: the position in the span of the item to retrieve or a slice defining
            the subspan to retriev.
        :type i: int
        :return: either the Token at position i in the span or the subspan defined by
        the slice i.
        :rtype: Union[int, slice]
        """
        if isinstance(i, slice):
            start = 0 if i.start is None else i.start
            stop = -1 if i.stop is None else i.stop

            if start < 0:
                start = len(self) + start
            if stop < 0:
                stop = len(self) + stop
            if start >= stop:
                return Span(self._doc, self._start+start, self._start+start)
            if start > len(self):
                return Span(self._doc, self._start+start, self._start+start)
            if start < 0:
                start = 0
            if stop < 0:
                stop = 0
            if stop > len(self):
                stop = len(self)
            return Span(self._doc, self._start+start, self._start+stop)
        else:
            if i < 0:
                i = len(self) + i
            if (i < 0 or i > len(self)
                    or self._start+i < 0 or self._start+i >= len(self._doc)):
                raise IndexError("Span index out of range")
            return self._doc[self._start+i]

    def __repr__(self) -> str:
        """
        The representation of a span is one line for each token represented in the
        CoNLL-U format.
        """
        if len(self) == 0:
            return ""
        first_tid = self[0].i - 1
        id = 1
        tokens_repr = [token.__repr__() for token in self]
        tokens_repr_reindexed = []
        for token_repr in tokens_repr:
            cols = token_repr.split("\t")
            cols[0] = str(id)
            if cols[6] != "_":
                cols[6] = str(int(cols[6]) - first_tid)
            id += 1
            token_repr = "\t".join(cols)
            tokens_repr_reindexed.append(token_repr)
        return "\n".join(tokens_repr_reindexed)

    text = property(
            fget=lambda self: (self._doc.text[
                self._doc[self._start].idx:
                    self._doc[self._end-1].idx+len(self._doc[self._end-1])]),
            doc="A string representation of the span text.")

    doc = property(
            fget=lambda self: self._doc,
            doc="The parent document.")

    start = property(
            fget=lambda self: self._start,
            doc="The token offset for the start of the span.")

    end = property(
            fget=lambda self: self._end,
            doc="The token offset for the end of the span.")

    start_char = property(
            fget=lambda self: self[0].idx,
            doc="The character offset for the start of the span.")

    end_char = property(
            fget=lambda self: self[-1].idx+len(self[-1]),
            doc="The character offset for the end of the span.")

    label = property(
            fget=lambda self: self._label,
            doc="A label to attach to the span, e.g. for named entities.")


class _DocEntitiesIterator:
    """Doc Entities Iterator class"""

    def __init__(self, doc):
        # Doc object reference
        self._doc = doc
        # index variable to keep track
        self._index = 0

    def __iter__(self):
        """Returns Iterator object"""
        return self

    def __next__(self) -> Span:
        """'Returns the next Span defining an entity in the document"""
        while self._index < len(self._doc):
            print(f"doc entities loop {self._index}, {len(self._doc)}", file=sys.stderr)
            if self._doc[self._index].ent_iob == "B":
                start = self._doc[self._index].i
                end = start
                label = self._doc[self._index].ent_type
                while self._index < len(self._doc):
                    print(f"doc entities inner loop {self._index}, {len(self._doc)}", file=sys.stderr)
                    self._index += 1
                    if self._index < len(self._doc):
                        if self._doc[self._index].ent_iob == "I":
                            end = self._index
                        else:
                            return Span(self._doc, start, end, label=label)
            self._index += 1
        # Iteration ends
        raise StopIteration


class _DocIterator:
    """Doc Iterator class"""

    def __init__(self, doc):
        # Doc object reference
        self._doc = doc
        # index variable to keep track
        self._index = 0

    def __next__(self) -> Token:
        """'Returns the next value from doc object's lists"""
        if self._index < len(self._doc):
            result = self._doc[self._index]
            self._index += 1
            return result
        # Iteration ends
        raise StopIteration


[docs]class Doc:
    """A document.

    This is mainly an iterable of tokens.

    Example::

        import aymara.lima
        nlp = aymara.lima.Lima()
        doc = nlp("Give it back! He pleaded.")

    TODO
    Some parts of the API are still not implemented:

        compounds   The compounds found into the document text by the
            CompoundsBuilderFromSyntacticData LIMA pipeline unit
            List[Compound]


    """
    def __init__(self, doc: aymaralima.cpplima.Doc):
        self.limadoc = doc

    def __iter__(self) -> _DocIterator:
        """Returns Iterator object"""
        return _DocIterator(self)

    def __len__(self) -> int:
        """'Returns the number of tokens of this document

        :return: the number of tokens of this document.
        :rtype:int
        """
        return self.limadoc.len()

    def __getitem__(self, i: Union[int, slice]) -> Union[Token, Span]:
        """Returns the token at position i or a contiguous slice of tokens.

        Example::

            doc = nlp("Give it back! He pleaded.")
            assert doc[0].text == "Give"
            assert doc[-1].text == "."
            span = doc[1:3]
            assert span.text == "it back"

        :param i: a position i or a contiguous slice of token to retrieve
        :type i: Union[int, slice]
        :return: the token at position i or a contiguous slice of tokens.
        :rtype: Union[int, slice]
        """
        if isinstance(i, slice):
            return Span(self, i.start, i.stop)
        if i < 0:
            i = len(self) + i
        return Token(self.limadoc.at(i))

    def __repr__(self) -> str:
        """
        The representation of a document is one line for each token represented in the
        CoNLL-U format.
        """
        return "\n\n".join([repr(sent) for sent in self.sents])
        # return "\n".join([token.__repr__() for token in self])

    def __str__(self) -> str:
        """
        The string of a document is its original text.
        """
        return self.text

    text = property(
            fget=lambda self: self.limadoc.text(),
            doc=("The original text.\n"
                 ":type: str\n"))

    sents = property(
            fget=lambda self: _SentencesIterator(self),
            doc=("    Iterate over the sentences in the document.\n"
                 "        This property is only available when sentence boundaries have"
                 " been set on the\n"
                 "        document by the pipeline. It will raise an error otherwise.\n"
                 "        Example::\n"
                 "sents = list(doc.sents)\n"
                 "          import aymara.lima\n"
                 "          nlp = aymara.lima.Lima()\n"
                 "          doc = nlp(\"This is a sentence. Here's another...\")\n"
                 "          sents = list(doc.sents)\n"
                 "          assert len(sents) == 2\n"
                 "          assert [s.root.text for s in sents] == [\"is\", \"'s\"]\n"
                 "\n"
                 "        :yields:	Sentences in the document.\n"
                 "        :type: Span\n"))

    lang = property(
            fget=lambda self: self.limadoc.language(),
            doc="Language of the document.")

    ents = property(
            fget=lambda self: _DocEntitiesIterator(self),
            doc=("Iterate over the entites in the document. Returns an iterator yielding"
                 "named entity Span objects.\n"
                 "        Example::\n"
                 "\n"
                 "          import aymara.lima\n"
                 "          nlp = aymara.lima.Lima()\n"
                 "          doc = nlp(\"John Doe lives in New York\")\n"
                 "          ents = list(doc.ents)\n"
                 "          assert ents[0].label == \"Person.PERSON\"\n"
                 "          assert ents[0].text == \"John Doe\"\n"
                 "\n"
                 "        :yields:	Entities in the document.\n"
                 "        :type: Span\n"))


[docs]class LimaInternalError(Exception):
    pass


[docs]class Lima:
    """A text-processing pipeline

    Usually you’ll load this once per process as nlp and pass the instance around your
    application. The Lima class is a wrapper around the LimaAnalyzer class which is
    itself a binding around the C++ classes necessary to analyze text.

    Example::

                import aymara.lima
                nlp = aymara.lima.Lima()
                doc = nlp("Give it back! He pleaded.")
                print(doc)

    """
    def __init__(self,
                 langs: str = "fre,eng",
                 pipes: str = "main,deepud,tfud",
                 user_config_path: str = "",
                 user_resources_path: str = "",
                 meta: Dict[str, str] = {}):
        """
        Initialize the Lima analyzer

        :param langs: a comma-separated list of language trigrams to initialize
            (Default value = "fre,eng")
        :type langs: str
        :param pipes: a comma-separated list of Lima pipelines to analyze (Default value =
            "main, deepud, tfud")
        :type pipes: str
        :param user_config_path: a path where Lima configuration files will be searched
            for. This allows to override default configurations. (Default value = an empty
            string)
        :type user_config_path: str
        :param user_resources_path: a path where Lima resource files will be searched
            for. This allows to override default configurations (Default value = an empty
            string)
        :type user_resources_path: str
        :param meta: a list of named metadata values that will be used for each
            analysis.They can be completed or overriden at analysis time (Default value = an
            empty dictionary)
        :type meta:  Dict[str, str]
        """
        # print(f"Lima __init__: calling LimaAnalyzer constructor {langs}, {pipes}",
        #       file=sys.stderr)
        self.analyzer = aymaralima.cpplima.LimaAnalyzer(
            langs,
            pipes,
            list(aymaralima.__path__)[-1],
            user_config_path,
            user_resources_path,
            ",".join([f"{k}:{v}" for k, v in meta.items()])
            )
        if self.analyzer.error():
            raise LimaInternalError(self.analyzer.errorMessage())

        self.langs = langs
        self.pipes = pipes

    def __call__(self,
                 text: str,
                 lang: str = None,
                 pipeline: str = None,
                 meta: Dict[str, str] = {}) -> Doc:
        """
        Just 'call' your Lima instance to analyze the given text in the given language.
        The lang language must have been initialized when instantiating this object.

        Example::

                    import aymara.lima
                    nlp = aymara.lima.Lima()
                    doc = nlp("Give it back! He pleaded.")
                    print(doc)

        :param text: the text to analyze
        :type text: str
        :param lang: the language of the text. If none, will backup to the first element
            of the langs member or to eng if empty (Default value = `None`). Its value
            can be one of the three historic pre-Universal Dependencies languages
            ("eng", "fre" and "por") or the value "ud". In the latter case, the meta
            parameter must include a pair "udlang":"<trigram for the language>", e.g.:
            "udlang":"fra".
        :type lang: str
        :param pipeline: the Lima pipeline to use for analysis. If none, will backup to
            the first element of the pipelines member or to main if empty (Default
            value = `None`).
        :type pipeline: str
        :param meta: a dict of named metadata values (Default value = an empty dictionary).
        :type meta: Dict[str, str]

        :return: a Doc object representing the result of the analysis.
        :rtype: Doc
        """
        if lang is None:
            lang = self.langs.split(",")[0] if self.langs else "eng"
        if not isinstance(lang, str):
            raise TypeError(f"Lima.analyzeText lang parameter must be str, "
                            f"not {type(lang)}")
        if (lang not in ["eng", "fre", "por"] and not lang.startswith("ud-")
                and lang != "ud"):
            lang = "ud-" + lang
        if pipeline is None:
            if self.pipes:
                pipeline = self.pipes.split(",")[0]
            elif lang.startswith("ud-"):
                pipeline = "deepud"
            else:
                pipeline = "main"
        if not isinstance(pipeline, str):
            raise TypeError(f"Lima.analyzeText pipeline parameter must be str, "
                            f"not {type(pipeline)}")
        if not isinstance(text, str):
            raise TypeError(f"Lima.analyzeText text parameter must be str, "
                            f"not {type(text)}")
        try:
            parse_obj_as(Dict[str, str], meta)
        except ValidationError as e:
            raise TypeError(f"Lima.analyzeText meta parameter must be Dict[str, str], "
                            f"not {type(meta)}")
        lima_doc = self.analyzer(
            text, lang=lang, pipeline=pipeline,
            meta=",".join([f"{k}:{v}" for k, v in meta.items()]))
        if self.analyzer.error() or lima_doc.error():
            raise LimaInternalError(self.analyzer.errorMessage()
                                    + " / " + lima_doc.errorMessage())
        return Doc(lima_doc)

[docs]    def analyzeText(self,
                    text: str,
                    lang: str = None,
                    pipeline: str = None,
                    meta: Dict[str, str] = {}) -> str:
        """Analyze the given text in the given language. The lang language must have been
        initialized when instantiating this object.

        Example::

                    import aymara.lima
                    nlp = aymara.lima.Lima()
                    result = nlp.analyzeText("Give it back! He pleaded.")
                    print(result)

        :param text: the text to analyze
        :type text: str
        :param lang: the language of the text. If none, will backup to the first element
            of the langs member or to eng if empty (Default value = `None`).
        :type lang: str
        :param pipeline: the Lima pipeline to use for analysis. If none, will backup to
            the first element of the pipelines member or to main if empty (Default
            value = `None`).
        :type pipeline: str
        :param meta: a dict of named metadata values (Default value = an empty
            dictionary).
        :type meta: Dict[str, str]
        :return: the content of the text written by the text dumper of Lima if any. An
            empty string otherwise
        :rtype: str
        """
        print(f"Lima.analyzeText {text}, {lang}, {pipeline}, {meta}", file=sys.stderr)
        if self.analyzer.error():
            # Not covering line below because it is not easy to make lima fail at will
            raise LimaInternalError(self.analyzer.errorMessage())  # pragma: no cover
        if lang is None:
            lang = self.langs.split(",")[0] if self.langs else "eng"
        if not isinstance(lang, str):
            raise TypeError(f"Lima.analyzeText lang parameter must be str, "
                            f"not {type(lang)}")
        if (lang not in ["eng", "fre", "por"] and not lang.startswith("ud-")
                and lang != "ud"):
            lang = "ud-" + lang
        if pipeline is None:
            if self.pipes:
                pipeline = self.pipes.split(",")[0]
            elif lang.startswith("ud-"):
                pipeline = "deepud"
            else:
                pipeline = "main"
        if not isinstance(pipeline, str):
            raise TypeError(f"Lima.analyzeText pipeline parameter must be str, "
                            f"not {type(pipeline)}")
        if not isinstance(text, str):
            print(f"Lima.analyzeText text ({text}) is not a string. Raising.", file=sys.stderr)
            raise TypeError(f"Lima.analyzeText text parameter must be str, "
                            f"not {type(text)}")
        try:
            parse_obj_as(Dict[str, str], meta)
        except ValidationError as e:
            raise TypeError(f"Lima.analyzeText meta parameter must be Dict[str, str], "
                            f"not {type(meta)}")
        result = self.analyzer.analyzeText(
            text, lang=lang, pipeline=pipeline,
            meta=",".join([f"{k}:{v}" for k, v in meta.items()]))
        if self.analyzer.error():
            raise LimaInternalError(self.analyzer.errorMessage())
        return result

[docs]    @staticmethod
    def export_system_conf(dir: pathlib.Path = None, lang: str = None) -> bool:
        """Export LIMA configuration files from the module system path to the given
        dir in order to be able to easily change configuration files.

        If lang is given, only the configuration files concerning this language
        are exported (NOT IMPLEMENTED).

        Use this function to initiate a user configuration. For LIMA to take into
        account the configuration in the new path, you will have to add it in front of
        the LIMA_CONF environment variable (or define it if it does not exist).

        Please refer to the
        `LIMA documentation <https://github.com/aymara/lima/wiki/LIMA-User-Manual#configuring-lima>`_
        for how to configure the analysis:

        Example::

            import aymara.lima
            aymara.lima.Lima.export_system_conf("~/MyLima")

        :param dir: the directory were to export the configuration (Default value =
            None)
        :type dir: pathlib.Path
        :param lang: the language whose configuration must be exported. If `None`, the
            whole configuration is exported (Default value = None)
        :type lang: str
        :return: True if the configuration is correctly exported and False otherwise.
        :rtype: bool
        """
        # Verify thar dir exists and is writable or create it
        if not dir:
            dir = _get_data_dir("lima")
        dir.mkdir(parents=True, exist_ok=True)

        fromDirectory = pathlib.Path(list(aymaralima.__path__)[-1]) / "config"
        toDirectory = dir / "config"
        print(f"Copying {str(fromDirectory)} to {str(toDirectory)}")
        copy_tree(str(fromDirectory), str(toDirectory))

        fromDirectory = pathlib.Path(list(aymaralima.__path__)[-1]) / "resources"
        toDirectory = dir / "resources"
        print(f"Copying {str(fromDirectory)} to {str(toDirectory)}")
        copy_tree(str(fromDirectory), str(toDirectory))
        return True

[docs]    @staticmethod
    def get_system_paths() -> Tuple[str, str]:
        """
        Get the system configuration and resoures paths.

        Example::

            import aymara.lima
            aymara.lima.Lima.get_system_paths()

        :return: the colon (; under Windows) -separated list of the paths that are
            searched by LIMA to load its configuration files and linguistic resources.
            This function is useful to understand from which dirs data are loaded to
            debug configuration errors. It can also be used to know where to put or edit
            files.
        :rtype: Tuple[str, str]

        """
        return (str(pathlib.Path(list(aymaralima.__path__)[-1]) / "config"),
                str(pathlib.Path(list(aymaralima.__path__)[-1]) / "resources"))