Source code for pepys_import.file.highlighter.support.line

from re import finditer, search

from pepys_import.file.highlighter.level import HighlightLevel
from pepys_import.file.highlighter.support.utils import merge_adjacent_text_locations

from .token import SubToken, Token
from .usages import SingleUsage


[docs]class Line:
    """
    Object representing a line from a HighlightedDatafile.

    Has methods to get a list of Tokens in the line, and to record a usage of the whole line.
    """

    WHITESPACE_TOKENISER = "\\S+"
    CSV_TOKENISER = r'(?:,"|^")(""|[\w\W]*?)(?=",|"$)|(?:,(?!")|^(?!"))([^,]*?)(?=$|,)|(\r\n|\n)'
    QUOTED_NAME_REGEX = r"([\"'])(?:(?=(\\?))\2.)*?\1"

    __slots__ = ("children", "highlighted_file")

    def __init__(self, list_of_subtokens, hf_instance):
        """
        Create a new line, giving it a list of SubToken objects as children of the line

        Usually this will be just a list of one item, but has the flexibility to have more
        for composite tokens.
        """
        self.children = list_of_subtokens
        self.highlighted_file = hf_instance

    def __repr__(self):
        res = "Line: "
        for child in self.children:
            res += "(" + str(child.line_start) + "+" + repr(child.span) + ", " + child.text + ")"
        return res

    @property
    def text(self):
        """Returns the entire text of the Line

        :return: Entire text content of the Line
        :rtype: String
        """
        res = ""
        for child in self.children:
            res += child.text
        return res

[docs]    def tokens(self, reg_exp=WHITESPACE_TOKENISER, strip_char="", quoted_name=QUOTED_NAME_REGEX):
        """Generates a list of Token objects for each token in the line.

        :param reg_exp: Regular expression used to split the line into tokens. Useful
                        constants are defined in this class, including `CSV_TOKENISER`, defaults
                        to `WHITESPACE_TOKENISER`. See notes below.
        :type reg_exp: String, optional
        :param strip_char: Characters to strip after splitting, defaults to ""
        :type strip_char: String, optional
        :return: List of Token objects
        :rtype: List

        Notes:
        The reg_exp given to this function should be a regular expression that extracts the individual tokens from the line,
        and *not* a regular expression that identifies the characters to split by. Thus, the WHITESPACE_TOKENISER regex is
        simply \\S+, which matches any amount of anything that isn't whitespace. The CSV_TOKENISER is more complex, as it deals
        with quotes and other issues that cause problems in CSV files.
        The regular expression can use groups, but the entire match of the regular expression should be the token - there is no
        capacity (currently at least) for extracting particular groups of the regular expression. Use can be made of look-ahead
        and look-behind expressions in the regex to constrain it so that the entire match covers just the token and nothing else.
        (For a good example of this see the SLASH_TOKENISER in the Nisida importer)
        """
        tokens_array = []

        for child in self.children:
            # Search and match values between quotation marks if there is any
            quoted_text_match = search(quoted_name, child.text)
            start, end = None, None
            if quoted_text_match:
                original_name = quoted_text_match.group()
                start, end = quoted_text_match.span()
                # Remove quotation marks and then strip the text
                quoted_sensor_name = original_name[1:-1].strip()
                subtoken_sensor_name = SubToken(
                    (start, end), quoted_sensor_name, int(child.line_start), child.chars
                )
                subtokens_sensor_name = [subtoken_sensor_name]

            for match in finditer(reg_exp, child.text):
                token_str = match.group()
                token_start, token_end = match.span()
                # If quoted text exists and it contains the split token, continue or
                # add the quoted text's Token object
                if start and end and token_start >= start and token_end <= end:
                    # Since quoted text might contain a few tokens and it should be added to the
                    # token arrays in the correct position, the following if clause used. It adds
                    # the quoted text if token's end is equal to the quoted text's end.
                    if token_end == end:
                        tokens_array.append(Token(subtokens_sensor_name, self.highlighted_file))
                    continue

                # special handling, we may need to strip a leading delimiter
                if strip_char != "":
                    char_index = token_str.find(strip_char)
                    if char_index == 0:
                        token_str = token_str[1:]
                        token_start += 1
                    # and ditch any new whitespace
                    token_str = token_str.strip()

                subtoken = SubToken(
                    (token_start, token_end), token_str, int(child.line_start), child.chars
                )
                # the token object expects an array of SubTokens, as it could be a composite object
                list_of_subtokens = [subtoken]
                tokens_array.append(Token(list_of_subtokens, self.highlighted_file))

        return tokens_array

[docs]    def record(self, tool: str, field: str, value: str, units: str = None):
        """
        Record a usage of the whole line

        :param tool: Name of the importer handling the import (eg. "NMEA Importer)
                     Should be set to `self.name` when called from an importer
        :param field: The field that the token is being interpreted as (eg. "speed")
        :param value: The parsed value of the token (eg. "5 knots") - where possible,
                      pass a Quantity object with associated units
        :param units: The units that the field was interpreted as using (optional - do not
                      include if the value was a Quantity as that holds unit information itself

        Technical details:
        ------------------
        Adds a SingleUsage object to each of the relevant characters in the
        char array referenced by each SubToken child.
        """
        recording_level = self.highlighted_file.importer_highlighting_levels.get(tool, None)
        if recording_level == HighlightLevel.NONE:
            return

        self.highlighted_file.fill_char_array_if_needed()

        tool_field = tool + "/" + field
        if units is not None:
            message = "Value:" + str(value) + " Units:" + str(units)
        else:
            message = "Value:" + str(value)

        text_locations = []

        for child in self.children:
            start = child.start()
            end = child.end()

            text_locations.append((start, end))

            for i in range(start, end):
                usage = SingleUsage(tool_field, message)
                child.chars[i].usages.append(usage)

        if recording_level == HighlightLevel.DATABASE:
            merged_text_locations = merge_adjacent_text_locations(text_locations)
            text_location_str = ",".join([f"{low}-{high}" for low, high in merged_text_locations])

            self.highlighted_file.datafile.pending_extracted_tokens.append(
                {
                    "text": self.text,
                    "interpreted_value": str(value),
                    "text_location": text_location_str,
                    "importer": tool,
                    "field": field,
                }
            )