Source code for pepys_import.file.highlighter.support.token

from pepys_import.file.highlighter.level import HighlightLevel
from pepys_import.file.highlighter.support.utils import merge_adjacent_text_locations

from .usages import SingleUsage


[docs]class SubToken:
    """
    Object representing a single token at a lower level than Token.

    Usually there is a single SubToken object as a child of each Token object,
    but when tokens are combined (with the `combine_tokens` function) then
    there will be multiple SubToken children.

    Each SubToken object keeps track of the span (start and end characters) of the SubToken,
    the text that is contained within the SubToken, the character index that the line starts at
    and a reference to the overall character array created by HighlightedFile.
    """

    __slots__ = ("span", "text", "line_start", "chars")

    def __init__(self, span, text, line_start, chars):
        self.span = span
        self.text = text
        self.line_start = line_start
        self.chars = chars

[docs]    def start(self):
        """
        Returns the index into the character array that this SubToken starts at
        """
        return self.line_start + int(self.span[0])

[docs]    def end(self):
        """
        Returns the index into the character array that this SubToken ends at
        """
        return self.line_start + int(self.span[1])

    def __repr__(self):
        return "SubToken: (" + str(self.line_start) + "+" + repr(self.span) + ", " + self.text + ")"


[docs]class Token:
    """
    Object representing a single token extracted from a Line.

    This is the main object that the user will interact with, running
    the `record` method to record that this token has been used for a specific purpose.

    The `children` of this token are SubToken objects. Most of the time there will
    just be one SubToken object as a child of a Token object - however, when tokens are
    combined there can be multiple children.
    """

    __slots__ = ("children", "highlighted_file")

    def __init__(self, list_of_subtokens, hf_instance):
        """
        :param list_of_subtokens:  A list of SubToken objects
        to be kept as children of this object
        """
        self.children = list_of_subtokens
        self.highlighted_file = hf_instance

    def __repr__(self):
        res = "Token: "
        for child in self.children:
            res += "(" + str(child) + ")"
        return res

    @property
    def text(self):
        """Returns the entire text of the Line

        :return: Entire text content of the Line
        :rtype: String
        """
        res = ""
        for child in self.children:
            res += child.text
        return res

    @property
    def text_space_separated(self):
        """Returns the entire text of the Line, with spaces separating the different subtokens

        :return: Entire text content of the Line
        :rtype: String
        """
        return " ".join([child.text for child in self.children])

[docs]    def record(self, tool: str, field: str, value: str, units: str = None):
        """
        Record the usage of this token for a specific purpose

        :param tool: Name of the importer handling the import (eg. "NMEA Importer)
                     Should be set to `self.name` when called from an importer
        :param field: The field that the token is being interpreted as (eg. "speed")
        :param value: The parsed value of the token (eg. "5 knots") - where possible,
                      pass a Quantity object with associated units
        :param units: The units that the field was interpreted as using (optional - do not
                      include if the value was a Quantity as that holds unit information itself

        Technical details
        -----------------
        This adds SingleUsage objects to each of the relevant characters in the
        character array stored by the SubToken objects that are children of this object.
        """
        recording_level = self.highlighted_file.importer_highlighting_levels.get(tool, None)
        if recording_level == HighlightLevel.NONE:
            return

        self.highlighted_file.fill_char_array_if_needed()

        tool_field = tool + "/" + field
        if units is not None:
            message = "Value:" + str(value) + " Units:" + str(units)
        else:
            message = "Value:" + str(value)

        usage = SingleUsage(tool_field, message)

        text_locations = []

        # This loop gives us each SubToken that is a child of this Token
        for subtoken in self.children:
            start = subtoken.start()
            end = subtoken.end()

            text_locations.append((start, end))

            for i in range(start, end):
                # Note: subtoken.chars is a reference to a single char array
                # that was originally created by the HighlightedFile class
                # So each time round the loop we're actually altering the same
                # char array, even though it is accessed via different SubToken
                # objects
                subtoken.chars[i].usages.append(usage)

        if recording_level == HighlightLevel.DATABASE:
            merged_text_locations = merge_adjacent_text_locations(text_locations)
            text_location_str = ",".join([f"{low}-{high}" for low, high in merged_text_locations])

            self.highlighted_file.datafile.pending_extracted_tokens.append(
                {
                    "text": self.text_space_separated,
                    "interpreted_value": str(value),
                    "text_location": text_location_str,
                    "importer": tool,
                    "field": field,
                }
            )