Source code for pepys_import.file.highlighter.support.line

from re import finditer, search

from pepys_import.file.highlighter.level import HighlightLevel
from pepys_import.file.highlighter.support.utils import merge_adjacent_text_locations

from .token import SubToken, Token
from .usages import SingleUsage


[docs]class Line: """ Object representing a line from a HighlightedDatafile. Has methods to get a list of Tokens in the line, and to record a usage of the whole line. """ WHITESPACE_TOKENISER = "\\S+" CSV_TOKENISER = r'(?:,"|^")(""|[\w\W]*?)(?=",|"$)|(?:,(?!")|^(?!"))([^,]*?)(?=$|,)|(\r\n|\n)' QUOTED_NAME_REGEX = r"([\"'])(?:(?=(\\?))\2.)*?\1" __slots__ = ("children", "highlighted_file") def __init__(self, list_of_subtokens, hf_instance): """ Create a new line, giving it a list of SubToken objects as children of the line Usually this will be just a list of one item, but has the flexibility to have more for composite tokens. """ self.children = list_of_subtokens self.highlighted_file = hf_instance def __repr__(self): res = "Line: " for child in self.children: res += "(" + str(child.line_start) + "+" + repr(child.span) + ", " + child.text + ")" return res @property def text(self): """Returns the entire text of the Line :return: Entire text content of the Line :rtype: String """ res = "" for child in self.children: res += child.text return res
[docs] def tokens(self, reg_exp=WHITESPACE_TOKENISER, strip_char="", quoted_name=QUOTED_NAME_REGEX): """Generates a list of Token objects for each token in the line. :param reg_exp: Regular expression used to split the line into tokens. Useful constants are defined in this class, including `CSV_TOKENISER`, defaults to `WHITESPACE_TOKENISER`. See notes below. :type reg_exp: String, optional :param strip_char: Characters to strip after splitting, defaults to "" :type strip_char: String, optional :return: List of Token objects :rtype: List Notes: The reg_exp given to this function should be a regular expression that extracts the individual tokens from the line, and *not* a regular expression that identifies the characters to split by. Thus, the WHITESPACE_TOKENISER regex is simply \\S+, which matches any amount of anything that isn't whitespace. The CSV_TOKENISER is more complex, as it deals with quotes and other issues that cause problems in CSV files. The regular expression can use groups, but the entire match of the regular expression should be the token - there is no capacity (currently at least) for extracting particular groups of the regular expression. Use can be made of look-ahead and look-behind expressions in the regex to constrain it so that the entire match covers just the token and nothing else. (For a good example of this see the SLASH_TOKENISER in the Nisida importer) """ tokens_array = [] for child in self.children: # Search and match values between quotation marks if there is any quoted_text_match = search(quoted_name, child.text) start, end = None, None if quoted_text_match: original_name = quoted_text_match.group() start, end = quoted_text_match.span() # Remove quotation marks and then strip the text quoted_sensor_name = original_name[1:-1].strip() subtoken_sensor_name = SubToken( (start, end), quoted_sensor_name, int(child.line_start), child.chars ) subtokens_sensor_name = [subtoken_sensor_name] for match in finditer(reg_exp, child.text): token_str = match.group() token_start, token_end = match.span() # If quoted text exists and it contains the split token, continue or # add the quoted text's Token object if start and end and token_start >= start and token_end <= end: # Since quoted text might contain a few tokens and it should be added to the # token arrays in the correct position, the following if clause used. It adds # the quoted text if token's end is equal to the quoted text's end. if token_end == end: tokens_array.append(Token(subtokens_sensor_name, self.highlighted_file)) continue # special handling, we may need to strip a leading delimiter if strip_char != "": char_index = token_str.find(strip_char) if char_index == 0: token_str = token_str[1:] token_start += 1 # and ditch any new whitespace token_str = token_str.strip() subtoken = SubToken( (token_start, token_end), token_str, int(child.line_start), child.chars ) # the token object expects an array of SubTokens, as it could be a composite object list_of_subtokens = [subtoken] tokens_array.append(Token(list_of_subtokens, self.highlighted_file)) return tokens_array
[docs] def record(self, tool: str, field: str, value: str, units: str = None): """ Record a usage of the whole line :param tool: Name of the importer handling the import (eg. "NMEA Importer) Should be set to `self.name` when called from an importer :param field: The field that the token is being interpreted as (eg. "speed") :param value: The parsed value of the token (eg. "5 knots") - where possible, pass a Quantity object with associated units :param units: The units that the field was interpreted as using (optional - do not include if the value was a Quantity as that holds unit information itself Technical details: ------------------ Adds a SingleUsage object to each of the relevant characters in the char array referenced by each SubToken child. """ recording_level = self.highlighted_file.importer_highlighting_levels.get(tool, None) if recording_level == HighlightLevel.NONE: return self.highlighted_file.fill_char_array_if_needed() tool_field = tool + "/" + field if units is not None: message = "Value:" + str(value) + " Units:" + str(units) else: message = "Value:" + str(value) text_locations = [] for child in self.children: start = child.start() end = child.end() text_locations.append((start, end)) for i in range(start, end): usage = SingleUsage(tool_field, message) child.chars[i].usages.append(usage) if recording_level == HighlightLevel.DATABASE: merged_text_locations = merge_adjacent_text_locations(text_locations) text_location_str = ",".join([f"{low}-{high}" for low, high in merged_text_locations]) self.highlighted_file.datafile.pending_extracted_tokens.append( { "text": self.text, "interpreted_value": str(value), "text_location": text_location_str, "importer": tool, "field": field, } )