from tqdm import tqdm
from pepys_import.file.highlighter.support.line import Line
from ...utils.text_formatting_utils import custom_print_formatted_text, format_error_message
from .support.char import Char
from .support.export import export_report
from .support.token import SubToken
[docs]class HighlightedFile:
"""
class that can load/tokenize a datafile, record changes to the file,
then export a highlighted version of the file that indicates extraction
"""
def __init__(self, filename: str, number_of_lines=None, datafile=None):
"""
Constructor for this object
Args:
filename (str): The name of the file to be parsed/reported upon
number_of_lines(int) Number of lines that should be shown
in the output (all lines if None)
"""
self.chars = []
self.filename = filename
self.dict_color = {}
self.number_of_lines = number_of_lines
self.datafile = datafile
self.importer_highlighting_levels = {}
[docs] def reinitialise(self, filename, datafile):
"""Re-initialise the highlighted file object, to set a new filename as the source
of data to be highlighted.
Designed to be called with a new filename from within `_load_this_file` in an importer
if the original file needs transforming before being processed and highlighted."""
self.__init__(filename=filename, datafile=datafile)
self.datafile = datafile
[docs] def chars_debug(self):
"""
Debug method, to check contents of chars
"""
self.fill_char_array_if_needed()
return self.chars
[docs] def lines(self):
"""
Slice the file into lines and return a list of Line objects
"""
if self.number_of_lines is None:
return self.not_limited_lines()
elif self.number_of_lines <= 0:
custom_print_formatted_text(
format_error_message("Non-positive number of lines. Please provide positive number")
)
exit(1)
else:
return self.limited_lines()
[docs] def export(self, filename: str, include_key=False):
"""
Provide highlighted summary for this file
Args:
filename (str): The name of the destination for the HTML output
include_key (bool): Whether to include a key at the bottom of the output
showing what each colour refers to
"""
if len(self.chars) > 0:
export_report(filename, self.chars, self.dict_color, include_key)
[docs] def limited_contents(self):
with open(self.filename, "r") as file:
whole_file_contents = file.read()
lines_list = whole_file_contents.splitlines()
lines_list = lines_list[0 : self.number_of_lines]
limited_contents = "\n".join(str(e) for e in lines_list)
return limited_contents, lines_list
[docs] def limited_lines(self):
"""
Return a list of Line objects for each line in the file,
producing only self.number_of_lines objects (to limit length
of output for very large files)
"""
limited_contents, lines_list = self.limited_contents()
lines = self.create_lines(limited_contents, lines_list)
return lines
[docs] def not_limited_lines(self):
"""
Return a list of Line objects for each line in the file
"""
with open(self.filename, "r") as file:
file_contents = file.read()
lines_list = file_contents.splitlines()
lines = self.create_lines(file_contents, lines_list)
return lines
[docs] def fill_char_array_if_needed(self):
if len(self.chars) > 0:
# Char array already filled, so no need to do anything
return
if self.number_of_lines is None:
with open(self.filename, "r") as file:
file_contents = file.read()
elif self.number_of_lines <= 0:
raise ValueError("Non-positive number of lines. Please provide positive number")
else:
file_contents, _ = self.limited_contents()
# We need to store the contents of the file as bytes so that
# the record call on an XML element can convert from bytes offsets to character offsets
with open(self.filename, "rb") as f:
self.file_byte_contents = f.read()
# Initialise the char index (self.chars), with one Char entry for
# each character in the file. (Note: a reference to this char array is
# given to each SubToken)
# We do this as a list comprehension as it's more efficient, but we have to
# add it to list that already exists in self.chars, as references have already
# been made to this list
self.chars += [Char(c) for c in tqdm(file_contents)]
[docs] def set_usages_for_slice(self, start, end, usage):
for i in range(start, end):
self.chars[i].usages.append(usage)
return (start, end)
[docs] def create_lines(self, file_contents, lines_list):
"""
Create individual Line objects
for each line, with appropriate references to the character array
"""
# Keeps track of which character in the file a line starts on
line_start_counter = 0
lines = []
# For each line in the file create a Line object with a SubToken
# object as its child, keeping track of the length of the line
# and which character of the file this line starts on
for this_line in lines_list:
line_length = len(this_line)
line_span = (0, len(this_line))
# Create SubToken object to keep track of the line length, the line itself
# the start character of the line in the file, and a reference to the overall
# list of characters
sub_token = SubToken(line_span, this_line, int(line_start_counter), self.chars)
new_l = Line([sub_token], self)
lines.append(new_l)
# Update the starting character of the line ready for next time
line_start_counter += line_length + 1
return lines