Source code for gptables.core.gptable

import pandas as pd
import re
from xlsxwriter.format import Format

[docs]class GPTable: """ A Good Practice Table. Stores a table and metadata for writing a table to excel. Attributes ---------- table : pandas.DataFrame table to be written to an Excel workbook table_name : str name for table. Should be unique with no spaces and start with a character or underscore. title : str title of the table subtitles : List[str], optional subtitles as a list of strings instructions : str, optional instructions on how to read the sheet. If not provided, defaults to "This worksheet contains one table. Some cells may refer to notes, which can be found on the notes worksheet." scope : str, optional description of scope/basis of data in table if not included in title source : str, optional description of the source of the data in table if not included in cover units : dict, optional units used in each (dict) column of table legend : list, optional descriptions of special notation used in table index_columns : dict, optional mapping an index level to a 0-indexed column as {level: column}. Default is a level two index in the first column ({2: 0}). additional_formatting : dict, optional table-specific formatting for columns, rows or individual cells """ def __init__(self, table, table_name, title, scope=None, source=None, units=None, table_notes=None, subtitles=[], instructions="", legend=[], index_columns={2:0}, additional_formatting=[], ): # Attributes self.title = None self.subtitles = [] self.units = None # str or {units (str):column index (int)} dict self.table_notes = None # str or {units (str):column index (int)} dict self._VALID_INDEX_LEVELS = [1, 2, 3] self.index_levels = 0 self.index_columns = {} # {index level (int): column index (int)} self._column_headings = set() # Non-index column headings self.table = pd.DataFrame() self.table_name = None self.data_range = [0] * 4 self.scope = None self.source = None self.legend = [] self._annotations = [] self.additional_formatting = [] # Valid format labels from XlsxWriter self._valid_format_labels = [ attr.replace("set_", "") for attr in Format().__dir__() if attr.startswith('set_') and callable(getattr(Format(), attr)) ] # Call methods to set attributes self.set_title(title) self.set_subtitles(subtitles) self.set_instructions(instructions) self.set_additional_formatting(additional_formatting) self.set_table(table, index_columns, units, table_notes) self.set_table_name(table_name) self.set_scope(scope) self.set_source(source) self.set_legend(legend) self._set_data_range()
[docs] def set_table(self, new_table, new_index_columns = None, new_units = None, new_table_notes = None): """ Set the `table`, `index_columns`, `units` and `table_notes` attributes. Overwrites existing values for these attributes. """ if not isinstance(new_table, pd.DataFrame): raise TypeError("`table` must be a pandas DataFrame") default_index = pd.Index(range(new_table.shape[0])) if not all(new_table.index == default_index) and not new_table.empty: msg = ("`table` index must not contain index data. It can be reset" " before adding to a GPTable (see DataFrame.reset_index())." " Please ensure that index data is stored in the first 1-3" " columns of `table` and is indicated in `index_columns`.") raise ValueError(msg) self.table = new_table.reset_index(drop=True) self._validate_all_column_names_have_text() self._validate_no_duplicate_column_names() if new_index_columns is None: new_index_columns = self.index_columns self.set_index_columns(new_index_columns) if new_units is None: new_units = self.units self.set_units(new_units) if new_table_notes is None: new_table_notes = self.table_notes self.set_table_notes(new_table_notes)
[docs] def set_index_columns(self, new_index_columns): """ Set the `index_columns` attribute. Overwrites any existing values. A dict must be supplied. This dict should map index level to a single 0-indexed column number. All other columns will be considered as data columns. """ if isinstance(new_index_columns, dict): # Check if levels and values are valid valid_levels = all(level in self._VALID_INDEX_LEVELS for level in new_index_columns.keys()) if not valid_levels: msg = ("`index_columns` dictionary keys must be valid index" f" levels: {self._VALID_INDEX_LEVELS}") raise ValueError(msg) if not all(isinstance(col, int) for col in new_index_columns.values()): # Convert col name to numeric index for key, value in new_index_columns.items(): col_iloc = self.table.columns.get_loc(value) new_index_columns.update({key: col_iloc}) column_indexes = [col for col in new_index_columns.values()] valid_columns = all(self._valid_column_index(col) for col in column_indexes) if not valid_columns: msg = ("Out of range - `index_columns` dictionary values must" "be valid, 0-indexed column numbers") raise ValueError(msg) self.index_levels = len(new_index_columns.keys()) self.index_columns = new_index_columns self._set_column_headings() else: msg = ("`index_columns` must be a dict mapping a valid index level" " to a 0-indexed column number") raise ValueError(msg)
def _valid_column_index(self, column_index): """ Check if `column_index` is valid, given the `table` shape. """ return column_index in range(self.table.shape[1]) def _set_column_headings(self): # TODO: check custom formatting in headers """ Sets the `column_headings` attribute to the set of column indexes that are not assigned to `index_columns`. """ index_cols = set(self.index_columns.values()) self._column_headings = {x for x in range(self.table.shape[1])} - index_cols def _validate_all_column_names_have_text(self): """ Validate that all column names in header row have text. """ for column_name in self.table.columns: if len(column_name) > 0: continue else: msg = ("Empty column name found in table data - column names must all have text") raise ValueError(msg) def _validate_no_duplicate_column_names(self): """ Validate that there are no duplicate column names in table data. """ if len(self.table.columns) != len(set(self.table.columns)): msg = ("Duplicate column names found in table data - column names must be unique") raise ValueError(msg)
[docs] def set_table_name(self, new_table_name): """ Set the `table_name` attribute. """ if not isinstance(new_table_name, str): msg = ("table_name should be provided as a string") raise TypeError(msg) elif len(new_table_name) != len("".join(new_table_name.split())): msg = ("Whitespace found in table_name, remove or replace with underscores") raise ValueError(msg) else: self.table_name = new_table_name
[docs] def set_title(self, new_title): """ Set the `title` attribute. """ self._validate_text(new_title, "title") if isinstance(new_title, list): new_title = FormatList(new_title) self.title = new_title
[docs] def add_subtitle(self, new_subtitle): """ Add a single subtitle to the existing list of `subtitles`. """ self._validate_text(new_subtitle, "subtitles") if isinstance(new_subtitle, list): new_subtitle = FormatList(new_subtitle) self.subtitles.append(new_subtitle)
[docs] def set_subtitles(self, new_subtitles, overwrite=True): """ Set a list of subtitles to the `subtitles` attribute. Overwrites existing ist of subtitles by default. If `overwrite` is False, new list is appended to existing list of subtitles. """ if new_subtitles is None: new_subtitles = [] if not isinstance(new_subtitles, (list)): msg =("`subtitles` must be provided as a list containing strings" " and/or lists of strings and format dictionaries" " (rich text)") raise TypeError(msg) for text in new_subtitles: self._validate_text(text, "subtitles") new_subtitles = [FormatList(text) if isinstance(text, list) else text for text in new_subtitles] if overwrite: self.subtitles = new_subtitles else: self.subtitles += new_subtitles
[docs] def set_instructions(self, new_instructions): """ Set `instructions` attribute. """ self._validate_text(new_instructions, "instructions") if isinstance(new_instructions, list): new_instructions = FormatList(new_instructions) if len(new_instructions) == 0: self.instructions = "This worksheet contains one table. Some cells may refer to notes, which can be found on the notes worksheet." else: self.instructions = new_instructions
[docs] def set_scope(self, new_scope): """ Set the `scope` attribute. """ if new_scope == None: new_scope = "" return self._validate_text(new_scope, "scope") if isinstance(new_scope, list): new_scope = FormatList(new_scope) self.scope = new_scope
[docs] def set_units(self, new_units): # TODO: custom formatting in units? """ Adds units to column headers. Units should be in the format {column: units_text}. Column can be column name or 0-indexed column number in `table`. """ if isinstance(new_units, dict) and len(new_units) > 0: for value in new_units.values(): self._validate_text(value, "units") # Convert numeric keys to column names new_headers_keys = [self.table.columns.values.tolist()[key] if isinstance(key, int) else key for key in new_units.keys()] new_headers_values = [f"{key} \n({value})" for key, value in zip(new_headers_keys, new_units.values())] new_headers = dict(zip(new_headers_keys, new_headers_values)) self.table = self.table.rename(columns = new_headers) if len(self.additional_formatting) > 0: self._add_units_to_additional_formatting(new_headers) elif not new_units is None: msg = ("`units` attribute must be a dictionary or None" " ({column: units_text})") raise TypeError(msg) self.units = new_units
def _add_units_to_additional_formatting(self, col_names): """ Parameters ---------- col_names: dict with keys old names and values new names, where new names are old names plus units Return ------ None """ formatting_list = self.additional_formatting for dictionary in formatting_list: if list(dictionary.keys()) == ["column"]: format = list(dictionary.values())[0] # new_name if name==old_name else name for name in col_names format["columns"] = [col_names[name] if name in list(col_names.keys()) else name for name in format["columns"]] self.additional_formatting = formatting_list
[docs] def set_table_notes(self, new_table_notes): # TODO: custom formatting in column headers? """ Adds note references to column headers. `table_notes` should be in the format {column: "$$note_reference$$"}. Column can be column name or 0-indexed column number in `table`. """ if isinstance(new_table_notes, dict) and len(new_table_notes) > 0: for value in new_table_notes.values(): self._validate_text(value, "table_notes") # Convert numeric keys to column names new_headers_keys = [self.table.columns.values.tolist()[key] if isinstance(key, int) else key for key in new_table_notes.keys()] new_headers_values = [f"{key} \n{value}" for key, value in zip(new_headers_keys, new_table_notes.values())] new_headers = dict(zip(new_headers_keys, new_headers_values)) self.table = self.table.rename(columns = new_headers) elif not new_table_notes is None: msg = ("`table_notes` attribute must be a dictionary or None" " ({column: '$$note_reference$$'})") raise TypeError(msg) self.table_notes = new_table_notes
[docs] def set_source(self, new_source): """ Set the source attribute to the specified str. """ if new_source == None: new_source = "" return self._validate_text(new_source, "source") if isinstance(new_source, list): new_source = FormatList(new_source) self.source = new_source
[docs] def add_legend(self, new_legend): """ Add a single legend entry to the existing `legend` list. """ self._validate_text(new_legend, "legend") if isinstance(new_legend, list): new_legend = FormatList(new_legend) self.legend.append(new_legend)
[docs] def set_legend(self, new_legend, overwrite=True): """ Set a list of legend entries to the `legend` attribute. Overwrites existing legend entries by default. If overwrite is False, new entries are appended to the `legend` list. """ if new_legend is None: self.legend = [] return if not isinstance(new_legend, list): msg = ("`legend` must be provided as a list of text elements") raise TypeError(msg) for text in new_legend: self._validate_text(text, "legend") new_legend = [FormatList(text) if isinstance(text, list) else text for text in new_legend] if overwrite: self.legend = new_legend else: self.legend += new_legend
def _set_annotations(self): """ Set a list of note references to the `_annotations` attribute. """ elements = [ "title", "subtitles", "legend", "source", "scope", "units", ] ordered_refs = [] for attr in elements: attr_current = getattr(self, attr) references = self._get_references_from_attr(attr_current) ordered_refs.extend(references) table_refs = self._get_references_from_table() ordered_refs.extend(table_refs) # remove duplicates from ordered_refs and assign to self._annotations self._annotations = list(dict.fromkeys(ordered_refs)) def _get_references_from_attr(self, data): """ Finds references in a string or list/dict of strings. Works recursively on list elements and dict values. Other types are ignored. Returns ordered list of references from attribute. Parameters ---------- data : string or list/dict of strings object containing strings to replace references in Returns ------- list of str """ ordered_refs = [] if isinstance(data, str): ordered_refs.extend(self._get_references(data)) if isinstance(data, list): for n in range(len(data)): ordered_refs.extend(self._get_references_from_attr(data[n])) if isinstance(data, dict): for key in data.keys(): ordered_refs.extend(self._get_references(data[key])) if isinstance(data, FormatList): data_list = data.list for n in range(len(data_list)): if isinstance(data_list[n], str): ordered_refs.extend(self._get_references(data_list[n])) return ordered_refs def _get_references_from_table(self): """ Get note references in the table column headings and index columns. """ table = self.table ordered_refs = [] column_references = self._get_references_from_attr(table.columns.to_list()) ordered_refs.extend(column_references) index_columns = self.index_columns.values() for col in index_columns: index_column = table.iloc[:, col] index_column_references = self._get_references_from_attr(index_column.to_list()) ordered_refs.extend(index_column_references) return ordered_refs @staticmethod def _get_references(string): """ Given a single string, return occurrences of note references (denoted by flanking dollar signs [$$reference$$]). Parameters ---------- string : str the string to find references within Returns ------- list of str list of note references """ ordered_refs = [] refs_raw = re.findall(r"[$]{2}.*?[$]{2}", string) refs_clean = [w.replace("$", "") for w in refs_raw] ordered_refs.extend(refs_clean) return ordered_refs
[docs] def set_additional_formatting(self, new_formatting): """ Set a dictionary of additional formatting to be applied to this table. """ if not isinstance(new_formatting, list): msg = ("`additional_formatting` must be a list of dictionaries") raise TypeError(msg) keys = [key for item in new_formatting for key in item.keys()] for key in keys: if key not in ["column", "row", "cell"]: msg = (f"`{key}` is not a supported format type. Please use" " `column`, `row` or `cell`") raise ValueError(msg) self._validate_format_labels(new_formatting) self.additional_formatting = new_formatting
def _validate_format_labels(self, format_list): """ Validate that format labels are valid property of XlsxWriter Format. """ labels = [label for item in format_list for key in item.keys() for label in item[key]["format"] ] for label in labels: if label not in self._valid_format_labels: msg = (f"`{label}` is not a valid XlsxWriter Format property") raise ValueError(msg) def _set_data_range(self): """ Get the top-left and bottom-right cell reference of the table data. """ #TODO: ugly code row_offset = sum([ int(self.title is not None), int(self.scope is not None), int(self.source is not None), ]) + 1 #corresponds to instructions which are included by default if self.subtitles is not None: row_offset += len(self.subtitles) if self.legend is not None: row_offset += len(self.legend) self.data_range = [ row_offset, 0, self.table.shape[0] + row_offset, self.table.shape[1] - 1 ] @staticmethod def _validate_text(obj, attr): """ Validate that an object contains valid text elements. These are either strings or list of strings and dictionaries. """ if isinstance(obj, str) or obj is None: return None msg = (f"{attr} text should be provided as strings or lists of" f" strings and dictionaries (rich-text). {type(obj)} are" " not valid text elements.") if isinstance(obj, list): for element in obj: if not isinstance(element, (str, dict)): msg = (f"{attr} text should be provided as strings or" " lists of strings and dictionaries (rich-text)." f" {type(element)} are not valid rich text" " elements.") raise TypeError(msg) else: msg = (f"{attr} text should be provided as strings or lists of" f" strings and dictionaries (rich-text). {type(obj)} are" " not valid text elements.") raise TypeError(msg)
class FormatList: """ Class for storing list of alternating string and dictionary objects. Dictionaries specify additional formatting to be applied to the following string. """ def __init__(self, list): self.list = list self._set_string_property() def _set_string_property(self): string = "" for entry in self.list: if isinstance(entry, str): string += entry self.string = string