Source code for gptables.core.gptable

import pandas as pd
import re
from xlsxwriter.format import Format

[docs]class GPTable:
    """
    A Good Practice Table. Stores a table and metadata for writing a table
    to excel.
    
    Attributes
    ----------
    table : pandas.DataFrame
        table to be written to an Excel workbook
    table_name : str
        name for table. Should be unique with no spaces and start with a
        character or underscore.
    title : str
        title of the table
    subtitles : List[str], optional
        subtitles as a list of strings
    instructions : str, optional
        instructions on how to read the sheet. If not provided, defaults to
        "This worksheet contains one table. Some cells may refer to notes,
        which can be found on the notes worksheet."
    scope : str, optional
        description of scope/basis of data in table if not included in title
    source : str, optional
        description of the source of the data in table if not included in cover
    units : dict, optional
        units used in each (dict) column of table
    legend : list, optional
        descriptions of special notation used in table
    index_columns : dict, optional
        mapping an index level to a 0-indexed column as {level: column}.
        Default is a level two index in the first column ({2: 0}).
    additional_formatting : dict, optional
        table-specific formatting for columns, rows or individual cells
    """

    def __init__(self,
                 table,
                 table_name,
                 title,
                 scope=None,
                 source=None,
                 units=None,
                 table_notes=None,
                 subtitles=[],
                 instructions="",
                 legend=[],
                 index_columns={2:0},
                 additional_formatting=[],
                 ):
        
        # Attributes
        self.title = None
        self.subtitles = []
        
        self.units = None  # str or {units (str):column index (int)} dict
        self.table_notes = None  # str or {units (str):column index (int)} dict
        
        self._VALID_INDEX_LEVELS = [1, 2, 3]
        self.index_levels = 0
        self.index_columns = {}  # {index level (int): column index (int)}
        self._column_headings = set() # Non-index column headings
        self.table = pd.DataFrame()
        self.table_name = None
        self.data_range = [0] * 4
        
        self.scope = None
        self.source = None
        self.legend = []
        self._annotations = []
        
        self.additional_formatting = []
        
        # Valid format labels from XlsxWriter
        self._valid_format_labels = [
                attr.replace("set_", "")
                for attr in Format().__dir__() 
                if attr.startswith('set_')
                and callable(getattr(Format(), attr))
                ]
        
        # Call methods to set attributes        
        self.set_title(title)
        self.set_subtitles(subtitles)
        self.set_instructions(instructions)
        self.set_additional_formatting(additional_formatting)
        self.set_table(table, index_columns, units, table_notes)
        self.set_table_name(table_name)
        self.set_scope(scope)
        self.set_source(source)
        self.set_legend(legend)
        self._set_data_range()
        

[docs]    def set_table(self, new_table, new_index_columns = None, new_units = None, new_table_notes = None):
        """
        Set the `table`, `index_columns`, `units` and `table_notes` attributes. Overwrites
        existing values for these attributes.
        """
        if not isinstance(new_table, pd.DataFrame):
            raise TypeError("`table` must be a pandas DataFrame")
        
        default_index = pd.Index(range(new_table.shape[0]))
        if not all(new_table.index == default_index) and not new_table.empty:
            msg = ("`table` index must not contain index data. It can be reset"
                   " before adding to a GPTable (see DataFrame.reset_index())."
                   " Please ensure that index data is stored in the first 1-3"
                   " columns of `table` and is indicated in `index_columns`.")
            raise ValueError(msg)

        self.table = new_table.reset_index(drop=True)

        self._validate_all_column_names_have_text()
        self._validate_no_duplicate_column_names()

        if new_index_columns is None:
            new_index_columns = self.index_columns
        self.set_index_columns(new_index_columns)

        if new_units is None:
            new_units = self.units
        self.set_units(new_units)

        if new_table_notes is None:
            new_table_notes = self.table_notes
        self.set_table_notes(new_table_notes)


[docs]    def set_index_columns(self, new_index_columns):
        """
        Set the `index_columns` attribute. Overwrites any existing values.
        A dict must be supplied. This dict should map index level to a
        single 0-indexed column number. All other columns will be considered
        as data columns.
        """
        if isinstance(new_index_columns, dict):
            # Check if levels and values are valid
            valid_levels = all(level in self._VALID_INDEX_LEVELS for level in new_index_columns.keys())
            if not valid_levels:
                msg = ("`index_columns` dictionary keys must be valid index"
                       f" levels: {self._VALID_INDEX_LEVELS}")
                raise ValueError(msg)
            
            if not all(isinstance(col, int) for col in new_index_columns.values()):
                # Convert col name to numeric index
                for key, value in new_index_columns.items():
                    col_iloc = self.table.columns.get_loc(value)
                    new_index_columns.update({key: col_iloc})
                    
            column_indexes = [col for col in new_index_columns.values()]
                
            valid_columns = all(self._valid_column_index(col) for col in column_indexes)
            if not valid_columns:
                msg = ("Out of range - `index_columns` dictionary values must"
                       "be valid, 0-indexed column numbers")
                raise ValueError(msg)
            
            self.index_levels = len(new_index_columns.keys())
            
            self.index_columns = new_index_columns
            self._set_column_headings()
        else:
            msg = ("`index_columns` must be a dict mapping a valid index level"
                   " to a 0-indexed column number")
            raise ValueError(msg)
    

    def _valid_column_index(self, column_index):
        """
        Check if `column_index` is valid, given the `table` shape.
        """
        return column_index in range(self.table.shape[1])
        

    def _set_column_headings(self): # TODO: check custom formatting in headers
        """
        Sets the `column_headings` attribute to the set of column indexes that
        are not assigned to `index_columns`.
        """
        index_cols = set(self.index_columns.values())
        self._column_headings = {x for x in range(self.table.shape[1])} - index_cols
    
    
    def _validate_all_column_names_have_text(self):
        """
        Validate that all column names in header row have text.
        """
        for column_name in self.table.columns:
            if len(column_name) > 0:
                continue
            else:
                msg = ("Empty column name found in table data - column names must all have text")
                raise ValueError(msg)


    def _validate_no_duplicate_column_names(self):
        """
        Validate that there are no duplicate column names in table data.
        """
        if len(self.table.columns) != len(set(self.table.columns)):
            msg = ("Duplicate column names found in table data - column names must be unique")
            raise ValueError(msg)


[docs]    def set_table_name(self, new_table_name):
        """
        Set the `table_name` attribute.
        """
        if not isinstance(new_table_name, str):
            msg = ("table_name should be provided as a string")
            raise TypeError(msg)

        elif len(new_table_name) != len("".join(new_table_name.split())):
            msg = ("Whitespace found in table_name, remove or replace with underscores")
            raise ValueError(msg)

        else:
            self.table_name = new_table_name


[docs]    def set_title(self, new_title):
        """
        Set the `title` attribute.
        """
        self._validate_text(new_title, "title")

        if isinstance(new_title, list):
            new_title = FormatList(new_title)

        self.title = new_title


[docs]    def add_subtitle(self, new_subtitle):
        """
        Add a single subtitle to the existing list of `subtitles`.
        """
        self._validate_text(new_subtitle, "subtitles")

        if isinstance(new_subtitle, list):
            new_subtitle = FormatList(new_subtitle)

        self.subtitles.append(new_subtitle)


[docs]    def set_subtitles(self, new_subtitles, overwrite=True):
        """
        Set a list of subtitles to the `subtitles` attribute. Overwrites
        existing ist of subtitles by default. If `overwrite` is False, new list
        is appended to existing list of subtitles.
        """
        if new_subtitles is None:
            new_subtitles = []

        if not isinstance(new_subtitles, (list)):
            msg =("`subtitles` must be provided as a list containing strings"
                  " and/or lists of strings and format dictionaries"
                  " (rich text)")
            raise TypeError(msg)

        for text in new_subtitles:
            self._validate_text(text, "subtitles")

        new_subtitles = [FormatList(text) if isinstance(text, list) else text for text in new_subtitles]

        if overwrite:
            self.subtitles = new_subtitles
        else:
            self.subtitles += new_subtitles
            
[docs]    def set_instructions(self, new_instructions):
        """
        Set `instructions` attribute.
        """
        self._validate_text(new_instructions, "instructions")

        if isinstance(new_instructions, list):
            new_instructions = FormatList(new_instructions)

        if len(new_instructions) == 0:
            self.instructions = "This worksheet contains one table. Some cells may refer to notes, which can be found on the notes worksheet."
        else:
            self.instructions = new_instructions


[docs]    def set_scope(self, new_scope):
        """
        Set the `scope` attribute.
        """
        if new_scope == None:
            new_scope = ""
            return

        self._validate_text(new_scope, "scope")

        if isinstance(new_scope, list):
            new_scope = FormatList(new_scope)

        self.scope = new_scope


[docs]    def set_units(self, new_units): # TODO: custom formatting in units?
        """
        Adds units to column headers.
        Units should be in the format {column: units_text}. Column can be column name or 0-indexed column
        number in `table`. 
        """    
        if isinstance(new_units, dict) and len(new_units) > 0:
            for value in new_units.values():
                self._validate_text(value, "units")

            # Convert numeric keys to column names
            new_headers_keys = [self.table.columns.values.tolist()[key] if isinstance(key, int) else key for key in new_units.keys()] 
            new_headers_values = [f"{key} \n({value})" for key, value in zip(new_headers_keys, new_units.values())]
            new_headers = dict(zip(new_headers_keys, new_headers_values))

            self.table = self.table.rename(columns = new_headers)

            if len(self.additional_formatting) > 0:
                self._add_units_to_additional_formatting(new_headers)

        elif not new_units is None:
            msg = ("`units` attribute must be a dictionary or None"
                   " ({column: units_text})")
            
            raise TypeError(msg)

        self.units = new_units

    def _add_units_to_additional_formatting(self, col_names):
        """
        
        Parameters
        ----------
        col_names: dict
            with keys old names and values new names, where new names are old names plus units
           
        Return
        ------
        None
        """
        formatting_list = self.additional_formatting
        for dictionary in formatting_list:
            if list(dictionary.keys()) == ["column"]:
                format = list(dictionary.values())[0]
                
                # new_name if name==old_name else name for name in col_names
                format["columns"] = [col_names[name] if name in list(col_names.keys()) else name for name in format["columns"]]

        self.additional_formatting = formatting_list

[docs]    def set_table_notes(self, new_table_notes): # TODO: custom formatting in column headers?
        """
        Adds note references to column headers.
        `table_notes` should be in the format {column: "$$note_reference$$"}.
        Column can be column name or 0-indexed column number in `table`.
        """
        if isinstance(new_table_notes, dict) and len(new_table_notes) > 0:
            for value in new_table_notes.values():
                self._validate_text(value, "table_notes")

            # Convert numeric keys to column names
            new_headers_keys = [self.table.columns.values.tolist()[key] if isinstance(key, int) else key for key in new_table_notes.keys()]
            new_headers_values = [f"{key} \n{value}" for key, value in zip(new_headers_keys, new_table_notes.values())]
            new_headers = dict(zip(new_headers_keys, new_headers_values))

            self.table = self.table.rename(columns = new_headers)

        elif not new_table_notes is None:
            msg = ("`table_notes` attribute must be a dictionary or None"
                   " ({column: '$$note_reference$$'})")
            raise TypeError(msg)

        self.table_notes = new_table_notes


[docs]    def set_source(self, new_source):
        """
        Set the source attribute to the specified str.
        """
        if new_source == None:
            new_source = ""
            return

        self._validate_text(new_source, "source")

        if isinstance(new_source, list):
            new_source = FormatList(new_source)

        self.source = new_source
    

[docs]    def add_legend(self, new_legend):
        """
        Add a single legend entry to the existing `legend` list.
        """
        self._validate_text(new_legend, "legend")

        if isinstance(new_legend, list):
            new_legend = FormatList(new_legend)

        self.legend.append(new_legend)
    

[docs]    def set_legend(self, new_legend, overwrite=True):
        """
        Set a list of legend entries to the `legend` attribute. Overwrites
        existing legend entries by default. If overwrite is False, new entries 
        are appended to the `legend` list.
        """
        if new_legend is None:
            self.legend = []
            return
        if not isinstance(new_legend, list):
            msg = ("`legend` must be provided as a list of text elements")
            raise TypeError(msg)
        for text in new_legend:
            self._validate_text(text, "legend")

        new_legend = [FormatList(text) if isinstance(text, list) else text for text in new_legend]

        if overwrite:
            self.legend = new_legend
        else:
            self.legend += new_legend

    def _set_annotations(self):
        """
        Set a list of note references to the `_annotations` attribute.
        """
        elements = [
                "title",
                "subtitles",
                "legend",
                "source",
                "scope",
                "units",
                ]
        
        ordered_refs = []
        
        for attr in elements:
            attr_current = getattr(self, attr)
            references = self._get_references_from_attr(attr_current)
            ordered_refs.extend(references)

        table_refs = self._get_references_from_table()
        ordered_refs.extend(table_refs)

        # remove duplicates from ordered_refs and assign to self._annotations
        self._annotations = list(dict.fromkeys(ordered_refs))


    def _get_references_from_attr(self, data):
        """
        Finds references in a string or list/dict of strings. Works
        recursively on list elements and dict values. Other types are ignored.
        Returns ordered list of references from attribute.
        
        Parameters
        ----------
        data : string or list/dict of strings
            object containing strings to replace references in

        Returns
        -------
        list of str
        """
        ordered_refs = []
        if isinstance(data, str):
            ordered_refs.extend(self._get_references(data))
        if isinstance(data, list):
            for n in range(len(data)):
                ordered_refs.extend(self._get_references_from_attr(data[n]))
        if isinstance(data, dict):
            for key in data.keys():
                ordered_refs.extend(self._get_references(data[key]))
        if isinstance(data, FormatList):
            data_list = data.list
            for n in range(len(data_list)):
                if isinstance(data_list[n], str):
                    ordered_refs.extend(self._get_references(data_list[n]))

        return ordered_refs
        

    def _get_references_from_table(self):
        """
        Get note references in the table column headings and index columns.
        """
        table = self.table
        
        ordered_refs = []
        column_references = self._get_references_from_attr(table.columns.to_list())
        ordered_refs.extend(column_references)

        index_columns = self.index_columns.values()
        for col in index_columns:
            index_column = table.iloc[:, col]
            index_column_references = self._get_references_from_attr(index_column.to_list())
            ordered_refs.extend(index_column_references)

        return ordered_refs


    @staticmethod
    def _get_references(string):
        """
        Given a single string, return occurrences of note references (denoted by
        flanking dollar signs [$$reference$$]).
        
        Parameters
        ----------
        string : str
            the string to find references within

        Returns
        -------
        list of str
            list of note references
        """
        ordered_refs = []
        refs_raw = re.findall(r"[$]{2}.*?[$]{2}", string)
        refs_clean = [w.replace("$", "") for w in refs_raw]
        ordered_refs.extend(refs_clean)

        return ordered_refs


[docs]    def set_additional_formatting(self, new_formatting):
        """
        Set a dictionary of additional formatting to be applied to this table.
        """
        if not isinstance(new_formatting, list):
            msg = ("`additional_formatting` must be a list of dictionaries")
            raise TypeError(msg)
        keys = [key for item in new_formatting for key in item.keys()]
        for key in keys:
            if key not in ["column", "row", "cell"]:
                msg = (f"`{key}` is not a supported format type. Please use"
                       " `column`, `row` or `cell`")
                raise ValueError(msg)
        
        self._validate_format_labels(new_formatting)
            
        self.additional_formatting = new_formatting
    

    def _validate_format_labels(self, format_list):
        """
        Validate that format labels are valid property of XlsxWriter Format.
        """
        labels = [label
                  for item in format_list
                  for key in item.keys()
                  for label in item[key]["format"]
                  ]
        for label in labels:
            if label not in self._valid_format_labels:
                msg = (f"`{label}` is not a valid XlsxWriter Format property")
                raise ValueError(msg)


    def _set_data_range(self):
        """
        Get the top-left and bottom-right cell reference of the table data.
        """
        #TODO: ugly code
        row_offset = sum([
            int(self.title is not None),
            int(self.scope is not None),
            int(self.source is not None),
        ]) + 1 #corresponds to instructions which are included by default

        if self.subtitles is not None:
            row_offset += len(self.subtitles)
        if self.legend is not None:
            row_offset += len(self.legend)
        
        self.data_range = [
            row_offset,
            0,
            self.table.shape[0] + row_offset,
            self.table.shape[1] - 1
        ]

    @staticmethod
    def _validate_text(obj, attr):
        """
        Validate that an object contains valid text elements. These are either
        strings or list of strings and dictionaries.
        """
        if isinstance(obj, str) or obj is None:
            return None
        
        msg = (f"{attr} text should be provided as strings or lists of"
                   f" strings and dictionaries (rich-text). {type(obj)} are"
                   " not valid text elements.")
        if isinstance(obj, list):
            for element in obj:
                if not isinstance(element, (str, dict)):
                    msg = (f"{attr} text should be provided as strings or"
                           " lists of strings and dictionaries (rich-text)."
                           f" {type(element)} are not valid rich text"
                           " elements.")
                    raise TypeError(msg)
        else:
            msg = (f"{attr} text should be provided as strings or lists of"
                   f" strings and dictionaries (rich-text). {type(obj)} are"
                   " not valid text elements.")
            raise TypeError(msg)

class FormatList:
    """
    Class for storing list of alternating string and dictionary objects.
    Dictionaries specify additional formatting to be applied to the following string.
    """
    def __init__(self, list):
        self.list = list
        self._set_string_property()

    def _set_string_property(self):
        string = ""
        for entry in self.list:
            if isinstance(entry, str):
                string += entry

        self.string = string
gptables

Navigation

Related Topics

Source code for gptables.core.gptable