Source code for gptables.core.wrappers

import os
import re
import warnings
import pandas as pd
import numpy as np
from copy import deepcopy

from xlsxwriter.workbook import Workbook
from xlsxwriter.worksheet import Worksheet

from gptables.core.cover import Cover

from .theme import Theme
from .gptable import GPTable, FormatList
from gptables.utils.unpickle_themes import gptheme


[docs]class GPWorksheet(Worksheet): """ Wrapper for an XlsxWriter Worksheet object. Provides a method for writing a good practice table (GPTable) to a Worksheet. """ def write_cover(self, cover): """ Write a cover page to the Worksheet. Uses text from a Cover object and details of the Workbook contents. Parameters ---------- cover : gptables.Cover object containing cover sheet text """ theme = self.theme pos = [0, 0] self._parse_urls(cover) pos = self._write_element(pos, cover.title, theme.cover_title_format) if cover.intro is not None: pos = self._write_element(pos, "Introductory information", theme.cover_subtitle_format) pos = self._write_element_list(pos, cover.intro, theme.cover_text_format) if cover.about is not None: pos = self._write_element(pos, "About these data", theme.cover_subtitle_format) pos = self._write_element_list(pos, cover.about, theme.cover_text_format) if cover.contact is not None: pos = self._write_element(pos, "Contact", theme.cover_subtitle_format) pos = self._write_element_list(pos, cover.contact, theme.cover_text_format) def write_gptable(self, gptable, auto_width, reference_order=[]): """ Write data from a GPTable object to the worksheet using the workbook Theme object for formatting. Parameters ---------- gptable : gptables.GPTable object containing elements of the gptable to be written to the Worksheet reference_order : list, optional order of annotations in workbook must be provided if gptable uses annotations Returns ------- None """ if not isinstance(gptable, GPTable): raise TypeError("`gptable` must be a gptables.GPTable object") if len(gptable._annotations)>0 and len(reference_order)==0: msg = "reference_order must be provided if gptable contains annotations" raise ValueError(msg) theme = self.theme # Write each GPTable element using appropriate Theme attr pos = [0, 0] self._reference_annotations(gptable, reference_order) self._parse_urls(gptable) gptable = deepcopy(gptable) pos = self._write_element( pos, gptable.title, theme.title_format ) pos = self._write_element_list( pos, gptable.subtitles, theme.subtitle_format ) description = theme.description_order for element in description: pos = getattr(self, "_write_" + element)( pos, getattr(gptable, element), getattr(theme, element + "_format") ) pos = self._write_table_elements( pos, gptable, auto_width, ) self._mark_data_as_worksheet_table(gptable, theme.column_heading_format) def _reference_annotations(self, gptable, reference_order): """ Replace note references with numbered references and move to end of element. Acts on `title`, `subtitles`, `table` and `notes` attributes of a GPTable. References are numbered from top left of spreadsheet, working across each row. Parameters ---------- gptable : gptables.GPTable object containing data with references to notes reference_order : list order of annotations in workbook Returns ------- None """ elements = [ "title", "subtitles", "legend", "source", "scope", "units", ] # Loop through elements, replacing references in strings for attr in elements: attr_current = getattr(gptable, attr) setattr( gptable, attr, self._replace_reference_in_attr( attr_current, reference_order ) ) self._reference_table_annotations(gptable, reference_order) def _reference_table_annotations(self, gptable, reference_order): # TODO: properly integrate this with table_notes parameter """ Reference annotations in the table column headings and index columns. """ table = getattr(gptable, "table") table.columns = self._replace_reference_in_attr( [x for x in table.columns], reference_order ) index_columns = gptable.index_columns.values() for col in index_columns: table.iloc[:, col] = table.iloc[:, col].apply( lambda x: self._replace_reference_in_attr(x, reference_order) ) setattr(gptable, "table", table) def _replace_reference_in_attr(self, data, reference_order): """ Replaces references in a string or list/dict of strings. Works recursively on list elements and dict values. Other types are returned without modification. Parameters ---------- data : any type object containing strings to replace references in reference_order : list order of annotations in workbook Returns ------- string : str input string with references replaced with numerical reference (n), where n is the order of appearance in the resulting document """ if isinstance(data, str): data = self._replace_reference(data, reference_order) if isinstance(data, list): for n in range(len(data)): data[n] = self._replace_reference_in_attr( data[n], reference_order ) if isinstance(data, dict): for key in data.keys(): data[key] = self._replace_reference_in_attr( data[key], reference_order ) if isinstance(data, FormatList): data_list = data.list for n in range(len(data_list)): data_list[n] = self._replace_reference_in_attr( data_list[n], reference_order ) data = FormatList(data_list) return data @staticmethod def _replace_reference(string, reference_order): """ Given a single string, record occurrences of new references (denoted by flanking dollar signs [$$reference$$]) and replace with number reference reflecting order of detection. Parameters ---------- string : str the string to replace references within reference_order : list order of annotations in workbook Returns ------- string : str input string with references replaced with numerical reference (n), where n is the order of appearence in the resulting document """ text_refs = re.findall(r"[$]{2}.*?[$]{2}", string) dict_refs = [w.replace("$", "") for w in text_refs] for n in range(len(dict_refs)): num_ref = "[note " + str(reference_order.index(dict_refs[n]) + 1) + "]" string = string.replace(text_refs[n], "") + num_ref return string def _parse_urls(self, sheet): """ Convert markdown URL formatting into URL, string tuple Parameters ---------- sheet : gptables.GPTable, gptables.Cover object containing data with urls """ if isinstance(sheet, GPTable): elements = [ "title", "subtitles", "legend", "source", "scope", "units", ] elif isinstance(sheet, Cover): elements = [ "title", "intro", "about", "contact", ] # Loop through elements, replacing urls in strings for attr in elements: attr_current = getattr(sheet, attr) setattr( sheet, attr, self._replace_url_in_attr( attr_current, ) ) if isinstance(sheet, GPTable): self._parse_table_urls(sheet) def _parse_table_urls(self, gptable): """ Parse URLs in table. """ table = getattr(gptable, "table") rows, columns = table.shape for c in range(columns): for r in range(rows): cell = self._replace_url_in_attr(table.iloc[r, c]) if isinstance(cell, dict): table.iloc[r, c] = [cell] else: table.iloc[r, c] = cell setattr(gptable, "table", table) def _replace_url_in_attr(self, data): """ Replaces urls in a string or list/dict of strings. Works recursively on list elements and dict values. Other types are returned without modification. Parameters ---------- data : any type object containing strings to replace references in """ if isinstance(data, str): data = self._replace_url(data) if isinstance(data, list): for n in range(len(data)): data[n] = self._replace_url_in_attr( data[n], ) if isinstance(data, dict): for key in data.keys(): data[key] = self._replace_url_in_attr( data[key], ) return data @staticmethod def _replace_url(string): """ Given a single string, record occurrences of markdown style urls (formatted as `"[url](display_text)"`) and replace with tuples of `(url, string)` Parameters ---------- string : str the string to replace references within Returns ------- string or dict if no markdown style urls found, returns sting if found, return dictionary with key `string` and value `url`, where markdown style url in `string` is replaced with `display_text` """ f_url_pattern = r"\[.+\]\(.+\)" # "[display_text](url)" f_urls = re.findall(f_url_pattern, string) if len(f_urls) == 0: return string if len(f_urls) > 1: msg = "More than one link found in cell. Excel only permits one link per cell" raise ValueError(msg) else: f_url = f_urls[0] url = re.split(r"\(", f_url)[1].replace(")", "") display_text = re.split(r"\]", f_url)[0].replace("[", "") string = re.sub(f_url_pattern, display_text, string) return {string: url} def _write_element(self, pos, element, format_dict): """ Write a single text element of a GPTable to the GPWorksheet. Parameters ---------- element : str or list the string or list of rich string elements to be written format_dict : dict format to be applied to string pos : list the position of the worksheet cell to write the element to Returns ------- pos : list new position to write next element from """ if element: self._smart_write(*pos, element, format_dict) pos[0] += 1 return pos def _write_element_list(self, pos, element_list, format_dict): """ Writes a list of elements row-wise. Parameters ---------- element_list : list list of strings or nested list of rich string elements to write, one per row format_dict : dict format to be applied to string pos : list the position of the worksheet cell to write the elements to Returns ------- pos: list new position to write next element from """ if element_list: for element in element_list: pos = self._write_element(pos, element, format_dict) return pos def _write_instructions(self, pos, element, format_dict): """ Alias for writting description elements by name. """ return self._write_element(pos, element, format_dict) def _write_source(self, pos, element, format_dict): """ Alias for writting description elements by name. """ return self._write_element(pos, element, format_dict) def _write_scope(self, pos, element, format_dict): """ Alias for writting description elements by name. """ return self._write_element(pos, element, format_dict) def _write_legend(self, pos, element_list, format_dict): """ Alias for writting description elements by name. """ return self._write_element_list(pos, element_list, format_dict) def _write_notes(self, pos, element_list, format_dict): """ Alias for writting description elements by name. """ return self._write_element_list(pos, element_list, format_dict) def _write_table_elements(self, pos, gptable, auto_width): """ Writes the table and units elements of a GPTable. Uses the Workbook Theme, plus any additional formatting associated with the GPTable. Parameters ---------- gptable : gptables.GPTable object containing the table and additional formatting data pos : list the position of the worksheet cell to write the units to auto_width : bool select if column widths should be determined automatically using length of text in index and columns Returns ------- pos : list new position to write next element from """ # Raise error if any table element is null or whitespace gptable.table.replace(regex=r'^\s*$', value=np.NaN, inplace=True) if gptable.table.isna().values.any(): msg = (""" Empty or null cell found in table, the reason for missingness should be included above the table before inputting to gptables. There should only be one reason otherwise a shorthand should be provided. Guidance on shorthand can be found at: https://analysisfunction.civilservice.gov.uk/policy-store/symbols-in-tables-definitions-and-help/ """) warnings.warn(msg) # Raise error if any table element is only special characters if gptable.table.stack().str.contains('^[^a-zA-Z0-9_]*$').any(): msg = (""" Cell found containing only special characters, replace with alphanumeric characters before inputting to gptables. Guidance on symbols in tables can be found at: https://analysisfunction.civilservice.gov.uk/policy-store/symbols-in-tables-definitions-and-help/ """) raise ValueError(msg) # Get theme theme = self.theme # Reset position to left col on next row pos[1] = 0 ## Create data array index_levels = gptable.index_levels index_columns = [col for col in gptable.index_columns.values()] data = pd.DataFrame(gptable.table, copy=True) # Create row containing column headings data.loc[-1] = data.columns data.index = data.index + 1 data.sort_index(inplace=True) ## Create formats array # pandas.DataFrame did NOT want to hold dictionaries, so be wary formats = pd.DataFrame().reindex_like(data) dict_row = [{} for n in range(formats.shape[1])] for row in range(formats.shape[0]): dict_row = [{} for n in range(formats.shape[1])] formats.iloc[row] = dict_row ## Add Theme formatting to formats dataframe format_headings_from = 0 self._apply_format( formats.iloc[0, format_headings_from:], theme.column_heading_format ) self._apply_format( formats.iloc[1:, index_levels:], theme.data_format ) index_level_formats = [ theme.index_1_format, theme.index_2_format, theme.index_3_format ] for level, col in gptable.index_columns.items(): self._apply_format( formats.iloc[1:, col], index_level_formats[level - 1] # Account for 0-indexing ) ## Add additional table-specific formatting from GPTable self._apply_additional_formatting( formats, gptable.additional_formatting, gptable.index_levels ) ## Write table pos = self._write_array(pos, data, formats) ## Set columns widths if auto_width: widths = self._calculate_column_widths(data, formats) self._set_column_widths(widths) return pos def _apply_additional_formatting( self, formats_table, additional_formatting, index_levels ): """ Apply row, column and cell formatting to dataframe of formats. """ for item in additional_formatting: fmt_type = list(item.keys())[0] format_desc = item[fmt_type] if fmt_type == "cell": formatting = format_desc["format"] cell_ilocs = format_desc["cells"] if isinstance(cell_ilocs, tuple): cell_ilocs = [cell_ilocs] for row, col in cell_ilocs: formats_table_slice = formats_table.iloc[row, col] self._apply_format( formats_table_slice, formatting ) return None if fmt_type == "column": cols_iloc = [ formats_table.columns.get_loc(col) if isinstance(col, str) else col for col in format_desc["columns"] ] row_start = 0 if "include_names" in format_desc.keys(): row_start = 0 if format_desc["include_names"] else 1 formats_table_slice = formats_table.iloc[row_start:, cols_iloc] formatting = format_desc["format"] elif fmt_type == "row": rows_iloc = format_desc["rows"] col_start = 0 if "include_names" in format_desc.keys(): col_start = 0 if format_desc["include_names"] else index_levels formats_table_slice = formats_table.iloc[rows_iloc, col_start:] formatting = format_desc["format"] self._apply_format( formats_table_slice, formatting ) def _write_array(self, pos, data, formats): """ Write a two-dimensional array to the current Worksheet, starting from the specified position. Parameters ---------- data : pandas.DataFrame array of data to be written to Worksheet formats : pandas.DataFrame array of dictionaries that specify the formatting to be applied to each cell of data pos : list the position of the top left cell to start writing the array from Returns ------- pos : list new position to write next element from """ if data.shape != formats.shape: raise ValueError("data and formats arrays must be of equal shape") rows, cols = data.shape for row in range(rows): for col in range(cols): cell_data = data.iloc[row, col] cell_format_dict = formats.iloc[row, col] self._smart_write( pos[0] + row, pos[1] + col, cell_data, cell_format_dict ) pos = [pos[0] + rows, 0] return pos def _mark_data_as_worksheet_table(self, gptable, column_header_format_dict): """ Marks the data to be recognised as a Worksheet Table in Excel. """ data_range = gptable.data_range column_header_format = self._workbook.add_format(column_header_format_dict) column_list = gptable.table.columns.tolist() column_headers = [{'header': column, 'header_format': column_header_format} for column in column_list] self.add_table(*data_range, {'header_row': True, 'autofilter': False, 'columns': column_headers, 'style': None, 'name': gptable.table_name }) def _smart_write(self, row, col, data, format_dict, *args): """ Depending on the input data, this function will write rich strings or use the standard `write()` method. For rich strings, the base format is merged with each rich format supplied within data. Parameters ---------- row : int 0-indexed row of cell to write to col : int 0-indexed column of cell to write to data : str or list or dict Simple string to be written with `format_dict` formatting. Or a list of alternating string and dictionary objects. Dictionaries specify additional formatting to be applied to the following string in the list. Dictionaries will be written with (first) key as display text and (first) value as URL. format_dict : dict Dictionary containing base format for the string. Returns ------- None """ wb = self._workbook # Reference to Workbook that contains sheet if isinstance(data, list): if len(data) == 1: data = data[0] self._smart_write(row, col, data, format_dict, *args) elif any([isinstance(element, FormatList) for element in data]): self._write_with_newlines_and_custom_formats(wb, row, col, data, format_dict, *args) else: self._write_with_newlines(wb, row, col, data, format_dict, *args) elif isinstance(data, FormatList): self._write_with_custom_formats(wb, row, col, data, format_dict, *args) elif isinstance(data, dict): self._write_dict_as_url(wb, row, col, data, format_dict, *args) elif pd.isna(data): self.write_blank(row, col, None, wb.add_format(format_dict)) else: # Write handles all other write types dynamically self.write(row, col, data, wb.add_format(format_dict), *args) def _write_with_newlines_and_custom_formats(self, wb, row, col, data, format_dict, *args): """ Take list of FormatList (and str), join with newline characters and smart write """ data_with_newlines = [] first_element = data.copy()[0] if isinstance(first_element, FormatList): first_element = first_element.list else: first_element = [first_element] data_with_newlines.extend(first_element) for element in data[1:]: if isinstance(element, FormatList): element = element.list element_stings = [item for item in element if isinstance(item, str)] first_string = element_stings[0] new_string = "\n" + first_string element_with_newline = [new_string if item == first_string else item for item in element] else: element_with_newline = ["\n" + str(element)] data_with_newlines.extend(element_with_newline) self._write_with_custom_formats( wb, row, col, FormatList(data_with_newlines), format_dict, *args ) def _write_with_newlines(self, wb, row, col, data, format_dict, *args): """ Take list of str, join with newline character and write """ data_string = "\n".join(data) self.write( row, col, data_string, wb.add_format(format_dict), *args ) def _write_with_custom_formats(self, wb, row, col, data, format_dict, *args): data_with_custom_formats = [] for item in data.list: # Convert dicts to Format (with merge onto base format) if isinstance(item, dict): rich_format = format_dict.copy() rich_format.update(item) data_with_custom_formats.append(wb.add_format(rich_format)) else: data_with_custom_formats.append(item) data_with_all_formats = [] for n in range(len(data_with_custom_formats)-1): data_with_all_formats.append(data_with_custom_formats[n]) if isinstance(data_with_custom_formats[n], str): if isinstance(data_with_custom_formats[n+1], str): data_with_all_formats.append(wb.add_format(format_dict)) data_with_all_formats.append(data_with_custom_formats[-1]) self.write_rich_string( row, col, *data_with_all_formats, wb.add_format(format_dict), *args ) def _write_dict_as_url(self, workbook, row, col, data, format_dict, *args): url = list(data.values())[0] display_text = list(data.keys())[0] url_format = format_dict.copy() url_format.update({"underline": True, "font_color": "blue"}) self.write_url( row, col, url, workbook.add_format(url_format), display_text, *args ) @staticmethod def _apply_format(format_table_slice, format_dict): """ Update all cells of a given dataframe slice with the format dictionary. Handles dict, series or dataframes. """ if isinstance(format_table_slice, pd.Series): (format_table_slice .apply(lambda d: d.update(format_dict)) ) elif isinstance(format_table_slice, pd.DataFrame): # Vectorised for 2D (format_table_slice .apply(np.vectorize(lambda d: d.update(format_dict))) ) elif isinstance(format_table_slice, dict): format_table_slice.update(format_dict) def _set_column_widths(self, widths): """ Set the column widths using a list of widths. """ for col_number in range(len(widths)): self.set_column( col_number, col_number, widths[col_number] ) def _calculate_column_widths(self, table, formats_table): """ Calculate Excel column widths using maximum length of strings and the maximum font size in each column of the data table. Parameters ---------- table : pd.DataFrame data table to calculate widths from formats_table: pd.DataFrame formats table to retrieve font size from Returns ------- col_widths : list width to apply to Excel columns """ cols = table.shape[1] max_lengths = [ table.iloc[:, col].apply(self._longest_line_length).max() for col in range(cols) ] max_font_sizes = [ formats_table.iloc[:, col] .apply(lambda x: x.get("font_size") or 10).max() for col in range(cols) ] col_widths = [ self._excel_string_width(l, f) for l, f in zip(max_lengths, max_font_sizes) ] return col_widths @staticmethod def _excel_string_width(string_len, font_size): """ Calculate the rough length of a string in Excel character units. This crude estimate does not account for font name or other font format (e.g. wrapping). Parameters ---------- string_len : int length of string to calculate width in Excel for font_size : int size of font Returns ------- excel_width : float width of equivalent string in Excel """ if string_len == 0: excel_width = 0 else: excel_width = string_len * ((font_size * 0.12) - 0.09) return excel_width @staticmethod def _longest_line_length(cell_val): """ Calculate the length of the longest line within a cell. If the cell contains a string, the longest length between line breaks is returned. If the cell contains a link formatted as [{display_text: link}], the longest length is calculated from the display text. If the cell contains a list of strings, the length of the longest string in the list is returned. Expects new lines to be marked with "\n", "\r\n" or new lines in multiline strings. Parameters ---------- cell_val: cell value Returns ------- max_length: int the length of the longest line within the string """ split_strings = """ |\r\n|\n""" if isinstance(cell_val, str): return(max([len(line) for line in re.split(split_strings, cell_val)])) elif isinstance(cell_val, list): if isinstance(cell_val[0], dict): # text with links are stored as {text: link}, extract key to calculate text length return(max([len(line) for line in re.split(split_strings, list(cell_val[0])[0])])) elif isinstance(cell_val[0], FormatList): string = cell_val[0].string return(max([len(line) for line in re.split(split_strings, string)])) else: return(max([len(line) for line in cell_val])) else: return(0)
[docs]class GPWorkbook(Workbook): """ Wrapper for and XlsxWriter Workbook object. The Worksheets class has been replaced by an alternative with a method for writting GPTable objects. """ def __init__(self, filename=None, options={}): super(GPWorkbook, self).__init__(filename=filename, options=options) self.theme = None self._annotations = None # Set default theme self.set_theme(gptheme) def add_worksheet(self, name=None): """ Overwrite add_worksheet() to create a GPWorksheet object. Parameters ---------- name : str (optional) name of the the worksheet to be created Returns ------- worksheet : gptables.GPWorksheet a worksheet object, which supports writing of GPTable objects """ worksheet = super(GPWorkbook, self).add_worksheet(name, GPWorksheet) worksheet.theme = self.theme worksheet._workbook = self # Create reference to wb, for formatting worksheet.hide_gridlines(2) return worksheet def set_theme(self, theme): """ Sets the theme for all GPTable objects written to the Workbook. Parameters ---------- theme : gptables.Theme a Theme object containing the formatting to be applied to GPTable objects written to Worksheets within this Workbook Returns ------- None """ if not isinstance(theme, Theme): raise TypeError(f"`theme` must be a gptables.Theme object, not: {type(theme)}") self.theme = theme def _update_annotations(self, sheets): ordered_refs = [] for gptable in sheets.values(): gptable._set_annotations() ordered_refs.extend(gptable._annotations) # remove duplicates from ordered_refs and assign to self._annotations self._annotations = list(dict.fromkeys(ordered_refs)) def make_table_of_contents( self, sheets, additional_elements = None, column_names = None, table_name = None, title = None, subtitles = None, instructions = None, ): """ Generate table of contents from sheet and optional customisation parameters. Parameters ---------- sheets : dict mapping worksheet labels to gptables.GPTable objects additional_elements : List[str], optional additional GPTable elements to display in the contents table. Allowed elements are "subtitles", "scope", "source" and "instructions". column_names : List[str], optional table of contents column names, defaults to "Sheet name", "Table description" table_name: str, optional contents table name, defaults to "contents_table" title : str, optional table of contents title, defaults to "Table of contents" subtitles: List[str], optional list of subtitles as strings instructions: str, optional description of the page layout defaults to "This worksheet contains one table." Return ------ gpt.GPTable """ if column_names is None: column_names = ["Sheet name", "Table description"] if table_name is None: table_name = "contents_table" if title is None: title = "Table of contents" if instructions is None: instructions = "This worksheet contains one table." if additional_elements is not None: valid_elements = ["subtitles", "scope", "source", "instructions"] if not all(element in valid_elements for element in additional_elements): msg = ("Cover `additional_elements` list can only contain" "'subtitles', 'scope', 'source' and 'instructions'") raise ValueError(msg) contents_dict = {} for label, gptable in sheets.items(): contents_entry = [] contents_entry.append(self._strip_annotation_references(gptable.title)) if additional_elements is not None: for element in additional_elements: content = getattr(gptable, element) if element == "subtitles": [contents_entry.append(self._strip_annotation_references(element)) for element in content] else: contents_entry.append(self._strip_annotation_references(content)) link = {label: f"internal:'{label}'!A1"} contents_dict[label] = [link, contents_entry] contents_table = pd.DataFrame.from_dict(contents_dict, orient="index").reset_index(drop=True) contents_table.columns = column_names return GPTable( table=contents_table, table_name=table_name, title=title, subtitles=subtitles, instructions=instructions ) @staticmethod def _strip_annotation_references(text): """ Strip annotation references (as $$ $$) from a str or list text element. """ pattern = r"\$\$.*?\$\$" if isinstance(text, str): no_annotations = re.sub(pattern, "", text) elif isinstance(text, FormatList): no_annotations = FormatList([ re.sub(pattern, "", part) if isinstance(part, str) else part for part in text.list ]) elif isinstance(text, list): # TODO: this shouldn't get used - check and delete no_annotations = [ re.sub(pattern, "", part) if isinstance(part, str) else part for part in text ] return no_annotations def make_notesheet( self, notes_table, table_name = None, title = None, instructions = None, ): """ Generate notes table sheets from notes table and optional customisation parameters. Parameters ---------- notes_table : pd.DataFrame table with notes reference, text and (optional) link columns table_name: str, optional notes table name, defaults to "notes_table" title : str, optional notes page title, defaults to "Notes" instructions: str, optional description of the page layout defaults to "This worksheet contains one table." Return ------ gpt.GPTable """ # set defaults if table_name is None: table_name = "notes_table" if title is None: title = "Notes" if instructions is None: instructions = "This worksheet contains one table." # order notes table by worksheet reference order ordered_refs = self._annotations order_df = pd.DataFrame({"order": ordered_refs}) notes = notes_table.copy() notes = notes.rename(columns={notes.columns[0]: "order"}) ordered_notes = order_df.merge(notes, on="order", how="left") unreferenced_notes = notes[~notes["order"].isin(ordered_notes["order"])] if not unreferenced_notes.empty: warnings.warn(f"The following notes are not referenced: {list(unreferenced_notes['order'])}") ordered_notes = pd.concat([ordered_notes, unreferenced_notes]) # replace note references with note number ordered_notes = (ordered_notes .reset_index() .rename(columns={"index": "Note number"}) .drop(columns=["order"]) ) # convert from python 0-indexing ordered_notes["Note number"] = ordered_notes["Note number"] + 1 return GPTable( table=ordered_notes, table_name=table_name, title=title, instructions=instructions, index_columns={} )