Spaces:
Running
Running
| from io import StringIO | |
| import lxml.html | |
| import pandas as pd | |
| import requests | |
| from typing import List | |
| from queries.footballguys import constants as fbgc | |
| def url_to_pandas(url) -> List[pd.DataFrame]: | |
| page = requests.get(url) | |
| table = pd.read_html(StringIO(page.text.replace("<br>", "-"))) | |
| return table | |
| def create_html_table_from_header_body(header_html_str: str, body_html_str: str): | |
| return f""" | |
| <table> | |
| {header_html_str} | |
| {body_html_str} | |
| </table> | |
| """ | |
| def extract_snaps_to_pandas(url: str): | |
| root = lxml.html.document_fromstring(requests.get(url).text) | |
| table_element_list = root.xpath("""//*[@id="stats_snapcounts_data"]/div/table""") | |
| assert isinstance(table_element_list, list) | |
| table_element = table_element_list[0] | |
| assert isinstance(table_element, lxml.html.HtmlElement) | |
| table_child_list = table_element.getchildren() | |
| assert len(table_child_list) % 2 == 0 # check is even | |
| half_len = int(len(table_child_list) / 2) | |
| df_list = [] | |
| for i in range(half_len): | |
| table_html = create_html_table_from_header_body( | |
| lxml.html.tostring(table_child_list[2 * i]), lxml.html.tostring(table_child_list[2 * i + 1]) | |
| ).replace("\\n", "") | |
| df = pd.read_html(table_html)[0] | |
| # First column contains name and is initially labeled as each position, example "Quarterback" | |
| # Insert column at front called POS and fill with current first column label | |
| position_name = df.columns[0] | |
| df.insert(0, "POS", position_name) | |
| df.rename(columns={position_name: "name"}, inplace=True) | |
| df_list.append(df) | |
| return df_list | |
| def add_snap_off_def_column(team_snap_df: pd.DataFrame): | |
| off_def = team_snap_df["POS"].apply(lambda x: fbgc.POSITIONS_TO_OFFENSE_DEFENSE[x]) | |
| team_snap_df.insert(0, "OFF/DEF", off_def) | |
| def add_snap_position_column( | |
| team_snap_df_list: List[pd.DataFrame], | |
| position_name_array: List[str] = fbgc.SNAP_PAGE_POSITON_ORDER, | |
| ): | |
| # blank player names between positions, so we can use cumsum | |
| # 8/22/23 - We are currently failing here because snap counts are incorrectly not split by position atm | |
| assert len(team_snap_df_list) == len(position_name_array) | |
| for pos_idx, pos_df in enumerate(team_snap_df_list): | |
| pos_df.insert(0, "POS", position_name_array[pos_idx]) | |
| def set_multilevel_columns(df): | |
| new_cols = [tuple(x.split("-")) if "-" in x else (x, x) for x in df.columns] | |
| df.columns = pd.MultiIndex.from_tuples(new_cols) | |
| def parse_snaps(team_short_name: str, base_url: str = fbgc.BASE_URL, year: int = fbgc.YEAR) -> pd.DataFrame: | |
| print(f"Attempting to parse snaps for {team_short_name}") | |
| team_snap_df_list = parse_team_page(team_short_name, base_url, "snap-counts", year) | |
| team_snap_df = pd.concat(team_snap_df_list) | |
| # add_snap_off_def_column(team_snap_df) | |
| split_snap_count_percents(team_snap_df) | |
| team_snap_df.dropna(subset=["name"], inplace=True) | |
| # set_multilevel_columns(team_snap_df) | |
| return team_snap_df | |
| def add_targets_position(team_df: pd.DataFrame): | |
| # fill blanks up by reversing index, fill down, and re-reversing | |
| positions = team_df.name.apply(lambda x: x.replace(" Totals", "") if " Totals" in x else None)[::-1].ffill()[::-1] | |
| team_df.insert(0, "POS", positions) | |
| def parse_targets(team_short_name: str, base_url: str = fbgc.BASE_URL, year: int = fbgc.YEAR) -> pd.DataFrame: | |
| # snaps are index 2 | |
| print(f"Attempting to parse targets for {team_short_name}") | |
| team_df = parse_team_page(team_short_name, base_url, "targets", year)[0] | |
| add_targets_position(team_df) | |
| return team_df[team_df.name.notna()] | |
| def parse_redzone(team_short_name: str, base_url: str = fbgc.BASE_URL, year: int = fbgc.YEAR) -> pd.DataFrame: | |
| # snaps are index 3 | |
| print(f"Attempting to parse redzone for {team_short_name}") | |
| team_df = parse_team_page(team_short_name, base_url, "redzone", year)[0] | |
| add_targets_position(team_df) | |
| return team_df[team_df.name.notna()] | |
| def split_snap_count_percents(team_snap_df: pd.DataFrame): | |
| for week in range(1, 18): | |
| if f"Wk {week}" not in team_snap_df.columns: | |
| continue | |
| # if values are all NaN column will be dtype float 64 and should skip | |
| if team_snap_df[f"Wk {week}"].dtype == float: | |
| team_snap_df[f"{week}-count"] = 0 | |
| team_snap_df[f"{week}-%"] = 0.0 | |
| else: | |
| week_split = team_snap_df[f"Wk {week}"].astype(str).str.split("-") | |
| week_count = week_split.apply(lambda x: 0 if len(x) == 1 or x[0] == "" else int(x[0])) | |
| week_pct = week_split.apply(lambda x: 0.0 if len(x) == 1 else float(x[1].strip("%")) / 100.0) | |
| team_snap_df[f"{week}-count"] = week_count | |
| team_snap_df[f"{week}-%"] = week_pct | |
| team_snap_df.drop(columns=f"Wk {week}", inplace=True) | |
| def parse_team_page( | |
| team_short_name: str, | |
| base_url: str, | |
| stat_name: str, | |
| year: int, | |
| ) -> List[pd.DataFrame]: | |
| url = f"{base_url}/{stat_name}/teams?team={team_short_name}&year={year}" | |
| if stat_name == "snap-counts": | |
| all_tables = extract_snaps_to_pandas(url) | |
| else: | |
| all_tables = url_to_pandas(url) | |
| return all_tables | |