Spaces:
Running
Running
Jon Solow
commited on
Commit
·
54505df
1
Parent(s):
a033f2f
Refactor player_news parsing to go player by player to rather than parsing separate lists
Browse files
src/queries/nbcsports/player_news.py
CHANGED
|
@@ -7,39 +7,33 @@ from typing import Mapping
|
|
| 7 |
NEWS_URL = "https://www.nbcsports.com/fantasy/football/player-news"
|
| 8 |
|
| 9 |
|
| 10 |
-
def
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
return
|
| 15 |
|
| 16 |
|
| 17 |
-
def
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
def get_nfl_player_news(page_number: int = 1) -> pd.DataFrame:
|
| 25 |
url = f"{NEWS_URL}?p={page_number}"
|
| 26 |
request_page = requests.get(url)
|
| 27 |
soup = BeautifulSoup(request_page.content)
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
datetime_div_list = soup.find_all("div", {"class": "PlayerNewsPost-date"})
|
| 34 |
-
assert datetime_div_list
|
| 35 |
-
datetime_list = [x["data-date"] for x in datetime_div_list]
|
| 36 |
-
assert (
|
| 37 |
-
len(player_names_list) == len(team_abbr_list) == len(position_list) == len(headline_list) == len(analysis_list)
|
| 38 |
-
)
|
| 39 |
-
df = pd.DataFrame(
|
| 40 |
-
zip(datetime_list, player_names_list, team_abbr_list, position_list, headline_list, analysis_list),
|
| 41 |
-
columns=["Date/Time", "Name", "Team", "Position", "Headline", "Analysis"],
|
| 42 |
-
)
|
| 43 |
df["Date/Time"] = pd.to_datetime(df["Date/Time"])
|
| 44 |
return df
|
| 45 |
|
|
|
|
| 7 |
NEWS_URL = "https://www.nbcsports.com/fantasy/football/player-news"
|
| 8 |
|
| 9 |
|
| 10 |
+
def find_soup_text_with_default(soup, element: str, find_search_map: Mapping[str, str]):
|
| 11 |
+
find_result = soup.find(element, find_search_map)
|
| 12 |
+
if not find_result:
|
| 13 |
+
return ""
|
| 14 |
+
return find_result.text.strip()
|
| 15 |
|
| 16 |
|
| 17 |
+
def parse_player_div(player_div):
|
| 18 |
+
return {
|
| 19 |
+
"Date/Time": player_div.find("div", {"class": "PlayerNewsPost-date"}).get("data-date"),
|
| 20 |
+
"Name": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-name"}),
|
| 21 |
+
"Team": find_soup_text_with_default(player_div, "span", {"class": "PlayerNewsPost-team-abbr"}).upper(),
|
| 22 |
+
"Position": find_soup_text_with_default(player_div, "span", {"class": "PlayerNewsPost-position"}).title(),
|
| 23 |
+
"Headline": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-headline"}),
|
| 24 |
+
"Analysis":find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-analysis"}),
|
| 25 |
+
}
|
| 26 |
|
| 27 |
|
| 28 |
def get_nfl_player_news(page_number: int = 1) -> pd.DataFrame:
|
| 29 |
url = f"{NEWS_URL}?p={page_number}"
|
| 30 |
request_page = requests.get(url)
|
| 31 |
soup = BeautifulSoup(request_page.content)
|
| 32 |
+
player_div_list = soup.find_all("div", {"class": "PlayerNewsPost"})
|
| 33 |
+
if not player_div_list:
|
| 34 |
+
return pd.DataFrame()
|
| 35 |
+
parsed_player_list = [parse_player_div(d) for d in player_div_list]
|
| 36 |
+
df = pd.DataFrame(parsed_player_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
df["Date/Time"] = pd.to_datetime(df["Date/Time"])
|
| 38 |
return df
|
| 39 |
|