From 7b766c15c466e9068c88a9483568a907bb790363 Mon Sep 17 00:00:00 2001 From: Marek Slipski Date: Tue, 27 Jul 2021 06:03:20 -0700 Subject: [PATCH] Scrape MLB IDs from Baseball-Reference (#222) * add MLB ID column to BRef player stats * increased expected length of columns by 1 in tests --- pybaseball/league_batting_stats.py | 4 ++++ pybaseball/league_pitching_stats.py | 4 ++++ tests/integration/pybaseball/test_league_batting_stats.py | 2 +- tests/integration/pybaseball/test_league_pitching_stats.py | 2 +- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pybaseball/league_batting_stats.py b/pybaseball/league_batting_stats.py index 9818a72e..c0bb73a9 100644 --- a/pybaseball/league_batting_stats.py +++ b/pybaseball/league_batting_stats.py @@ -26,12 +26,16 @@ def get_table(soup: BeautifulSoup) -> pd.DataFrame: table = soup.find_all('table')[0] data = [] headings = [th.get_text() for th in table.find("tr").find_all("th")][1:] + headings.append("mlbID") data.append(headings) table_body = table.find('tbody') rows = table_body.find_all('tr') for row in rows: cols = row.find_all('td') + row_anchor = row.find("a") + mlbid = row_anchor["href"].split("mlb_ID=")[-1] if row_anchor else pd.NA # ID str or nan cols = [ele.text.strip() for ele in cols] + cols.append(mlbid) data.append([ele for ele in cols]) df = pd.DataFrame(data) df = df.rename(columns=df.iloc[0]) diff --git a/pybaseball/league_pitching_stats.py b/pybaseball/league_pitching_stats.py index a3284906..f25198d3 100644 --- a/pybaseball/league_pitching_stats.py +++ b/pybaseball/league_pitching_stats.py @@ -25,12 +25,16 @@ def get_table(soup): table = soup.find_all('table')[0] data = [] headings = [th.get_text() for th in table.find("tr").find_all("th")][1:] + headings.append("mlbID") data.append(headings) table_body = table.find('tbody') rows = table_body.find_all('tr') for row in rows: cols = row.find_all('td') + row_anchor = row.find("a") + mlbid = row_anchor["href"].split("mlb_ID=")[-1] if row_anchor else pd.NA # ID str or nan cols = [ele.text.strip() for ele in cols] + cols.append(mlbid) data.append([ele for ele in cols]) data = pd.DataFrame(data) data = data.rename(columns=data.iloc[0]) diff --git a/tests/integration/pybaseball/test_league_batting_stats.py b/tests/integration/pybaseball/test_league_batting_stats.py index 3ac88b28..f17450b9 100644 --- a/tests/integration/pybaseball/test_league_batting_stats.py +++ b/tests/integration/pybaseball/test_league_batting_stats.py @@ -10,7 +10,7 @@ def test_batting_stats_bref() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 27 + assert len(result.columns) == 28 assert(len(result)) == 991 diff --git a/tests/integration/pybaseball/test_league_pitching_stats.py b/tests/integration/pybaseball/test_league_pitching_stats.py index 6ee9621c..ede052cf 100644 --- a/tests/integration/pybaseball/test_league_pitching_stats.py +++ b/tests/integration/pybaseball/test_league_pitching_stats.py @@ -19,7 +19,7 @@ def test_pitching_stats_bref() -> None: assert result is not None assert not result.empty - assert len(result.columns) == 40 + assert len(result.columns) == 41 assert(len(result)) == 831