Skip to content

Commit

Permalink
Scrape MLB IDs from Baseball-Reference (#222)
Browse files Browse the repository at this point in the history
* add MLB ID column to BRef player stats

* increased expected length of columns by 1 in tests
  • Loading branch information
marek-slipski authored Jul 27, 2021
1 parent 582981c commit 7b766c1
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 2 deletions.
4 changes: 4 additions & 0 deletions pybaseball/league_batting_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,16 @@ def get_table(soup: BeautifulSoup) -> pd.DataFrame:
table = soup.find_all('table')[0]
data = []
headings = [th.get_text() for th in table.find("tr").find_all("th")][1:]
headings.append("mlbID")
data.append(headings)
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
row_anchor = row.find("a")
mlbid = row_anchor["href"].split("mlb_ID=")[-1] if row_anchor else pd.NA # ID str or nan
cols = [ele.text.strip() for ele in cols]
cols.append(mlbid)
data.append([ele for ele in cols])
df = pd.DataFrame(data)
df = df.rename(columns=df.iloc[0])
Expand Down
4 changes: 4 additions & 0 deletions pybaseball/league_pitching_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,16 @@ def get_table(soup):
table = soup.find_all('table')[0]
data = []
headings = [th.get_text() for th in table.find("tr").find_all("th")][1:]
headings.append("mlbID")
data.append(headings)
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
row_anchor = row.find("a")
mlbid = row_anchor["href"].split("mlb_ID=")[-1] if row_anchor else pd.NA # ID str or nan
cols = [ele.text.strip() for ele in cols]
cols.append(mlbid)
data.append([ele for ele in cols])
data = pd.DataFrame(data)
data = data.rename(columns=data.iloc[0])
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/pybaseball/test_league_batting_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def test_batting_stats_bref() -> None:
assert result is not None
assert not result.empty

assert len(result.columns) == 27
assert len(result.columns) == 28
assert(len(result)) == 991


Expand Down
2 changes: 1 addition & 1 deletion tests/integration/pybaseball/test_league_pitching_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def test_pitching_stats_bref() -> None:
assert result is not None
assert not result.empty

assert len(result.columns) == 40
assert len(result.columns) == 41
assert(len(result)) == 831


Expand Down

0 comments on commit 7b766c1

Please sign in to comment.