Skip to content

Commit

Permalink
Scrape and save event images from Meetup
Browse files Browse the repository at this point in the history
  • Loading branch information
joeriddles committed Sep 13, 2024
1 parent e2cfdb7 commit 28808c5
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 29 deletions.
69 changes: 49 additions & 20 deletions src/web/scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
import pathlib
import re
import urllib.parse
import zoneinfo
from datetime import datetime, timedelta
from typing import Any, Protocol, TypeAlias, TypeVar

import eventbrite.access_methods
import requests
import zoneinfo
from bs4 import BeautifulSoup, Tag
from bs4 import BeautifulSoup
from bs4.element import Tag
from django.conf import settings
from django.utils import timezone
from eventbrite import Eventbrite
Expand Down Expand Up @@ -39,7 +40,8 @@ def scrape(self, url: str) -> ST:
...


EventScraperResult: TypeAlias = tuple[models.Event, list[models.Tag]]
ImageResult: TypeAlias = tuple[str, bytes]
EventScraperResult: TypeAlias = tuple[models.Event, list[models.Tag], ImageResult | None]


class MeetupScraperMixin:
Expand Down Expand Up @@ -84,8 +86,8 @@ def scrape(self, url: str) -> list[str]:
else:
upcoming_section = soup.find_all(id="upcoming-section")[0]
events = upcoming_section.find_all_next(id=re.compile(r"event-card-"))
filtered_event_containers = [event for event in events if self._filter_event_tag(event)]
event_urls = [event_container["href"] for event_container in filtered_event_containers]
filtered_event_containers: list[Tag] = [event for event in events if self._filter_event_tag(event)] # type: ignore
event_urls: list[str] = [event_container["href"] for event_container in filtered_event_containers] # type: ignore

return [url for url in event_urls if self._filter_repeating_events(url)]

Expand Down Expand Up @@ -136,27 +138,31 @@ def scrape(self, url: str) -> EventScraperResult:
location_data = apollo_state[event_json["venue"]["__ref"]]
location = f"{location_data['address']}, {location_data['city']}, {location_data['state']}"
external_id = event_json["id"]
event_photo = event_json["featuredEventPhoto"]["__ref"]
image_url = apollo_state[event_photo].get("highResUrl", apollo_state[event_photo]["baseUrl"])
except KeyError:
name = self._parse_name(soup)
description = self._parse_description(soup)
date_time = self._parse_date_time(soup)
duration = self._parse_duration(soup)
location = self._parse_location(soup)
external_id = self._parse_external_id(url)
image_url = self._parse_image(soup)

if image_url:
image_result = self._get_image(image_url)

tags = self._parse_tags(soup)
return (
models.Event(
name=name,
description=description,
date_time=date_time,
duration=duration,
location=location,
external_id=external_id,
url=url,
),
tags,
event = models.Event(
name=name,
description=description,
date_time=date_time,
duration=duration,
location=location,
external_id=external_id,
url=url,
)
return (event, tags, image_result)

def _parse_name(self, soup: BeautifulSoup) -> str:
name: str = soup.find_all("h1")[0].text
Expand All @@ -171,10 +177,16 @@ def _parse_description(self, soup: BeautifulSoup) -> str:
return description

def _parse_date_time(self, soup: BeautifulSoup) -> datetime:
return datetime.fromisoformat(soup.find_all("time")[0]["datetime"])
time: Tag | None = soup.find_next("time") # type: ignore
if not time:
raise ValueError("could not find time")
dt: str = time["datetime"] # type: ignore
return datetime.fromisoformat(dt)

def _parse_duration(self, soup: BeautifulSoup) -> timedelta:
time: Tag = soup.find_all("time")[0]
time: Tag | None = soup.find_next("time") # type: ignore
if not time:
raise ValueError("could not find time")
matches = self.DURATION_PATTERN.findall(time.text)
if not matches:
raise ValueError("Could not find duration from:", time.text)
Expand All @@ -199,6 +211,23 @@ def _parse_tags(self, soup: BeautifulSoup) -> list[models.Tag]:
tags = [re.sub(r"\s+", " ", t.text) for t in tags] # Some tags have newlines & extra spaces
return [models.Tag(value=t) for t in tags]

def _parse_image(self, soup: BeautifulSoup) -> str | None:
picture = soup.find(attrs={"data-testid": "event-description-image"})
if not picture:
return None
img: Tag | None = picture.find("img") # type: ignore
if not img:
return None
src: str = img["src"] # type: ignore
return src

def _get_image(self, image_url: str) -> ImageResult:
image_name = image_url.rsplit("/", maxsplit=1)[-1].split("?", maxsplit=1)[0]
response = requests.get(image_url, timeout=10)
response.raise_for_status()
image = response.content
return image_name, image


class EventbriteScraper(Scraper[list[EventScraperResult]]):
def __init__(self, api_token: str | None = None):
Expand All @@ -213,7 +242,7 @@ def scrape(self, organization_id: str) -> list[EventScraperResult]:
events_and_tags = [self.map_to_event(eventbrite_event) for eventbrite_event in response["events"]]
return events_and_tags

def map_to_event(self, eventbrite_event: dict) -> tuple[models.Event, list[models.Tag]]:
def map_to_event(self, eventbrite_event: dict) -> EventScraperResult:
name = eventbrite_event["name"]["text"]
start = datetime.fromisoformat(eventbrite_event["start"]["utc"])
end = datetime.fromisoformat(eventbrite_event["end"]["utc"])
Expand Down Expand Up @@ -249,7 +278,7 @@ def map_to_event(self, eventbrite_event: dict) -> tuple[models.Event, list[model
# if subcategory_name:
# tags.append(models.Tag(value=subcategory_name))

return event, []
return event, [], None

@functools.lru_cache
def _get_venue_location(self, venue_id: str) -> str:
Expand Down
20 changes: 18 additions & 2 deletions src/web/services.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from datetime import timedelta
from typing import Protocol

from django.core.files.base import ContentFile
from django.forms.models import model_to_dict
from django.utils import timezone

Expand All @@ -24,13 +25,15 @@ def save_events(self) -> None:
for tech_group in models.TechGroup.objects.filter(homepage__icontains="meetup.com"):
event_urls = self.homepage_scraper.scrape(tech_group.homepage) # type: ignore
for event_url in event_urls: # TODO: parallelize (with async?)
event, tags = self.event_scraper.scrape(event_url)
event, tags, image_result = self.event_scraper.scrape(event_url)
event.group = tech_group
event.approved_at = now
defaults = model_to_dict(event, exclude=["id"])
defaults["group"] = tech_group

del defaults["tags"] # Can't apply Many-to-Many relationship untill after the event has been saved.
del defaults["image"]

new_event, _ = models.Event.objects.update_or_create(
external_id=event.external_id,
defaults=defaults,
Expand All @@ -39,6 +42,19 @@ def save_events(self) -> None:
tag, _ = models.Tag.objects.get_or_create(value=tag)
new_event.tags.add(tag)

if image_result is not None:
image_name, image = image_result

# If images are the same, don't re-upload
has_existing_image = bool(new_event.image)
if has_existing_image:
existing_image = new_event.image.read()
if existing_image == image:
continue

file = ContentFile(image, name=image_name)
new_event.image.save(image_name, file)


class EventbriteService:
events_scraper: scrapers.Scraper[list[scrapers.EventScraperResult]]
Expand All @@ -58,7 +74,7 @@ def save_events(self) -> None:
for eventbrite_organization in models.EventbriteOrganization.objects.prefetch_related("tech_group"):
tech_group = eventbrite_organization.tech_group
events_and_tags = self.events_scraper.scrape(eventbrite_organization.eventbrite_id)
for event, _ in events_and_tags:
for event, _, _ in events_and_tags:
event.group = tech_group
event.approved_at = now
defaults = model_to_dict(event, exclude=["id"])
Expand Down
Binary file added src/web/tests/data/meetup-image.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added src/web/tests/data/meetup-image.webp
Binary file not shown.
46 changes: 39 additions & 7 deletions src/web/tests/test_scrapers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import pathlib
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

import freezegun
import responses
import pytest
import responses
from django.test import TestCase

from web import models, scrapers
from zoneinfo import ZoneInfo


class TestMeetupHomepageScraper(TestCase):
Expand Down Expand Up @@ -59,16 +60,26 @@ def test_scraper_without_json(self):
class TestMeetupEventScraper(TestCase):
@responses.activate
def test_scraper_with_json(self):
fin = open(pathlib.Path(__file__).parent / "data" / "meetup-with-json.html")
body = fin.read()
fin.close()
# Arrange
with open(pathlib.Path(__file__).parent / "data" / "meetup-with-json.html") as fin:
body = fin.read()
responses.get(
"https://www.meetup.com/python-spokane/events/298213205/",
body=body,
)

with open(pathlib.Path(__file__).parent / "data" / "meetup-image.jpeg", "rb") as fin:
body = fin.read()
responses.get(
"https://secure.meetupstatic.com/photos/event/1/0/a/e/highres_519844270.jpeg",
body=body,
)

# Act
scraper = scrapers.MeetupEventScraper()
actual, actual_tags = scraper.scrape("https://www.meetup.com/python-spokane/events/298213205/")
actual, actual_tags, actual_image_result = scraper.scrape(
"https://www.meetup.com/python-spokane/events/298213205/"
)

assert actual.name == "Dagger with Spokane Tech 🚀"
assert actual.description and actual.description.startswith("Join us for our monthly SPUG meetup!")
Expand All @@ -77,6 +88,7 @@ def test_scraper_with_json(self):
assert actual.location == "1720 W 4th Ave Unit B, Spokane, WA"
assert actual.url == "https://www.meetup.com/python-spokane/events/298213205/"
assert actual.external_id == "298213205"

assert len(actual_tags) == 5
assert {t.value for t in actual_tags} == {
"Linux",
Expand All @@ -86,8 +98,13 @@ def test_scraper_with_json(self):
"Agile and Scrum",
}

assert actual_image_result
assert actual_image_result[0] == "highres_519844270.jpeg"
assert len(actual_image_result[1]) > 0

@responses.activate
def test_scraper_without_json(self):
# Arrange
fin = open(pathlib.Path(__file__).parent / "data" / "meetup-without-json.html")
body = fin.read()
fin.close()
Expand All @@ -96,9 +113,20 @@ def test_scraper_without_json(self):
body=body,
)

with open(pathlib.Path(__file__).parent / "data" / "meetup-image.webp", "rb") as fin:
body = fin.read()
responses.get(
"https://secure.meetupstatic.com/photos/event/1/0/a/e/600_519844270.webp?w=750",
body=body,
)

# Act
scraper = scrapers.MeetupEventScraper()
actual, actual_tags = scraper.scrape("https://www.meetup.com/python-spokane/events/298213205/")
actual, actual_tags, actual_image_result = scraper.scrape(
"https://www.meetup.com/python-spokane/events/298213205/"
)

# Assert
assert actual.name == "Dagger with Spokane Tech 🚀"
assert actual.description and actual.description.startswith("Join us for our monthly SPUG meetup!")
assert actual.date_time == datetime(2024, 3, 19, 18, 0, 0, tzinfo=ZoneInfo("America/Los_Angeles"))
Expand All @@ -115,6 +143,10 @@ def test_scraper_without_json(self):
"Agile and Scrum",
}

assert actual_image_result
assert actual_image_result[0] == "600_519844270.webp"
assert len(actual_image_result[1]) > 0


@pytest.mark.eventbrite
class TestEventbriteScraper(TestCase):
Expand Down

0 comments on commit 28808c5

Please sign in to comment.