Skip to content

Commit

Permalink
Add image scraping to EventbriteScraper
Browse files Browse the repository at this point in the history
  • Loading branch information
joeriddles committed Sep 13, 2024
1 parent 28808c5 commit da0f6a4
Show file tree
Hide file tree
Showing 8 changed files with 311 additions and 78 deletions.
46 changes: 31 additions & 15 deletions src/web/scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def get_venue(self, id, **data):


def get_event_description(self, id, **data):
return self.get("/events/{0}/description//".format(id), data=data)
return self.get("/events/{0}/description/".format(id), data=data)


setattr(eventbrite.access_methods.AccessMethodsMixin, "get_venue", get_venue)
Expand All @@ -44,7 +44,19 @@ def scrape(self, url: str) -> ST:
EventScraperResult: TypeAlias = tuple[models.Event, list[models.Tag], ImageResult | None]


class MeetupScraperMixin:
class ScraperMixin:
def _get_image(self, image_url: str) -> ImageResult:
image_name = self._parse_image_name(image_url)
response = requests.get(image_url, timeout=10)
response.raise_for_status()
image = response.content
return image_name, image

def _parse_image_name(self, image_url: str) -> str:
return image_url.rsplit("/", maxsplit=1)[-1].split("?", maxsplit=1)[0]


class MeetupScraperMixin(ScraperMixin):
"""Common Meetup scraping functionality."""

def _parse_apollo_state(self, soup: BeautifulSoup) -> dict:
Expand Down Expand Up @@ -177,14 +189,14 @@ def _parse_description(self, soup: BeautifulSoup) -> str:
return description

def _parse_date_time(self, soup: BeautifulSoup) -> datetime:
time: Tag | None = soup.find_next("time") # type: ignore
time: Tag | None = soup.find("time") # type: ignore
if not time:
raise ValueError("could not find time")
dt: str = time["datetime"] # type: ignore
return datetime.fromisoformat(dt)

def _parse_duration(self, soup: BeautifulSoup) -> timedelta:
time: Tag | None = soup.find_next("time") # type: ignore
time: Tag | None = soup.find("time") # type: ignore
if not time:
raise ValueError("could not find time")
matches = self.DURATION_PATTERN.findall(time.text)
Expand Down Expand Up @@ -221,15 +233,8 @@ def _parse_image(self, soup: BeautifulSoup) -> str | None:
src: str = img["src"] # type: ignore
return src

def _get_image(self, image_url: str) -> ImageResult:
image_name = image_url.rsplit("/", maxsplit=1)[-1].split("?", maxsplit=1)[0]
response = requests.get(image_url, timeout=10)
response.raise_for_status()
image = response.content
return image_name, image


class EventbriteScraper(Scraper[list[EventScraperResult]]):
class EventbriteScraper(ScraperMixin, Scraper[list[EventScraperResult]]):
def __init__(self, api_token: str | None = None):
self.client = Eventbrite(api_token or settings.EVENTBRITE_API_TOKEN)
self._location_by_venue_id: dict[str, str] = {}
Expand All @@ -238,9 +243,10 @@ def scrape(self, organization_id: str) -> list[EventScraperResult]:
response = self.client.get_organizer_events(
organization_id,
status="live",
expand="logo",
)
events_and_tags = [self.map_to_event(eventbrite_event) for eventbrite_event in response["events"]]
return events_and_tags
results = [self.map_to_event(eventbrite_event) for eventbrite_event in response["events"]]
return results

def map_to_event(self, eventbrite_event: dict) -> EventScraperResult:
name = eventbrite_event["name"]["text"]
Expand All @@ -259,6 +265,16 @@ def map_to_event(self, eventbrite_event: dict) -> EventScraperResult:
# short description
description = eventbrite_event["description"]["html"]

try:
image_url = eventbrite_event["logo"]["original"]["url"]
image_result = self._get_image(image_url)
except (KeyError, requests.HTTPError):
try:
image_url = eventbrite_event["logo"]["url"]
image_result = self._get_image(image_url)
except KeyError:
image_result = None

event = models.Event(
name=name,
description=description,
Expand All @@ -278,7 +294,7 @@ def map_to_event(self, eventbrite_event: dict) -> EventScraperResult:
# if subcategory_name:
# tags.append(models.Tag(value=subcategory_name))

return event, [], None
return event, [], image_result

@functools.lru_cache
def _get_venue_location(self, venue_id: str) -> str:
Expand Down
109 changes: 67 additions & 42 deletions src/web/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,52 +8,84 @@
from web import models, scrapers


class EventService:
def save_event_from_result(
self,
result: scrapers.EventScraperResult,
tech_group: models.TechGroup,
) -> None:
event, tags, image_result = result
event = self._save_event(event, tech_group)
self._save_tags(event, tags)
if image_result is not None:
self._save_image(event, image_result)

def _save_event(
self,
event: models.Event,
tech_group: models.TechGroup,
) -> models.Event:
event.group = tech_group
event.approved_at = timezone.localtime()
defaults = model_to_dict(event, exclude=["id"])
defaults["group"] = tech_group

del defaults["tags"] # Can't apply Many-to-Many relationship untill after the event has been saved.
del defaults["image"]

updated_event, _ = models.Event.objects.update_or_create(
external_id=event.external_id,
defaults=defaults,
)
return updated_event

def _save_tags(
self,
event: models.Event,
tags: list[models.Tag],
) -> None:
for tag in tags:
tag, _ = models.Tag.objects.get_or_create(value=tag)
event.tags.add(tag)

def _save_image(
self,
event: models.Event,
image_result: scrapers.ImageResult,
) -> None:
image_name, image = image_result

# If images are the same, don't re-upload
has_existing_image = bool(event.image)
if has_existing_image:
existing_image = event.image.read()
if existing_image == image:
return

file = ContentFile(image, name=image_name)
event.image.save(image_name, file)


class MeetupService:
def __init__(
self,
homepage_scraper: scrapers.Scraper[list[str]] | None = None,
event_scraper: scrapers.Scraper[scrapers.EventScraperResult] | None = None,
event_service: EventService | None = None,
) -> None:
self.homepage_scraper: scrapers.Scraper[list[str]] = homepage_scraper or scrapers.MeetupHomepageScraper()
self.event_scraper: scrapers.Scraper[scrapers.EventScraperResult] = (
event_scraper or scrapers.MeetupEventScraper()
)
self.event_service = event_service or EventService()

def save_events(self) -> None:
"""Scrape upcoming events from Meetup and save them to the database."""
now = timezone.localtime()
for tech_group in models.TechGroup.objects.filter(homepage__icontains="meetup.com"):
event_urls = self.homepage_scraper.scrape(tech_group.homepage) # type: ignore
for event_url in event_urls: # TODO: parallelize (with async?)
event, tags, image_result = self.event_scraper.scrape(event_url)
event.group = tech_group
event.approved_at = now
defaults = model_to_dict(event, exclude=["id"])
defaults["group"] = tech_group

del defaults["tags"] # Can't apply Many-to-Many relationship untill after the event has been saved.
del defaults["image"]

new_event, _ = models.Event.objects.update_or_create(
external_id=event.external_id,
defaults=defaults,
)
for tag in tags:
tag, _ = models.Tag.objects.get_or_create(value=tag)
new_event.tags.add(tag)

if image_result is not None:
image_name, image = image_result

# If images are the same, don't re-upload
has_existing_image = bool(new_event.image)
if has_existing_image:
existing_image = new_event.image.read()
if existing_image == image:
continue

file = ContentFile(image, name=image_name)
new_event.image.save(image_name, file)
result = self.event_scraper.scrape(event_url)
self.event_service.save_event_from_result(result, tech_group)


class EventbriteService:
Expand All @@ -62,28 +94,21 @@ class EventbriteService:
def __init__(
self,
events_scraper: scrapers.Scraper[list[scrapers.EventScraperResult]] | None = None,
event_service: EventService | None = None,
) -> None:
self.events_scraper = events_scraper or scrapers.EventbriteScraper()
self.event_service = event_service or EventService()

def save_events(self) -> None:
"""Fetch upcoming events from Eventbrite and save them.
Note: this uses an API and doesn't actually web scrape.
"""
now = timezone.localtime()
for eventbrite_organization in models.EventbriteOrganization.objects.prefetch_related("tech_group"):
tech_group = eventbrite_organization.tech_group
events_and_tags = self.events_scraper.scrape(eventbrite_organization.eventbrite_id)
for event, _, _ in events_and_tags:
event.group = tech_group
event.approved_at = now
defaults = model_to_dict(event, exclude=["id"])
defaults["group"] = tech_group
del defaults["tags"] # Can't apply Many-to-Many relationship untill after the event has been saved.
models.Event.objects.update_or_create(
external_id=event.external_id,
defaults=defaults,
)
results = self.events_scraper.scrape(eventbrite_organization.eventbrite_id)
for result in results:
self.event_service.save_event_from_result(result, tech_group)


class Sender(Protocol):
Expand Down
3 changes: 3 additions & 0 deletions src/web/tests/data/eventbrite/event_description.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"description": "<div>Full Day of Panels, Speakers and Vendors on Cybersecurity, AI and Compliance. FREE with pre-registration - space limited - Oct 2, 2024"
}
Binary file added src/web/tests/data/eventbrite/event_image.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
25 changes: 25 additions & 0 deletions src/web/tests/data/eventbrite/event_venue.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"address": {
"address_1": "702 East Desmet Avenue",
"address_2": "",
"city": "Spokane",
"region": "WA",
"postal_code": "99202",
"country": "US",
"latitude": "47.6672448",
"longitude": "-117.3999126",
"localized_address_display": "702 East Desmet Avenue, Spokane, WA 99202",
"localized_area_display": "Spokane, WA",
"localized_multi_line_address_display": [
"702 East Desmet Avenue",
"Spokane, WA 99202"
]
},
"resource_uri": "https://www.eventbriteapi.com/v3/venues/214450569/",
"id": "214450569",
"age_restriction": null,
"capacity": null,
"name": "John J. Hemmingson Center",
"latitude": "47.6672448",
"longitude": "-117.3999126"
}
83 changes: 83 additions & 0 deletions src/web/tests/data/eventbrite/organizer_events.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
{
"pagination": {
"object_count": 1,
"page_number": 1,
"page_size": 50,
"page_count": 1,
"has_more_items": false
},
"events": [
{
"name": {
"text": "3rd Annual - INCH360 Regional Cybersecurity Conference",
"html": "3rd Annual - INCH360 Regional Cybersecurity Conference"
},
"description": {
"text": "Full Day of Panels, Speakers and Vendors on Cybersecurity, AI and Compliance. FREE with pre-registration - space limited - Oct 2, 2024",
"html": "Full Day of Panels, Speakers and Vendors on Cybersecurity, AI and Compliance. FREE with pre-registration - space limited - Oct 2, 2024"
},
"url": "https://www.eventbrite.com/e/3rd-annual-inch360-regional-cybersecurity-conference-tickets-909447069667",
"start": {
"timezone": "America/Los_Angeles",
"local": "2024-10-02T08:30:00",
"utc": "2024-10-02T15:30:00Z"
},
"end": {
"timezone": "America/Los_Angeles",
"local": "2024-10-02T16:00:00",
"utc": "2024-10-02T23:00:00Z"
},
"organization_id": "1773924472233",
"created": "2024-05-19T22:29:10Z",
"changed": "2024-09-06T16:49:53Z",
"published": "2024-05-20T20:03:26Z",
"capacity": null,
"capacity_is_custom": null,
"status": "live",
"currency": "USD",
"listed": true,
"shareable": true,
"online_event": false,
"tx_time_limit": 1200,
"hide_start_date": false,
"hide_end_date": false,
"locale": "en_US",
"is_locked": false,
"privacy_setting": "unlocked",
"is_series": false,
"is_series_parent": false,
"inventory_type": "limited",
"is_reserved_seating": false,
"show_pick_a_seat": false,
"show_seatmap_thumbnail": false,
"show_colors_in_seatmap_thumbnail": false,
"source": "auto_create",
"is_free": true,
"version": null,
"summary": "Full Day of Panels, Speakers and Vendors on Cybersecurity, AI and Compliance. FREE with pre-registration - space limited - Oct 2, 2024",
"facebook_event_id": null,
"logo_id": "843746309",
"organizer_id": "72020528223",
"venue_id": "214450569",
"category_id": "102",
"subcategory_id": "2004",
"format_id": "2",
"id": "909447069667",
"resource_uri": "https://www.eventbriteapi.com/v3/events/909447069667/",
"is_externally_ticketed": false,
"logo": {
"crop_mask": null,
"original": {
"url": "https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc.com%2Fimages%2F843746309%2F530357704049%2F1%2Foriginal.20240906-164727?auto=format%2Ccompress&q=75&sharp=10&s=09370c02bd3ab62907337f2e1ca8a61d",
"width": 6912,
"height": 3456
},
"id": "843746309",
"url": "https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc.com%2Fimages%2F843746309%2F530357704049%2F1%2Foriginal.20240906-164727?h=200&w=450&auto=format%2Ccompress&q=75&sharp=10&s=4b5f5340dcfe0cc78bec9f19b08f45f7",
"aspect_ratio": "2",
"edge_color": "#516c79",
"edge_color_set": true
}
}
]
}
Loading

0 comments on commit da0f6a4

Please sign in to comment.