Skip to content

Commit

Permalink
added the app for letterboxd scraper and dashboard
Browse files Browse the repository at this point in the history
  • Loading branch information
juanjuanjuanfer committed Oct 3, 2024
1 parent 02ceb9d commit 6a41554
Show file tree
Hide file tree
Showing 4 changed files with 257 additions and 5 deletions.
32 changes: 32 additions & 0 deletions lbxd/Home.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import streamlit as st
st.set_page_config(page_title="Letterboxd Film Tracker", page_icon=":chart_with_upwards_trend:", layout="wide")

# css for the page with background color #1A232C
# pallete :#1A232C #FF8100 #FFFFFF #3EBDF4 #00E153
page_css = """
<style>
[data-testid="stHeader"]{
background-color: #1A232C;
color: #FFFFFF;
}
[data-testid="stMainBlockContainer"]{
background-color: #1A232C;\
color: #FFFFFF;
}
[data-testid="stMain"]{
background-color: #1A232C;\
</style>
"""

st.markdown(page_css, unsafe_allow_html=True)

# Set padding for the page
padding_top = 2
# Set page title and layout

st.title('Letterboxd Film Tracker')
120 changes: 120 additions & 0 deletions lbxd/pages/Dashboard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import connection_mongo
import film as scraper

# Establish MongoDB connection
client = connection_mongo.connect_to_mongo("-", "-")
db = client.get_database("Letterboxd")

# Set page title
st.set_page_config(page_title="Letterboxd Film Tracker", page_icon=":chart_with_upwards_trend:")

# Main title
st.title('Letterboxd Film Tracker')

# Main content
st.header('Movie Stats')

# Text input (default empty) and store the movie in session state
if 'movie' not in st.session_state:
st.session_state['movie'] = ''

user_input = st.text_input('Enter the movie you want:', st.session_state['movie'])

# Button to change the selected movie and store it in session state
if st.button('Click to change'):
st.session_state['movie'] = user_input
st.write(f'Selected film: {st.session_state["movie"]}')
collection = db[st.session_state['movie']]

# Only proceed if a movie has been selected
if st.session_state['movie']:
collection = db[st.session_state['movie']]
film = scraper.Film()
film.set_film_name(st.session_state['movie'])
film_poster = film.scrape_film_poster(film.filmMainSoup, film.filmName)


image_url = film_poster # Replace with your image URL
st.markdown(
f'<div style="text-align:center;"><img src="{image_url}" alt="Movie Image" width="300"></div>',
unsafe_allow_html=True
)
# Text input for number of reviews
n = st.text_input("Number of recent reviews to show", "10")
n = int(n)

# Checkbox to display reviews
if st.checkbox(f'Show last {n} reviews'):
data = list(
collection.find({'rating': {'$exists': True}})
.sort([('$natural', -1)])
.limit(n)
)
result = []
for x in data:
result.append([x["username"], x["rating"], x["review_text"], x["date"]])
st.dataframe(result)

# Get dates from the database
dates = collection.distinct('date', {'rating': {'$exists': True}})
dates = [x for x in dates if x != ""]

# Manage the selected date using session state
if 'selected_date' not in st.session_state:
st.session_state['selected_date'] = None

if dates:
st.session_state['selected_date'] = st.selectbox('Select a date', options=dates, index=0)

# Plot ratings distribution for the selected date
ratings = list(collection.aggregate([
# Filter documents based on the given date and check if rating exists
{'$match': {'date': st.session_state['selected_date'], 'rating': {'$exists': True}}},

# Group by rating and count the occurrences of each rating
{'$group': {'_id': '$rating', 'count': {'$sum': 1}}}
]))

# Convert the result to a dictionary where key is rating and value is the count
ratings_dict = {item['_id']: item['count'] for item in ratings}
ratings_dict = dict(sorted(ratings_dict.items()))

if ratings_dict: # Proceed if there are ratings to show
ratings = list(ratings_dict.keys())
counts = list(ratings_dict.values())

# Create the horizontal bar plot
st.subheader(f'Plot for {collection.name} on {st.session_state["selected_date"]}')
st.write("The movie may have few scraped reviews, so the plot may not be accurate.")
fig, ax = plt.subplots() # Create a figure and axis
ax.barh(ratings, counts) # Create horizontal bar chart

# Add labels and title
ax.set_xlabel('Count')
ax.set_ylabel('Rating')
ax.set_title('Ratings Distribution')

# Display the plot in Streamlit
st.pyplot(fig)
else:
st.write("No ratings data available for the selected date.")
else:
st.write("No dates available for the selected movie.")
else:
st.write("Please enter a movie and click 'Click to change' to see the data.")

# Footer
st.markdown("---")

# Transform footer into two columns
col1, col2 = st.columns(2)
col1.write("Canto Arcona Alexis")
col1.write("Castro Echeverria Samantha")

col2.write("Cumi Llanez Christopher")
col2.write("Fernandez Cruz Juan")
col2.write("Ramayo Cardoso Juliana")
96 changes: 96 additions & 0 deletions lbxd/pages/Scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import streamlit as st
import connection_mongo
import film as scraper

# Set page title
st.title('Letterboxd Scraper')

# Establish MongoDB connection
client = connection_mongo.connect_to_mongo("-", "-")
db = client.get_database("Letterboxd")

# Initialize session state for movie, scraping status, and scrape amount if not already done
if 'movie' not in st.session_state:
st.session_state['movie'] = ''
if 'scrape_status' not in st.session_state:
st.session_state['scrape_status'] = 'No movie scraped yet.'
if 'scrape_amount' not in st.session_state:
st.session_state['scrape_amount'] = ''
if 'film_data' not in st.session_state:
st.session_state['film_data'] = {}

# User input for movie name
user_input = st.text_input('Enter the movie you want to scrape:', st.session_state['movie'])
st.session_state['movie'] = user_input # Save movie to session state



# Button to scrape the movie
if st.button('Click to scrape'):
if user_input:
film = scraper.Film()
film.set_film_name(st.session_state['movie'])

# Scrape movie poster
film_poster = film.scrape_film_poster(film.filmMainSoup, film.filmName)

# Save film data to session state
st.session_state['film_data'] = {
'name': film.filmName,
'year': film.filmReleaseYear,
'directors': film.filmDirectors["Directors"],
'rating': film.filmAverageRating,
'poster': film_poster
}

# Update scraping status
st.session_state['scrape_status'] = f'Successfully scraped {film.filmName}'

# Display movie information if it exists
if st.session_state['film_data']:
st.write(f"{st.session_state['film_data']['name']} | {st.session_state['film_data']['year']}")
st.write(f'Directed by: {st.session_state["film_data"]["directors"]}')
st.write(f'Rating: {st.session_state["film_data"]["rating"]:.1f} / 10')
st.write(f'https://letterboxd.com/film/{st.session_state["film_data"]["name"]}/')

# Display the scraped image
image_url = st.session_state['film_data']['poster']
st.markdown(f'<br><div style="text-align:center;"><img src="{image_url}" alt="Movie Image" width="300"></div><br>', unsafe_allow_html=True)

# Text input for scrape amount
st.write("If the movie is not popular recently, it is recommended to scrape more reviews at once, as if the scraper is run again, it will probably scrape the same reviews.")
scrape_amount_input = st.text_input("Number of recent reviews to scrape:", st.session_state['scrape_amount'])
st.session_state['scrape_amount'] = scrape_amount_input # Save scrape amount to session state

# Button to scrape reviews
if st.button('Scrape Reviews'):
if st.session_state['scrape_amount'] and st.session_state['film_data']:
try:
scrape_amount = int(st.session_state['scrape_amount']) // 12 + (int(st.session_state['scrape_amount']) % 12 > 0)
st.write(f"Scraping {scrape_amount * 12} recent reviews.")

# Scrape reviews
reviews = scraper.Film.FilmReview(st.session_state['film_data']['name'])
reviews.get_film_reviews(scrape_amount)
data = reviews.filmReviews
collection = db[st.session_state['movie']]

for review in data:
if not collection.find_one({"review_id": review['review_id']}):
collection.insert_one(review)
print(f"Review with ID {review['review_id']} inserted into the database.")
else:
print(f"Review with ID {review['review_id']} already exists in the database.")

# Update scraping status
st.session_state['scrape_status'] = f'Successfully scraped {len(data)} reviews for {st.session_state["film_data"]["name"]}.'


except ValueError:
st.error("Please enter a valid number for scraping reviews.")
else:
st.warning("Please scrape a movie first before scraping reviews.")

# If scraping was done, display the last scraped movie
if 'scrape_status' in st.session_state and st.session_state['scrape_status'] != 'No movie scraped yet.':
st.write(f"Last scraped movie: {st.session_state['movie']}")
14 changes: 9 additions & 5 deletions lbxd/scrape_reviews.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,21 @@
import connection_mongo as mongo
import time

client= mongo.connect_to_mongo("", "")
client= mongo.connect_to_mongo("-", "-")
db = client.get_database("Letterboxd")
collection = db["reviews_the-substance"]
moviename = "beetlejuice-beetlejuice"
collection = db[moviename]

# number of pages to scrape
n = 1

while True:
print("Scraping reviews...")
movie = scraper.Film()
movie.set_film_name("the-substance")
movie.set_film_name(moviename)

reviews = scraper.Film.FilmReview(movie.filmName)
reviews.get_film_reviews(1)
reviews.get_film_reviews(n)
data = reviews.filmReviews


Expand All @@ -24,5 +28,5 @@
else:
print(f"Review with ID {review['review_id']} already exists in the database.")

print("Waiting for 30 seconds before the next iteration...")
print(f"Waiting for {n} seconds before the next iteration...")
time.sleep(1)

0 comments on commit 6a41554

Please sign in to comment.