added the app for letterboxd scraper and dashboard

juanjuanjuanfer · Oct 3, 2024 · 6a41554 · 6a41554
1 parent 02ceb9d
commit 6a41554
Show file tree

Hide file tree

Showing 4 changed files with 257 additions and 5 deletions.
diff --git a/lbxd/Home.py b/lbxd/Home.py
@@ -0,0 +1,32 @@
+import streamlit as st
+st.set_page_config(page_title="Letterboxd Film Tracker", page_icon=":chart_with_upwards_trend:", layout="wide")
+
+# css for the page with background color #1A232C
+# pallete :#1A232C #FF8100 #FFFFFF #3EBDF4 #00E153
+page_css = """
+
+<style>
+[data-testid="stHeader"]{
+    background-color: #1A232C;
+    color: #FFFFFF;
+}
+
+[data-testid="stMainBlockContainer"]{
+
+    background-color: #1A232C;\
+    color: #FFFFFF;
+    }
+
+[data-testid="stMain"]{
+
+    background-color: #1A232C;\
+</style>
+"""
+
+st.markdown(page_css, unsafe_allow_html=True)
+
+# Set padding for the page
+padding_top = 2
+# Set page title and layout
+
+st.title('Letterboxd Film Tracker')
diff --git a/lbxd/pages/Dashboard.py b/lbxd/pages/Dashboard.py
@@ -0,0 +1,120 @@
+import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import connection_mongo
+import film as scraper
+
+# Establish MongoDB connection
+client = connection_mongo.connect_to_mongo("-", "-")
+db = client.get_database("Letterboxd")
+
+# Set page title
+st.set_page_config(page_title="Letterboxd Film Tracker", page_icon=":chart_with_upwards_trend:")
+
+# Main title
+st.title('Letterboxd Film Tracker')
+
+# Main content
+st.header('Movie Stats')
+
+# Text input (default empty) and store the movie in session state
+if 'movie' not in st.session_state:
+    st.session_state['movie'] = ''
+
+user_input = st.text_input('Enter the movie you want:', st.session_state['movie'])
+
+# Button to change the selected movie and store it in session state
+if st.button('Click to change'):
+    st.session_state['movie'] = user_input
+    st.write(f'Selected film: {st.session_state["movie"]}')
+    collection = db[st.session_state['movie']]
+
+# Only proceed if a movie has been selected
+if st.session_state['movie']:
+    collection = db[st.session_state['movie']]
+    film = scraper.Film()
+    film.set_film_name(st.session_state['movie'])
+    film_poster = film.scrape_film_poster(film.filmMainSoup, film.filmName)
+
+
+    image_url = film_poster  # Replace with your image URL
+    st.markdown(
+        f'<div style="text-align:center;"><img src="{image_url}" alt="Movie Image" width="300"></div>',
+        unsafe_allow_html=True
+    )
+    # Text input for number of reviews
+    n = st.text_input("Number of recent reviews to show", "10")
+    n = int(n)
+
+    # Checkbox to display reviews
+    if st.checkbox(f'Show last {n} reviews'):
+        data = list(
+            collection.find({'rating': {'$exists': True}})
+            .sort([('$natural', -1)])
+            .limit(n)
+        )
+        result  = []
+        for x in data:
+            result.append([x["username"], x["rating"], x["review_text"], x["date"]])
+        st.dataframe(result)
+
+    # Get dates from the database
+    dates = collection.distinct('date', {'rating': {'$exists': True}})
+    dates = [x for x in dates if x != ""]
+
+    # Manage the selected date using session state
+    if 'selected_date' not in st.session_state:
+        st.session_state['selected_date'] = None
+
+    if dates:
+        st.session_state['selected_date'] = st.selectbox('Select a date', options=dates, index=0)
+
+        # Plot ratings distribution for the selected date
+        ratings = list(collection.aggregate([
+            # Filter documents based on the given date and check if rating exists
+            {'$match': {'date': st.session_state['selected_date'], 'rating': {'$exists': True}}},
+
+            # Group by rating and count the occurrences of each rating
+            {'$group': {'_id': '$rating', 'count': {'$sum': 1}}}
+        ]))
+
+        # Convert the result to a dictionary where key is rating and value is the count
+        ratings_dict = {item['_id']: item['count'] for item in ratings}
+        ratings_dict = dict(sorted(ratings_dict.items()))
+
+        if ratings_dict:  # Proceed if there are ratings to show
+            ratings = list(ratings_dict.keys())
+            counts = list(ratings_dict.values())
+
+            # Create the horizontal bar plot
+            st.subheader(f'Plot for {collection.name} on {st.session_state["selected_date"]}')
+            st.write("The movie may have few scraped reviews, so the plot may not be accurate.")
+            fig, ax = plt.subplots()  # Create a figure and axis
+            ax.barh(ratings, counts)  # Create horizontal bar chart
+
+            # Add labels and title
+            ax.set_xlabel('Count')
+            ax.set_ylabel('Rating')
+            ax.set_title('Ratings Distribution')
+
+            # Display the plot in Streamlit
+            st.pyplot(fig)
+        else:
+            st.write("No ratings data available for the selected date.")
+    else:
+        st.write("No dates available for the selected movie.")
+else:
+    st.write("Please enter a movie and click 'Click to change' to see the data.")
+
+# Footer
+st.markdown("---")
+
+# Transform footer into two columns
+col1, col2 = st.columns(2)
+col1.write("Canto Arcona Alexis")
+col1.write("Castro Echeverria Samantha")
+
+col2.write("Cumi Llanez Christopher")
+col2.write("Fernandez Cruz Juan")
+col2.write("Ramayo Cardoso Juliana")
diff --git a/lbxd/pages/Scraper.py b/lbxd/pages/Scraper.py
@@ -0,0 +1,96 @@
+import streamlit as st
+import connection_mongo
+import film as scraper
+
+# Set page title
+st.title('Letterboxd Scraper')
+
+# Establish MongoDB connection
+client = connection_mongo.connect_to_mongo("-", "-")
+db = client.get_database("Letterboxd")
+
+# Initialize session state for movie, scraping status, and scrape amount if not already done
+if 'movie' not in st.session_state:
+    st.session_state['movie'] = ''
+if 'scrape_status' not in st.session_state:
+    st.session_state['scrape_status'] = 'No movie scraped yet.'
+if 'scrape_amount' not in st.session_state:
+    st.session_state['scrape_amount'] = ''
+if 'film_data' not in st.session_state:
+    st.session_state['film_data'] = {}
+
+# User input for movie name
+user_input = st.text_input('Enter the movie you want to scrape:', st.session_state['movie'])
+st.session_state['movie'] = user_input  # Save movie to session state
+
+
+
+# Button to scrape the movie
+if st.button('Click to scrape'):
+    if user_input:
+        film = scraper.Film()
+        film.set_film_name(st.session_state['movie'])
+
+        # Scrape movie poster
+        film_poster = film.scrape_film_poster(film.filmMainSoup, film.filmName)
+
+        # Save film data to session state
+        st.session_state['film_data'] = {
+            'name': film.filmName,
+            'year': film.filmReleaseYear,
+            'directors': film.filmDirectors["Directors"],
+            'rating': film.filmAverageRating,
+            'poster': film_poster
+        }
+
+        # Update scraping status
+        st.session_state['scrape_status'] = f'Successfully scraped {film.filmName}'
+
+# Display movie information if it exists
+if st.session_state['film_data']:
+    st.write(f"{st.session_state['film_data']['name']} | {st.session_state['film_data']['year']}")
+    st.write(f'Directed by: {st.session_state["film_data"]["directors"]}')
+    st.write(f'Rating: {st.session_state["film_data"]["rating"]:.1f} / 10')
+    st.write(f'https://letterboxd.com/film/{st.session_state["film_data"]["name"]}/')
+
+    # Display the scraped image
+    image_url = st.session_state['film_data']['poster']
+    st.markdown(f'<br><div style="text-align:center;"><img src="{image_url}" alt="Movie Image" width="300"></div><br>', unsafe_allow_html=True)
+
+# Text input for scrape amount
+st.write("If the movie is not popular recently, it is recommended to scrape more reviews at once, as if the scraper is run again, it will probably scrape the same reviews.")
+scrape_amount_input = st.text_input("Number of recent reviews to scrape:", st.session_state['scrape_amount'])
+st.session_state['scrape_amount'] = scrape_amount_input  # Save scrape amount to session state
+
+# Button to scrape reviews
+if st.button('Scrape Reviews'):
+    if st.session_state['scrape_amount'] and st.session_state['film_data']:
+        try:
+            scrape_amount = int(st.session_state['scrape_amount']) // 12 + (int(st.session_state['scrape_amount']) % 12 > 0)
+            st.write(f"Scraping {scrape_amount * 12} recent reviews.")
+
+            # Scrape reviews
+            reviews = scraper.Film.FilmReview(st.session_state['film_data']['name'])
+            reviews.get_film_reviews(scrape_amount)
+            data = reviews.filmReviews
+            collection = db[st.session_state['movie']]
+
+            for review in data:
+                if not collection.find_one({"review_id": review['review_id']}):
+                    collection.insert_one(review)
+                    print(f"Review with ID {review['review_id']} inserted into the database.")
+                else:
+                    print(f"Review with ID {review['review_id']} already exists in the database.")
+
+            # Update scraping status
+            st.session_state['scrape_status'] = f'Successfully scraped {len(data)} reviews for {st.session_state["film_data"]["name"]}.'
+
+
+        except ValueError:
+            st.error("Please enter a valid number for scraping reviews.")
+    else:
+        st.warning("Please scrape a movie first before scraping reviews.")
+
+# If scraping was done, display the last scraped movie
+if 'scrape_status' in st.session_state and st.session_state['scrape_status'] != 'No movie scraped yet.':
+    st.write(f"Last scraped movie: {st.session_state['movie']}")
diff --git a/lbxd/scrape_reviews.py b/lbxd/scrape_reviews.py
@@ -2,17 +2,21 @@
 import connection_mongo as mongo
 import time
 
-client= mongo.connect_to_mongo("", "")
+client= mongo.connect_to_mongo("-", "-")
 db = client.get_database("Letterboxd")
-collection = db["reviews_the-substance"]
+moviename = "beetlejuice-beetlejuice"
+collection = db[moviename]
+
+# number of pages to scrape
+n = 1
 
 while True:
     print("Scraping reviews...")
     movie = scraper.Film()
-    movie.set_film_name("the-substance")
+    movie.set_film_name(moviename)
 
     reviews = scraper.Film.FilmReview(movie.filmName)
-    reviews.get_film_reviews(1)
+    reviews.get_film_reviews(n)
     data = reviews.filmReviews
 
 
@@ -24,5 +28,5 @@
         else:
             print(f"Review with ID {review['review_id']} already exists in the database.")
 
-    print("Waiting for 30 seconds before the next iteration...")
+    print(f"Waiting for {n} seconds before the next iteration...")
     time.sleep(1)