Skip to content

Commit

Permalink
finished .py files
Browse files Browse the repository at this point in the history
  • Loading branch information
lauragreemko committed Oct 8, 2022
1 parent 2e32d6a commit b70b44a
Show file tree
Hide file tree
Showing 15 changed files with 346 additions and 62 deletions.
1 change: 0 additions & 1 deletion data/data1.json

This file was deleted.

1 change: 0 additions & 1 deletion data/data2.json

This file was deleted.

Binary file added data/twitter.db
Binary file not shown.
242 changes: 242 additions & 0 deletions notebooks/app.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,248 @@
"source": [
"# print(json.dumps(json_response, indent=4, sort_keys=True))"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sqlite3"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"connection = sqlite3.connect(\"../data/twitter.db\")\n",
"crsr = connection.cursor()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"def sql_query(query):\n",
"\n",
" # Ejecuta la query\n",
" crsr.execute(query)\n",
"\n",
" # Almacena los datos de la query \n",
" ans = crsr.fetchall()\n",
"\n",
" # Obtenemos los nombres de las columnas de la tabla\n",
" names = [description[0] for description in crsr.description]\n",
"\n",
" return pd.DataFrame(ans,columns=names)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"query_match = '''SELECT * FROM users'''"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>username</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1578095844569514011</td>\n",
" <td>Rocket</td>\n",
" <td>75_Rocket_</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1003872445</td>\n",
" <td>Pau Mugarra</td>\n",
" <td>PauMugarra</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2529499620</td>\n",
" <td>Formación Fundación Universidad Carlos III</td>\n",
" <td>formacion_fuc3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>706520411551494145</td>\n",
" <td>JoseSola</td>\n",
" <td>Jose_Sola_</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>604485175</td>\n",
" <td>Berta Otero</td>\n",
" <td>bertinha84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1391838162830385165</td>\n",
" <td>Apiux Tecnología</td>\n",
" <td>Apiuxtecnologia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1494355208473874438</td>\n",
" <td>PyData Madrid</td>\n",
" <td>PyDataMadrid</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>12085582</td>\n",
" <td>diegodl</td>\n",
" <td>diegodl</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>2907682397</td>\n",
" <td>graphext</td>\n",
" <td>graphext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1131933723681800193</td>\n",
" <td>Biohub VLC</td>\n",
" <td>BiohubVLC</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1073622740</td>\n",
" <td>COITCV/AVIT</td>\n",
" <td>COITCV</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>41110484</td>\n",
" <td>Raúl Cotrina</td>\n",
" <td>raulcotrina</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>210026854</td>\n",
" <td>Ana Amar🎃</td>\n",
" <td>AnaWhitewolf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>2390191837</td>\n",
" <td>LaSeñoenlaRadioTdF</td>\n",
" <td>LaSenioenRadio</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>394734366</td>\n",
" <td>Sandra Huerga</td>\n",
" <td>sanhuerga</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>1327588137346998273</td>\n",
" <td>DIGITAL INNOVATION NEWS</td>\n",
" <td>innovation_news</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>581925228</td>\n",
" <td>Fujitsu España</td>\n",
" <td>Fujitsu_ES</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name \\\n",
"0 1578095844569514011 Rocket \n",
"1 1003872445 Pau Mugarra \n",
"2 2529499620 Formación Fundación Universidad Carlos III \n",
"3 706520411551494145 JoseSola \n",
"4 604485175 Berta Otero \n",
"5 1391838162830385165 Apiux Tecnología \n",
"6 1494355208473874438 PyData Madrid \n",
"7 12085582 diegodl \n",
"8 2907682397 graphext \n",
"9 1131933723681800193 Biohub VLC \n",
"10 1073622740 COITCV/AVIT \n",
"11 41110484 Raúl Cotrina \n",
"12 210026854 Ana Amar🎃 \n",
"13 2390191837 LaSeñoenlaRadioTdF \n",
"14 394734366 Sandra Huerga \n",
"15 1327588137346998273 DIGITAL INNOVATION NEWS \n",
"16 581925228 Fujitsu España \n",
"\n",
" username \n",
"0 75_Rocket_ \n",
"1 PauMugarra \n",
"2 formacion_fuc3 \n",
"3 Jose_Sola_ \n",
"4 bertinha84 \n",
"5 Apiuxtecnologia \n",
"6 PyDataMadrid \n",
"7 diegodl \n",
"8 graphext \n",
"9 BiohubVLC \n",
"10 COITCV \n",
"11 raulcotrina \n",
"12 AnaWhitewolf \n",
"13 LaSenioenRadio \n",
"14 sanhuerga \n",
"15 innovation_news \n",
"16 Fujitsu_ES "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sql_query(query_match)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}
Empty file added notebooks/extract_data.ipynb
Empty file.
Binary file added twitter.db
Binary file not shown.
Binary file removed utils/__pycache__/classes.cpython-37.pyc
Binary file not shown.
Binary file removed utils/__pycache__/functions.cpython-37.pyc
Binary file not shown.
Binary file removed utils/__pycache__/variables.cpython-37.pyc
Binary file not shown.
10 changes: 6 additions & 4 deletions utils/app.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from functions import *
from variables import *
from classes import *
from classes.extract_data import *
from classes.create_df import *
from functions import *

obj = Create_df()

# get_all_tweets(files, start_date, max_results, tweet_fields, expansions)
df_list = [obj.tweet_df, obj.author_df]

Extract_data()
create_database(df_list)
69 changes: 69 additions & 0 deletions utils/classes/create_df.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from classes.extract_data import *
import pandas as pd

class Create_df:

obj = Extract_data()

data_1 = obj.response_json_1
data_2 = obj.response_json_2

retweet_count_list = []
reply_count = []
like_count = []
quote_count = []

df_tweet_1 = pd.DataFrame()
df_tweet_2 = pd.DataFrame()
df_tweet_list = [df_tweet_1, df_tweet_2]

df_author_1 = pd.DataFrame()
df_author_2 = pd.DataFrame()
df_author_list = [df_author_1, df_author_2]

def __init__(self):
self.data_list = [self.data_1, self.data_2]

self.main_tweet_df()
self.main_author_df()
self.drop_columns()
self.get_detail_list()
self.add_lists_to_df()


def main_tweet_df(self):
for index, data in enumerate(self.data_list):
data = data['data']
self.df_tweet_list[index] = pd.DataFrame(data)
self.tweet_df = pd.concat(self.df_tweet_list)


def main_author_df(self):
for index, data in enumerate(self.data_list):
data = data['includes']['users']
self.df_author_list[index] = pd.DataFrame(data)
self.author_df = pd.concat(self.df_author_list)


def drop_columns(self):
self.tweet_df.drop(columns='public_metrics', axis=1, inplace=True)
self.tweet_df.drop(columns='edit_history_tweet_ids', axis=1, inplace=True)


def get_detail_list(self):
for data in self.data_list:
for tweet in data['data']:
self.retweet_count_list.append(tweet['public_metrics']['retweet_count'])
self.reply_count.append(tweet['public_metrics']['reply_count'])
self.like_count.append(tweet['public_metrics']['like_count'])
self.quote_count.append(tweet['public_metrics']['quote_count'])


def add_lists_to_df(self):
self.tweet_df['retweet_count'] = self.retweet_count_list
self.tweet_df['reply_count'] = self.reply_count
self.tweet_df['like_count'] = self.like_count
self.tweet_df['quote_count'] = self.quote_count



19 changes: 10 additions & 9 deletions utils/classes.py → utils/classes/extract_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class Extract_data:
expansions = expansions
end_time = None
user = user
next_token = None
files = files

def __init__(self):
self.auth()
Expand All @@ -31,21 +31,22 @@ def create_url(self):
'start_time': self.start_date,
'end_time' : self.end_time,
'max_results': self.max_results,
'expansions': self.expansions,
'next_token': {}}
'expansions': self.expansions}

def connect_to_endpoint(self):
self.params['next_token'] = self.next_token
response = requests.request("GET", self.url, headers = self.headers, params = self.params)
self.response_json = response.json()
print("Endpoint Response Code: " + str(response.status_code))
if response.status_code != 200:
raise Exception(response.status_code, response.text)

def get_all_tweets(self):
self.create_url()
self.connect_to_endpoint()
self.params["next_token"] = self.response_json["meta"]["next_token"]
print(self.params["next_token"])

for i in self.files:
self.create_url()
self.connect_to_endpoint()
self.end_time = self.response_json['data'][-1]['created_at']
if i == 1:
self.response_json_1 = self.response_json
else:
self.response_json_2 = self.response_json

Loading

0 comments on commit b70b44a

Please sign in to comment.