diff --git a/data/twitter.db b/data/twitter.db deleted file mode 100644 index 562e4c2..0000000 Binary files a/data/twitter.db and /dev/null differ diff --git a/notebooks/extract_data.ipynb b/notebooks/extract_data.ipynb index 40ae36c..96b43bd 100644 --- a/notebooks/extract_data.ipynb +++ b/notebooks/extract_data.ipynb @@ -2,13 +2,15 @@ "cells": [ { "cell_type": "code", - "execution_count": 26, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "import os\n", "import requests\n", - "import json" + "import json\n", + "import pandas as pd\n", + "import sqlite3" ] }, { @@ -1983,10 +1985,524 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_list = [response_json_1, response_json_2]" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "df_tweet_1 = pd.DataFrame()\n", + "df_tweet_2 = pd.DataFrame()\n", + "df_tweet_list = [df_tweet_1, df_tweet_2]" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "for index, data in enumerate(data_list):\n", + " data = data['data']\n", + " df_tweet_list[index] = pd.DataFrame(data)\n", + " tweet_df = pd.concat(df_tweet_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 154 entries, 0 to 53\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 text 154 non-null object\n", + " 1 edit_history_tweet_ids 154 non-null object\n", + " 2 created_at 154 non-null object\n", + " 3 author_id 154 non-null object\n", + " 4 public_metrics 154 non-null object\n", + " 5 id 154 non-null object\n", + "dtypes: object(6)\n", + "memory usage: 8.4+ KB\n" + ] + } + ], + "source": [ + "tweet_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textedit_history_tweet_idscreated_atauthor_idpublic_metricsid
0Gente muy agradable en @TheBridge_Tech , te ri...[1578613094191796224]2022-10-08T05:07:45.000Z1578095844569514011{'retweet_count': 0, 'reply_count': 0, 'like_c...1578613094191796224
1Recordaros que la semana que viene tenemos la ...[1577334577701453827]2022-10-04T16:27:23.000Z1003872445{'retweet_count': 2, 'reply_count': 0, 'like_c...1577334577701453827
2El desarrollador web es uno de los perfiles má...[1576902991507922944]2022-10-03T11:52:25.000Z2529499620{'retweet_count': 3, 'reply_count': 0, 'like_c...1576902991507922944
3@jorgegrev @TheBridge_Tech Enhorabuena crack![1576856703349374976]2022-10-03T08:48:29.000Z706520411551494145{'retweet_count': 0, 'reply_count': 0, 'like_c...1576856703349374976
4Hoy por fin ve la luz mi primer Case Study púb...[1575217357105946624]2022-09-28T20:14:18.000Z604485175{'retweet_count': 3, 'reply_count': 0, 'like_c...1575217357105946624
\n", + "
" + ], + "text/plain": [ + " text edit_history_tweet_ids \\\n", + "0 Gente muy agradable en @TheBridge_Tech , te ri... [1578613094191796224] \n", + "1 Recordaros que la semana que viene tenemos la ... [1577334577701453827] \n", + "2 El desarrollador web es uno de los perfiles má... [1576902991507922944] \n", + "3 @jorgegrev @TheBridge_Tech Enhorabuena crack! [1576856703349374976] \n", + "4 Hoy por fin ve la luz mi primer Case Study púb... [1575217357105946624] \n", + "\n", + " created_at author_id \\\n", + "0 2022-10-08T05:07:45.000Z 1578095844569514011 \n", + "1 2022-10-04T16:27:23.000Z 1003872445 \n", + "2 2022-10-03T11:52:25.000Z 2529499620 \n", + "3 2022-10-03T08:48:29.000Z 706520411551494145 \n", + "4 2022-09-28T20:14:18.000Z 604485175 \n", + "\n", + " public_metrics id \n", + "0 {'retweet_count': 0, 'reply_count': 0, 'like_c... 1578613094191796224 \n", + "1 {'retweet_count': 2, 'reply_count': 0, 'like_c... 1577334577701453827 \n", + "2 {'retweet_count': 3, 'reply_count': 0, 'like_c... 1576902991507922944 \n", + "3 {'retweet_count': 0, 'reply_count': 0, 'like_c... 1576856703349374976 \n", + "4 {'retweet_count': 3, 'reply_count': 0, 'like_c... 1575217357105946624 " + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweet_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "tweet_df.drop(columns='public_metrics', axis=1, inplace=True)\n", + "tweet_df.drop(columns='edit_history_tweet_ids', axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "retweet_count_list = []\n", + "reply_count = []\n", + "like_count = []\n", + "quote_count = []" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "for data in data_list:\n", + " for tweet in data['data']:\n", + " retweet_count_list.append(tweet['public_metrics']['retweet_count'])\n", + " reply_count.append(tweet['public_metrics']['reply_count'])\n", + " like_count.append(tweet['public_metrics']['like_count'])\n", + " quote_count.append(tweet['public_metrics']['quote_count'])" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "tweet_df['retweet_count'] = retweet_count_list\n", + "tweet_df['reply_count'] = reply_count\n", + "tweet_df['like_count'] = like_count\n", + "tweet_df['quote_count'] = quote_count" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textcreated_atauthor_ididretweet_countreply_countlike_countquote_count
0Gente muy agradable en @TheBridge_Tech , te ri...2022-10-08T05:07:45.000Z157809584456951401115786130941917962240000
1Recordaros que la semana que viene tenemos la ...2022-10-04T16:27:23.000Z1003872445157733457770145382720120
2El desarrollador web es uno de los perfiles má...2022-10-03T11:52:25.000Z252949962015769029915079229443030
3@jorgegrev @TheBridge_Tech Enhorabuena crack!2022-10-03T08:48:29.000Z70652041155149414515768567033493749760010
4Hoy por fin ve la luz mi primer Case Study púb...2022-09-28T20:14:18.000Z60448517515752173571059466243040
\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 Gente muy agradable en @TheBridge_Tech , te ri... \n", + "1 Recordaros que la semana que viene tenemos la ... \n", + "2 El desarrollador web es uno de los perfiles má... \n", + "3 @jorgegrev @TheBridge_Tech Enhorabuena crack! \n", + "4 Hoy por fin ve la luz mi primer Case Study púb... \n", + "\n", + " created_at author_id id \\\n", + "0 2022-10-08T05:07:45.000Z 1578095844569514011 1578613094191796224 \n", + "1 2022-10-04T16:27:23.000Z 1003872445 1577334577701453827 \n", + "2 2022-10-03T11:52:25.000Z 2529499620 1576902991507922944 \n", + "3 2022-10-03T08:48:29.000Z 706520411551494145 1576856703349374976 \n", + "4 2022-09-28T20:14:18.000Z 604485175 1575217357105946624 \n", + "\n", + " retweet_count reply_count like_count quote_count \n", + "0 0 0 0 0 \n", + "1 2 0 12 0 \n", + "2 3 0 3 0 \n", + "3 0 0 1 0 \n", + "4 3 0 4 0 " + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweet_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "df_author_1 = pd.DataFrame()\n", + "df_author_2 = pd.DataFrame()\n", + "df_author_list = [df_author_1, df_author_2]" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "for index, data in enumerate(data_list):\n", + " data = data['includes']['users']\n", + " df_author_list[index] = pd.DataFrame(data)\n", + " author_df = pd.concat(df_author_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameusername
01578095844569514011Rocket75_Rocket_
11003872445Pau MugarraPauMugarra
22529499620Formación Fundación Universidad Carlos IIIformacion_fuc3
3706520411551494145JoseSolaJose_Sola_
4604485175Berta Oterobertinha84
\n", + "
" + ], + "text/plain": [ + " id name \\\n", + "0 1578095844569514011 Rocket \n", + "1 1003872445 Pau Mugarra \n", + "2 2529499620 Formación Fundación Universidad Carlos III \n", + "3 706520411551494145 JoseSola \n", + "4 604485175 Berta Otero \n", + "\n", + " username \n", + "0 75_Rocket_ \n", + "1 PauMugarra \n", + "2 formacion_fuc3 \n", + "3 Jose_Sola_ \n", + "4 bertinha84 " + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "author_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 90 entries, 0 to 26\n", + "Data columns (total 3 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 90 non-null object\n", + " 1 name 90 non-null object\n", + " 2 username 90 non-null object\n", + "dtypes: object(3)\n", + "memory usage: 2.8+ KB\n" + ] + } + ], + "source": [ + "author_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "connection = sqlite3.connect('../data/twitter.db')" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "tweet_df.to_sql('tweets', con=connection, index=False)\n", + "author_df.to_sql('users', con=connection, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "connection.close()" + ] } ], "metadata": {