diff --git a/data/twitter.db b/data/twitter.db index 8e2e86c..21c2b6c 100644 Binary files a/data/twitter.db and b/data/twitter.db differ diff --git a/notebooks/eda.ipynb b/notebooks/eda.ipynb index 17d041c..08ec5b1 100644 --- a/notebooks/eda.ipynb +++ b/notebooks/eda.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 99, + "execution_count": 168, "metadata": {}, "outputs": [], "source": [ @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 169, "metadata": {}, "outputs": [], "source": [ @@ -22,47 +22,17 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 170, "metadata": {}, "outputs": [], "source": [ - "def sql_query(query):\n", - "\n", - " # Ejecuta la query\n", - " crsr.execute(query)\n", - "\n", - " # Almacena los datos de la query \n", - " ans = crsr.fetchall()\n", - "\n", - " # Obtenemos los nombres de las columnas de la tabla\n", - " names = [description[0] for description in crsr.description]\n", - "\n", - " return pd.DataFrame(ans,columns=names)" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "metadata": {}, - "outputs": [], - "source": [ - "query_tweet = '''SELECT * FROM tweets'''\n", - "query_user = '''SELECT * FROM users'''" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "metadata": {}, - "outputs": [], - "source": [ - "df_tweets = sql_query(query_tweet)\n", - "df_users = sql_query(query_user)" + "df_tweets = pd.read_sql('SELECT * FROM tweets', connection, parse_dates=['created_at'])\n", + "df_users = pd.read_sql('SELECT * FROM users', connection)" ] }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 171, "metadata": {}, "outputs": [ { @@ -86,10 +56,10 @@ " \n", " \n", " \n", - " created_at\n", - " text\n", " id\n", + " text\n", " author_id\n", + " created_at\n", " retweet_count\n", " reply_count\n", " like_count\n", @@ -99,10 +69,10 @@ " \n", " \n", " 0\n", - " 2022-10-08T05:07:45.000Z\n", - " Gente muy agradable en @TheBridge_Tech , te ri...\n", " 1578613094191796224\n", + " Gente muy agradable en @TheBridge_Tech , te ri...\n", " 1578095844569514011\n", + " 2022-10-08 05:07:45\n", " 0\n", " 0\n", " 0\n", @@ -110,10 +80,10 @@ " \n", " \n", " 1\n", - " 2022-10-04T16:27:23.000Z\n", - " Recordaros que la semana que viene tenemos la ...\n", " 1577334577701453827\n", + " Recordaros que la semana que viene tenemos la ...\n", " 1003872445\n", + " 2022-10-04 16:27:23\n", " 2\n", " 0\n", " 12\n", @@ -121,10 +91,10 @@ " \n", " \n", " 2\n", - " 2022-10-03T11:52:25.000Z\n", - " El desarrollador web es uno de los perfiles má...\n", " 1576902991507922944\n", + " El desarrollador web es uno de los perfiles má...\n", " 2529499620\n", + " 2022-10-03 11:52:25\n", " 3\n", " 0\n", " 3\n", @@ -132,10 +102,10 @@ " \n", " \n", " 3\n", - " 2022-10-03T08:48:29.000Z\n", - " @jorgegrev @TheBridge_Tech Enhorabuena crack!\n", " 1576856703349374976\n", + " @jorgegrev @TheBridge_Tech Enhorabuena crack!\n", " 706520411551494145\n", + " 2022-10-03 08:48:29\n", " 0\n", " 0\n", " 1\n", @@ -143,10 +113,10 @@ " \n", " \n", " 4\n", - " 2022-09-28T20:14:18.000Z\n", - " Hoy por fin ve la luz mi primer Case Study púb...\n", " 1575217357105946624\n", + " Hoy por fin ve la luz mi primer Case Study púb...\n", " 604485175\n", + " 2022-09-28 20:14:18\n", " 3\n", " 0\n", " 4\n", @@ -157,29 +127,29 @@ "" ], "text/plain": [ - " created_at \\\n", - "0 2022-10-08T05:07:45.000Z \n", - "1 2022-10-04T16:27:23.000Z \n", - "2 2022-10-03T11:52:25.000Z \n", - "3 2022-10-03T08:48:29.000Z \n", - "4 2022-09-28T20:14:18.000Z \n", + " id text \\\n", + "0 1578613094191796224 Gente muy agradable en @TheBridge_Tech , te ri... \n", + "1 1577334577701453827 Recordaros que la semana que viene tenemos la ... \n", + "2 1576902991507922944 El desarrollador web es uno de los perfiles má... \n", + "3 1576856703349374976 @jorgegrev @TheBridge_Tech Enhorabuena crack! \n", + "4 1575217357105946624 Hoy por fin ve la luz mi primer Case Study púb... \n", "\n", - " text id \\\n", - "0 Gente muy agradable en @TheBridge_Tech , te ri... 1578613094191796224 \n", - "1 Recordaros que la semana que viene tenemos la ... 1577334577701453827 \n", - "2 El desarrollador web es uno de los perfiles má... 1576902991507922944 \n", - "3 @jorgegrev @TheBridge_Tech Enhorabuena crack! 1576856703349374976 \n", - "4 Hoy por fin ve la luz mi primer Case Study púb... 1575217357105946624 \n", + " author_id created_at retweet_count reply_count \\\n", + "0 1578095844569514011 2022-10-08 05:07:45 0 0 \n", + "1 1003872445 2022-10-04 16:27:23 2 0 \n", + "2 2529499620 2022-10-03 11:52:25 3 0 \n", + "3 706520411551494145 2022-10-03 08:48:29 0 0 \n", + "4 604485175 2022-09-28 20:14:18 3 0 \n", "\n", - " author_id retweet_count reply_count like_count quote_count \n", - "0 1578095844569514011 0 0 0 0 \n", - "1 1003872445 2 0 12 0 \n", - "2 2529499620 3 0 3 0 \n", - "3 706520411551494145 0 0 1 0 \n", - "4 604485175 3 0 4 0 " + " like_count quote_count \n", + "0 0 0 \n", + "1 12 0 \n", + "2 3 0 \n", + "3 1 0 \n", + "4 4 0 " ] }, - "execution_count": 104, + "execution_count": 171, "metadata": {}, "output_type": "execute_result" } @@ -190,7 +160,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 172, "metadata": {}, "outputs": [ { @@ -200,17 +170,17 @@ "\n", "RangeIndex: 154 entries, 0 to 153\n", "Data columns (total 8 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 created_at 154 non-null object\n", - " 1 text 154 non-null object\n", - " 2 id 154 non-null object\n", - " 3 author_id 154 non-null object\n", - " 4 retweet_count 154 non-null int64 \n", - " 5 reply_count 154 non-null int64 \n", - " 6 like_count 154 non-null int64 \n", - " 7 quote_count 154 non-null int64 \n", - "dtypes: int64(4), object(4)\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 154 non-null object \n", + " 1 text 154 non-null object \n", + " 2 author_id 154 non-null object \n", + " 3 created_at 154 non-null datetime64[ns]\n", + " 4 retweet_count 154 non-null int64 \n", + " 5 reply_count 154 non-null int64 \n", + " 6 like_count 154 non-null int64 \n", + " 7 quote_count 154 non-null int64 \n", + "dtypes: datetime64[ns](1), int64(4), object(3)\n", "memory usage: 9.8+ KB\n" ] } @@ -221,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 173, "metadata": {}, "outputs": [ { @@ -301,7 +271,7 @@ "4 bertinha84 " ] }, - "execution_count": 106, + "execution_count": 173, "metadata": {}, "output_type": "execute_result" } @@ -312,7 +282,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 174, "metadata": {}, "outputs": [ { @@ -338,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 175, "metadata": {}, "outputs": [ { @@ -355,25 +325,7 @@ }, { "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [], - "source": [ - "df_tweets['text'] = df_tweets['text'].str.replace('\\n',' ')" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [], - "source": [ - "df_tweets['social_repercussion'] = df_tweets['retweet_count'] + df_tweets['reply_count'] + df_tweets['like_count'] + df_tweets['quote_count']" - ] - }, - { - "cell_type": "code", - "execution_count": 111, + "execution_count": 176, "metadata": {}, "outputs": [ { @@ -386,42 +338,28 @@ } ], "source": [ + "# The column social_repercussion contains the sum of the 4 metrics columns\n", + "df_tweets['social_repercussion'] = df_tweets['retweet_count'] + df_tweets['reply_count'] + df_tweets['like_count'] + df_tweets['quote_count']\n", + "\n", "print(df_tweets[df_tweets['social_repercussion']==df_tweets['social_repercussion'].max()]['text'])" ] }, { "cell_type": "code", - "execution_count": 127, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'1255794072280842240'" - ] - }, - "execution_count": 127, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_tweets['author_id'].value_counts().idxmax()" - ] - }, - { - "cell_type": "code", - "execution_count": 128, + "execution_count": 164, "metadata": {}, "outputs": [], "source": [ + "# Author with most tweets\n", "most_repeated_user = df_tweets['author_id'].value_counts().idxmax()\n", + "\n", + "# Times this author posted\n", "times_user_repeated = df_tweets['author_id'].value_counts()[most_repeated_user]" ] }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 165, "metadata": {}, "outputs": [ { @@ -466,40 +404,36 @@ "48 1255794072280842240 Heavy Mental HeavyMental_es" ] }, - "execution_count": 129, + "execution_count": 165, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# Row which contains author information\n", "df_users[df_users['id'] == most_repeated_user]" ] }, { "cell_type": "code", - "execution_count": 130, - "metadata": {}, - "outputs": [], - "source": [ - "most_repeated_user_name = df_users[df_users['id'] == most_repeated_user]['name']" - ] - }, - { - "cell_type": "code", - "execution_count": 132, + "execution_count": 180, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "48 The user Heavy Mental posted 12 times about Th...\n", + "48 The user Heavy Mental mentioned us 12 times.\n", "Name: name, dtype: object\n" ] } ], "source": [ - "print('The user ' + most_repeated_user_name + ' posted ' + str(times_user_repeated) + ' times about The Bridge Tech')" + "# Name of author with most posts\n", + "most_repeated_user_name = df_users[df_users['id'] == most_repeated_user]['name']\n", + "\n", + "\n", + "print('The user ' + most_repeated_user_name + ' mentioned us ' + str(times_user_repeated) + ' times.')" ] }, { @@ -548,7 +482,20 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "def sql_query(query):\n", + "\n", + " # Ejecuta la query\n", + " crsr.execute(query)\n", + "\n", + " # Almacena los datos de la query \n", + " ans = crsr.fetchall()\n", + "\n", + " # Obtenemos los nombres de las columnas de la tabla\n", + " names = [description[0] for description in crsr.description]\n", + "\n", + " return pd.DataFrame(ans,columns=names)" + ] } ], "metadata": { diff --git a/notebooks/extract_data.ipynb b/notebooks/extract_data.ipynb index cb13dda..1b5f518 100644 --- a/notebooks/extract_data.ipynb +++ b/notebooks/extract_data.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 53, + "execution_count": 116, "metadata": {}, "outputs": [], "source": [ @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 117, "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 118, "metadata": {}, "outputs": [], "source": [ @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 119, "metadata": {}, "outputs": [], "source": [ @@ -49,7 +49,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 120, "metadata": {}, "outputs": [], "source": [ @@ -63,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 121, "metadata": {}, "outputs": [ { @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 122, "metadata": {}, "outputs": [ { @@ -1929,7 +1929,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 123, "metadata": {}, "outputs": [], "source": [ @@ -1960,7 +1960,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 124, "metadata": {}, "outputs": [ { @@ -1985,7 +1985,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 125, "metadata": {}, "outputs": [], "source": [ @@ -1994,7 +1994,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 126, "metadata": {}, "outputs": [], "source": [ @@ -2005,7 +2005,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ @@ -2017,7 +2017,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 128, "metadata": {}, "outputs": [ { @@ -2029,12 +2029,12 @@ "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", - " 0 text 154 non-null object\n", - " 1 edit_history_tweet_ids 154 non-null object\n", - " 2 created_at 154 non-null object\n", - " 3 author_id 154 non-null object\n", + " 0 id 154 non-null object\n", + " 1 text 154 non-null object\n", + " 2 author_id 154 non-null object\n", + " 3 created_at 154 non-null object\n", " 4 public_metrics 154 non-null object\n", - " 5 id 154 non-null object\n", + " 5 edit_history_tweet_ids 154 non-null object\n", "dtypes: object(6)\n", "memory usage: 8.4+ KB\n" ] @@ -2046,7 +2046,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 129, "metadata": {}, "outputs": [ { @@ -2070,88 +2070,88 @@ " \n", " \n", " \n", + " id\n", " text\n", - " edit_history_tweet_ids\n", - " created_at\n", " author_id\n", + " created_at\n", " public_metrics\n", - " id\n", + " edit_history_tweet_ids\n", " \n", " \n", " \n", " \n", " 0\n", + " 1578613094191796224\n", " Gente muy agradable en @TheBridge_Tech , te ri...\n", - " [1578613094191796224]\n", - " 2022-10-08T05:07:45.000Z\n", " 1578095844569514011\n", + " 2022-10-08T05:07:45.000Z\n", " {'retweet_count': 0, 'reply_count': 0, 'like_c...\n", - " 1578613094191796224\n", + " [1578613094191796224]\n", " \n", " \n", " 1\n", + " 1577334577701453827\n", " Recordaros que la semana que viene tenemos la ...\n", - " [1577334577701453827]\n", - " 2022-10-04T16:27:23.000Z\n", " 1003872445\n", + " 2022-10-04T16:27:23.000Z\n", " {'retweet_count': 2, 'reply_count': 0, 'like_c...\n", - " 1577334577701453827\n", + " [1577334577701453827]\n", " \n", " \n", " 2\n", + " 1576902991507922944\n", " El desarrollador web es uno de los perfiles má...\n", - " [1576902991507922944]\n", - " 2022-10-03T11:52:25.000Z\n", " 2529499620\n", + " 2022-10-03T11:52:25.000Z\n", " {'retweet_count': 3, 'reply_count': 0, 'like_c...\n", - " 1576902991507922944\n", + " [1576902991507922944]\n", " \n", " \n", " 3\n", + " 1576856703349374976\n", " @jorgegrev @TheBridge_Tech Enhorabuena crack!\n", - " [1576856703349374976]\n", - " 2022-10-03T08:48:29.000Z\n", " 706520411551494145\n", + " 2022-10-03T08:48:29.000Z\n", " {'retweet_count': 0, 'reply_count': 0, 'like_c...\n", - " 1576856703349374976\n", + " [1576856703349374976]\n", " \n", " \n", " 4\n", + " 1575217357105946624\n", " Hoy por fin ve la luz mi primer Case Study púb...\n", - " [1575217357105946624]\n", - " 2022-09-28T20:14:18.000Z\n", " 604485175\n", + " 2022-09-28T20:14:18.000Z\n", " {'retweet_count': 3, 'reply_count': 0, 'like_c...\n", - " 1575217357105946624\n", + " [1575217357105946624]\n", " \n", " \n", "\n", "" ], "text/plain": [ - " text edit_history_tweet_ids \\\n", - "0 Gente muy agradable en @TheBridge_Tech , te ri... [1578613094191796224] \n", - "1 Recordaros que la semana que viene tenemos la ... [1577334577701453827] \n", - "2 El desarrollador web es uno de los perfiles má... [1576902991507922944] \n", - "3 @jorgegrev @TheBridge_Tech Enhorabuena crack! [1576856703349374976] \n", - "4 Hoy por fin ve la luz mi primer Case Study púb... [1575217357105946624] \n", + " id text \\\n", + "0 1578613094191796224 Gente muy agradable en @TheBridge_Tech , te ri... \n", + "1 1577334577701453827 Recordaros que la semana que viene tenemos la ... \n", + "2 1576902991507922944 El desarrollador web es uno de los perfiles má... \n", + "3 1576856703349374976 @jorgegrev @TheBridge_Tech Enhorabuena crack! \n", + "4 1575217357105946624 Hoy por fin ve la luz mi primer Case Study púb... \n", "\n", - " created_at author_id \\\n", - "0 2022-10-08T05:07:45.000Z 1578095844569514011 \n", - "1 2022-10-04T16:27:23.000Z 1003872445 \n", - "2 2022-10-03T11:52:25.000Z 2529499620 \n", - "3 2022-10-03T08:48:29.000Z 706520411551494145 \n", - "4 2022-09-28T20:14:18.000Z 604485175 \n", + " author_id created_at \\\n", + "0 1578095844569514011 2022-10-08T05:07:45.000Z \n", + "1 1003872445 2022-10-04T16:27:23.000Z \n", + "2 2529499620 2022-10-03T11:52:25.000Z \n", + "3 706520411551494145 2022-10-03T08:48:29.000Z \n", + "4 604485175 2022-09-28T20:14:18.000Z \n", "\n", - " public_metrics id \n", - "0 {'retweet_count': 0, 'reply_count': 0, 'like_c... 1578613094191796224 \n", - "1 {'retweet_count': 2, 'reply_count': 0, 'like_c... 1577334577701453827 \n", - "2 {'retweet_count': 3, 'reply_count': 0, 'like_c... 1576902991507922944 \n", - "3 {'retweet_count': 0, 'reply_count': 0, 'like_c... 1576856703349374976 \n", - "4 {'retweet_count': 3, 'reply_count': 0, 'like_c... 1575217357105946624 " + " public_metrics edit_history_tweet_ids \n", + "0 {'retweet_count': 0, 'reply_count': 0, 'like_c... [1578613094191796224] \n", + "1 {'retweet_count': 2, 'reply_count': 0, 'like_c... [1577334577701453827] \n", + "2 {'retweet_count': 3, 'reply_count': 0, 'like_c... [1576902991507922944] \n", + "3 {'retweet_count': 0, 'reply_count': 0, 'like_c... [1576856703349374976] \n", + "4 {'retweet_count': 3, 'reply_count': 0, 'like_c... [1575217357105946624] " ] }, - "execution_count": 43, + "execution_count": 129, "metadata": {}, "output_type": "execute_result" } @@ -2162,7 +2162,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 130, "metadata": {}, "outputs": [], "source": [ @@ -2172,7 +2172,35 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 131, + "metadata": {}, + "outputs": [], + "source": [ + "tweet_df['text'] = tweet_df['text'].str.replace('\\n',' ')" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: The default value of regex will change from True to False in a future version.\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + } + ], + "source": [ + "tweet_df['created_at'] = tweet_df['created_at'].str.replace('.000Z',' ')\n", + "tweet_df[\"created_at\"] = pd.to_datetime(tweet_df['created_at'], format=\"%Y-%m-%d %H:%M:%S\")" + ] + }, + { + "cell_type": "code", + "execution_count": 133, "metadata": {}, "outputs": [], "source": [ @@ -2184,7 +2212,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 134, "metadata": {}, "outputs": [], "source": [ @@ -2198,7 +2226,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 135, "metadata": {}, "outputs": [], "source": [ @@ -2210,7 +2238,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 136, "metadata": {}, "outputs": [ { @@ -2234,10 +2262,10 @@ " \n", " \n", " \n", + " id\n", " text\n", - " created_at\n", " author_id\n", - " id\n", + " created_at\n", " retweet_count\n", " reply_count\n", " like_count\n", @@ -2247,10 +2275,10 @@ " \n", " \n", " 0\n", + " 1578613094191796224\n", " Gente muy agradable en @TheBridge_Tech , te ri...\n", - " 2022-10-08T05:07:45.000Z\n", " 1578095844569514011\n", - " 1578613094191796224\n", + " 2022-10-08 05:07:45\n", " 0\n", " 0\n", " 0\n", @@ -2258,10 +2286,10 @@ " \n", " \n", " 1\n", + " 1577334577701453827\n", " Recordaros que la semana que viene tenemos la ...\n", - " 2022-10-04T16:27:23.000Z\n", " 1003872445\n", - " 1577334577701453827\n", + " 2022-10-04 16:27:23\n", " 2\n", " 0\n", " 12\n", @@ -2269,10 +2297,10 @@ " \n", " \n", " 2\n", + " 1576902991507922944\n", " El desarrollador web es uno de los perfiles má...\n", - " 2022-10-03T11:52:25.000Z\n", " 2529499620\n", - " 1576902991507922944\n", + " 2022-10-03 11:52:25\n", " 3\n", " 0\n", " 3\n", @@ -2280,10 +2308,10 @@ " \n", " \n", " 3\n", + " 1576856703349374976\n", " @jorgegrev @TheBridge_Tech Enhorabuena crack!\n", - " 2022-10-03T08:48:29.000Z\n", " 706520411551494145\n", - " 1576856703349374976\n", + " 2022-10-03 08:48:29\n", " 0\n", " 0\n", " 1\n", @@ -2291,10 +2319,10 @@ " \n", " \n", " 4\n", + " 1575217357105946624\n", " Hoy por fin ve la luz mi primer Case Study púb...\n", - " 2022-09-28T20:14:18.000Z\n", " 604485175\n", - " 1575217357105946624\n", + " 2022-09-28 20:14:18\n", " 3\n", " 0\n", " 4\n", @@ -2305,29 +2333,29 @@ "" ], "text/plain": [ - " text \\\n", - "0 Gente muy agradable en @TheBridge_Tech , te ri... \n", - "1 Recordaros que la semana que viene tenemos la ... \n", - "2 El desarrollador web es uno de los perfiles má... \n", - "3 @jorgegrev @TheBridge_Tech Enhorabuena crack! \n", - "4 Hoy por fin ve la luz mi primer Case Study púb... \n", + " id text \\\n", + "0 1578613094191796224 Gente muy agradable en @TheBridge_Tech , te ri... \n", + "1 1577334577701453827 Recordaros que la semana que viene tenemos la ... \n", + "2 1576902991507922944 El desarrollador web es uno de los perfiles má... \n", + "3 1576856703349374976 @jorgegrev @TheBridge_Tech Enhorabuena crack! \n", + "4 1575217357105946624 Hoy por fin ve la luz mi primer Case Study púb... \n", "\n", - " created_at author_id id \\\n", - "0 2022-10-08T05:07:45.000Z 1578095844569514011 1578613094191796224 \n", - "1 2022-10-04T16:27:23.000Z 1003872445 1577334577701453827 \n", - "2 2022-10-03T11:52:25.000Z 2529499620 1576902991507922944 \n", - "3 2022-10-03T08:48:29.000Z 706520411551494145 1576856703349374976 \n", - "4 2022-09-28T20:14:18.000Z 604485175 1575217357105946624 \n", + " author_id created_at retweet_count reply_count \\\n", + "0 1578095844569514011 2022-10-08 05:07:45 0 0 \n", + "1 1003872445 2022-10-04 16:27:23 2 0 \n", + "2 2529499620 2022-10-03 11:52:25 3 0 \n", + "3 706520411551494145 2022-10-03 08:48:29 0 0 \n", + "4 604485175 2022-09-28 20:14:18 3 0 \n", "\n", - " retweet_count reply_count like_count quote_count \n", - "0 0 0 0 0 \n", - "1 2 0 12 0 \n", - "2 3 0 3 0 \n", - "3 0 0 1 0 \n", - "4 3 0 4 0 " + " like_count quote_count \n", + "0 0 0 \n", + "1 12 0 \n", + "2 3 0 \n", + "3 1 0 \n", + "4 4 0 " ] }, - "execution_count": 48, + "execution_count": 136, "metadata": {}, "output_type": "execute_result" } @@ -2338,7 +2366,38 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 137, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 154 entries, 0 to 53\n", + "Data columns (total 8 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 154 non-null object \n", + " 1 text 154 non-null object \n", + " 2 author_id 154 non-null object \n", + " 3 created_at 154 non-null datetime64[ns]\n", + " 4 retweet_count 154 non-null int64 \n", + " 5 reply_count 154 non-null int64 \n", + " 6 like_count 154 non-null int64 \n", + " 7 quote_count 154 non-null int64 \n", + "dtypes: datetime64[ns](1), int64(4), object(3)\n", + "memory usage: 10.8+ KB\n" + ] + } + ], + "source": [ + "tweet_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 138, "metadata": {}, "outputs": [], "source": [ @@ -2349,7 +2408,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 139, "metadata": {}, "outputs": [], "source": [ @@ -2361,7 +2420,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 140, "metadata": {}, "outputs": [ { @@ -2441,7 +2500,7 @@ "4 bertinha84 " ] }, - "execution_count": 51, + "execution_count": 140, "metadata": {}, "output_type": "execute_result" } @@ -2452,7 +2511,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 141, "metadata": {}, "outputs": [ { @@ -2478,7 +2537,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 142, "metadata": {}, "outputs": [], "source": [ @@ -2488,7 +2547,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 143, "metadata": {}, "outputs": [], "source": [ @@ -2497,7 +2556,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 144, "metadata": {}, "outputs": [], "source": [ @@ -2507,7 +2566,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 145, "metadata": {}, "outputs": [], "source": [ diff --git a/utils/classes/create_df.py b/utils/classes/create_df.py index d54969c..66db803 100644 --- a/utils/classes/create_df.py +++ b/utils/classes/create_df.py @@ -27,6 +27,8 @@ def __init__(self): self.main_tweet_df() self.main_author_df() self.drop_columns() + self.transform_text() + self.transform_date() self.get_detail_list() self.add_lists_to_df() self.drop_duplicates() @@ -50,6 +52,15 @@ def drop_columns(self): self.tweet_df.drop(columns='public_metrics', axis=1, inplace=True) self.tweet_df.drop(columns='edit_history_tweet_ids', axis=1, inplace=True) + + def transform_text(self): + self.tweet_df['text'] = self.tweet_df['text'].str.replace('\n',' ') + + + def transform_date(self): + self.tweet_df['created_at'] = self.tweet_df['created_at'].str.replace('.000Z',' ') + self.tweet_df["created_at"] = pd.to_datetime(self.tweet_df['created_at'], format="%Y-%m-%d %H:%M:%S") + def get_detail_list(self): for data in self.data_list: