diff --git a/data/twitter.db b/data/twitter.db
index 8e2e86c..21c2b6c 100644
Binary files a/data/twitter.db and b/data/twitter.db differ
diff --git a/notebooks/eda.ipynb b/notebooks/eda.ipynb
index 17d041c..08ec5b1 100644
--- a/notebooks/eda.ipynb
+++ b/notebooks/eda.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 99,
+ "execution_count": 168,
"metadata": {},
"outputs": [],
"source": [
@@ -12,7 +12,7 @@
},
{
"cell_type": "code",
- "execution_count": 100,
+ "execution_count": 169,
"metadata": {},
"outputs": [],
"source": [
@@ -22,47 +22,17 @@
},
{
"cell_type": "code",
- "execution_count": 101,
+ "execution_count": 170,
"metadata": {},
"outputs": [],
"source": [
- "def sql_query(query):\n",
- "\n",
- " # Ejecuta la query\n",
- " crsr.execute(query)\n",
- "\n",
- " # Almacena los datos de la query \n",
- " ans = crsr.fetchall()\n",
- "\n",
- " # Obtenemos los nombres de las columnas de la tabla\n",
- " names = [description[0] for description in crsr.description]\n",
- "\n",
- " return pd.DataFrame(ans,columns=names)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 102,
- "metadata": {},
- "outputs": [],
- "source": [
- "query_tweet = '''SELECT * FROM tweets'''\n",
- "query_user = '''SELECT * FROM users'''"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 103,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_tweets = sql_query(query_tweet)\n",
- "df_users = sql_query(query_user)"
+ "df_tweets = pd.read_sql('SELECT * FROM tweets', connection, parse_dates=['created_at'])\n",
+ "df_users = pd.read_sql('SELECT * FROM users', connection)"
]
},
{
"cell_type": "code",
- "execution_count": 104,
+ "execution_count": 171,
"metadata": {},
"outputs": [
{
@@ -86,10 +56,10 @@
" \n",
" \n",
" | \n",
- " created_at | \n",
- " text | \n",
" id | \n",
+ " text | \n",
" author_id | \n",
+ " created_at | \n",
" retweet_count | \n",
" reply_count | \n",
" like_count | \n",
@@ -99,10 +69,10 @@
"
\n",
" \n",
" 0 | \n",
- " 2022-10-08T05:07:45.000Z | \n",
- " Gente muy agradable en @TheBridge_Tech , te ri... | \n",
" 1578613094191796224 | \n",
+ " Gente muy agradable en @TheBridge_Tech , te ri... | \n",
" 1578095844569514011 | \n",
+ " 2022-10-08 05:07:45 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
@@ -110,10 +80,10 @@
"
\n",
" \n",
" 1 | \n",
- " 2022-10-04T16:27:23.000Z | \n",
- " Recordaros que la semana que viene tenemos la ... | \n",
" 1577334577701453827 | \n",
+ " Recordaros que la semana que viene tenemos la ... | \n",
" 1003872445 | \n",
+ " 2022-10-04 16:27:23 | \n",
" 2 | \n",
" 0 | \n",
" 12 | \n",
@@ -121,10 +91,10 @@
"
\n",
" \n",
" 2 | \n",
- " 2022-10-03T11:52:25.000Z | \n",
- " El desarrollador web es uno de los perfiles má... | \n",
" 1576902991507922944 | \n",
+ " El desarrollador web es uno de los perfiles má... | \n",
" 2529499620 | \n",
+ " 2022-10-03 11:52:25 | \n",
" 3 | \n",
" 0 | \n",
" 3 | \n",
@@ -132,10 +102,10 @@
"
\n",
" \n",
" 3 | \n",
- " 2022-10-03T08:48:29.000Z | \n",
- " @jorgegrev @TheBridge_Tech Enhorabuena crack! | \n",
" 1576856703349374976 | \n",
+ " @jorgegrev @TheBridge_Tech Enhorabuena crack! | \n",
" 706520411551494145 | \n",
+ " 2022-10-03 08:48:29 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
@@ -143,10 +113,10 @@
"
\n",
" \n",
" 4 | \n",
- " 2022-09-28T20:14:18.000Z | \n",
- " Hoy por fin ve la luz mi primer Case Study púb... | \n",
" 1575217357105946624 | \n",
+ " Hoy por fin ve la luz mi primer Case Study púb... | \n",
" 604485175 | \n",
+ " 2022-09-28 20:14:18 | \n",
" 3 | \n",
" 0 | \n",
" 4 | \n",
@@ -157,29 +127,29 @@
""
],
"text/plain": [
- " created_at \\\n",
- "0 2022-10-08T05:07:45.000Z \n",
- "1 2022-10-04T16:27:23.000Z \n",
- "2 2022-10-03T11:52:25.000Z \n",
- "3 2022-10-03T08:48:29.000Z \n",
- "4 2022-09-28T20:14:18.000Z \n",
+ " id text \\\n",
+ "0 1578613094191796224 Gente muy agradable en @TheBridge_Tech , te ri... \n",
+ "1 1577334577701453827 Recordaros que la semana que viene tenemos la ... \n",
+ "2 1576902991507922944 El desarrollador web es uno de los perfiles má... \n",
+ "3 1576856703349374976 @jorgegrev @TheBridge_Tech Enhorabuena crack! \n",
+ "4 1575217357105946624 Hoy por fin ve la luz mi primer Case Study púb... \n",
"\n",
- " text id \\\n",
- "0 Gente muy agradable en @TheBridge_Tech , te ri... 1578613094191796224 \n",
- "1 Recordaros que la semana que viene tenemos la ... 1577334577701453827 \n",
- "2 El desarrollador web es uno de los perfiles má... 1576902991507922944 \n",
- "3 @jorgegrev @TheBridge_Tech Enhorabuena crack! 1576856703349374976 \n",
- "4 Hoy por fin ve la luz mi primer Case Study púb... 1575217357105946624 \n",
+ " author_id created_at retweet_count reply_count \\\n",
+ "0 1578095844569514011 2022-10-08 05:07:45 0 0 \n",
+ "1 1003872445 2022-10-04 16:27:23 2 0 \n",
+ "2 2529499620 2022-10-03 11:52:25 3 0 \n",
+ "3 706520411551494145 2022-10-03 08:48:29 0 0 \n",
+ "4 604485175 2022-09-28 20:14:18 3 0 \n",
"\n",
- " author_id retweet_count reply_count like_count quote_count \n",
- "0 1578095844569514011 0 0 0 0 \n",
- "1 1003872445 2 0 12 0 \n",
- "2 2529499620 3 0 3 0 \n",
- "3 706520411551494145 0 0 1 0 \n",
- "4 604485175 3 0 4 0 "
+ " like_count quote_count \n",
+ "0 0 0 \n",
+ "1 12 0 \n",
+ "2 3 0 \n",
+ "3 1 0 \n",
+ "4 4 0 "
]
},
- "execution_count": 104,
+ "execution_count": 171,
"metadata": {},
"output_type": "execute_result"
}
@@ -190,7 +160,7 @@
},
{
"cell_type": "code",
- "execution_count": 105,
+ "execution_count": 172,
"metadata": {},
"outputs": [
{
@@ -200,17 +170,17 @@
"\n",
"RangeIndex: 154 entries, 0 to 153\n",
"Data columns (total 8 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 created_at 154 non-null object\n",
- " 1 text 154 non-null object\n",
- " 2 id 154 non-null object\n",
- " 3 author_id 154 non-null object\n",
- " 4 retweet_count 154 non-null int64 \n",
- " 5 reply_count 154 non-null int64 \n",
- " 6 like_count 154 non-null int64 \n",
- " 7 quote_count 154 non-null int64 \n",
- "dtypes: int64(4), object(4)\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 id 154 non-null object \n",
+ " 1 text 154 non-null object \n",
+ " 2 author_id 154 non-null object \n",
+ " 3 created_at 154 non-null datetime64[ns]\n",
+ " 4 retweet_count 154 non-null int64 \n",
+ " 5 reply_count 154 non-null int64 \n",
+ " 6 like_count 154 non-null int64 \n",
+ " 7 quote_count 154 non-null int64 \n",
+ "dtypes: datetime64[ns](1), int64(4), object(3)\n",
"memory usage: 9.8+ KB\n"
]
}
@@ -221,7 +191,7 @@
},
{
"cell_type": "code",
- "execution_count": 106,
+ "execution_count": 173,
"metadata": {},
"outputs": [
{
@@ -301,7 +271,7 @@
"4 bertinha84 "
]
},
- "execution_count": 106,
+ "execution_count": 173,
"metadata": {},
"output_type": "execute_result"
}
@@ -312,7 +282,7 @@
},
{
"cell_type": "code",
- "execution_count": 107,
+ "execution_count": 174,
"metadata": {},
"outputs": [
{
@@ -338,7 +308,7 @@
},
{
"cell_type": "code",
- "execution_count": 108,
+ "execution_count": 175,
"metadata": {},
"outputs": [
{
@@ -355,25 +325,7 @@
},
{
"cell_type": "code",
- "execution_count": 109,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_tweets['text'] = df_tweets['text'].str.replace('\\n',' ')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 110,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_tweets['social_repercussion'] = df_tweets['retweet_count'] + df_tweets['reply_count'] + df_tweets['like_count'] + df_tweets['quote_count']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 111,
+ "execution_count": 176,
"metadata": {},
"outputs": [
{
@@ -386,42 +338,28 @@
}
],
"source": [
+ "# The column social_repercussion contains the sum of the 4 metrics columns\n",
+ "df_tweets['social_repercussion'] = df_tweets['retweet_count'] + df_tweets['reply_count'] + df_tweets['like_count'] + df_tweets['quote_count']\n",
+ "\n",
"print(df_tweets[df_tweets['social_repercussion']==df_tweets['social_repercussion'].max()]['text'])"
]
},
{
"cell_type": "code",
- "execution_count": 127,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'1255794072280842240'"
- ]
- },
- "execution_count": 127,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_tweets['author_id'].value_counts().idxmax()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 128,
+ "execution_count": 164,
"metadata": {},
"outputs": [],
"source": [
+ "# Author with most tweets\n",
"most_repeated_user = df_tweets['author_id'].value_counts().idxmax()\n",
+ "\n",
+ "# Times this author posted\n",
"times_user_repeated = df_tweets['author_id'].value_counts()[most_repeated_user]"
]
},
{
"cell_type": "code",
- "execution_count": 129,
+ "execution_count": 165,
"metadata": {},
"outputs": [
{
@@ -466,40 +404,36 @@
"48 1255794072280842240 Heavy Mental HeavyMental_es"
]
},
- "execution_count": 129,
+ "execution_count": 165,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
+ "# Row which contains author information\n",
"df_users[df_users['id'] == most_repeated_user]"
]
},
{
"cell_type": "code",
- "execution_count": 130,
- "metadata": {},
- "outputs": [],
- "source": [
- "most_repeated_user_name = df_users[df_users['id'] == most_repeated_user]['name']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 132,
+ "execution_count": 180,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "48 The user Heavy Mental posted 12 times about Th...\n",
+ "48 The user Heavy Mental mentioned us 12 times.\n",
"Name: name, dtype: object\n"
]
}
],
"source": [
- "print('The user ' + most_repeated_user_name + ' posted ' + str(times_user_repeated) + ' times about The Bridge Tech')"
+ "# Name of author with most posts\n",
+ "most_repeated_user_name = df_users[df_users['id'] == most_repeated_user]['name']\n",
+ "\n",
+ "\n",
+ "print('The user ' + most_repeated_user_name + ' mentioned us ' + str(times_user_repeated) + ' times.')"
]
},
{
@@ -548,7 +482,20 @@
"execution_count": null,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "def sql_query(query):\n",
+ "\n",
+ " # Ejecuta la query\n",
+ " crsr.execute(query)\n",
+ "\n",
+ " # Almacena los datos de la query \n",
+ " ans = crsr.fetchall()\n",
+ "\n",
+ " # Obtenemos los nombres de las columnas de la tabla\n",
+ " names = [description[0] for description in crsr.description]\n",
+ "\n",
+ " return pd.DataFrame(ans,columns=names)"
+ ]
}
],
"metadata": {
diff --git a/notebooks/extract_data.ipynb b/notebooks/extract_data.ipynb
index cb13dda..1b5f518 100644
--- a/notebooks/extract_data.ipynb
+++ b/notebooks/extract_data.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 53,
+ "execution_count": 116,
"metadata": {},
"outputs": [],
"source": [
@@ -15,7 +15,7 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 117,
"metadata": {},
"outputs": [],
"source": [
@@ -26,7 +26,7 @@
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
@@ -35,7 +35,7 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
@@ -49,7 +49,7 @@
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
@@ -63,7 +63,7 @@
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": 121,
"metadata": {},
"outputs": [
{
@@ -84,7 +84,7 @@
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": 122,
"metadata": {},
"outputs": [
{
@@ -1929,7 +1929,7 @@
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 123,
"metadata": {},
"outputs": [],
"source": [
@@ -1960,7 +1960,7 @@
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 124,
"metadata": {},
"outputs": [
{
@@ -1985,7 +1985,7 @@
},
{
"cell_type": "code",
- "execution_count": 39,
+ "execution_count": 125,
"metadata": {},
"outputs": [],
"source": [
@@ -1994,7 +1994,7 @@
},
{
"cell_type": "code",
- "execution_count": 40,
+ "execution_count": 126,
"metadata": {},
"outputs": [],
"source": [
@@ -2005,7 +2005,7 @@
},
{
"cell_type": "code",
- "execution_count": 41,
+ "execution_count": 127,
"metadata": {},
"outputs": [],
"source": [
@@ -2017,7 +2017,7 @@
},
{
"cell_type": "code",
- "execution_count": 42,
+ "execution_count": 128,
"metadata": {},
"outputs": [
{
@@ -2029,12 +2029,12 @@
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
- " 0 text 154 non-null object\n",
- " 1 edit_history_tweet_ids 154 non-null object\n",
- " 2 created_at 154 non-null object\n",
- " 3 author_id 154 non-null object\n",
+ " 0 id 154 non-null object\n",
+ " 1 text 154 non-null object\n",
+ " 2 author_id 154 non-null object\n",
+ " 3 created_at 154 non-null object\n",
" 4 public_metrics 154 non-null object\n",
- " 5 id 154 non-null object\n",
+ " 5 edit_history_tweet_ids 154 non-null object\n",
"dtypes: object(6)\n",
"memory usage: 8.4+ KB\n"
]
@@ -2046,7 +2046,7 @@
},
{
"cell_type": "code",
- "execution_count": 43,
+ "execution_count": 129,
"metadata": {},
"outputs": [
{
@@ -2070,88 +2070,88 @@
" \n",
" \n",
" | \n",
+ " id | \n",
" text | \n",
- " edit_history_tweet_ids | \n",
- " created_at | \n",
" author_id | \n",
+ " created_at | \n",
" public_metrics | \n",
- " id | \n",
+ " edit_history_tweet_ids | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
+ " 1578613094191796224 | \n",
" Gente muy agradable en @TheBridge_Tech , te ri... | \n",
- " [1578613094191796224] | \n",
- " 2022-10-08T05:07:45.000Z | \n",
" 1578095844569514011 | \n",
+ " 2022-10-08T05:07:45.000Z | \n",
" {'retweet_count': 0, 'reply_count': 0, 'like_c... | \n",
- " 1578613094191796224 | \n",
+ " [1578613094191796224] | \n",
"
\n",
" \n",
" 1 | \n",
+ " 1577334577701453827 | \n",
" Recordaros que la semana que viene tenemos la ... | \n",
- " [1577334577701453827] | \n",
- " 2022-10-04T16:27:23.000Z | \n",
" 1003872445 | \n",
+ " 2022-10-04T16:27:23.000Z | \n",
" {'retweet_count': 2, 'reply_count': 0, 'like_c... | \n",
- " 1577334577701453827 | \n",
+ " [1577334577701453827] | \n",
"
\n",
" \n",
" 2 | \n",
+ " 1576902991507922944 | \n",
" El desarrollador web es uno de los perfiles má... | \n",
- " [1576902991507922944] | \n",
- " 2022-10-03T11:52:25.000Z | \n",
" 2529499620 | \n",
+ " 2022-10-03T11:52:25.000Z | \n",
" {'retweet_count': 3, 'reply_count': 0, 'like_c... | \n",
- " 1576902991507922944 | \n",
+ " [1576902991507922944] | \n",
"
\n",
" \n",
" 3 | \n",
+ " 1576856703349374976 | \n",
" @jorgegrev @TheBridge_Tech Enhorabuena crack! | \n",
- " [1576856703349374976] | \n",
- " 2022-10-03T08:48:29.000Z | \n",
" 706520411551494145 | \n",
+ " 2022-10-03T08:48:29.000Z | \n",
" {'retweet_count': 0, 'reply_count': 0, 'like_c... | \n",
- " 1576856703349374976 | \n",
+ " [1576856703349374976] | \n",
"
\n",
" \n",
" 4 | \n",
+ " 1575217357105946624 | \n",
" Hoy por fin ve la luz mi primer Case Study púb... | \n",
- " [1575217357105946624] | \n",
- " 2022-09-28T20:14:18.000Z | \n",
" 604485175 | \n",
+ " 2022-09-28T20:14:18.000Z | \n",
" {'retweet_count': 3, 'reply_count': 0, 'like_c... | \n",
- " 1575217357105946624 | \n",
+ " [1575217357105946624] | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " text edit_history_tweet_ids \\\n",
- "0 Gente muy agradable en @TheBridge_Tech , te ri... [1578613094191796224] \n",
- "1 Recordaros que la semana que viene tenemos la ... [1577334577701453827] \n",
- "2 El desarrollador web es uno de los perfiles má... [1576902991507922944] \n",
- "3 @jorgegrev @TheBridge_Tech Enhorabuena crack! [1576856703349374976] \n",
- "4 Hoy por fin ve la luz mi primer Case Study púb... [1575217357105946624] \n",
+ " id text \\\n",
+ "0 1578613094191796224 Gente muy agradable en @TheBridge_Tech , te ri... \n",
+ "1 1577334577701453827 Recordaros que la semana que viene tenemos la ... \n",
+ "2 1576902991507922944 El desarrollador web es uno de los perfiles má... \n",
+ "3 1576856703349374976 @jorgegrev @TheBridge_Tech Enhorabuena crack! \n",
+ "4 1575217357105946624 Hoy por fin ve la luz mi primer Case Study púb... \n",
"\n",
- " created_at author_id \\\n",
- "0 2022-10-08T05:07:45.000Z 1578095844569514011 \n",
- "1 2022-10-04T16:27:23.000Z 1003872445 \n",
- "2 2022-10-03T11:52:25.000Z 2529499620 \n",
- "3 2022-10-03T08:48:29.000Z 706520411551494145 \n",
- "4 2022-09-28T20:14:18.000Z 604485175 \n",
+ " author_id created_at \\\n",
+ "0 1578095844569514011 2022-10-08T05:07:45.000Z \n",
+ "1 1003872445 2022-10-04T16:27:23.000Z \n",
+ "2 2529499620 2022-10-03T11:52:25.000Z \n",
+ "3 706520411551494145 2022-10-03T08:48:29.000Z \n",
+ "4 604485175 2022-09-28T20:14:18.000Z \n",
"\n",
- " public_metrics id \n",
- "0 {'retweet_count': 0, 'reply_count': 0, 'like_c... 1578613094191796224 \n",
- "1 {'retweet_count': 2, 'reply_count': 0, 'like_c... 1577334577701453827 \n",
- "2 {'retweet_count': 3, 'reply_count': 0, 'like_c... 1576902991507922944 \n",
- "3 {'retweet_count': 0, 'reply_count': 0, 'like_c... 1576856703349374976 \n",
- "4 {'retweet_count': 3, 'reply_count': 0, 'like_c... 1575217357105946624 "
+ " public_metrics edit_history_tweet_ids \n",
+ "0 {'retweet_count': 0, 'reply_count': 0, 'like_c... [1578613094191796224] \n",
+ "1 {'retweet_count': 2, 'reply_count': 0, 'like_c... [1577334577701453827] \n",
+ "2 {'retweet_count': 3, 'reply_count': 0, 'like_c... [1576902991507922944] \n",
+ "3 {'retweet_count': 0, 'reply_count': 0, 'like_c... [1576856703349374976] \n",
+ "4 {'retweet_count': 3, 'reply_count': 0, 'like_c... [1575217357105946624] "
]
},
- "execution_count": 43,
+ "execution_count": 129,
"metadata": {},
"output_type": "execute_result"
}
@@ -2162,7 +2162,7 @@
},
{
"cell_type": "code",
- "execution_count": 44,
+ "execution_count": 130,
"metadata": {},
"outputs": [],
"source": [
@@ -2172,7 +2172,35 @@
},
{
"cell_type": "code",
- "execution_count": 45,
+ "execution_count": 131,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tweet_df['text'] = tweet_df['text'].str.replace('\\n',' ')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 132,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: The default value of regex will change from True to False in a future version.\n",
+ " \"\"\"Entry point for launching an IPython kernel.\n"
+ ]
+ }
+ ],
+ "source": [
+ "tweet_df['created_at'] = tweet_df['created_at'].str.replace('.000Z',' ')\n",
+ "tweet_df[\"created_at\"] = pd.to_datetime(tweet_df['created_at'], format=\"%Y-%m-%d %H:%M:%S\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 133,
"metadata": {},
"outputs": [],
"source": [
@@ -2184,7 +2212,7 @@
},
{
"cell_type": "code",
- "execution_count": 46,
+ "execution_count": 134,
"metadata": {},
"outputs": [],
"source": [
@@ -2198,7 +2226,7 @@
},
{
"cell_type": "code",
- "execution_count": 47,
+ "execution_count": 135,
"metadata": {},
"outputs": [],
"source": [
@@ -2210,7 +2238,7 @@
},
{
"cell_type": "code",
- "execution_count": 48,
+ "execution_count": 136,
"metadata": {},
"outputs": [
{
@@ -2234,10 +2262,10 @@
" \n",
" \n",
" | \n",
+ " id | \n",
" text | \n",
- " created_at | \n",
" author_id | \n",
- " id | \n",
+ " created_at | \n",
" retweet_count | \n",
" reply_count | \n",
" like_count | \n",
@@ -2247,10 +2275,10 @@
"
\n",
" \n",
" 0 | \n",
+ " 1578613094191796224 | \n",
" Gente muy agradable en @TheBridge_Tech , te ri... | \n",
- " 2022-10-08T05:07:45.000Z | \n",
" 1578095844569514011 | \n",
- " 1578613094191796224 | \n",
+ " 2022-10-08 05:07:45 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
@@ -2258,10 +2286,10 @@
"
\n",
" \n",
" 1 | \n",
+ " 1577334577701453827 | \n",
" Recordaros que la semana que viene tenemos la ... | \n",
- " 2022-10-04T16:27:23.000Z | \n",
" 1003872445 | \n",
- " 1577334577701453827 | \n",
+ " 2022-10-04 16:27:23 | \n",
" 2 | \n",
" 0 | \n",
" 12 | \n",
@@ -2269,10 +2297,10 @@
"
\n",
" \n",
" 2 | \n",
+ " 1576902991507922944 | \n",
" El desarrollador web es uno de los perfiles má... | \n",
- " 2022-10-03T11:52:25.000Z | \n",
" 2529499620 | \n",
- " 1576902991507922944 | \n",
+ " 2022-10-03 11:52:25 | \n",
" 3 | \n",
" 0 | \n",
" 3 | \n",
@@ -2280,10 +2308,10 @@
"
\n",
" \n",
" 3 | \n",
+ " 1576856703349374976 | \n",
" @jorgegrev @TheBridge_Tech Enhorabuena crack! | \n",
- " 2022-10-03T08:48:29.000Z | \n",
" 706520411551494145 | \n",
- " 1576856703349374976 | \n",
+ " 2022-10-03 08:48:29 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
@@ -2291,10 +2319,10 @@
"
\n",
" \n",
" 4 | \n",
+ " 1575217357105946624 | \n",
" Hoy por fin ve la luz mi primer Case Study púb... | \n",
- " 2022-09-28T20:14:18.000Z | \n",
" 604485175 | \n",
- " 1575217357105946624 | \n",
+ " 2022-09-28 20:14:18 | \n",
" 3 | \n",
" 0 | \n",
" 4 | \n",
@@ -2305,29 +2333,29 @@
""
],
"text/plain": [
- " text \\\n",
- "0 Gente muy agradable en @TheBridge_Tech , te ri... \n",
- "1 Recordaros que la semana que viene tenemos la ... \n",
- "2 El desarrollador web es uno de los perfiles má... \n",
- "3 @jorgegrev @TheBridge_Tech Enhorabuena crack! \n",
- "4 Hoy por fin ve la luz mi primer Case Study púb... \n",
+ " id text \\\n",
+ "0 1578613094191796224 Gente muy agradable en @TheBridge_Tech , te ri... \n",
+ "1 1577334577701453827 Recordaros que la semana que viene tenemos la ... \n",
+ "2 1576902991507922944 El desarrollador web es uno de los perfiles má... \n",
+ "3 1576856703349374976 @jorgegrev @TheBridge_Tech Enhorabuena crack! \n",
+ "4 1575217357105946624 Hoy por fin ve la luz mi primer Case Study púb... \n",
"\n",
- " created_at author_id id \\\n",
- "0 2022-10-08T05:07:45.000Z 1578095844569514011 1578613094191796224 \n",
- "1 2022-10-04T16:27:23.000Z 1003872445 1577334577701453827 \n",
- "2 2022-10-03T11:52:25.000Z 2529499620 1576902991507922944 \n",
- "3 2022-10-03T08:48:29.000Z 706520411551494145 1576856703349374976 \n",
- "4 2022-09-28T20:14:18.000Z 604485175 1575217357105946624 \n",
+ " author_id created_at retweet_count reply_count \\\n",
+ "0 1578095844569514011 2022-10-08 05:07:45 0 0 \n",
+ "1 1003872445 2022-10-04 16:27:23 2 0 \n",
+ "2 2529499620 2022-10-03 11:52:25 3 0 \n",
+ "3 706520411551494145 2022-10-03 08:48:29 0 0 \n",
+ "4 604485175 2022-09-28 20:14:18 3 0 \n",
"\n",
- " retweet_count reply_count like_count quote_count \n",
- "0 0 0 0 0 \n",
- "1 2 0 12 0 \n",
- "2 3 0 3 0 \n",
- "3 0 0 1 0 \n",
- "4 3 0 4 0 "
+ " like_count quote_count \n",
+ "0 0 0 \n",
+ "1 12 0 \n",
+ "2 3 0 \n",
+ "3 1 0 \n",
+ "4 4 0 "
]
},
- "execution_count": 48,
+ "execution_count": 136,
"metadata": {},
"output_type": "execute_result"
}
@@ -2338,7 +2366,38 @@
},
{
"cell_type": "code",
- "execution_count": 49,
+ "execution_count": 137,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 154 entries, 0 to 53\n",
+ "Data columns (total 8 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 id 154 non-null object \n",
+ " 1 text 154 non-null object \n",
+ " 2 author_id 154 non-null object \n",
+ " 3 created_at 154 non-null datetime64[ns]\n",
+ " 4 retweet_count 154 non-null int64 \n",
+ " 5 reply_count 154 non-null int64 \n",
+ " 6 like_count 154 non-null int64 \n",
+ " 7 quote_count 154 non-null int64 \n",
+ "dtypes: datetime64[ns](1), int64(4), object(3)\n",
+ "memory usage: 10.8+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "tweet_df.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 138,
"metadata": {},
"outputs": [],
"source": [
@@ -2349,7 +2408,7 @@
},
{
"cell_type": "code",
- "execution_count": 50,
+ "execution_count": 139,
"metadata": {},
"outputs": [],
"source": [
@@ -2361,7 +2420,7 @@
},
{
"cell_type": "code",
- "execution_count": 51,
+ "execution_count": 140,
"metadata": {},
"outputs": [
{
@@ -2441,7 +2500,7 @@
"4 bertinha84 "
]
},
- "execution_count": 51,
+ "execution_count": 140,
"metadata": {},
"output_type": "execute_result"
}
@@ -2452,7 +2511,7 @@
},
{
"cell_type": "code",
- "execution_count": 52,
+ "execution_count": 141,
"metadata": {},
"outputs": [
{
@@ -2478,7 +2537,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 142,
"metadata": {},
"outputs": [],
"source": [
@@ -2488,7 +2547,7 @@
},
{
"cell_type": "code",
- "execution_count": 54,
+ "execution_count": 143,
"metadata": {},
"outputs": [],
"source": [
@@ -2497,7 +2556,7 @@
},
{
"cell_type": "code",
- "execution_count": 55,
+ "execution_count": 144,
"metadata": {},
"outputs": [],
"source": [
@@ -2507,7 +2566,7 @@
},
{
"cell_type": "code",
- "execution_count": 56,
+ "execution_count": 145,
"metadata": {},
"outputs": [],
"source": [
diff --git a/utils/classes/create_df.py b/utils/classes/create_df.py
index d54969c..66db803 100644
--- a/utils/classes/create_df.py
+++ b/utils/classes/create_df.py
@@ -27,6 +27,8 @@ def __init__(self):
self.main_tweet_df()
self.main_author_df()
self.drop_columns()
+ self.transform_text()
+ self.transform_date()
self.get_detail_list()
self.add_lists_to_df()
self.drop_duplicates()
@@ -50,6 +52,15 @@ def drop_columns(self):
self.tweet_df.drop(columns='public_metrics', axis=1, inplace=True)
self.tweet_df.drop(columns='edit_history_tweet_ids', axis=1, inplace=True)
+
+ def transform_text(self):
+ self.tweet_df['text'] = self.tweet_df['text'].str.replace('\n',' ')
+
+
+ def transform_date(self):
+ self.tweet_df['created_at'] = self.tweet_df['created_at'].str.replace('.000Z',' ')
+ self.tweet_df["created_at"] = pd.to_datetime(self.tweet_df['created_at'], format="%Y-%m-%d %H:%M:%S")
+
def get_detail_list(self):
for data in self.data_list: