-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_twitter.py
116 lines (105 loc) · 4.55 KB
/
scrape_twitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import json
import time
import os
import pandas as pd
import http.client, urllib.request, urllib.parse, urllib.error, base64
from azure.storage.blob import BlockBlobService, PublicAccess
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
# Variables that contains the user credentials to access Twitter API
access_token = "<YOUR_ACCESS_TOKEN>"
access_token_secret = "<YOUR_SECRET_ACCESS_TOKEN>"
consumer_key = "<YOUR_CONSUMER_KEY>"
consumer_secret = "<YOUR_CONSUMER_SECRET>"
# Microsoft Azure keys
cognitive_sciences_key = "<YOUR_COGNITIVE_SCIENCES_KEY>"
blob_storage_key = "<YOUR_BLOB_STORAGE_ACCOUNT_KEY>"
filtered_tweets = "filteredTweets.txt"
req = {
"documents": []
}
MAX_COUNT = 100000
MAX_SIZE = 100000 #50Kb max size
count = 1
# This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
global filtered_tweets, req, MAX_COUNT, MAX_SIZE, count
data = json.loads(data)
tweet = {}
if ('lang' in data and 'text' in data and 'id' in data):
tweet = {
"language": data["lang"],
"id": str(count),
"text": data["text"]
}
req["documents"].append(tweet)
count += 1
if count % 100 == 0:
print ('Counter: ', str(count))
headers = {
# Request headers
'Content-Type': 'application/json',
'Ocp-Apim-Subscription-Key': cognitive_sciences_key,
}
try:
req_json = json.dumps(req)
conn = http.client.HTTPSConnection('eastus2.api.cognitive.microsoft.com')
conn.request("POST", "/text/analytics/v2.0/keyPhrases", req_json, headers)
response = conn.getresponse()
data = response.read().decode()
data = json.loads(data)
print(data)
#time.sleep(1)
with open(filtered_tweets, "a", encoding="utf-8") as t:
if ('documents' in data):
for doc in data["documents"]:
for phrase in doc["keyPhrases"]:
t.write(phrase + ",")
t.write("\n")
conn.close()
req = {
"documents": []
}
statinfo = os.stat(filtered_tweets)
if statinfo.st_size > MAX_SIZE:
# Upload data to azure
self.upload_to_azure()
# Remove local file
if os.path.exists(filtered_tweets):
os.remove(filtered_tweets)
else:
print("The file does not exist")
if count >= MAX_COUNT:
count = 1
except Exception as ex:
template = "An exception of type {0} occurred. Arguments:\n{1!r}"
message = template.format(type(ex).__name__, ex.args)
print(message)
return True
def on_error(self, status):
print('Error during streaming:', status)
def upload_to_azure(self):
global filtered_tweets
# Create the BlockBlockService that is used to call the Blob service for the storage account
block_blob_service = BlockBlobService(account_name='project3twitter', account_key= blob_storage_key)
# Create a container called 'quickstartblobs'.
container_name ='project3'
block_blob_service.create_container(container_name)
# Set the permission so the blobs are public.
block_blob_service.set_container_acl(container_name, public_access=PublicAccess.Container)
local_path = os.getcwd()
full_path_to_file = os.path.join(local_path, filtered_tweets)
print(full_path_to_file)
print("\nUploading to Blob storage as blob: " + filtered_tweets)
# Upload the created file, use local_file_name for the blob name
block_blob_service.create_blob_from_path(container_name, filtered_tweets, full_path_to_file)
if __name__ == '__main__':
#This handles Twitter authetification and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
#This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
stream.filter(languages=['en'], track=['a'])