-
Notifications
You must be signed in to change notification settings - Fork 27
/
app.py
229 lines (198 loc) · 11.4 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# YouTubeTranscriptAPI Imports
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, VideoUnavailable, TooManyRequests, \
TranscriptsDisabled, NoTranscriptAvailable
from youtube_transcript_api.formatters import TextFormatter
# Flask Imports
from flask import Flask, jsonify, request, send_from_directory, render_template, redirect
# NLTK Imports
import nltk
# Other Imports
import os
import sys
# Summarizer Import (Our Another File: summarizer.py)
from summarizer import gensim_summarize, spacy_summarize, nltk_summarize, sumy_lsa_summarize, sumy_luhn_summarize, \
sumy_text_rank_summarize
# Waitress Import for Serving at Heroku
from waitress import serve
def create_app():
# Creating Flask Object and returning it.
app = Flask(__name__)
# "Punkt" download before nltk tokenization
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
print('Downloading punkt')
nltk.download('punkt', quiet=True)
# "Wordnet" download before nltk tokenization
try:
nltk.data.find('corpora/wordnet')
except LookupError:
print('Downloading wordnet')
nltk.download('wordnet')
# "Stopwords" download before nltk tokenization
try:
nltk.data.find('corpora/stopwords')
except LookupError:
print('Downloading Stopwords')
nltk.download("stopwords", quiet=True)
# Processing Function for below route.
@app.route('/summarize/', methods=['GET'])
def transcript_fetched_query():
# Getting argument from the request
video_id = request.args.get('id') # video_id of the YouTube Video
percent = request.args.get('percent') # percentage of the summary
choice = request.args.get('choice') # summarization choice
# Checking whether all parameters exist or not
if video_id and percent and choice:
# Every parameter exists here: checking validity of choice
choice_list = ["gensim-sum", "spacy-sum", "nltk-sum", "sumy-lsa-sum", "sumy-luhn-sum", "sumy-text-rank-sum"]
if choice in choice_list:
# Choice Correct: Proceeding with Transcript Fetch and its Summarization
try:
# Using Formatter to store and format received subtitles properly.
formatter = TextFormatter()
transcript = YouTubeTranscriptApi.get_transcript(video_id)
formatted_text = formatter.format_transcript(transcript).replace("\n", " ")
# Checking the length of sentences in formatted_text string, before summarizing it.
num_sent_text = len(nltk.sent_tokenize(formatted_text))
# Pre-check if the summary will have at least one line .
select_length = int(num_sent_text * (int(percent) / 100))
# Summary will have at least 1 line. Proceed to summarize.
if select_length > 0:
# Condition satisfied for summarization, summarizing the formatted_text based on choice.
if num_sent_text > 1:
# Summarizing Formatted Text based upon the request's choice
if choice == "gensim-sum":
summary = gensim_summarize(formatted_text,
percent) # Gensim Library for TextRank Based Summary.
elif choice == "spacy-sum":
summary = spacy_summarize(formatted_text,
percent) # Spacy Library for frequency-based summary.
elif choice == "nltk-sum":
summary = nltk_summarize(formatted_text,
percent) # NLTK Library used for frequency-based summary.
elif choice == "sumy-lsa-sum":
summary = sumy_lsa_summarize(formatted_text,
percent) # Sumy for extractive summary using LSA.
elif choice == "sumy-luhn-sum":
summary = sumy_luhn_summarize(formatted_text,
percent) # Sumy Library for TF-IDF Based Summary.
elif choice == "sumy-text-rank-sum":
summary = sumy_text_rank_summarize(formatted_text,
percent) # Sumy for Text Rank Based Summary.
else:
summary = None
# Checking the length of sentences in summary string.
num_sent_summary = len(nltk.sent_tokenize(summary))
# Returning Result
response_list = {
# 'fetched_transcript': formatted_text,
'processed_summary': summary,
'length_original': len(formatted_text),
'length_summary': len(summary),
'sentence_original': num_sent_text,
'sentence_summary': num_sent_summary
}
return jsonify(success=True,
message="Subtitles for this video was fetched and summarized successfully.",
response=response_list), 200
else:
return jsonify(success=False,
message="Subtitles are not formatted properly for this video. Unable to "
"summarize. There is a possibility that there is no punctuation in "
"subtitles of your video.",
response=None), 400
else:
return jsonify(success=False,
message="Number of lines in the subtitles of your video is not "
"enough to generate a summary. Number of sentences in your video: {}"
.format(num_sent_text),
response=None), 400
# Catching Exceptions
except VideoUnavailable:
return jsonify(success=False, message="VideoUnavailable: The video is no longer available.",
response=None), 400
except TooManyRequests:
return jsonify(success=False,
message="TooManyRequests: YouTube is receiving too many requests from this IP."
" Wait until the ban on server has been lifted.",
response=None), 500
except TranscriptsDisabled:
return jsonify(success=False, message="TranscriptsDisabled: Subtitles are disabled for this video.",
response=None), 400
except NoTranscriptAvailable:
return jsonify(success=False,
message="NoTranscriptAvailable: No transcripts are available for this video.",
response=None), 400
except NoTranscriptFound:
return jsonify(success=False, message="NoTranscriptAvailable: No transcripts were found.",
response=None), 400
except Exception as e:
# Prevent server error by returning this message to all other un-expected errors.
print(e)
sys.stdout.flush()
return jsonify(success=False,
message="Some error occurred."
" Contact the administrator if it is happening too frequently.",
response=None), 500
else:
return jsonify(success=False,
message="Invalid Choice: Please create your request with correct choice.",
response=None), 400
elif video_id is None or len(video_id) <= 0:
# video_id parameter doesn't exist in the request.
return jsonify(success=False,
message="Video ID is not present in the request. "
"Please check that you have added id in your request correctly.",
response=None), 400
elif percent is None or len(percent) <= 0:
# percent parameter doesn't exist.
return jsonify(success=False,
message="No Percentage value is present in the request. "
"Please check whether your request is correct.",
response=None), 400
elif choice is None or len(choice) <= 0:
# choice parameter for the summary type doesn't exist here.
return jsonify(success=False,
message="No Choice parameter is present in the request. "
"Please request along with your choice correctly.",
response=None), 400
else:
# Some another edge case happened. Return this message for preventing exception throw.
return jsonify(success=False,
message="Please request the server with your arguments correctly.",
response=None), 400
@app.route('/favicon.ico')
# Favicon is stored in static folder, browsers request it to display along with tab title.
def favicon():
return send_from_directory(os.path.join(app.root_path, 'static'), 'favicon.png',
mimetype='image/vnd.microsoft.icon')
@app.route('/')
def root_function():
# Since we have two end points inside root, we are closing root endpoint.
# Displaying root.html to the end user
return render_template('root.html')
@app.route('/web/')
def summarizer_web():
# We are at web.html, online input boxes are there to summarize the given video URL.
# Displaying web.html to the end user
return render_template('web.html')
@app.route('/api/')
def summarizer_api_info_route():
# Since we have two end points inside root, we are closing root endpoint.
# Displaying root.html to the end user
return render_template('api.html')
@app.before_request
# Before Request Function: We are redirecting any HTTP requests to HTTPS especially on heroku environment.
def enforce_https_in_heroku():
if 'DYNO' in os.environ:
if request.headers.get('X-Forwarded-Proto') == 'http':
url = request.url.replace('http://', 'https://', 1)
code = 301
return redirect(url, code=code)
return app
if __name__ == '__main__':
# Running Flask Application
# app.run()
flask_app = create_app()
serve(flask_app, host='0.0.0.0', port=80, debug=False, url_scheme='https')