-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
173 lines (133 loc) · 5.39 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import cmudict
from nltk.corpus import stopwords
import re
# Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('cmudict')
# Function to extract article text from URL
def extract_article_text(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Extracting article text
article_text = ''
article_body = soup.find('body')
for paragraph in article_body.find_all('p'):
article_text += paragraph.text.strip() + '\n'
return article_text
# Load input data
input_data = pd.read_excel('input.xlsx')
# Function to compute variables
def calculate_sentiment(text):
# Tokenize text into sentences and words
sentences = sent_tokenize(text)
words = word_tokenize(text)
def calculate_readability(text):
sentences = sent_tokenize(text)
words = word_tokenize(text)
# Average sentence length
avg_sentence_length = len(words) / len(sentences)
# Percentage of complex words
complex_words = [word for word in words if len(word) > 2]
percentage_complex_words = (len(complex_words) / len(words)) * 100
# Fog Index
fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
return avg_sentence_length, percentage_complex_words, fog_index
# Function to compute variables
def compute_variables(text):
# Tokenize text
words = word_tokenize(text)
sentences = sent_tokenize(text)
# Counting number of words
word_count = len(words)
# Counting number of sentences
sentence_count = len(sentences)
# Average number of words per sentence
avg_words_per_sentence = word_count / sentence_count
# Calculating syllables per word
d = cmudict.dict()
syllable_count = sum([len([y for y in x if y[-1].isdigit()]) for word in words for x in d.get(word.lower(), [[]])])
syllable_per_word = syllable_count / word_count
# Other variables to compute
# You can compute other variables as specified in the assignment
return word_count, avg_words_per_sentence, syllable_per_word
def average_words_per_sentence(text):
sentences = sent_tokenize(text)
total_words = len(word_tokenize(text))
average_words_per_sentence = total_words / len(sentences)
return average_words_per_sentence
def count_complex_words(text):
words = word_tokenize(text)
complex_words = [word for word in words if len(word) > 2]
return len(complex_words)
def count_total_words(text):
words = word_tokenize(text)
total_words = len([word for word in words if word.isalnum()])
return total_words
def count_syllables(word):
vowels = 'aeiou'
count = 0
prev_char = ""
for char in word.lower():
if char in vowels and prev_char != char:
count += 1
prev_char = char
# Handle exceptions for words ending with "es" or "ed"
if word.lower().endswith(('es', 'ed')) and count > 1:
count -= 1
return count
def calculate_syllable_per_word(text):
words = word_tokenize(text)
total_syllables = sum(count_syllables(word) for word in words)
total_words = len(words)
return total_syllables / total_words
def count_personal_pronouns(text):
personal_pronouns = ['i', 'we', 'my', 'ours', 'us']
# Exclude 'US' if it's not a personal pronoun
text = re.sub(r'\bUS\b', '', text)
pronoun_count = sum(1 for word in word_tokenize(text) if word.lower() in personal_pronouns)
return pronoun_count
def calculate_average_word_length(text):
words = word_tokenize(text)
total_characters = sum(len(word) for word in words)
total_words = len(words)
return total_characters / total_words
# Extracting computing variables for each URL
output_data = []
for index, row in input_data.iterrows():
url = row['URL']
url_id = row['URL_ID']
article_text = extract_article_text(url)
# Readability Analysis
avg_sentence_length, percentage_complex_words, fog_index = calculate_readability(article_text)
# Other Metrics
avg_words_per_sentence = average_words_per_sentence(article_text)
complex_word_count = count_complex_words(article_text)
total_word_count = count_total_words(article_text)
syllable_per_word = calculate_syllable_per_word(article_text)
personal_pronouns_count = count_personal_pronouns(article_text)
average_word_length = calculate_average_word_length(article_text)
word_count, avg_words_per_sentence, syllable_per_word = compute_variables(article_text)
# Append extracted data to output list
output_data.append({
'URL_ID': url_id,
'URL': url,
'Average Sentence Length': [avg_sentence_length],
'Percentage of Complex Words': [percentage_complex_words],
'Fog Index': [fog_index],
'Average Words Per Sentence': [avg_words_per_sentence],
'Complex Word Count': [complex_word_count],
'Total Word Count': [total_word_count],
'Syllable Per Word': [syllable_per_word],
'Personal Pronouns Count': [personal_pronouns_count],
'Average Word Length': [average_word_length]
})
# Create DataFrame from output data
output_df = pd.DataFrame(output_data)
# Save output DataFrame to Excel file
output_df.to_excel('Output.xlsx', index=False)