-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfilehandle.py
57 lines (40 loc) · 1.36 KB
/
filehandle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import PyPDF2 # PDF Reader Library
import docx # WORD Reader Library
def pdftolist(filename):
''' A function to convert PDF CONTENT to LIST '''
# Loads the PDF Document
pdf_file = open("Documents_Uploaded/"+filename, 'rb')
# Reads the PDF Document
read_pdf = PyPDF2.PdfFileReader(pdf_file)
c = read_pdf.numPages
# Make PDF CONTENT into LIST
txtlist=[]
texts=""
for i in range(c):
page = read_pdf.getPage(i)
texts = page.extractText()
texts=str(texts).split("\n\n")
txtlist.extend(texts)
textlist=list(filter(None, txtlist))
textlist[:]=[item for item in textlist if item !=' ']
# LIST READY of PDF FORMAT
print("Reading PDF Complete")
# Return the TEXTLIST
return textlist
def wordtolist(filename):
''' A function to convert WORD CONTENT to LIST '''
# Loads the WORD Document
word_file = open("Documents_Uploaded/"+filename, 'rb')
# Reads the WORD Document
read_word = docx.Document(word_file)
doc = read_word
# Make WORD CONTENT into LIST
listText = []
for paragraph in doc.paragraphs:
listText.append(paragraph.text)
textlist=list(filter(None, listText))
textlist[:]=[item for item in textlist if item !=' ']
# LIST READY of WORD FORMAT
print("Reading WORD Complete")
# Return the TEXTLIST
return textlist