Nov-30-2020, 08:35 AM
Hello all,
I am trying to make a audiobook project where I am using different libraries. In here I am trying to do:
pdf -> images -> text -> gTTS -> audiofile -> pygame mixer -> playing audio.
This is my general structure of project but last two processes are happening but not giving me audio. It didn't show me error also. Here is the code,
I am trying to make a audiobook project where I am using different libraries. In here I am trying to do:
pdf -> images -> text -> gTTS -> audiofile -> pygame mixer -> playing audio.
This is my general structure of project but last two processes are happening but not giving me audio. It didn't show me error also. Here is the code,
# Import the required module for text to speech conversion
from gtts import gTTS
import pygame
from PIL import Image
import pytesseract
import os
import glob
import PySimpleGUI as sg
import tkinter as tk
import fitz
# In this function we get first and last page, which we want the software to read
def get_text(value):
string = value
string = string.strip()
if "-" in string:
first_page_number = int(string.split("-")[0])
last_page_number = int(string.split("-")[1])
else:
first_page_number = int(string)
last_page_number = 0
return first_page_number, last_page_number
def main():
global e, first_page_number, last_page_number
##### Create directory for Text to speech software
current_directory = os.getcwd()
final_directory = os.path.join(current_directory, r'Text_to_speech_software')
if not os.path.exists(final_directory):
os.makedirs(final_directory)
print(current_directory)
print(final_directory)
#### GUI Part #####
# All the stuff inside your window.
layout = [[sg.Text('Choose PDF File to read'), sg.Input(), sg.FileBrowse()],
[sg.Text('Enter PDF Page number or range separated by - '), sg.InputText()],
[sg.Button('Ok'), sg.Button('Cancel')]
]
# Create the GUI Window Prompt
window = sg.Window('Input', layout)
valid = False
# Event Loop to process "events" and get the "values" of the inputs
while True:
event, values = window.read()
# Here we read the path of the pdf file
pdf_to_read = values[0]
if event in (None, 'Cancel'): # if user closes window or clicks cancel
print("Exitting")
window.close()
exit()
if event == "Ok":
if values[0] == "":
sg.Popup("Enter value", "Enter PDF file to be transcribed ")
if values[1] == "":
sg.Popup("Enter value", "Enter page number(s) to be transcribed")
if values[0] != "" and values[1] != "":
for char in values[1]:
if char.isdigit() == False:
sg.Popup("Invalid value", "Enter valid number or numbers separated by -")
break
else:
valid = True
break
# Break while loop if valid first and last page numbers received
if valid == True:
print('You entered ', values[1])
break
window.close()
first_page_number, last_page_number = get_text(values[1])
# In this bunch of code, we get permission to delete the folder if it already exists, where we intend to save our PDF images and audio
image_directory = glob.glob(final_directory)
for file in os.listdir(final_directory):
filepath = os.path.join(final_directory, file)
print(filepath)
os.chmod(filepath, 0o777)
os.remove(filepath)
# Here we read desired PDF pages and store them as images in a folder
doc = fitz.open(pdf_to_read)
k = 1
# If user wants to read a single page
if last_page_number == 0:
page = doc.loadPage(first_page_number - 1) # number of page
zoom_x = 2.0
zoom_y = 2.0
mat = fitz.Matrix(zoom_x, zoom_y)
pix = page.getPixmap(matrix=mat)
output = os.path.join(final_directory, r"image_to_read.png")
pix.writePNG(output)
# If user wants to read range of pages
else:
for i in range(first_page_number - 1, last_page_number):
page = doc.loadPage(i) # number of page
zoom_x = 2.0
zoom_y = 2.0
mat = fitz.Matrix(zoom_x, zoom_y)
pix = page.getPixmap(matrix=mat)
output = os.path.join(final_directory, r"image_" + str(k) + "_to_read.png")
pix.writePNG(output)
k += 1
print("Done")
# Initialize the Pytesseract OCR software
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
mytext = []
# Here we load the image(s) created in Text_to_speech folder and read the text in image via pytesseract Optical Character Recognition (OCR) software
# thus reading text in images and giving us a string
for file in os.listdir(final_directory):
data = pytesseract.image_to_string(Image.open(os.path.join(final_directory, file)), lang="eng")
data = data.replace("|",
"I") # For some reason the image to text translation would put | instead of the letter I. So we replace | with I
data = data.split('\n')
mytext.append(data)
# Language in which you want to convert
language = 'en'
print(mytext)
# Here we make sure that the text is read correctly and we read it line by line. Because sometimes, text would end abruptly
newtext = ""
for text in mytext:
for line in text:
line = line.strip()
# If line is small, ignore it
if len(line.split(" ")) < 10 and len(line.split(" ")) > 0:
newtext = newtext + " " + str(line) + "\n"
elif len(line.split(" ")) < 2:
pass
else:
if line[-1] != ".":
newtext = newtext + " " + str(line)
else:
newtext = newtext + " " + line + "\n"
print(newtext)
# Passing the text and language to the engine,
# here we have marked slow=False. Which tells
# the module that the converted audio should
# have a high speed
myobj = gTTS(text=newtext, lang=language, slow=False)
# Saving the converted audio in a mp3 file named pdf_audio.mp3
myobj.save(os.path.join(final_directory, "pdf_audio.mp3"))
# Here we load and play the audio file
pygame.mixer.init()
pygame.mixer.music.load(os.path.join(final_directory, "pdf_audio.mp3"))
pygame.mixer.music.play(-1)
########## GUI END ########
if __name__ == '__main__':
main()Kindly, help me.
