PYTHON: textract.exceptions.ShellError: The command antiword d:/doc\300 Dpi.doc failed with exit code 127

Рейтинг: 0Ответов: 1Опубликовано: 18.06.2023

Я использую этот код для преобразования .doc и .docx в txt

import os
import textract

# Path to the folder containing .doc and .docx files
input_folder = "d:/doc"

# Path to the folder where .txt files will be saved
output_folder = "d:/doc"

# Get a list of all .doc and .docx files in the input folder
files = [f for f in os.listdir(input_folder) if f.endswith((".doc", ".docx"))]

# Loop through each file and convert it to .txt using Textract
for file in files:
    file_path = os.path.join(input_folder, file)
    output_file_name = os.path.splitext(file)[0] + ".txt"
    output_file_path = os.path.join(output_folder, output_file_name)

    text = textract.process(file_path).decode("utf-8")

    with open(output_file_path, "w", encoding="utf-8") as txt_file:
        txt_file.write(text)

print("Conversion complete!")

И У МЕНЯ ЭТА ОШИБКА НА ВЫВОДЕ

utils.py

*** Remote Interpreter Reinitialized ***
Traceback (most recent call last):
  File "C:\Users\Castel\AppData\Roaming\Python\Python310\site-packages\textract\parsers\utils.py", line 87, in run
    pipe = subprocess.Popen(
  File "C:\Program Files\Python39\lib\subprocess.py", line 966, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Program Files\Python39\lib\subprocess.py", line 1435, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
FileNotFoundError: [WinError 2] The system cannot find the file specified

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "E:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\Convert Docx and Doc to TXT with textract.py", line 19, in <module>
    text = textract.process(file_path).decode("utf-8")
  File "C:\Users\Castel\AppData\Roaming\Python\Python310\site-packages\textract\parsers\__init__.py", line 79, in process
    return parser.process(filename, input_encoding, output_encoding, **kwargs)
  File "C:\Users\Castel\AppData\Roaming\Python\Python310\site-packages\textract\parsers\utils.py", line 46, in process
    byte_string = self.extract(filename, **kwargs)
  File "C:\Users\Castel\AppData\Roaming\Python\Python310\site-packages\textract\parsers\doc_parser.py", line 9, in extract
    stdout, stderr = self.run(['antiword', filename])
  File "C:\Users\Castel\AppData\Roaming\Python\Python310\site-packages\textract\parsers\utils.py", line 95, in run
    raise exceptions.ShellError(
textract.exceptions.ShellError: The command `antiword d:/doc\300 Dpi.doc` failed with exit code 127
------------- stdout -------------
------------- stderr -------------

>>>

Ответы

▲ -1
import os
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
from PyPDF2 import PdfFileReader

# Path to the folder containing PDF files
input_folder = "d:/doc/doc"

# Path to the folder where text files will be saved
output_folder = "d:/doc/doc"

# Path to the Tesseract OCR executable (change if necessary)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Get a list of all PDF files in the input folder
files = [f for f in os.listdir(input_folder) if f.endswith(".pdf")]

# Loop through each PDF file and convert it to text using OCR
for file in files:
    pdf_path = os.path.join(input_folder, file)
    txt_path = os.path.join(output_folder, os.path.splitext(file)[0] + ".txt")

    # Convert PDF pages to images
    images = convert_from_path(pdf_path)

    # Perform OCR on images and extract text
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image)

    # Save the extracted text to a text file
    with open(txt_path, "w", encoding="utf-8") as txt_file:
        txt_file.write(text)

print("Conversion complete!")