国产av日韩一区二区三区精品,成人性爱视频在线观看,国产,欧美,日韩,一区,www.成色av久久成人,2222eeee成人天堂

? ??? ?? ??? ???? ??? ?? PDF?? ??? ?? ??: pytesseract OCR ????

??? ?? PDF?? ??? ?? ??: pytesseract OCR ????

Dec 01, 2024 pm 06:47 PM

Unlocking Text from Embedded-Font PDFs: A pytesseract OCR Tutorial

PDF?? ???? ???? ??? ????? PDF? ??? ?? ?? ??? ???? ?? ?? ?????. ??? ??? ??? ???? pdfminer ?? pdfplumber? ?? ?? Python ?????? ???? ?? ??????. ????? ????? ??PDF?? ???? ???? ??, ??, ?? ?? ??? ??? JSON ???? ???? ??? ?????.

PDF ??? ??? ??? ?? ??? ??-????? ???? ??? pdfplumber? ???? ?? ? ?? ?? ???? ?????. ??? ? PDF ???? ???? ??? ?? ???? ??? ?? ?? pytesseract ?????? ???? OCR? ???? ???? "??"?? ????. ? ??????? ?? ???? ??? ?????.

??? ?

  • pdfplumumber(Python ?????)
  • pdf2image(Python ?????)
  • pytesseract(Python ?????)
  • ????-ocr

??? ?? pip ??? ???? Python ?????? ??? ? ????. Tesseract-OCR? ?? ?? ????? ?????? ?????? ?????. pytesseract? tesseract ?????? ??? ??? ????.

pip install pdfplumber
pip install pdf2image
pip install pytesseract

PDF ???? ???? ??

? ?? ??? PDF ???? ???? ???? ????. ? extract_text_from_pdf() ??? PDF ??? page_num(??? 0)? ????? ???? ?? ??? ?? ??? ?????. ???? ?? ?? ???? ???? ???? ??? ?? ?? ?????.

# Extract text from a specific page of a PDF
def extract_text_from_pdf(pdf_path, page_num):
    # Use pdfplumber to open the PDF
    pdf = pdfplumber.open(pdf_path)
    print(f"extracting page {page_num}..")
    page = pdf.pages[page_num]
    images = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)
    image = images[0]
    # Convert to black and white
    bw_image = convert_to_bw(image)
    # Save the B&W image for debugging (optional)
    #bw_image.save("bw_page.png")
    # Perform OCR on the B&W image
    e_text = ocr_image(bw_image)
    open('out.txt', 'w', encoding='utf-8').write(e_text)
    #print("output written to file.")
    try:
        process_text(page_num, e_text)
    except Exception as e:
        print("Error occurred:", e)
    print("done..")

# Convert image to black and white
def convert_to_bw(image):
    # Convert to grayscale
    gray = image.convert('L')
    # Apply threshold to convert to pure black and white
    bw = gray.point(lambda x: 0 if x < 128 else 255, '1')
    return bw

# Perform OCR using Tesseract on a given image
def ocr_image(image_path):
    try:
        # Perform OCR
        custom_config = r'--oem 3 --psm 6 -l guj+eng'
        text = pytesseract.image_to_string(image_path, config=custom_config)  # --psm 6 treats the image as a block of text
        return text
    except Exception as e:
        print(f"Error during OCR: {e}")
        return None

ocr_image() ??? pytesseract? ???? OCR? ?? ????? ???? ?????. --oem ? --psm? ?? ?? ????? ??? ?? ??? ???? -l guj eng ????? ?? ??? ?????. ? PDF?? ?? ?? ???? ???? ???? guj eng? ??????.

??? ??

OCR? ???? ???? ??? ??? ??? ???? ?? ??? ? ????. ?? pdfplumber ?? pypdf2? ?? ?? PDF ?????? ???? ?????.

nums = ['0', '?', '?', '?', '?', '?', '?', '?', '?', '?']

def process_text(page_num, e_text):
    obj = None
    last_surname = None
    last_kramank = None
    print(f"processing page {page_num}..")
    for line in e_text.splitlines():
        line = line.replace('|', '').replace('[', '').replace(']', '')
        parts = [word for word in line.split(' ') if word]
        if len(parts) == 0: continue
        new_rec = True
        for char in parts[0]:
            if char not in nums:
                new_rec = False
                break
        if len(parts) < 2: continue

        if new_rec and len(parts[0]) >= 2: # numbered line
            if len(parts) < 9: continue
            if obj: records.append(obj)
            obj = {}
            last_surname = parts[1]
            obj['kramank'] = parts[0]
            last_kramank = parts[0]
            obj['full_name'] = ' '.join(parts[1:4])
            obj['surname'] = parts[1]
            obj['pdf_page_num'] = page_num + 1
            obj['registered_by'] = parts[4]
            obj['village_vatan'] = parts[5]
            obj['village_mosal'] = parts[6]
            if parts[8] == '????':
                idx = 7
                obj['dob'] = parts[idx] + ' ????'
                idx += 1
            elif len(parts[7]) == 8 and parts[7][2] == '-':
                idx = 7
                obj['dob'] = parts[idx]
            else:
                print("warning: no date")
                idx = 6
            obj['marital_status'] = parts[idx+1]
            obj['extra_fields'] = '::'.join(parts[idx+2:-2])
            obj['blood_group'] = parts[-1]
        elif parts[0] == last_surname: # new member in existing family
            if obj: records.append(obj)
            obj = {}
            obj['kramank'] = last_kramank
            obj['surname'] = last_surname
            obj['full_name'] = ' '.join(parts[0:3])
            obj['pdf_page_num'] = page_num + 1
            obj['registered_by'] = parts[3]
            obj['village_vatan'] = parts[4]
            obj['village_mosal'] = parts[5]
            if len(parts) <= 6: continue
            if parts[7] == '????': # date exists
                idx = 6
                obj['dob'] = parts[idx] + ' ????'
                idx += 1
            elif len(parts[6]) == 8 and parts[6][2] == '-':
                idx = 6
                obj['dob'] = parts[idx]
            else:
                print("warning: no date")
                idx = 5
            obj['marital_status'] = parts[idx+1]
            obj['extra_fields'] = '::'.join(parts[idx+2:-2])
            obj['blood_group'] = parts[-1]
        elif obj: # continuation lines
            if ("(" in line and ")" in line) or "??.?" in line:
                obj['extra_fields'] += ' ' + '::'.join(parts[0:])
    if obj: records.append(obj)        
    jstr = json.dumps(records, indent=4)
    open("guj.json", 'w', encoding='utf-8').write(jstr)
    print(f"written page {page_num} to json..")

?? PDF?? ???? ? ??? ???? ????. ? ?? ? ?? ??? ? ????(?: 0? ?? 0?)? ?? ??(?)? ??? ? ? ??? ?????.

pytesseract? IT ??? ??? ??? ???? ?????. ? 10? ?? ?? ???? ??? PC? ????? ??? ?? ??? OCR? ???? PDF ???? ??? ?? ???? ?? ?? ???????. ??? ??? ?????! ??? ?? ??? ????. ?? ??? ??? ????? ?????.

????

  • Windows? Tesseract ??
  • pytesseract OCR? ???? ????? ??? ??
  • Windows 10?? ??? ?? ??? ?? ??? ??? ????? pytesseract? ???? ??? ??????

? ??? ??? ?? PDF?? ??? ?? ??: pytesseract OCR ????? ?? ?????. ??? ??? PHP ??? ????? ?? ?? ??? ?????!

? ????? ??
? ?? ??? ????? ???? ??? ??????, ???? ?????? ????. ? ???? ?? ???? ?? ??? ?? ????. ???? ??? ???? ???? ??? ?? admin@php.cn?? ?????.

? AI ??

Undresser.AI Undress

Undresser.AI Undress

???? ?? ??? ??? ?? AI ?? ?

AI Clothes Remover

AI Clothes Remover

???? ?? ???? ??? AI ?????.

Video Face Swap

Video Face Swap

??? ??? AI ?? ?? ??? ???? ?? ???? ??? ?? ????!

???

??? ??

???++7.3.1

???++7.3.1

???? ?? ?? ?? ???

SublimeText3 ??? ??

SublimeText3 ??? ??

??? ??, ???? ?? ????.

???? 13.0.1 ???

???? 13.0.1 ???

??? PHP ?? ?? ??

???? CS6

???? CS6

??? ? ?? ??

SublimeText3 Mac ??

SublimeText3 Mac ??

? ??? ?? ?? ?????(SublimeText3)

???

??? ??

??? ????
1600
29
PHP ????
1500
276
???
????? API ??? ???? ?? ????? API ??? ???? ?? Jul 13, 2025 am 02:22 AM

API ??? ??? ??? ?? ??? ???? ???? ???? ????. 1. Apikey? ?? ??? ?? ????, ????? ?? ?? ?? URL ?? ??? ?????. 2. Basicauth? ?? ???? ??? Base64 ??? ??? ??? ??? ????? ?????. 3. OAUTH2? ?? Client_ID ? Client_Secret? ?? ??? ?? ?? ?? ??? BearEtroken? ???????. 4. ?? ??? ???? ?? ?? ?? ???? ????? ???? ?? ?? ? ????. ???, ??? ?? ??? ??? ???? ?? ??? ???? ???? ?? ?????.

??? ??? ??????. ??? ??? ??????. Jul 07, 2025 am 12:14 AM

Assert? ????? ???? ???? ?? ? ???? ??? ???? ??? ?? ?? ????. ??? ??? ??? ?? ??? ?????, ?? ?? ?? ??, ?? ?? ?? ?? ?? ?? ??? ????? ?? ?? ??? ?? ???? ??? ? ??? ??? ??? ??? ?? ???????. ?? ??? ???? ?? ?? ???? ?? ????? ??? ? ????.

??? ?? ??? ?????? ??? ?? ??? ?????? Jul 07, 2025 am 02:55 AM

typehintsinpythonsolvetheproblemombiguityandpotentialbugsindynamicallytypedcodebyallowingdevelopscifyexpectiontypes. theyenhancereadability, enablearylybugdetection ? improvetoomingsupport.typehintsareaddedusingaColon (:) forvariblesAndAramete

? ?? ? ??? ???? ?? Python ? ?? ? ??? ???? ?? Python Jul 09, 2025 am 01:13 AM

????? ??? ? ??? ??? ?? ??? ???? ??? zip () ??? ???? ????.? ??? ?? ??? ???? ?? ??? ?? ????. ?? ??? ???? ?? ?? itertools.zip_longest ()? ???? ?? ?? ? ??? ?? ? ????. enumerate ()? ???? ??? ???? ?? ? ????. 1.zip ()? ???? ????? ?? ??? ??? ??? ?????. 2.zip_longest ()? ???? ?? ??? ?? ? ? ???? ?? ? ????. 3. Enumental (Zip ())? ??? ??? ????? ??? ???? ???? ?? ???? ?? ? ????.

??? ???? ?????? ??? ???? ?????? Jul 08, 2025 am 02:56 AM

inpython, iteratorsareobjectsthatlowloppingthroughcollections __ () ? __next __ ()

Python Fastapi ???? Python Fastapi ???? Jul 12, 2025 am 02:42 AM

Python? ???? ????? ???? API? ???? Fastapi? ?????. ?? ??? ?? ????? ?????? ??? ??? ??? ???? ?? ? ? ????. Fastapi ? Asgi Server Uvicorn? ?? ? ? ????? ??? ??? ? ????. ??? ??, ?? ?? ?? ? ???? ?????? API? ???? ?? ? ? ????. Fastapi? ??? HTTP ??? ???? ?? ?? ? Swaggerui ? Redoc Documentation Systems? ?????. ?? ??? ?? URL ?? ??? ?? ? ??? ??, ?? ?? ??? ???? ???? ?? ?? ??? ??? ? ????. Pydantic ??? ???? ??? ?? ???? ???? ????? ? ??? ? ? ????.

??? ?? ?? ?? ? ?? ??? ?? ?? ?? ? ?? Jul 06, 2025 am 02:56 AM

?? ??? ?? ????? ???? ?? ? ? ??????. Python? ?? Venv ??? ???? ??? ??? Python-Mvenvenv???. ??? ?? : Windows? Env \ Scripts \ Activate? ?????. MacOS/Linux? Sourceenv/bin/activate? ?????. ?? ???? PipinStall? ???? PipFreeze> ?? ??? ???? ?? ?? ??? ???? PipinStall-Rrequirements.txt? ???? ??? ?????. ?? ???? GIT? ???? ?? ? ???? ?? ??? ? ????? IDE?? ?? ?? ? ???? ??? ? ????.

????? API? ????? ?? ????? API? ????? ?? Jul 12, 2025 am 02:47 AM

API? ?????? Python? ?? ?????? ???????. ??? ?????? ????, ??? ???, ??? ????, ?? ??? ???? ? ???? ????. ?? PipinstallRequests? ?? ?????? ??????. ?? ?? requests.get () ?? requests.post () ? ?? ???? ???? ?? ?? ?? ??? ?????. ?? ?? response.status_code ? response.json ()? ???? ?? ??? ???? ????? ??????. ?????, ?? ?? ?? ??? ???? ?? ?? ??? ???? ? ?? ?????? ???? ?? ???? ???? ???? ??????.

See all articles