PDF?? ???? ???? ??? ????? PDF? ??? ?? ?? ??? ???? ?? ?? ?????. ??? ??? ??? ???? pdfminer ?? pdfplumber? ?? ?? Python ?????? ???? ?? ??????. ????? ????? ??PDF?? ???? ???? ??, ??, ?? ?? ??? ??? JSON ???? ???? ??? ?????.
PDF ??? ??? ??? ?? ??? ??-????? ???? ??? pdfplumber? ???? ?? ? ?? ?? ???? ?????. ??? ? PDF ???? ???? ??? ?? ???? ??? ?? ?? pytesseract ?????? ???? OCR? ???? ???? "??"?? ????. ? ??????? ?? ???? ??? ?????.
??? ?
- pdfplumumber(Python ?????)
- pdf2image(Python ?????)
- pytesseract(Python ?????)
- ????-ocr
??? ?? pip ??? ???? Python ?????? ??? ? ????. Tesseract-OCR? ?? ?? ????? ?????? ?????? ?????. pytesseract? tesseract ?????? ??? ??? ????.
pip install pdfplumber pip install pdf2image pip install pytesseract
PDF ???? ???? ??
? ?? ??? PDF ???? ???? ???? ????. ? extract_text_from_pdf() ??? PDF ??? page_num(??? 0)? ????? ???? ?? ??? ?? ??? ?????. ???? ?? ?? ???? ???? ???? ??? ?? ?? ?????.
# Extract text from a specific page of a PDF def extract_text_from_pdf(pdf_path, page_num): # Use pdfplumber to open the PDF pdf = pdfplumber.open(pdf_path) print(f"extracting page {page_num}..") page = pdf.pages[page_num] images = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1) image = images[0] # Convert to black and white bw_image = convert_to_bw(image) # Save the B&W image for debugging (optional) #bw_image.save("bw_page.png") # Perform OCR on the B&W image e_text = ocr_image(bw_image) open('out.txt', 'w', encoding='utf-8').write(e_text) #print("output written to file.") try: process_text(page_num, e_text) except Exception as e: print("Error occurred:", e) print("done..") # Convert image to black and white def convert_to_bw(image): # Convert to grayscale gray = image.convert('L') # Apply threshold to convert to pure black and white bw = gray.point(lambda x: 0 if x < 128 else 255, '1') return bw # Perform OCR using Tesseract on a given image def ocr_image(image_path): try: # Perform OCR custom_config = r'--oem 3 --psm 6 -l guj+eng' text = pytesseract.image_to_string(image_path, config=custom_config) # --psm 6 treats the image as a block of text return text except Exception as e: print(f"Error during OCR: {e}") return None
ocr_image() ??? pytesseract? ???? OCR? ?? ????? ???? ?????. --oem ? --psm? ?? ?? ????? ??? ?? ??? ???? -l guj eng ????? ?? ??? ?????. ? PDF?? ?? ?? ???? ???? ???? guj eng? ??????.
??? ??
OCR? ???? ???? ??? ??? ??? ???? ?? ??? ? ????. ?? pdfplumber ?? pypdf2? ?? ?? PDF ?????? ???? ?????.
nums = ['0', '?', '?', '?', '?', '?', '?', '?', '?', '?'] def process_text(page_num, e_text): obj = None last_surname = None last_kramank = None print(f"processing page {page_num}..") for line in e_text.splitlines(): line = line.replace('|', '').replace('[', '').replace(']', '') parts = [word for word in line.split(' ') if word] if len(parts) == 0: continue new_rec = True for char in parts[0]: if char not in nums: new_rec = False break if len(parts) < 2: continue if new_rec and len(parts[0]) >= 2: # numbered line if len(parts) < 9: continue if obj: records.append(obj) obj = {} last_surname = parts[1] obj['kramank'] = parts[0] last_kramank = parts[0] obj['full_name'] = ' '.join(parts[1:4]) obj['surname'] = parts[1] obj['pdf_page_num'] = page_num + 1 obj['registered_by'] = parts[4] obj['village_vatan'] = parts[5] obj['village_mosal'] = parts[6] if parts[8] == '????': idx = 7 obj['dob'] = parts[idx] + ' ????' idx += 1 elif len(parts[7]) == 8 and parts[7][2] == '-': idx = 7 obj['dob'] = parts[idx] else: print("warning: no date") idx = 6 obj['marital_status'] = parts[idx+1] obj['extra_fields'] = '::'.join(parts[idx+2:-2]) obj['blood_group'] = parts[-1] elif parts[0] == last_surname: # new member in existing family if obj: records.append(obj) obj = {} obj['kramank'] = last_kramank obj['surname'] = last_surname obj['full_name'] = ' '.join(parts[0:3]) obj['pdf_page_num'] = page_num + 1 obj['registered_by'] = parts[3] obj['village_vatan'] = parts[4] obj['village_mosal'] = parts[5] if len(parts) <= 6: continue if parts[7] == '????': # date exists idx = 6 obj['dob'] = parts[idx] + ' ????' idx += 1 elif len(parts[6]) == 8 and parts[6][2] == '-': idx = 6 obj['dob'] = parts[idx] else: print("warning: no date") idx = 5 obj['marital_status'] = parts[idx+1] obj['extra_fields'] = '::'.join(parts[idx+2:-2]) obj['blood_group'] = parts[-1] elif obj: # continuation lines if ("(" in line and ")" in line) or "??.?" in line: obj['extra_fields'] += ' ' + '::'.join(parts[0:]) if obj: records.append(obj) jstr = json.dumps(records, indent=4) open("guj.json", 'w', encoding='utf-8').write(jstr) print(f"written page {page_num} to json..")
?? PDF?? ???? ? ??? ???? ????. ? ?? ? ?? ??? ? ????(?: 0? ?? 0?)? ?? ??(?)? ??? ? ? ??? ?????.
pytesseract? IT ??? ??? ??? ???? ?????. ? 10? ?? ?? ???? ??? PC? ????? ??? ?? ??? OCR? ???? PDF ???? ??? ?? ???? ?? ?? ???????. ??? ??? ?????! ??? ?? ??? ????. ?? ??? ??? ????? ?????.
????
- Windows? Tesseract ??
- pytesseract OCR? ???? ????? ??? ??
- Windows 10?? ??? ?? ??? ?? ??? ??? ????? pytesseract? ???? ??? ??????
? ??? ??? ?? PDF?? ??? ?? ??: pytesseract OCR ????? ?? ?????. ??? ??? PHP ??? ????? ?? ?? ??? ?????!

? AI ??

Undress AI Tool
??? ???? ??

Undresser.AI Undress
???? ?? ??? ??? ?? AI ?? ?

AI Clothes Remover
???? ?? ???? ??? AI ?????.

Clothoff.io
AI ? ???

Video Face Swap
??? ??? AI ?? ?? ??? ???? ?? ???? ??? ?? ????!

?? ??

??? ??

???++7.3.1
???? ?? ?? ?? ???

SublimeText3 ??? ??
??? ??, ???? ?? ????.

???? 13.0.1 ???
??? PHP ?? ?? ??

???? CS6
??? ? ?? ??

SublimeText3 Mac ??
? ??? ?? ?? ?????(SublimeText3)

API ??? ??? ??? ?? ??? ???? ???? ???? ????. 1. Apikey? ?? ??? ?? ????, ????? ?? ?? ?? URL ?? ??? ?????. 2. Basicauth? ?? ???? ??? Base64 ??? ??? ??? ??? ????? ?????. 3. OAUTH2? ?? Client_ID ? Client_Secret? ?? ??? ?? ?? ?? ??? BearEtroken? ???????. 4. ?? ??? ???? ?? ?? ?? ???? ????? ???? ?? ?? ? ????. ???, ??? ?? ??? ??? ???? ?? ??? ???? ???? ?? ?????.

Assert? ????? ???? ???? ?? ? ???? ??? ???? ??? ?? ?? ????. ??? ??? ??? ?? ??? ?????, ?? ?? ?? ??, ?? ?? ?? ?? ?? ?? ??? ????? ?? ?? ??? ?? ???? ??? ? ??? ??? ??? ??? ?? ???????. ?? ??? ???? ?? ?? ???? ?? ????? ??? ? ????.

typehintsinpythonsolvetheproblemombiguityandpotentialbugsindynamicallytypedcodebyallowingdevelopscifyexpectiontypes. theyenhancereadability, enablearylybugdetection ? improvetoomingsupport.typehintsareaddedusingaColon (:) forvariblesAndAramete

????? ??? ? ??? ??? ?? ??? ???? ??? zip () ??? ???? ????.? ??? ?? ??? ???? ?? ??? ?? ????. ?? ??? ???? ?? ?? itertools.zip_longest ()? ???? ?? ?? ? ??? ?? ? ????. enumerate ()? ???? ??? ???? ?? ? ????. 1.zip ()? ???? ????? ?? ??? ??? ??? ?????. 2.zip_longest ()? ???? ?? ??? ?? ? ? ???? ?? ? ????. 3. Enumental (Zip ())? ??? ??? ????? ??? ???? ???? ?? ???? ?? ? ????.

inpython, iteratorsareobjectsthatlowloppingthroughcollections __ () ? __next __ ()

Python? ???? ????? ???? API? ???? Fastapi? ?????. ?? ??? ?? ????? ?????? ??? ??? ??? ???? ?? ? ? ????. Fastapi ? Asgi Server Uvicorn? ?? ? ? ????? ??? ??? ? ????. ??? ??, ?? ?? ?? ? ???? ?????? API? ???? ?? ? ? ????. Fastapi? ??? HTTP ??? ???? ?? ?? ? Swaggerui ? Redoc Documentation Systems? ?????. ?? ??? ?? URL ?? ??? ?? ? ??? ??, ?? ?? ??? ???? ???? ?? ?? ??? ??? ? ????. Pydantic ??? ???? ??? ?? ???? ???? ????? ? ??? ? ? ????.

?? ??? ?? ????? ???? ?? ? ? ??????. Python? ?? Venv ??? ???? ??? ??? Python-Mvenvenv???. ??? ?? : Windows? Env \ Scripts \ Activate? ?????. MacOS/Linux? Sourceenv/bin/activate? ?????. ?? ???? PipinStall? ???? PipFreeze> ?? ??? ???? ?? ?? ??? ???? PipinStall-Rrequirements.txt? ???? ??? ?????. ?? ???? GIT? ???? ?? ? ???? ?? ??? ? ????? IDE?? ?? ?? ? ???? ??? ? ????.

API? ?????? Python? ?? ?????? ???????. ??? ?????? ????, ??? ???, ??? ????, ?? ??? ???? ? ???? ????. ?? PipinstallRequests? ?? ?????? ??????. ?? ?? requests.get () ?? requests.post () ? ?? ???? ???? ?? ?? ?? ??? ?????. ?? ?? response.status_code ? response.json ()? ???? ?? ??? ???? ????? ??????. ?????, ?? ?? ?? ??? ???? ?? ?? ??? ???? ? ?? ?????? ???? ?? ???? ???? ???? ??????.
