# First Run single_page_pdf_to_multiple.py file from distutils.util import convert_path from lib2to3.pytree import convert from PyPDF2 import PdfFileWriter, PdfFileReader import os import re import area_subarea_parser import json import csv import pandas as pd import area_subarea_parser from PyPDF2 import PdfFileMerger script_path=os.getcwd() print(script_path) def pdf_to_file_converter(input_file): inputpdf = PdfFileReader(open("input_file/"+input_file, "rb")) cnt=0 for i in range(inputpdf.numPages): cnt+=1 output = PdfFileWriter() output.addPage(inputpdf.getPage(i)) with open("generated_pdf/document-page%s.pdf" % i, "wb") as outputStream: output.write(outputStream) # Then Run pdf_address_Parser.py file with open('area_wise_pdf/Not_parsed/not_parsed.csv','w',newline='') as f: writer=csv.writer(f) writer.writerow(["Non Parsed Address"]) cnt=0 # print(os.getcwd()) for filename in os.listdir(script_path+"/generated_pdf"): try: cnt+=1 output_name=filename.replace('.pdf','') convert_command='pdftotext -layout '+script_path+'/generated_pdf/'+filename+' '+script_path+'/TEXT/'+output_name+'.txt' #print(convert_command) os.system(convert_command) f=open(script_path+'/TEXT/'+output_name+'.txt') data=f.readlines() address=data[7]+' '+data[8] address=', '.join(address.split()) file_id=re.search('(R\d{10,20})',data[8].strip()) if file_id: file_id=file_id.group(1) temp_add='' for k in range(1,len(data)): a=data[k] a=a.split(' ') temp_add+=a[0]+' ' area=area_subarea_parser.ASU_Parser(temp_add) area=area[0]['area'] #print(area) if area==None: area="Not_Parsed" try: temp_add=temp_add.strip() except: pass writer.writerow([temp_add]) try: old_name=script_path+"/generated_pdf/"+filename new_name=script_path+"/generated_pdf/"+area+'-'+file_id+".pdf" os.rename(old_name, new_name) except Exception as e: print(e) except Exception as e: print(e) print(cnt) # Then Run area_wise_json_file_generator.py file cnt=0 area_file={} for filename in os.listdir(script_path+"/generated_pdf"): cnt+=1 temp_file=filename.split('-')[0] if temp_file!='Not_Parsed' and temp_file not in area_file: area_file[temp_file]=[filename] elif temp_file!='Not_Parsed' and temp_file in area_file: area_file[temp_file].append(filename) print(cnt) f = open("area_wise_json_file.json", "w") json.dump(area_file, f,indent=4) f.close() # Then Run area_wise_pdf_generator.py file with open('area_wise_json_file.json') as f: #open and store knowledge based tree as global variable area_wise_json= json.load(f) cnt=0 for key,value in area_wise_json.items(): merger = PdfFileMerger() for i in range(0,len(value)): merger.append('generated_pdf/'+value[i]) merger.write("area_wise_pdf/Area_Wise/"+key) merger.close() ''' in this portion there have limitation. for some operating system and ram configuration at a time 3000 page can be pdf at a time. so if you find to see as an output area name from json key then do it manually 3000,3000,3000 continue and finally added them and converted as a pdf '''