2022-03-20 19:12:28
# First Run single_page_pdf_to_multiple.py file 
from distutils.util import convert_path
from lib2to3.pytree import convert
from PyPDF2 import PdfFileWriter, PdfFileReader
import os
import re
import area_subarea_parser
import json 
import csv
import pandas as pd
import area_subarea_parser 
 
from PyPDF2 import PdfFileMerger
script_path=os.getcwd()
print(script_path)
 
def pdf_to_file_converter(input_file):
    inputpdf = PdfFileReader(open("input_file/"+input_file, "rb"))
    cnt=0
    for i in range(inputpdf.numPages):
        cnt+=1
        output = PdfFileWriter()
        output.addPage(inputpdf.getPage(i))
        with open("generated_pdf/document-page%s.pdf" % i, "wb") as outputStream:
            output.write(outputStream)
 
 
    # Then Run pdf_address_Parser.py file 
    with open('area_wise_pdf/Not_parsed/not_parsed.csv','w',newline='') as f:
        writer=csv.writer(f)
        writer.writerow(["Non Parsed Address"])
        cnt=0
        # print(os.getcwd())
        for filename in os.listdir(script_path+"/generated_pdf"):
            try:
                cnt+=1
                output_name=filename.replace('.pdf','')
                convert_command='pdftotext -layout '+script_path+'/generated_pdf/'+filename+' '+script_path+'/TEXT/'+output_name+'.txt'
                #print(convert_command)
                os.system(convert_command)
                f=open(script_path+'/TEXT/'+output_name+'.txt')
                data=f.readlines()
                address=data[7]+' '+data[8]
                address=', '.join(address.split())
 
                file_id=re.search('(R\d{10,20})',data[8].strip())
                if file_id:
                    file_id=file_id.group(1)
 
                temp_add=''
                for k in range(1,len(data)):
                    a=data[k]
                    a=a.split('      ')
                    temp_add+=a[0]+' '
 
                area=area_subarea_parser.ASU_Parser(temp_add)
                area=area[0]['area']
                #print(area)
                if area==None:
                    area="Not_Parsed"
                    try:
                        temp_add=temp_add.strip()
                    except:
                        pass
                    writer.writerow([temp_add])
 
                try:
                    old_name=script_path+"/generated_pdf/"+filename
                    new_name=script_path+"/generated_pdf/"+area+'-'+file_id+".pdf"
                    os.rename(old_name, new_name)
                except Exception as e:
                    print(e)
 
            except Exception as e:
                print(e)
 
            print(cnt)
 
 
    # Then Run area_wise_json_file_generator.py file
 
 
 
    cnt=0
    area_file={}
    for filename in os.listdir(script_path+"/generated_pdf"):
        cnt+=1
        temp_file=filename.split('-')[0]
 
        if temp_file!='Not_Parsed' and temp_file not in area_file:
            area_file[temp_file]=[filename]
        elif temp_file!='Not_Parsed' and temp_file in area_file:
            area_file[temp_file].append(filename)
        print(cnt)
 
 
 
    f = open("area_wise_json_file.json", "w")
    json.dump(area_file, f,indent=4)
    f.close()
 
 
 
    # Then Run area_wise_pdf_generator.py file 
 
 
 
    with open('area_wise_json_file.json') as f: #open and store knowledge based tree as global variable  
        area_wise_json= json.load(f)
 
 
    cnt=0
    for key,value in area_wise_json.items():
        merger = PdfFileMerger()
        for i in range(0,len(value)):
            merger.append('generated_pdf/'+value[i])
 
        merger.write("area_wise_pdf/Area_Wise/"+key)
        merger.close()
 
 
    '''
    in this portion there have limitation. for some operating system and ram configuration at a time 3000
    page can be pdf at a time. 
    so if you find to see as an output area name from json key 
    then do it manually 3000,3000,3000 continue 
    and finally added them and converted as a pdf 
 
    '''
Invalid Email or Password