'''
Created on November 02, 2011
@author: Gaurav Sood
Scrape Text off Wisconsin Ads pdfs
'''

import sys, os, re, pyPdf, codecs, csv, string

def convertPdf2String(pdfFile):
      content = ""
      # try catch for EOF exception - which seems like a nuisance exception
      try:
          # load PDF file
          pdf = pyPdf.PdfFileReader(file(pdfFile, "rb"))
          # iterate pages
          for i in range(0, pdf.getNumPages()):
              # extract the text from each page
              content += pdf.getPage(i).extractText() + " \n"
              # collapse whitespaces
              content = u" ".join(content.replace(u"\xa0", u" ").strip().split())
      except Exception, e:
        return "Unable to open file: %s with error: %s" % (pdfFile, str(e))
      return content

def writer(path, out):
    dirList=os.listdir(path)
    for fname in dirList: 
        row = convertPdf2String(path+fname).encode("ascii", "xmlcharrefreplace")
        print row
        if row.find('Brand') > 0:
            title = ' '.join(row.split(' ')[1:]).partition('Brand:')[0]
            creative = row.partition('Brand:')[0]
            brand = row.partition('Brand:')[2].partition('Parent')[0]
        else:
            continue
        race = row.split(' ')[0]  
        parent =row.partition('Parent:')[2].partition('Aired:')[0]
        date = row.partition('Aired:')[2].partition('Creative Id:')[0].strip()
        creative_id = row.partition('Creative Id:')[2].partition('[')[0]
        sponsor=""
        if row.find('[PFB'):
            text = '['+row.partition('Creative Id:')[2].partition('[')[2].partition('[PFB')[0]
            sponsor = row.partition('[PFB')[2]
        if len(sponsor.split(':')) == 1: 
            sponsor = sponsor.split(':')[0].rstrip()   
        else:
            sponsor = sponsor.split(':')[1].rstrip()
        if sponsor.find('Copyright'):
            sponsor = sponsor.partition('Copyright')[0].lstrip()
        if sponsor.find(']'):
            sponsor = sponsor.partition(']')[0]        
        #Clean if you want to
        text = re.sub("Copyright 2003 TNS Media Intelligence/CMAG www.PoliticsOnTV.com 1-866-559-CMAG", "", text)
        text = re.sub("Copyright 2004 TNS Media Intelligence/CMAG www.PoliticsOnTV.com 1-866-559-CMAG", "", text)
        text = re.sub("Storyboard", "", text)
        
        record = (creative, creative_id, date, race, title, brand, parent, sponsor,text)
        out.writerow(record)

# Header Row
header = ('creative', 'creative_id','date.aired', 'race', 'title', 'brand', 'parent', 'sponsor', 'text')
    
ads = csv.writer(open('outpath', 'wb'))
ads.writerow(header) # Header Row
writer(path.to.ads.folder, ads)

