#!/usr/bin/python

import os,re,sys


##############################################################################
### CONFIGURATION SECTION ####################################################
##############################################################################
# The BIBDIRS and PDFDIRS are lists in which you can specify which directories
# will be indexed. PDFTOTEXT has the location of the pdftotext tool.
##############################################################################
BIBDIRS = [ "BIB", "BIB-other" ]
PDFDIRS= [ "PDF" ]
PDFTOTEXT = "/usr/bin/pdftotext"


##############################################################################
### BODY OF THE PROGRAM, DO NOT TOUCH! #######################################
##############################################################################

# get a list of all pdf files
pdffiles = {}
for dir in PDFDIRS:
  for file in os.listdir(dir):
    basename = re.sub("\.pdf","",file)
    pdfname = os.curdir + os.sep + dir + os.sep + file
    pdffiles[basename] = pdfname



# loop over dirs and files in those dirs
for dir in BIBDIRS:
  for file in os.listdir(dir):

    # strip off the .bib to get the basename and add the full path info go get
    # the bibname
    basename = re.sub("\.bib","",file)
    bibname = os.curdir + os.sep + dir + os.sep + file


    # check if we're really dealing with a bibfile. 
    if re.search("bib$",bibname):

      # first process the bibtex data
      doc  = "<bibfile>\n"
      doc += "  <bibtex>\n"

      # loop over the lines in the bibfile and check for key=value pairs. only
      # those have to be xml-ified. Strip newlines first!
      for line in open(bibname, "r").readlines():
        line = re.sub("\n","",line)
        if re.search(".*=.*",line):
          k,v = re.split("=",line )
          k = re.sub(" *","",k)
          line = "    <%s>%s</%s>\n" % (k,v,k)
          doc += line

      doc += "  </bibtex>\n"
      doc += "  <content>\n"

      # check of there is a pdf file 
      if basename in pdffiles.keys():
        command = "%s %s - 2>/dev/null " % (PDFTOTEXT, pdffiles[basename])
        child = os.popen(command)
        data = child.read()
        err = child.close()
        if err:
          raise RuntimeError, '%s failed w/ exit code %d' % (command, err)
        doc += data


      # stop processing
      doc += "  </content>\n"
      doc += "</bibfile> \n\n\n"


      # output info on current doc
      sys.stdout.write("Path-Name: " + bibname + " \n")
      sys.stdout.write("Content-Length: " + str(len(doc)) + " \n\n")
      sys.stdout.write(doc)


