#!/usr/bin/env python

"""
Bibliofolks
"""

__author__ = "Matteo Baldoni"
__version__ = "$Revision: 0.1 $"
__date__ = "$Date: 2008/03/25 15:30:00 $"
__copyright__ = "Copyright (c) 2001 Matteo Baldoni"
__license__ = "GPL"


from xml.dom import minidom
from re import *
from os import system

true     = lambda : 1
false    = lambda : 0

t_id       = 0
t_authors  = 1
t_title    = 2
t_keywords = 3

p_split = compile(r'\W+') 

stopWords  = ['in','from', 'of', 'to', 'and', 'an', 'with', 'that', 'a', 'on', \
              'the', '', 'lo', 'di', 'dell\'', 'e', 'into', 'E\'', 'The', \
              'by', 'for', 'about', 'as', '', 'A', '007', 'E', 'II', 'Using', \
              'based', 'driven', 'guided', 'means', 'priori', 'About', 'An', \
              '0', 'I', 'On', 'Proc', 'agli', 'like', 's', 'valued', '1', '3' \
              '2007', 'dell']

stopTags   = ['']

#authors    = ['baldoni', 'baroglio', 'patti', 'martelli', 'pozzato', \
#              'gliozzi', 'schifanella', 'sapino', 'olivetti', 'giordano']

authors    = ['Baldoni', 'Baroglio', 'Patti', 'Martelli', 'Pozzato', \
              'Gliozzi', 'Schifanella', 'Sapino', 'Olivetti', 'Giordano']


all_auths  = 'all'

def compare(val1, val2):
    # l'arco con minore peso
    return val1 > val2
    # l'arco con maggiore peso
    #return val1 < val2

# frequenza arco connessione per la selezione
edgeWeight = 3

wordFreqLimit = {}
wordAuthorFreqLimit = 2
for author in authors:
    wordFreqLimit[author] = wordAuthorFreqLimit
wordFreqLimit[all_auths] = 3

collectAuthorWordId = false()
collectAuthorTagId = true()

class BiblioFolks:

    def __init__(self, pathfile):
        self.pathfile = pathfile
        self.xmldoc = minidom.parse(pathfile)
        self.initAuthorsPattern()
        self.tuples = self.getListTuples()
        self.initDictionaries()

    def initDictionaries(self):
        self.wordTitleFreq = {}
        self.tagFreq = {}
        self.wordId = {}
        self.tagId = {}
        self.wordTitleFreq[all_auths] = {}
        self.tagFreq[all_auths] = {}
        self.wordId[all_auths] = {}
        self.tagId[all_auths] = {}
        for author in authors:
            self.wordTitleFreq[author] = {}
            self.tagFreq[author] = {}
            self.wordId[author] = {}
            self.tagId[author] = {}

    def initAuthorsPattern(self):
        self.p_authors = {}
        for author in authors:
            self.p_authors[author] = compile(author, IGNORECASE)

    def getTuples(self):
        return self.tuples

    def getWordTitleFreq(self, author = all_auths):
        return self.wordTitleFreq[author]

    def getWordTagFreq(self, author = all_auths):
        return self.tagFreq[author]

    def getWordId(self, author = all_auths):
        return self.wordId[author]

    def getTagId(self, author = all_auths):
        return self.tagId[author]

    def setFreqTitle(self, word, id_tupla, author = all_auths):
        self.wordTitleFreq[author][word] = self.wordTitleFreq[author].get(word, 0) + 1
        if collectAuthorWordId:
            if not self.wordId[author].has_key(word):
                self.wordId[author][word] = [id_tupla]
            else:
                self.wordId[author][word].append(id_tupla)
        
    def setFreqTag(self, tag, id_tupla, author = all_auths):
        self.tagFreq[author][tag] = self.tagFreq[author].get(tag, 0) + 1
        if collectAuthorTagId:
            if not self.tagId[author].has_key(tag):
                self.tagId[author][tag] = [id_tupla]
            else:
                self.tagId[author][tag].append(id_tupla)

    def setFreqTitlePerAuthors(self, word, id_tupla, authors_tupla):
        for author in authors_tupla:
            self.setFreqTitle(word, id_tupla, author)

    def setFreqTagPerAuthors(self, tag, id_tupla, authors_tupla):
        for author in authors_tupla:
            self.setFreqTag(tag, id_tupla, author)

    def computeFreq(self):
        for tupla in self.tuples:
            for word in tupla[t_title]:
                self.setFreqTitle(word, tupla[t_id])
                self.setFreqTitlePerAuthors(word, tupla[t_id], tupla[t_authors])
            for tag in tupla[t_keywords]:
                self.setFreqTag(tag, tupla[t_id])
                self.setFreqTagPerAuthors(tag, tupla[t_id], tupla[t_authors])
            
    def getXmlData(self, xmlEntry, tagName):
        xmlList = xmlEntry.getElementsByTagName(tagName)
        if not xmlList:
            return u""
        else:
            return xmlList[0].firstChild.data
  
    def splitWords(self, data):
        return p_split.split(data)

    def stopFilter(self, listWords, stopList):
        newListWords = []
        for word in listWords:
            if word not in stopList:
                newListWords.append(word)
        return newListWords

    def setAuthors(self, xmlEntry):
        authors_tupla = []
        if self.getXmlData(xmlEntry, 'bibtex:author'):
            data = self.getXmlData(xmlEntry, 'bibtex:author')
        else:
            data = self.getXmlData(xmlEntry, 'bibtex:editor')
        for author in authors:
            if self.p_authors[author].search(data):
                authors_tupla.append(author)
        return authors_tupla

    def getTuple(self, xmlEntry):       
        return (xmlEntry.attributes["id"].value, \
            self.setAuthors(xmlEntry), \
            self.stopFilter(self.splitWords(self.getXmlData(xmlEntry, \
                'bibtex:title')), stopWords), \
            self.stopFilter(self.splitWords(self.getXmlData(xmlEntry, \
                'bibtex:keywords')), stopTags))

    def getListTuples(self):
        xmlEntries = self.xmldoc.getElementsByTagName('bibtex:entry')
        entries = []
        for xmlEntry in xmlEntries:
            entries.append(self.getTuple(xmlEntry))
        return entries

    def printTupleAuthorWordFreq(self, author = all_auths):
        for word in self.wordTitleFreq[author].keys():
            if self.wordTitleFreq[author][word] >= wordFreqLimit[author]:
                print "(", author, ", ", word, ", ", \
                    self.wordTitleFreq[author][word], ")"

    def printTupleAuthorTagFreq(self, author = all_auths):
        for tag in self.tagFreq[author].keys():
            print "(", author, ", ", tag, ", ", \
                self.tagFreq[author][tag], ")"

    def phpArrayAuthorWordFreq(self, author = all_auths):
        for word in self.wordTitleFreq[author].keys():
            if self.wordTitleFreq[author][word] >= wordFreqLimit[author]:
                print "        '" + word + "' => " + \
                    str(self.wordTitleFreq[author][word]) + ","

    def phpArrayAuthorTagFreq(self, author = all_auths):
        for tag in self.tagFreq[author].keys():
            print "        '" + tag + "' => " + \
                str(self.tagFreq[author][tag]) + ","
          
    def dbAuthorWordFreq(self, author = all_auths):
        for word in self.wordTitleFreq[author].keys():
            if self.wordTitleFreq[author][word] >= wordFreqLimit[author]:
                print "('" + author + "', '" + \
                    word + "', " + \
                    str(self.wordTitleFreq[author][word]) + "),"

    def dbAuthorTagFreq(self, author = all_auths):
        for tag in self.tagFreq[author].keys():
            print "('" + author + "', '" + \
                  tag + "', " + \
                  str(self.tagFreq[author][tag]) + "),"

    def printTupleAuthorWordId(self, author = all_auths):
        for word in self.wordId[author].keys():
            for id in self.wordId[author][word]:
                print "(", author, ", ", word, ", ", id, ")"

    def printTupleAuthorTagId(self, author = all_auths):
        for tag in self.tagId[author].keys():
            for id in self.tagId[author][tag]:
                print "(", author, ", ", tag, ", ", id, ")"

    def dbAuthorWordId(self, author = all_auths):
        for word in self.wordId[author].keys():
            for id in self.wordId[author][word]:
                print "('" + author + "', '" + word + "', '" + id + "'),"

    def dbAuthorTagId(self, author = all_auths):
        for tag in self.tagId[author].keys():
            for id in self.tagId[author][tag]:
                print "('" + author + "', '" + tag + "', '" + id + "'),"

    def doEdges(self, listID, tag, tagWeight):
        i = 0
        for idi in listID[:len(listID)]:
            for idj in listID[i+1:]:
                tupla1 = (idi, idj)
                tupla2 = (idj, idi)
                if tupla1 not in self.edges.keys() and \
                   tupla2 not in self.edges.keys():
                    self.edges[tupla1] = (1, tag, tagWeight)
                else:
                   if tupla1 not in self.edges.keys() and \
                      tupla2 in self.edges.keys():
                       tuplaIn = self.edges[tupla2]
                       if compare(tuplaIn[2], tagWeight):
                           self.edges[tupla2] = (tuplaIn[0] + 1, tag, tagWeight)
                       else:
                           self.edges[tupla2] = (tuplaIn[0] + 1, tuplaIn[1], tuplaIn[2])
                   else:
                       if tupla1 in self.edges.keys() and \
                          tupla2 not in self.edges.keys():
                           tuplaIn = self.edges[tupla1]
                           if compare(tuplaIn[2], tagWeight):
                               self.edges[tupla1] = (tuplaIn[0] + 1, tag, tagWeight)
                           else:
                               self.edges[tupla1] = (tuplaIn[0] + 1, tuplaIn[1], tuplaIn[2])
            i = i + 1

    def printEdge(self, fp):
        count = 0
        for edge in self.edges.keys():
            if self.edges[edge][0] >= edgeWeight:
                fp.write('    "' + edge[0] + '" -> "' + edge[1] + \
                '" [ label = "' + self.edges[edge][1] + ', ' + \
                str(self.edges[edge][2]) + '" ];\n')
                count = count + 1
        return count

    def toDot(self, author = all_auths):
        fp = open(str(author) + '_biblio_graph.dot', 'w')
        self.edges = {}
        #tagWeight = 1
        fp.write('digraph ' + author + '_biblio_graph {\n' + \
	         '    node [color=lightblue2, style=filled];\n' + \
                 '    center = "";\n' + \
                 '    edge [dir=none];\n')
        for tag in self.tagId[author].keys():
            #if self.tagFreq[author][tag] >= tagWeight:
                self.doEdges(self.tagId[author][tag], tag, self.tagFreq[author][tag])
        numEdges = self.printEdge(fp)
        fp.write('}')
        #print "Num edges: ", numEdges

    def toDB(self, author = all_auths):
        print """-- phpMyAdmin SQL Dump
-- version 2.11.5
-- http://www.phpmyadmin.net
--
-- Host: localhost
-- Generation Time: Mar 22, 2008 at 09:59 PM
-- Server version: 4.1.21
-- PHP Version: 5.2.4

SET SQL_MODE="NO_AUTO_VALUE_ON_ZERO";

--
-- Database: `baldoni`
--

-- --------------------------------------------------------

--
-- Table structure for table `tagID`
--

DROP TABLE IF EXISTS `tagID`;
CREATE TABLE IF NOT EXISTS `tagID` (
  `author` enum('all', 'baldoni','baroglio','patti','martelli','pozzato','gliozzi','schifanella','sapino','olivetti','giordano') NOT NULL default 'all',
  `tag` varchar(50) NOT NULL default '',
  `id` varchar(50) NOT NULL default ''
) ENGINE=MyISAM DEFAULT CHARSET=latin1;

--
-- Dumping data for table `tagID`
--

INSERT INTO `tagID` (`author`, `tag`, `id`) VALUES"""
        self.dbAuthorTagId()
        if collectAuthorTagId:
            for author in authors:
                self.dbAuthorTagId(author)
        print "('all','','');"
        print """

-- --------------------------------------------------------

--
-- Table structure for table `tags`
--

DROP TABLE IF EXISTS `tags`;
CREATE TABLE IF NOT EXISTS `tags` (
  `author` enum('all', 'baldoni','baroglio','patti','martelli','pozzato','gliozzi','schifanella','sapino','olivetti','giordano') NOT NULL default 'all',
  `tag` varchar(50) character set latin1 collate latin1_bin NOT NULL default '',
  `count` int(11) NOT NULL default '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;

--
-- Dumping data for table `tags`
--

INSERT INTO `tags` (`author`, `tag`, `count`) VALUES"""
        self.dbAuthorTagFreq()
        if collectAuthorTagId:
            for author in authors:
                self.dbAuthorTagFreq(author)
        print "('all','',0);"
        print """

-- --------------------------------------------------------

--
-- Table structure for table `wordID`
--

DROP TABLE IF EXISTS `wordID`;
CREATE TABLE IF NOT EXISTS `wordID` (
  `author` enum('all', 'baldoni','baroglio','patti','martelli','pozzato','gliozzi','schifanella','sapino','olivetti','giordano') NOT NULL default 'all',
  `word` varchar(50) NOT NULL default '',
  `id` varchar(50) NOT NULL default ''
) ENGINE=MyISAM DEFAULT CHARSET=latin1;

--
-- Dumping data for table `wordID`
--
        """
        if collectAuthorWordId:
            print """INSERT INTO `wordID` (`author`, `word`, `id`) VALUES"""
            self.dbAuthorWordId()
            for author in authors:
                self.dbAuthorWordId(author)
            print "('all','','');"
        print """

-- --------------------------------------------------------

--
-- Table structure for table `words`
--

DROP TABLE IF EXISTS `words`;
CREATE TABLE IF NOT EXISTS `words` (
  `author` enum('all', 'baldoni','baroglio','patti','martelli','pozzato','gliozzi','schifanella','sapino','olivetti','giordano') NOT NULL default 'all',
  `word` varchar(50) NOT NULL default '',
  `count` int(11) NOT NULL default '0'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;

--
-- Dumping data for table `words`
--

INSERT INTO `words` (`author`, `word`, `count`) VALUES"""
        self.dbAuthorWordFreq()
        for author in authors:
            self.dbAuthorWordFreq(author)
        print "('all','',0);"


def main():
    bibliofolks = BiblioFolks("bibliography.xml")
    bibliofolks.computeFreq()
    for author in authors:
        bibliofolks.toDot(author)
        system('dot -Tjpg ' + author + '_biblio_graph.dot -o ' + author + '_biblio_graph.jpg')
    bibliofolks.toDB()

if __name__ == '__main__':
    main()