#!/usr/bin/env python """ Bibliofolks """ __author__ = "Matteo Baldoni" __version__ = "$Revision: 0.1 $" __date__ = "$Date: 2008/03/25 15:30:00 $" __copyright__ = "Copyright (c) 2001 Matteo Baldoni" __license__ = "GPL" from xml.dom import minidom from re import * from os import system true = lambda : 1 false = lambda : 0 t_id = 0 t_authors = 1 t_title = 2 t_keywords = 3 p_split = compile(r'\W+') stopWords = ['in','from', 'of', 'to', 'and', 'an', 'with', 'that', 'a', 'on', \ 'the', '', 'lo', 'di', 'dell\'', 'e', 'into', 'E\'', 'The', \ 'by', 'for', 'about', 'as', '', 'A', '007', 'E', 'II', 'Using', \ 'based', 'driven', 'guided', 'means', 'priori', 'About', 'An', \ '0', 'I', 'On', 'Proc', 'agli', 'like', 's', 'valued', '1', '3' \ '2007', 'dell'] stopTags = [''] #authors = ['baldoni', 'baroglio', 'patti', 'martelli', 'pozzato', \ # 'gliozzi', 'schifanella', 'sapino', 'olivetti', 'giordano'] authors = ['Baldoni', 'Baroglio', 'Patti', 'Martelli', 'Pozzato', \ 'Gliozzi', 'Schifanella', 'Sapino', 'Olivetti', 'Giordano'] all_auths = 'all' def compare(val1, val2): # l'arco con minore peso return val1 > val2 # l'arco con maggiore peso #return val1 < val2 # frequenza arco connessione per la selezione edgeWeight = 3 wordFreqLimit = {} wordAuthorFreqLimit = 2 for author in authors: wordFreqLimit[author] = wordAuthorFreqLimit wordFreqLimit[all_auths] = 3 collectAuthorWordId = false() collectAuthorTagId = true() class BiblioFolks: def __init__(self, pathfile): self.pathfile = pathfile self.xmldoc = minidom.parse(pathfile) self.initAuthorsPattern() self.tuples = self.getListTuples() self.initDictionaries() def initDictionaries(self): self.wordTitleFreq = {} self.tagFreq = {} self.wordId = {} self.tagId = {} self.wordTitleFreq[all_auths] = {} self.tagFreq[all_auths] = {} self.wordId[all_auths] = {} self.tagId[all_auths] = {} for author in authors: self.wordTitleFreq[author] = {} self.tagFreq[author] = {} self.wordId[author] = {} self.tagId[author] = {} def initAuthorsPattern(self): self.p_authors = {} for author in authors: self.p_authors[author] = compile(author, IGNORECASE) def getTuples(self): return self.tuples def getWordTitleFreq(self, author = all_auths): return self.wordTitleFreq[author] def getWordTagFreq(self, author = all_auths): return self.tagFreq[author] def getWordId(self, author = all_auths): return self.wordId[author] def getTagId(self, author = all_auths): return self.tagId[author] def setFreqTitle(self, word, id_tupla, author = all_auths): self.wordTitleFreq[author][word] = self.wordTitleFreq[author].get(word, 0) + 1 if collectAuthorWordId: if not self.wordId[author].has_key(word): self.wordId[author][word] = [id_tupla] else: self.wordId[author][word].append(id_tupla) def setFreqTag(self, tag, id_tupla, author = all_auths): self.tagFreq[author][tag] = self.tagFreq[author].get(tag, 0) + 1 if collectAuthorTagId: if not self.tagId[author].has_key(tag): self.tagId[author][tag] = [id_tupla] else: self.tagId[author][tag].append(id_tupla) def setFreqTitlePerAuthors(self, word, id_tupla, authors_tupla): for author in authors_tupla: self.setFreqTitle(word, id_tupla, author) def setFreqTagPerAuthors(self, tag, id_tupla, authors_tupla): for author in authors_tupla: self.setFreqTag(tag, id_tupla, author) def computeFreq(self): for tupla in self.tuples: for word in tupla[t_title]: self.setFreqTitle(word, tupla[t_id]) self.setFreqTitlePerAuthors(word, tupla[t_id], tupla[t_authors]) for tag in tupla[t_keywords]: self.setFreqTag(tag, tupla[t_id]) self.setFreqTagPerAuthors(tag, tupla[t_id], tupla[t_authors]) def getXmlData(self, xmlEntry, tagName): xmlList = xmlEntry.getElementsByTagName(tagName) if not xmlList: return u"" else: return xmlList[0].firstChild.data def splitWords(self, data): return p_split.split(data) def stopFilter(self, listWords, stopList): newListWords = [] for word in listWords: if word not in stopList: newListWords.append(word) return newListWords def setAuthors(self, xmlEntry): authors_tupla = [] if self.getXmlData(xmlEntry, 'bibtex:author'): data = self.getXmlData(xmlEntry, 'bibtex:author') else: data = self.getXmlData(xmlEntry, 'bibtex:editor') for author in authors: if self.p_authors[author].search(data): authors_tupla.append(author) return authors_tupla def getTuple(self, xmlEntry): return (xmlEntry.attributes["id"].value, \ self.setAuthors(xmlEntry), \ self.stopFilter(self.splitWords(self.getXmlData(xmlEntry, \ 'bibtex:title')), stopWords), \ self.stopFilter(self.splitWords(self.getXmlData(xmlEntry, \ 'bibtex:keywords')), stopTags)) def getListTuples(self): xmlEntries = self.xmldoc.getElementsByTagName('bibtex:entry') entries = [] for xmlEntry in xmlEntries: entries.append(self.getTuple(xmlEntry)) return entries def printTupleAuthorWordFreq(self, author = all_auths): for word in self.wordTitleFreq[author].keys(): if self.wordTitleFreq[author][word] >= wordFreqLimit[author]: print "(", author, ", ", word, ", ", \ self.wordTitleFreq[author][word], ")" def printTupleAuthorTagFreq(self, author = all_auths): for tag in self.tagFreq[author].keys(): print "(", author, ", ", tag, ", ", \ self.tagFreq[author][tag], ")" def phpArrayAuthorWordFreq(self, author = all_auths): for word in self.wordTitleFreq[author].keys(): if self.wordTitleFreq[author][word] >= wordFreqLimit[author]: print " '" + word + "' => " + \ str(self.wordTitleFreq[author][word]) + "," def phpArrayAuthorTagFreq(self, author = all_auths): for tag in self.tagFreq[author].keys(): print " '" + tag + "' => " + \ str(self.tagFreq[author][tag]) + "," def dbAuthorWordFreq(self, author = all_auths): for word in self.wordTitleFreq[author].keys(): if self.wordTitleFreq[author][word] >= wordFreqLimit[author]: print "('" + author + "', '" + \ word + "', " + \ str(self.wordTitleFreq[author][word]) + ")," def dbAuthorTagFreq(self, author = all_auths): for tag in self.tagFreq[author].keys(): print "('" + author + "', '" + \ tag + "', " + \ str(self.tagFreq[author][tag]) + ")," def printTupleAuthorWordId(self, author = all_auths): for word in self.wordId[author].keys(): for id in self.wordId[author][word]: print "(", author, ", ", word, ", ", id, ")" def printTupleAuthorTagId(self, author = all_auths): for tag in self.tagId[author].keys(): for id in self.tagId[author][tag]: print "(", author, ", ", tag, ", ", id, ")" def dbAuthorWordId(self, author = all_auths): for word in self.wordId[author].keys(): for id in self.wordId[author][word]: print "('" + author + "', '" + word + "', '" + id + "')," def dbAuthorTagId(self, author = all_auths): for tag in self.tagId[author].keys(): for id in self.tagId[author][tag]: print "('" + author + "', '" + tag + "', '" + id + "')," def doEdges(self, listID, tag, tagWeight): i = 0 for idi in listID[:len(listID)]: for idj in listID[i+1:]: tupla1 = (idi, idj) tupla2 = (idj, idi) if tupla1 not in self.edges.keys() and \ tupla2 not in self.edges.keys(): self.edges[tupla1] = (1, tag, tagWeight) else: if tupla1 not in self.edges.keys() and \ tupla2 in self.edges.keys(): tuplaIn = self.edges[tupla2] if compare(tuplaIn[2], tagWeight): self.edges[tupla2] = (tuplaIn[0] + 1, tag, tagWeight) else: self.edges[tupla2] = (tuplaIn[0] + 1, tuplaIn[1], tuplaIn[2]) else: if tupla1 in self.edges.keys() and \ tupla2 not in self.edges.keys(): tuplaIn = self.edges[tupla1] if compare(tuplaIn[2], tagWeight): self.edges[tupla1] = (tuplaIn[0] + 1, tag, tagWeight) else: self.edges[tupla1] = (tuplaIn[0] + 1, tuplaIn[1], tuplaIn[2]) i = i + 1 def printEdge(self, fp): count = 0 for edge in self.edges.keys(): if self.edges[edge][0] >= edgeWeight: fp.write(' "' + edge[0] + '" -> "' + edge[1] + \ '" [ label = "' + self.edges[edge][1] + ', ' + \ str(self.edges[edge][2]) + '" ];\n') count = count + 1 return count def toDot(self, author = all_auths): fp = open(str(author) + '_biblio_graph.dot', 'w') self.edges = {} #tagWeight = 1 fp.write('digraph ' + author + '_biblio_graph {\n' + \ ' node [color=lightblue2, style=filled];\n' + \ ' center = "";\n' + \ ' edge [dir=none];\n') for tag in self.tagId[author].keys(): #if self.tagFreq[author][tag] >= tagWeight: self.doEdges(self.tagId[author][tag], tag, self.tagFreq[author][tag]) numEdges = self.printEdge(fp) fp.write('}') #print "Num edges: ", numEdges def toDB(self, author = all_auths): print """-- phpMyAdmin SQL Dump -- version 2.11.5 -- http://www.phpmyadmin.net -- -- Host: localhost -- Generation Time: Mar 22, 2008 at 09:59 PM -- Server version: 4.1.21 -- PHP Version: 5.2.4 SET SQL_MODE="NO_AUTO_VALUE_ON_ZERO"; -- -- Database: `baldoni` -- -- -------------------------------------------------------- -- -- Table structure for table `tagID` -- DROP TABLE IF EXISTS `tagID`; CREATE TABLE IF NOT EXISTS `tagID` ( `author` enum('all', 'baldoni','baroglio','patti','martelli','pozzato','gliozzi','schifanella','sapino','olivetti','giordano') NOT NULL default 'all', `tag` varchar(50) NOT NULL default '', `id` varchar(50) NOT NULL default '' ) ENGINE=MyISAM DEFAULT CHARSET=latin1; -- -- Dumping data for table `tagID` -- INSERT INTO `tagID` (`author`, `tag`, `id`) VALUES""" self.dbAuthorTagId() if collectAuthorTagId: for author in authors: self.dbAuthorTagId(author) print "('all','','');" print """ -- -------------------------------------------------------- -- -- Table structure for table `tags` -- DROP TABLE IF EXISTS `tags`; CREATE TABLE IF NOT EXISTS `tags` ( `author` enum('all', 'baldoni','baroglio','patti','martelli','pozzato','gliozzi','schifanella','sapino','olivetti','giordano') NOT NULL default 'all', `tag` varchar(50) character set latin1 collate latin1_bin NOT NULL default '', `count` int(11) NOT NULL default '0' ) ENGINE=MyISAM DEFAULT CHARSET=latin1; -- -- Dumping data for table `tags` -- INSERT INTO `tags` (`author`, `tag`, `count`) VALUES""" self.dbAuthorTagFreq() if collectAuthorTagId: for author in authors: self.dbAuthorTagFreq(author) print "('all','',0);" print """ -- -------------------------------------------------------- -- -- Table structure for table `wordID` -- DROP TABLE IF EXISTS `wordID`; CREATE TABLE IF NOT EXISTS `wordID` ( `author` enum('all', 'baldoni','baroglio','patti','martelli','pozzato','gliozzi','schifanella','sapino','olivetti','giordano') NOT NULL default 'all', `word` varchar(50) NOT NULL default '', `id` varchar(50) NOT NULL default '' ) ENGINE=MyISAM DEFAULT CHARSET=latin1; -- -- Dumping data for table `wordID` -- """ if collectAuthorWordId: print """INSERT INTO `wordID` (`author`, `word`, `id`) VALUES""" self.dbAuthorWordId() for author in authors: self.dbAuthorWordId(author) print "('all','','');" print """ -- -------------------------------------------------------- -- -- Table structure for table `words` -- DROP TABLE IF EXISTS `words`; CREATE TABLE IF NOT EXISTS `words` ( `author` enum('all', 'baldoni','baroglio','patti','martelli','pozzato','gliozzi','schifanella','sapino','olivetti','giordano') NOT NULL default 'all', `word` varchar(50) NOT NULL default '', `count` int(11) NOT NULL default '0' ) ENGINE=MyISAM DEFAULT CHARSET=latin1; -- -- Dumping data for table `words` -- INSERT INTO `words` (`author`, `word`, `count`) VALUES""" self.dbAuthorWordFreq() for author in authors: self.dbAuthorWordFreq(author) print "('all','',0);" def main(): bibliofolks = BiblioFolks("bibliography.xml") bibliofolks.computeFreq() for author in authors: bibliofolks.toDot(author) system('dot -Tjpg ' + author + '_biblio_graph.dot -o ' + author + '_biblio_graph.jpg') bibliofolks.toDB() if __name__ == '__main__': main()