def histogram(text): d = {} words = text.split() for w in words: if w in d: d[w] = d[w] + 1 else: d[w] = 1 return d def phrase_collector(text, plen): d = {} words = text.split() words = map(lambda s: s.lower(), words) for windex in range(0, len(words) - plen): phrase = tuple(words[windex:windex+plen]) if phrase in d: d[phrase] = d[phrase] + 1 else: d[phrase]= 1 return d def common_phrases(d1, d2): keys = d1.keys() common = {} for k in keys: if k in d2: common[k] = (d1[k], d2[k]) return common def show_histogram(d): keys = d.keys() okeys = sorted(keys, lambda k1, k2: d[k2] - d[k1]) for k in okeys: print str(k) + ": " + str(d[k]) def phrase_collector(text, plen): d = {} words = text.split() words = map(lambda s: s.lower(), words) for windex in range(0, len(words) - plen): phrase = tuple(words[windex:windex+plen]) if phrase in d: d[phrase] = d[phrase] + 1 else: d[phrase]= 1 return d def show_phrases(d): keys = d.keys() okeys = sorted(keys, lambda k1, k2: (d[k2][0]+d[k2][1]) - (d[k1][0]+d[k1][1])) for k in okeys: print str(k) + ": " + str(d[k]) import urllib myhomepage = urllib.urlopen('http://www.cs.virginia.edu/evans/index.html').read() declaration = urllib.urlopen('http://www.cs.virginia.edu/cs1120/readings/declaration.html').read() pde = phrase_collector(myhomepage, 2) ptj = phrase_collector(declaration, 2) show_phrases(common_phrases(ptj, pde))