# RubiayatScraper.py # by Brian Stanley # # This program was used to extract the four versions of Edward FitzGerald's Rubiayat of Omar Khayyam from # an HTML table at http://www.therubaiyat.com/first.html (downloaded locally) and encode the stanzas in TEI. from lxml import html, etree import re file = open("C:\Documents and Settings\Brian\My Documents\My Documents\Librarianship\GSLIS\LIS452\Final Project\rubiayat-source.html", "r") webpage = file.read() file.close() webpage = re.sub(" ", "", webpage) webpage = re.sub("--|---", "—", webpage) tree1 = etree.Element('root') tree2 = etree.Element('root') tree3 = etree.Element('root') tree4 = etree.Element('root') webpage = html.fromstring(webpage) for t in range(len(webpage.xpath("//table"))): # loop through each table on page. table = webpage.xpath("//table")[t] for v in range(4): # loop through 4 versions of poem firstrow = table.xpath("tr")[0] secondrow = table.xpath("tr")[1] heading = firstrow.xpath("td")[v].text_content() heading = heading.strip() if heading != "": if v == 0: tree = tree1 elif v == 1: tree = tree2 elif v == 2: tree = tree3 else: tree = tree4 lg = etree.SubElement(tree, "lg", n = str(t + 1)) head = etree.SubElement(lg, "head") head.text = heading for l in range(4): #loop through 4 lines of stanza cell = secondrow.xpath("td")[v] line = cell.xpath("font[@face]")[l].text_content() lineNode = etree.SubElement(lg, "l", n = str(l + 1)) lineNode.text = line tree1File = open("C:\Documents and Settings\Brian\My Documents\My Documents\Librarianship\GSLIS\LIS452\Final Project\\version1.xml", 'w') tree2File = open("C:\Documents and Settings\Brian\My Documents\My Documents\Librarianship\GSLIS\LIS452\Final Project\\version2.xml", 'w') tree3File = open("C:\Documents and Settings\Brian\My Documents\My Documents\Librarianship\GSLIS\LIS452\Final Project\\version3.xml", 'w') tree4File = open("C:\Documents and Settings\Brian\My Documents\My Documents\Librarianship\GSLIS\LIS452\Final Project\\version4.xml", 'w') tree1 = etree.ElementTree(tree1) tree2 = etree.ElementTree(tree2) tree3 = etree.ElementTree(tree3) tree4 = etree.ElementTree(tree4) tree1.write(tree1File, xml_declaration=True) tree2.write(tree2File, xml_declaration=True) tree3.write(tree3File, xml_declaration=True) tree4.write(tree4File, xml_declaration=True) tree1File.close() tree2File.close() tree3File.close() tree4File.close()