#Python script that reads and converts .xml file into SPSS .sav file # Reference Hetland p. 427 (Beginning Python From Novice to Professional) # Other resource: Chap 24 of Python in a Nutshel (Alex Martelli) # Raynald Levesque August 2008 from xml.sax.handler import ContentHandler from xml.sax import parse import spss # Change next line to match your requirements fpath=r'C:/Test2' fname=r'sample.xml' # the enclosed sample data file xmlFile = r'%(fpath)s/%(fname)s' % vars() class DataHandler(ContentHandler): """ Creates tab separated data files """ in_extract = False def __init__(self, extract, wantList): ContentHandler.__init__(self) self.extract = extract self.data = [] self.wantList = wantList # next 2 strings were copy/pasted from the XML document then 'cleaned' self.action_logAtt = 'id team1 team1_name team2 team2_name league league_id \ date matchday season season_code start1 start2'.split() self.actionAtt = 'aid action_code activitytype result id minute second \ field_position receiver team_id x y z pace last_modified'.split() self.DeletedactionAtt = self.actionAtt # next string was found inspection of XML file self.actionOpt ='subtype c1 c2 c3'.split() self.first_Action = True self.first_Log = True def startElement(self, name, attrs): self.in_extract = name in self.wantList #print name, attrs.keys() if self.in_extract: if name== 'action_log': self.data=["1\t"] #record type 1 self.data.extend(["%s\t" % attrs.getValue(att) for att in self.action_logAtt]) self.action_logID=attrs.getValue('id') # will be added at beg of each child record elif name == 'action': self.data=["2\t", self.action_logID + '\t'] #record type 2 self.dataOpt=['\t']*len(self.actionOpt) for idx,attrib in enumerate(self.actionOpt): if attrs.has_key(attrib): self.dataOpt[idx] = attrs.getValue(attrib) + '\t' self.data.extend(["%s\t" % attrs.getValue(att) for att in self.actionAtt]) self.data.extend(self.dataOpt ) elif name == 'Deletedaction': self.data=["3\t", self.action_logID + '\t'] #record type 3 self.data.extend(["%s\t" % attrs.getValue(att) for att in self.DeletedactionAtt]) # Insert var names in tab delimited file if self.first_Log: # extract[0] will contain both var names and attributes of 1st XML element vnames=['recType'] vnames.extend(self.action_logAtt) vnames=["%s\t" % v for v in vnames] vnames.append('\n') vnames.extend(self.data) self.data=vnames self.first_Log = False # extract[1] will contain both var names and attributes of 2nd XML element elif self.first_Action and name in ['action','Deletedaction']: vnames=['recType','logID'] vnames.extend(self.actionAtt) vnames.extend(self.actionOpt) vnames=["%s\t" % v for v in vnames] vnames.append('\n') vnames.extend(self.data) # this is an in-place modif self.data = vnames self.first_Action = False # we won't come back through this if text = ''.join(self.data) + '\n' self.extract.append(text) def endElement(self,name): if name in self.wantList: self.data = [] self.in_extract = False def characters(self,string): if self.in_extract: self.data.append(string) # Use the class to create the tab separated data file # Note: If we were dealing with a very large file, it would be preferable to create # the 2 text files within the DataHandler class extract = [] wantList=['action_log','action','Deletedaction'] # Elements to extract parse(xmlFile, DataHandler(extract, wantList)) # extract now contain the data nameroot = fname[:fname.find('.')] #sample.xml --> sample fLogName = r'%(fpath)s/%(nameroot)sLog' % vars() # --> path/sampleLog fActionName = r'%(fpath)s/%(nameroot)sAction' % vars() #--> path/sampleAction fLog = open(fLogName,'w') #File to contain action_log info fAction = open(fActionName,'w') try: for (i,s) in enumerate(extract): if len(s)> 0: s2 = s.encode('iso8859-1') #unicode must be encoded before writing if s[0] in ['2','3'] or i==1: #2nd line contains vnames of action fAction.write(s2) elif s[0] == '1' or i == 0: #1st line contains vnames of log fLog.write(s2) finally: fAction.close() fLog.close() # The 2 tab delimited text files were then read using SPSS and the syntax was # pasted below cmd=r""" SET PRINTBACK=YES /MPRINT=YES. DATASET CLOSE ALL. GET DATA /TYPE = TXT /FILE = "%(fLogName)s.txt" /DELCASE = LINE /DELIMITERS = "\t" /ARRANGEMENT = DELIMITED /FIRSTCASE = 2 /IMPORTCASE = ALL /VARIABLES = recType F1.0 id F6.0 team1 F3.0 team1_name A19 team2 F3.0 team2_name A16 league A14 league_id F1.0 date A22 matchday F2.0 season A9 season_code F2.0 start1 A22 start2 A22. CACHE. SAVE OUTFILE= "%(fLogName)s.sav". GET DATA /TYPE = TXT /FILE = "%(fActionName)s.txt" /DELCASE = LINE /DELIMITERS = "\t" /ARRANGEMENT = DELIMITED /FIRSTCASE = 2 /IMPORTCASE = ALL /VARIABLES = recType F1.0 logID F6.0 aid F7.0 action_code A4 activitytype F2.0 result F1.0 id F2.0 minute F2.0 second F2.0 field_position F2.0 receiver F5.0 team_id F3.0 x F5.3 y F5.3 z F5.3 pace F5.3 last_modified A22 subtype F2.0 c1 F1.0 c2 F1.0 c3 F1.0 . CACHE. SAVE OUTFILE="%(fActionName)s.sav". """ % vars() spss.Submit(cmd)