ReadSampleXMLfile.py module that reads and converts xml into a sav file
This is a sample XML file to read by this script: Read Sample XML file
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | #Python script that reads and converts .xml file into SPSS .sav file # Reference Hetland p. 427 (Beginning Python From Novice to Professional) # Other resource: Chap 24 of Python in a Nutshel (Alex Martelli) # Raynald Levesque August 2008 from xml.sax.handler import ContentHandler from xml.sax import parse import spss # Change next line to match your requirements fpath=r'C:/Test2' fname=r'sample.xml' # the enclosed sample data file xmlFile = r'%(fpath)s/%(fname)s' % vars() class DataHandler(ContentHandler): """ Creates tab separated data files """ in_extract = False def __init__(self, extract, wantList): ContentHandler.__init__(self) self.extract = extract self.data = [] self.wantList = wantList # next 2 strings were copy/pasted from the XML document then 'cleaned' self.action_logAtt = 'id team1 team1_name team2 team2_name league league_id \ date matchday season season_code start1 start2'.split() self.actionAtt = 'aid action_code activitytype result id minute second \ field_position receiver team_id x y z pace last_modified'.split() self.DeletedactionAtt = self.actionAtt # next string was found inspection of XML file self.actionOpt ='subtype c1 c2 c3'.split() self.first_Action = True self.first_Log = True def startElement(self, name, attrs): self.in_extract = name in self.wantList #print name, attrs.keys() if self.in_extract: if name== 'action_log': self.data=["1\t"] #record type 1 self.data.extend(["%s\t" % attrs.getValue(att) for att in self.action_logAtt]) self.action_logID=attrs.getValue('id') # will be added at beg of each child record elif name == 'action': self.data=["2\t", self.action_logID + '\t'] #record type 2 self.dataOpt=['\t']*len(self.actionOpt) for idx,attrib in enumerate(self.actionOpt): if attrs.has_key(attrib): self.dataOpt[idx] = attrs.getValue(attrib) + '\t' self.data.extend(["%s\t" % attrs.getValue(att) for att in self.actionAtt]) self.data.extend(self.dataOpt ) elif name == 'Deletedaction': self.data=["3\t", self.action_logID + '\t'] #record type 3 self.data.extend(["%s\t" % attrs.getValue(att) for att in self.DeletedactionAtt]) # Insert var names in tab delimited file if self.first_Log: # extract[0] will contain both var names and attributes of 1st XML element vnames=['recType'] vnames.extend(self.action_logAtt) vnames=["%s\t" % v for v in vnames] vnames.append('\n') vnames.extend(self.data) self.data=vnames self.first_Log = False # extract[1] will contain both var names and attributes of 2nd XML element elif self.first_Action and name in ['action','Deletedaction']: vnames=['recType','logID'] vnames.extend(self.actionAtt) vnames.extend(self.actionOpt) vnames=["%s\t" % v for v in vnames] vnames.append('\n') vnames.extend(self.data) # this is an in-place modif self.data = vnames self.first_Action = False # we won't come back through this if text = ''.join(self.data) + '\n' self.extract.append(text) def endElement(self,name): if name in self.wantList: self.data = [] self.in_extract = False def characters(self,string): if self.in_extract: self.data.append(string) # Use the class to create the tab separated data file # Note: If we were dealing with a very large file, it would be preferable to create # the 2 text files within the DataHandler class extract = [] wantList=['action_log','action','Deletedaction'] # Elements to extract parse(xmlFile, DataHandler(extract, wantList)) # extract now contain the data nameroot = fname[:fname.find('.')] #sample.xml --> sample fLogName = r'%(fpath)s/%(nameroot)sLog' % vars() # --> path/sampleLog fActionName = r'%(fpath)s/%(nameroot)sAction' % vars() #--> path/sampleAction fLog = open(fLogName,'w') #File to contain action_log info fAction = open(fActionName,'w') try: for (i,s) in enumerate(extract): if len(s)> 0: s2 = s.encode('iso8859-1') #unicode must be encoded before writing if s[0] in ['2','3'] or i==1: #2nd line contains vnames of action fAction.write(s2) elif s[0] == '1' or i == 0: #1st line contains vnames of log fLog.write(s2) finally: fAction.close() fLog.close() # The 2 tab delimited text files were then read using SPSS and the syntax was # pasted below cmd=r""" SET PRINTBACK=YES /MPRINT=YES. DATASET CLOSE ALL. GET DATA /TYPE = TXT /FILE = "%(fLogName)s.txt" /DELCASE = LINE /DELIMITERS = "\t" /ARRANGEMENT = DELIMITED /FIRSTCASE = 2 /IMPORTCASE = ALL /VARIABLES = recType F1.0 id F6.0 team1 F3.0 team1_name A19 team2 F3.0 team2_name A16 league A14 league_id F1.0 date A22 matchday F2.0 season A9 season_code F2.0 start1 A22 start2 A22. CACHE. SAVE OUTFILE= "%(fLogName)s.sav". GET DATA /TYPE = TXT /FILE = "%(fActionName)s.txt" /DELCASE = LINE /DELIMITERS = "\t" /ARRANGEMENT = DELIMITED /FIRSTCASE = 2 /IMPORTCASE = ALL /VARIABLES = recType F1.0 logID F6.0 aid F7.0 action_code A4 activitytype F2.0 result F1.0 id F2.0 minute F2.0 second F2.0 field_position F2.0 receiver F5.0 team_id F3.0 x F5.3 y F5.3 z F5.3 pace F5.3 last_modified A22 subtype F2.0 c1 F1.0 c2 F1.0 c3 F1.0 . CACHE. SAVE OUTFILE="%(fActionName)s.sav". """ % vars() spss.Submit(cmd) |
Related pages
...
Navigate from here