00001 import os, csv 00002 00003 class CSV(list): 00004 """ 00005 Reader/writer for .csv files. The contents are stored as a list of dicts. 00006 00007 :param delimiter: csv field divider 00008 :param prefix: string start of comment lines to be ignored, default #Table 00009 :param descmarker: strings used to identify the field description line 00010 :param synth: when defined, add extra field with this name to hold the csv source line number 00011 00012 00013 Read usage example:: 00014 00015 src = CSV("$DBWRITERROOT/share/DYB_MC_AD1.txt", delimiter="\\t" ) 00016 src.read() 00017 for d in src: 00018 print d 00019 00020 len(src) 00021 src[0] 00022 src[-1] 00023 src.fieldnames 00024 00025 00026 On reading an invalid CSV an exception, with error report, is raised:: 00027 00028 src = CSV("$DBWRITERROOT/share/DYB_SAB_AD1.txt", delimiter="\\t" ) 00029 src.read() 00030 00031 00032 Handling of common csv incorrectnesses is made: 00033 00034 #. description line fixed up to conform to the delimiter 00035 #. description line extraneous characters removed (other than fieldnames and delimiters) 00036 #. removes comments 00037 00038 Write usage example, field names are obtained from the dict keys:: 00039 00040 out = CSV("/tmp/demo.csv", delimiter="\\t" ) 00041 for d in list_of_dict_datasource: 00042 out.append(d) 00043 out.write() 00044 00045 """ 00046 defaults = dict( delimiter="\t", prefix="#Table", descmarker="#[]", synth="_srcline" ) 00047 00048 def __init__(self, path, **kwargs ): 00049 """ 00050 """ 00051 self._path = path 00052 self.kwargs = kwargs 00053 self.stat = dict( hasblank=0 ) 00054 00055 def _fieldnames(self): 00056 """ 00057 If fieldnames keyword argument is supplied return that otherwise 00058 return the names of the keys in the first contained dict. In order to 00059 control the order of fields, the argument has to be specified. 00060 """ 00061 fns = self.kwargs.get('fieldnames',None) 00062 if fns: 00063 return fns 00064 if len(self)>0: 00065 return self[0].keys() 00066 return None 00067 00068 path = property(lambda self:os.path.expanduser(os.path.expandvars(self._path))) 00069 delimiter = property(lambda self:self.kwargs.get('delimiter',self.defaults.get('delimiter',None))) 00070 fieldnames = property(_fieldnames, doc=_fieldnames.__doc__) 00071 descline = property(lambda self:"#" + self.delimiter.join(self.fieldnames) ) 00072 00073 def read(self): 00074 kwargs = self.kwargs 00075 delimiter = kwargs.pop('delimiter', self.defaults['delimiter']) 00076 prefix = kwargs.pop('prefix', self.defaults['prefix']) 00077 descmarker = kwargs.pop('descmarker',self.defaults['descmarker']) 00078 synth = kwargs.pop('synth',self.defaults['synth']) 00079 00080 src = Source(open(self.path,"r"), delimiter=delimiter, prefix=prefix, descmarker=descmarker, synth=synth ) 00081 00082 for i,r in enumerate(csv.DictReader(src,delimiter=delimiter)): 00083 row = Entry(r) 00084 hasblank = row.hasblank 00085 if hasblank: 00086 self.stat['hasblank'] += 1 00087 row.update( _hasblank=hasblank ) 00088 self.append(row) 00089 self.src = src 00090 00091 if not self.is_valid(): 00092 self.smry() 00093 raise Exception("CSV.read INVALID .csv : %r " % self ) 00094 00095 00096 def is_valid(self): 00097 return len(self.blanks()) == 0 00098 00099 def blanks(self): 00100 return filter( lambda _:_['_hasblank'] == True, self) 00101 00102 def smry(self): 00103 print "source csv...", self.src 00104 print "CSV ... ", self 00105 print "irregularities in csv..." 00106 for blk in self.blanks(): 00107 print blk 00108 for blk in self.blanks(): 00109 print self.src[int(blk['_srcline'])] 00110 00111 def __repr__(self): 00112 return "CSV %s %s rows %s stat %r " % ( self._path, ["NOT VALID","valid"][self.is_valid()], len(self), self.stat ) 00113 00114 def write(self): 00115 """ 00116 """ 00117 out = open(self.path,"w") 00118 out.write("#Table CSV.write \n" ) 00119 out.write( self.descline + "\n") 00120 writer = csv.DictWriter( out , self.fieldnames , delimiter=self.delimiter ) 00121 for d in self: 00122 writer.writerow(d) 00123 out.close() 00124 00125 00126 00127 class Entry(dict): 00128 hasblank = property(lambda self:len(filter(lambda _:_=="",self.values()))>0) 00129 00130 00131 00132 class Source(list): 00133 """ 00134 Behaves like a file and holds the original text of the CSV. 00135 Applies some fixes to make readable as CSV:: 00136 00137 #. removes comments 00138 #. normalize the description line to conform to the delimiter 00139 00140 """ 00141 def __init__(self, f, delimiter="\t", prefix="#Table", descmarker="#[]", synth="srcline"): 00142 """ 00143 :param delimiter: csv field divider 00144 :param prefix: string start of lines to be ignored 00145 :param descmarker: strings used to identify the field description line 00146 :param synth: when defined, add extra field with this name to hold the csv source line number 00147 """ 00148 self.f = f 00149 self.prefix = prefix 00150 self.delimiter = delimiter 00151 self.descmarker = descmarker 00152 self.synth = synth 00153 00154 self.stat = dict(total=0,prefix=0, payload=0, descline=0) 00155 self.cols = None 00156 00157 def is_descline(self, line): 00158 """ 00159 Checks if line contains all of the description markers 00160 """ 00161 return len(filter(lambda _:_ in line,self.descmarker)) == len(self.descmarker) 00162 00163 def descline(self, line): 00164 for c in self.descmarker: 00165 line = line.replace(c,"") 00166 return line 00167 00168 def next(self): 00169 line = self.f.next() 00170 self.append(line.rstrip()) 00171 self.stat['total'] += 1 00172 while line.startswith(self.prefix): 00173 self.stat['prefix'] += 1 00174 line = self.f.next() 00175 else: 00176 if self.is_descline(line): 00177 self.stat['descline'] += 1 00178 if self.synth: 00179 line=self.synth + self.delimiter + self.descline(line) 00180 self.cols = line.rstrip().split(self.delimiter) 00181 else: 00182 self.stat['payload'] += 1 00183 if self.synth: 00184 line = "%d" % (len(self) - 1) + self.delimiter + line 00185 return line 00186 def __iter__(self): 00187 return self 00188 00189 def __repr__(self): 00190 return "Source stat:%r \n cols:%r " % ( self.stat, self.cols ) 00191 00192 00193 if __name__ == '__main__': 00194 paths = "$DBWRITERROOT/share/DYB_SAB_AD1.txt", "$DBWRITERROOT/share/DYB_MC_AD1.txt" 00195 00196 for path in paths: 00197 csv_ = CSV(path) 00198 csv_.read() 00199 print csv_ 00200 00201 00202 00203