00001 import os, csv
00002
00003 class CSV(list):
00004 """
00005 Reader/writer for .csv files. The contents are stored as a list of dicts.
00006
00007 :param delimiter: csv field divider
00008 :param prefix: string start of comment lines to be ignored, default #Table
00009 :param descmarker: strings used to identify the field description line
00010 :param synth: when defined, add extra field with this name to hold the csv source line number
00011
00012
00013 Read usage example::
00014
00015 src = CSV("$DBWRITERROOT/share/DYB_MC_AD1.txt", delimiter="\\t" )
00016 src.read()
00017 for d in src:
00018 print d
00019
00020 len(src)
00021 src[0]
00022 src[-1]
00023 src.fieldnames
00024
00025
00026 On reading an invalid CSV an exception, with error report, is raised::
00027
00028 src = CSV("$DBWRITERROOT/share/DYB_SAB_AD1.txt", delimiter="\\t" )
00029 src.read()
00030
00031
00032 Handling of common csv incorrectnesses is made:
00033
00034 #. description line fixed up to conform to the delimiter
00035 #. description line extraneous characters removed (other than fieldnames and delimiters)
00036 #. removes comments
00037
00038 Write usage example, field names are obtained from the dict keys::
00039
00040 out = CSV("/tmp/demo.csv", delimiter="\\t" )
00041 for d in list_of_dict_datasource:
00042 out.append(d)
00043 out.write()
00044
00045 """
00046 defaults = dict( delimiter="\t", prefix="#Table", descmarker="#[]", synth="_srcline" )
00047
00048 def __init__(self, path, **kwargs ):
00049 """
00050 """
00051 self._path = path
00052 self.kwargs = kwargs
00053 self.stat = dict( hasblank=0 )
00054
00055 def _fieldnames(self):
00056 """
00057 If fieldnames keyword argument is supplied return that otherwise
00058 return the names of the keys in the first contained dict. In order to
00059 control the order of fields, the argument has to be specified.
00060 """
00061 fns = self.kwargs.get('fieldnames',None)
00062 if fns:
00063 return fns
00064 if len(self)>0:
00065 return self[0].keys()
00066 return None
00067
00068 path = property(lambda self:os.path.expanduser(os.path.expandvars(self._path)))
00069 delimiter = property(lambda self:self.kwargs.get('delimiter',self.defaults.get('delimiter',None)))
00070 fieldnames = property(_fieldnames, doc=_fieldnames.__doc__)
00071 descline = property(lambda self:"#" + self.delimiter.join(self.fieldnames) )
00072
00073 def read(self):
00074 kwargs = self.kwargs
00075 delimiter = kwargs.pop('delimiter', self.defaults['delimiter'])
00076 prefix = kwargs.pop('prefix', self.defaults['prefix'])
00077 descmarker = kwargs.pop('descmarker',self.defaults['descmarker'])
00078 synth = kwargs.pop('synth',self.defaults['synth'])
00079
00080 src = Source(open(self.path,"r"), delimiter=delimiter, prefix=prefix, descmarker=descmarker, synth=synth )
00081
00082 for i,r in enumerate(csv.DictReader(src,delimiter=delimiter)):
00083 row = Entry(r)
00084 hasblank = row.hasblank
00085 if hasblank:
00086 self.stat['hasblank'] += 1
00087 row.update( _hasblank=hasblank )
00088 self.append(row)
00089 self.src = src
00090
00091 if not self.is_valid():
00092 self.smry()
00093 raise Exception("CSV.read INVALID .csv : %r " % self )
00094
00095
00096 def is_valid(self):
00097 return len(self.blanks()) == 0
00098
00099 def blanks(self):
00100 return filter( lambda _:_['_hasblank'] == True, self)
00101
00102 def smry(self):
00103 print "source csv...", self.src
00104 print "CSV ... ", self
00105 print "irregularities in csv..."
00106 for blk in self.blanks():
00107 print blk
00108 for blk in self.blanks():
00109 print self.src[int(blk['_srcline'])]
00110
00111 def __repr__(self):
00112 return "CSV %s %s rows %s stat %r " % ( self._path, ["NOT VALID","valid"][self.is_valid()], len(self), self.stat )
00113
00114 def write(self):
00115 """
00116 """
00117 out = open(self.path,"w")
00118 out.write("#Table CSV.write \n" )
00119 out.write( self.descline + "\n")
00120 writer = csv.DictWriter( out , self.fieldnames , delimiter=self.delimiter )
00121 for d in self:
00122 writer.writerow(d)
00123 out.close()
00124
00125
00126
00127 class Entry(dict):
00128 hasblank = property(lambda self:len(filter(lambda _:_=="",self.values()))>0)
00129
00130
00131
00132 class Source(list):
00133 """
00134 Behaves like a file and holds the original text of the CSV.
00135 Applies some fixes to make readable as CSV::
00136
00137 #. removes comments
00138 #. normalize the description line to conform to the delimiter
00139
00140 """
00141 def __init__(self, f, delimiter="\t", prefix="#Table", descmarker="#[]", synth="srcline"):
00142 """
00143 :param delimiter: csv field divider
00144 :param prefix: string start of lines to be ignored
00145 :param descmarker: strings used to identify the field description line
00146 :param synth: when defined, add extra field with this name to hold the csv source line number
00147 """
00148 self.f = f
00149 self.prefix = prefix
00150 self.delimiter = delimiter
00151 self.descmarker = descmarker
00152 self.synth = synth
00153
00154 self.stat = dict(total=0,prefix=0, payload=0, descline=0)
00155 self.cols = None
00156
00157 def is_descline(self, line):
00158 """
00159 Checks if line contains all of the description markers
00160 """
00161 return len(filter(lambda _:_ in line,self.descmarker)) == len(self.descmarker)
00162
00163 def descline(self, line):
00164 for c in self.descmarker:
00165 line = line.replace(c,"")
00166 return line
00167
00168 def next(self):
00169 line = self.f.next()
00170 self.append(line.rstrip())
00171 self.stat['total'] += 1
00172 while line.startswith(self.prefix):
00173 self.stat['prefix'] += 1
00174 line = self.f.next()
00175 else:
00176 if self.is_descline(line):
00177 self.stat['descline'] += 1
00178 if self.synth:
00179 line=self.synth + self.delimiter + self.descline(line)
00180 self.cols = line.rstrip().split(self.delimiter)
00181 else:
00182 self.stat['payload'] += 1
00183 if self.synth:
00184 line = "%d" % (len(self) - 1) + self.delimiter + line
00185 return line
00186 def __iter__(self):
00187 return self
00188
00189 def __repr__(self):
00190 return "Source stat:%r \n cols:%r " % ( self.stat, self.cols )
00191
00192
00193 if __name__ == '__main__':
00194 paths = "$DBWRITERROOT/share/DYB_SAB_AD1.txt", "$DBWRITERROOT/share/DYB_MC_AD1.txt"
00195
00196 for path in paths:
00197 csv_ = CSV(path)
00198 csv_.read()
00199 print csv_
00200
00201
00202
00203