DybDbi: /home/dayabay/installs/trunk_2011_04_11_opt/NuWa-trunk/dybgaudi/Database/DybDbi/python/DybDbi/csvrw.py Source File

00001 import os, csv
00002 
00003 class CSV(list):
00004     """
00005     Reader/writer for .csv files. The contents are stored as a list of dicts.
00006 
00007     :param delimiter: csv field divider
00008     :param prefix:  string start of comment lines to be ignored, default #Table
00009     :param descmarker: strings used to identify the field description line 
00010     :param synth: when defined, add extra field with this name to hold the csv source line number 
00011 
00012 
00013     Read usage example::
00014 
00015         src = CSV("$DBWRITERROOT/share/DYB_MC_AD1.txt", delimiter="\\t" )
00016         src.read()
00017         for d in src:
00018             print d
00019 
00020         len(src)
00021         src[0]
00022         src[-1]
00023         src.fieldnames
00024 
00025 
00026     On reading an invalid CSV an exception, with error report, is raised::
00027 
00028         src = CSV("$DBWRITERROOT/share/DYB_SAB_AD1.txt", delimiter="\\t" )
00029         src.read()
00030 
00031 
00032     Handling of common csv incorrectnesses is made:
00033 
00034     #. description line fixed up to conform to the delimiter
00035     #. description line extraneous characters removed (other than fieldnames and delimiters)
00036     #. removes comments 
00037  
00038     Write usage example, field names are obtained from the dict keys::
00039 
00040          out = CSV("/tmp/demo.csv", delimiter="\\t" )
00041          for d in list_of_dict_datasource:
00042              out.append(d)
00043          out.write()
00044 
00045     """
00046     defaults = dict( delimiter="\t", prefix="#Table", descmarker="#[]", synth="_srcline" )
00047 
00048     def __init__(self, path, **kwargs ):
00049         """
00050        """ 
00051         self._path = path
00052         self.kwargs = kwargs
00053         self.stat = dict( hasblank=0 )
00054 
00055     def _fieldnames(self):
00056         """
00057         If fieldnames keyword argument is supplied return that otherwise
00058         return the names of the keys in the first contained dict.  In order to
00059         control the order of fields, the argument has to be specified.
00060         """
00061         fns = self.kwargs.get('fieldnames',None)
00062         if fns:
00063             return fns
00064         if len(self)>0:
00065             return self[0].keys()     
00066         return None
00067 
00068     path = property(lambda self:os.path.expanduser(os.path.expandvars(self._path)))
00069     delimiter = property(lambda self:self.kwargs.get('delimiter',self.defaults.get('delimiter',None)))
00070     fieldnames = property(_fieldnames, doc=_fieldnames.__doc__)
00071     descline = property(lambda self:"#" + self.delimiter.join(self.fieldnames) )
00072 
00073     def read(self):
00074         kwargs = self.kwargs
00075         delimiter = kwargs.pop('delimiter', self.defaults['delimiter'])
00076         prefix =    kwargs.pop('prefix', self.defaults['prefix'])
00077         descmarker = kwargs.pop('descmarker',self.defaults['descmarker'])
00078         synth =     kwargs.pop('synth',self.defaults['synth'])
00079 
00080         src = Source(open(self.path,"r"), delimiter=delimiter, prefix=prefix, descmarker=descmarker, synth=synth )
00081 
00082         for i,r in enumerate(csv.DictReader(src,delimiter=delimiter)):
00083             row = Entry(r)
00084             hasblank = row.hasblank
00085             if hasblank:
00086                 self.stat['hasblank'] += 1 
00087             row.update( _hasblank=hasblank )
00088             self.append(row)
00089         self.src = src
00090 
00091         if not self.is_valid():
00092             self.smry()
00093             raise Exception("CSV.read INVALID .csv :  %r " % self )  
00094 
00095 
00096     def is_valid(self):
00097         return len(self.blanks()) == 0 
00098 
00099     def blanks(self):
00100         return filter( lambda _:_['_hasblank'] == True, self)
00101 
00102     def smry(self):
00103         print "source csv...", self.src
00104         print "CSV ... ", self
00105         print "irregularities in csv..."
00106         for blk in self.blanks():
00107             print blk
00108         for blk in self.blanks():
00109             print self.src[int(blk['_srcline'])]
00110    
00111     def __repr__(self):
00112         return "CSV %s %s rows %s stat %r  " % ( self._path, ["NOT VALID","valid"][self.is_valid()], len(self), self.stat )
00113 
00114     def write(self):
00115         """
00116         """
00117         out = open(self.path,"w")
00118         out.write("#Table CSV.write \n" )  
00119         out.write( self.descline + "\n")
00120         writer = csv.DictWriter( out , self.fieldnames , delimiter=self.delimiter )
00121         for d in self:
00122             writer.writerow(d)
00123         out.close()
00124 
00125 
00126 
00127 class Entry(dict):
00128     hasblank = property(lambda self:len(filter(lambda _:_=="",self.values()))>0)
00129 
00130 
00131 
00132 class Source(list):
00133     """
00134     Behaves like a file and holds the original text of the CSV. 
00135     Applies some fixes to make readable as CSV::
00136 
00137     #. removes comments
00138     #. normalize the description line to conform to the delimiter
00139  
00140     """
00141     def __init__(self, f, delimiter="\t", prefix="#Table", descmarker="#[]", synth="srcline"):
00142         """
00143         :param delimiter: csv field divider
00144         :param prefix:  string start of lines to be ignored
00145         :param descmarker: strings used to identify the field description line 
00146         :param synth: when defined, add extra field with this name to hold the csv source line number 
00147         """
00148         self.f = f
00149         self.prefix = prefix
00150         self.delimiter = delimiter
00151         self.descmarker = descmarker
00152         self.synth = synth
00153 
00154         self.stat = dict(total=0,prefix=0, payload=0, descline=0)
00155         self.cols = None
00156 
00157     def is_descline(self, line):
00158         """
00159         Checks if line contains all of the description markers
00160         """
00161         return len(filter(lambda _:_ in line,self.descmarker)) == len(self.descmarker)
00162 
00163     def descline(self, line):
00164         for c in self.descmarker:
00165              line = line.replace(c,"")
00166         return line
00167 
00168     def next(self):
00169         line = self.f.next()
00170         self.append(line.rstrip())
00171         self.stat['total'] += 1
00172         while line.startswith(self.prefix):
00173             self.stat['prefix'] += 1
00174             line = self.f.next()
00175         else:
00176             if self.is_descline(line):
00177                 self.stat['descline'] += 1
00178                 if self.synth:
00179                     line=self.synth + self.delimiter + self.descline(line)
00180                 self.cols = line.rstrip().split(self.delimiter)
00181             else:
00182                 self.stat['payload'] += 1
00183                 if self.synth: 
00184                     line = "%d" % (len(self) - 1) + self.delimiter + line 
00185             return line
00186     def __iter__(self):
00187         return self
00188 
00189     def __repr__(self):
00190         return "Source stat:%r \n cols:%r  " % ( self.stat, self.cols ) 
00191 
00192 
00193 if __name__ == '__main__':
00194     paths = "$DBWRITERROOT/share/DYB_SAB_AD1.txt", "$DBWRITERROOT/share/DYB_MC_AD1.txt"
00195 
00196     for path in paths:
00197         csv_ = CSV(path)
00198         csv_.read()
00199         print csv_
00200 
00201 
00202 
00203
In This Package:

csvrw.py