dbf.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. # -*- coding: utf-8 -*-
  2. # copyright 2003-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
  3. # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
  4. #
  5. # This file is part of logilab-common.
  6. #
  7. # logilab-common is free software: you can redistribute it and/or modify it under
  8. # the terms of the GNU Lesser General Public License as published by the Free
  9. # Software Foundation, either version 2.1 of the License, or (at your option) any
  10. # later version.
  11. #
  12. # logilab-common is distributed in the hope that it will be useful, but WITHOUT
  13. # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  14. # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
  15. # details.
  16. #
  17. # You should have received a copy of the GNU Lesser General Public License along
  18. # with logilab-common. If not, see <http://www.gnu.org/licenses/>.
  19. """This is a DBF reader which reads Visual Fox Pro DBF format with Memo field
  20. Usage:
  21. >>> rec = readDbf('test.dbf')
  22. >>> for line in rec:
  23. >>> print line['name']
  24. :date: 13/07/2007
  25. http://www.physics.ox.ac.uk/users/santoso/Software.Repository.html
  26. page says code is "available as is without any warranty or support".
  27. """
  28. import struct
  29. import os, os.path
  30. import sys
  31. import csv
  32. import tempfile
  33. import ConfigParser
  34. class Dbase:
  35. def __init__(self):
  36. self.fdb = None
  37. self.fmemo = None
  38. self.db_data = None
  39. self.memo_data = None
  40. self.fields = None
  41. self.num_records = 0
  42. self.header = None
  43. self.memo_file = ''
  44. self.memo_header = None
  45. self.memo_block_size = 0
  46. self.memo_header_len = 0
  47. def _drop_after_NULL(self, txt):
  48. for i in range(0, len(txt)):
  49. if ord(struct.unpack('c', txt[i])[0])==0:
  50. return txt[:i]
  51. return txt
  52. def _reverse_endian(self, num):
  53. if not len(num):
  54. return 0
  55. val = struct.unpack('<L', num)
  56. val = struct.pack('>L', val[0])
  57. val = struct.unpack('>L', val)
  58. return val[0]
  59. def _assign_ids(self, lst, ids):
  60. result = {}
  61. idx = 0
  62. for item in lst:
  63. id = ids[idx]
  64. result[id] = item
  65. idx += 1
  66. return result
  67. def open(self, db_name):
  68. filesize = os.path.getsize(db_name)
  69. if filesize <= 68:
  70. raise IOError, 'The file is not large enough to be a dbf file'
  71. self.fdb = open(db_name, 'rb')
  72. self.memo_file = ''
  73. if os.path.isfile(db_name[0:-1] + 't'):
  74. self.memo_file = db_name[0:-1] + 't'
  75. elif os.path.isfile(db_name[0:-3] + 'fpt'):
  76. self.memo_file = db_name[0:-3] + 'fpt'
  77. if self.memo_file:
  78. #Read memo file
  79. self.fmemo = open(self.memo_file, 'rb')
  80. self.memo_data = self.fmemo.read()
  81. self.memo_header = self._assign_ids(struct.unpack('>6x1H', self.memo_data[:8]), ['Block size'])
  82. block_size = self.memo_header['Block size']
  83. if not block_size:
  84. block_size = 512
  85. self.memo_block_size = block_size
  86. self.memo_header_len = block_size
  87. memo_size = os.path.getsize(self.memo_file)
  88. #Start reading data file
  89. data = self.fdb.read(32)
  90. self.header = self._assign_ids(struct.unpack('<B 3B L 2H 20x', data), ['id', 'Year', 'Month', 'Day', '# of Records', 'Header Size', 'Record Size'])
  91. self.header['id'] = hex(self.header['id'])
  92. self.num_records = self.header['# of Records']
  93. data = self.fdb.read(self.header['Header Size']-34)
  94. self.fields = {}
  95. x = 0
  96. header_pattern = '<11s c 4x B B 14x'
  97. ids = ['Field Name', 'Field Type', 'Field Length', 'Field Precision']
  98. pattern_len = 32
  99. for offset in range(0, len(data), 32):
  100. if ord(data[offset])==0x0d:
  101. break
  102. x += 1
  103. data_subset = data[offset: offset+pattern_len]
  104. if len(data_subset) < pattern_len:
  105. data_subset += ' '*(pattern_len-len(data_subset))
  106. self.fields[x] = self._assign_ids(struct.unpack(header_pattern, data_subset), ids)
  107. self.fields[x]['Field Name'] = self._drop_after_NULL(self.fields[x]['Field Name'])
  108. self.fdb.read(3)
  109. if self.header['# of Records']:
  110. data_size = (self.header['# of Records'] * self.header['Record Size']) - 1
  111. self.db_data = self.fdb.read(data_size)
  112. else:
  113. self.db_data = ''
  114. self.row_format = '<'
  115. self.row_ids = []
  116. self.row_len = 0
  117. for key in self.fields:
  118. field = self.fields[key]
  119. self.row_format += '%ds ' % (field['Field Length'])
  120. self.row_ids.append(field['Field Name'])
  121. self.row_len += field['Field Length']
  122. def close(self):
  123. if self.fdb:
  124. self.fdb.close()
  125. if self.fmemo:
  126. self.fmemo.close()
  127. def get_numrecords(self):
  128. return self.num_records
  129. def get_record_with_names(self, rec_no):
  130. """
  131. This function accept record number from 0 to N-1
  132. """
  133. if rec_no < 0 or rec_no > self.num_records:
  134. raise Exception, 'Unable to extract data outside the range'
  135. offset = self.header['Record Size'] * rec_no
  136. data = self.db_data[offset:offset+self.row_len]
  137. record = self._assign_ids(struct.unpack(self.row_format, data), self.row_ids)
  138. if self.memo_file:
  139. for key in self.fields:
  140. field = self.fields[key]
  141. f_type = field['Field Type']
  142. f_name = field['Field Name']
  143. c_data = record[f_name]
  144. if f_type=='M' or f_type=='G' or f_type=='B' or f_type=='P':
  145. c_data = self._reverse_endian(c_data)
  146. if c_data:
  147. record[f_name] = self.read_memo(c_data-1).strip()
  148. else:
  149. record[f_name] = c_data.strip()
  150. return record
  151. def read_memo_record(self, num, in_length):
  152. """
  153. Read the record of given number. The second parameter is the length of
  154. the record to read. It can be undefined, meaning read the whole record,
  155. and it can be negative, meaning at most the length
  156. """
  157. if in_length < 0:
  158. in_length = -self.memo_block_size
  159. offset = self.memo_header_len + num * self.memo_block_size
  160. self.fmemo.seek(offset)
  161. if in_length<0:
  162. in_length = -in_length
  163. if in_length==0:
  164. return ''
  165. return self.fmemo.read(in_length)
  166. def read_memo(self, num):
  167. result = ''
  168. buffer = self.read_memo_record(num, -1)
  169. if len(buffer)<=0:
  170. return ''
  171. length = struct.unpack('>L', buffer[4:4+4])[0] + 8
  172. block_size = self.memo_block_size
  173. if length < block_size:
  174. return buffer[8:length]
  175. rest_length = length - block_size
  176. rest_data = self.read_memo_record(num+1, rest_length)
  177. if len(rest_data)<=0:
  178. return ''
  179. return buffer[8:] + rest_data
  180. def readDbf(filename):
  181. """
  182. Read the DBF file specified by the filename and
  183. return the records as a list of dictionary.
  184. :param: filename File name of the DBF
  185. :return: List of rows
  186. """
  187. db = Dbase()
  188. db.open(filename)
  189. num = db.get_numrecords()
  190. rec = []
  191. for i in range(0, num):
  192. record = db.get_record_with_names(i)
  193. rec.append(record)
  194. db.close()
  195. return rec
  196. if __name__=='__main__':
  197. rec = readDbf('dbf/sptable.dbf')
  198. for line in rec:
  199. print '%s %s' % (line['GENUS'].strip(), line['SPECIES'].strip())