similar.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. # pylint: disable=W0622
  2. # Copyright (c) 2004-2006 LOGILAB S.A. (Paris, FRANCE).
  3. # http://www.logilab.fr/ -- mailto:contact@logilab.fr
  4. #
  5. # This program is free software; you can redistribute it and/or modify it under
  6. # the terms of the GNU General Public License as published by the Free Software
  7. # Foundation; either version 2 of the License, or (at your option) any later
  8. # version.
  9. #
  10. # This program is distributed in the hope that it will be useful, but WITHOUT
  11. # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details
  13. #
  14. # You should have received a copy of the GNU General Public License along with
  15. # this program; if not, write to the Free Software Foundation, Inc.,
  16. # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17. """a similarities / code duplication command line tool and pylint checker
  18. """
  19. from __future__ import generators
  20. import sys
  21. from itertools import izip
  22. from logilab.common.ureports import Table
  23. from pylint.interfaces import IRawChecker
  24. from pylint.checkers import BaseChecker, table_lines_from_stats
  25. class Similar:
  26. """finds copy-pasted lines of code in a project"""
  27. def __init__(self, min_lines=4, ignore_comments=False,
  28. ignore_docstrings=False):
  29. self.min_lines = min_lines
  30. self.ignore_comments = ignore_comments
  31. self.ignore_docstrings = ignore_docstrings
  32. self.linesets = []
  33. def append_stream(self, streamid, stream):
  34. """append a file to search for similarities"""
  35. stream.seek(0) # XXX may be removed with astng > 0.23
  36. self.linesets.append(LineSet(streamid,
  37. stream.readlines(),
  38. self.ignore_comments,
  39. self.ignore_docstrings))
  40. def run(self):
  41. """start looking for similarities and display results on stdout"""
  42. self._display_sims(self._compute_sims())
  43. def _compute_sims(self):
  44. """compute similarities in appended files"""
  45. no_duplicates = {}
  46. for num, lineset1, idx1, lineset2, idx2 in self._iter_sims():
  47. duplicate = no_duplicates.setdefault(num, [])
  48. for couples in duplicate:
  49. if (lineset1, idx1) in couples or (lineset2, idx2) in couples:
  50. couples.add( (lineset1, idx1) )
  51. couples.add( (lineset2, idx2) )
  52. break
  53. else:
  54. duplicate.append( set([(lineset1, idx1), (lineset2, idx2)]) )
  55. sims = []
  56. for num, ensembles in no_duplicates.iteritems():
  57. for couples in ensembles:
  58. sims.append( (num, couples) )
  59. sims.sort()
  60. sims.reverse()
  61. return sims
  62. def _display_sims(self, sims):
  63. """display computed similarities on stdout"""
  64. nb_lignes_dupliquees = 0
  65. for num, couples in sims:
  66. print
  67. print num, "similar lines in", len(couples), "files"
  68. couples = sorted(couples)
  69. for lineset, idx in couples:
  70. print "==%s:%s" % (lineset.name, idx)
  71. # pylint: disable=W0631
  72. for line in lineset._real_lines[idx:idx+num]:
  73. print " ", line,
  74. nb_lignes_dupliquees += num * (len(couples)-1)
  75. nb_total_lignes = sum([len(lineset) for lineset in self.linesets])
  76. print "TOTAL lines=%s duplicates=%s percent=%.2f" \
  77. % (nb_total_lignes, nb_lignes_dupliquees,
  78. nb_lignes_dupliquees*100. / nb_total_lignes)
  79. def _find_common(self, lineset1, lineset2):
  80. """find similarities in the two given linesets"""
  81. lines1 = lineset1.enumerate_stripped
  82. lines2 = lineset2.enumerate_stripped
  83. find = lineset2.find
  84. index1 = 0
  85. min_lines = self.min_lines
  86. while index1 < len(lineset1):
  87. skip = 1
  88. num = 0
  89. for index2 in find( lineset1[index1] ):
  90. non_blank = 0
  91. for num, ((_, line1), (_, line2)) in enumerate(
  92. izip(lines1(index1), lines2(index2))):
  93. if line1 != line2:
  94. if non_blank > min_lines:
  95. yield num, lineset1, index1, lineset2, index2
  96. skip = max(skip, num)
  97. break
  98. if line1:
  99. non_blank += 1
  100. else:
  101. # we may have reach the end
  102. num += 1
  103. if non_blank > min_lines:
  104. yield num, lineset1, index1, lineset2, index2
  105. skip = max(skip, num)
  106. index1 += skip
  107. def _iter_sims(self):
  108. """iterate on similarities among all files, by making a cartesian
  109. product
  110. """
  111. for idx, lineset in enumerate(self.linesets[:-1]):
  112. for lineset2 in self.linesets[idx+1:]:
  113. for sim in self._find_common(lineset, lineset2):
  114. yield sim
  115. def stripped_lines(lines, ignore_comments, ignore_docstrings):
  116. strippedlines = []
  117. docstring = None
  118. for line in lines:
  119. line = line.strip()
  120. if ignore_docstrings:
  121. if not docstring and \
  122. (line.startswith('"""') or line.startswith("'''")):
  123. docstring = line[:3]
  124. line = line[3:]
  125. if docstring:
  126. if line.endswith(docstring):
  127. docstring = None
  128. line = ''
  129. if ignore_comments:
  130. # XXX should use regex in checkers/format to avoid cutting
  131. # at a "#" in a string
  132. line = line.split('#', 1)[0].strip()
  133. strippedlines.append(line)
  134. return strippedlines
  135. class LineSet:
  136. """Holds and indexes all the lines of a single source file"""
  137. def __init__(self, name, lines, ignore_comments=False,
  138. ignore_docstrings=False):
  139. self.name = name
  140. self._real_lines = lines
  141. self._stripped_lines = stripped_lines(lines, ignore_comments,
  142. ignore_docstrings)
  143. self._index = self._mk_index()
  144. def __str__(self):
  145. return '<Lineset for %s>' % self.name
  146. def __len__(self):
  147. return len(self._real_lines)
  148. def __getitem__(self, index):
  149. return self._stripped_lines[index]
  150. def __lt__(self, other):
  151. return self.name < other.name
  152. def __hash__(self):
  153. return id(self)
  154. def enumerate_stripped(self, start_at=0):
  155. """return an iterator on stripped lines, starting from a given index
  156. if specified, else 0
  157. """
  158. idx = start_at
  159. if start_at:
  160. lines = self._stripped_lines[start_at:]
  161. else:
  162. lines = self._stripped_lines
  163. for line in lines:
  164. #if line:
  165. yield idx, line
  166. idx += 1
  167. def find(self, stripped_line):
  168. """return positions of the given stripped line in this set"""
  169. return self._index.get(stripped_line, ())
  170. def _mk_index(self):
  171. """create the index for this set"""
  172. index = {}
  173. for line_no, line in enumerate(self._stripped_lines):
  174. if line:
  175. index.setdefault(line, []).append( line_no )
  176. return index
  177. MSGS = {'R0801': ('Similar lines in %s files\n%s',
  178. 'Indicates that a set of similar lines has been detected \
  179. among multiple file. This usually means that the code should \
  180. be refactored to avoid this duplication.')}
  181. def report_similarities(sect, stats, old_stats):
  182. """make a layout with some stats about duplication"""
  183. lines = ['', 'now', 'previous', 'difference']
  184. lines += table_lines_from_stats(stats, old_stats,
  185. ('nb_duplicated_lines',
  186. 'percent_duplicated_lines'))
  187. sect.append(Table(children=lines, cols=4, rheaders=1, cheaders=1))
  188. # wrapper to get a pylint checker from the similar class
  189. class SimilarChecker(BaseChecker, Similar):
  190. """checks for similarities and duplicated code. This computation may be
  191. memory / CPU intensive, so you should disable it if you experiment some
  192. problems.
  193. """
  194. __implements__ = (IRawChecker,)
  195. # configuration section name
  196. name = 'similarities'
  197. # messages
  198. msgs = MSGS
  199. # configuration options
  200. # for available dict keys/values see the optik parser 'add_option' method
  201. options = (('min-similarity-lines',
  202. {'default' : 4, 'type' : "int", 'metavar' : '<int>',
  203. 'help' : 'Minimum lines number of a similarity.'}),
  204. ('ignore-comments',
  205. {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
  206. 'help': 'Ignore comments when computing similarities.'}
  207. ),
  208. ('ignore-docstrings',
  209. {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
  210. 'help': 'Ignore docstrings when computing similarities.'}
  211. ),
  212. )
  213. # reports
  214. reports = ( ('R0801', 'Duplication', report_similarities), ) # XXX actually a Refactoring message
  215. def __init__(self, linter=None):
  216. BaseChecker.__init__(self, linter)
  217. Similar.__init__(self, min_lines=4,
  218. ignore_comments=True, ignore_docstrings=True)
  219. self.stats = None
  220. def set_option(self, optname, value, action=None, optdict=None):
  221. """method called to set an option (registered in the options list)
  222. overridden to report options setting to Similar
  223. """
  224. BaseChecker.set_option(self, optname, value, action, optdict)
  225. if optname == 'min-similarity-lines':
  226. self.min_lines = self.config.min_similarity_lines
  227. elif optname == 'ignore-comments':
  228. self.ignore_comments = self.config.ignore_comments
  229. elif optname == 'ignore-docstrings':
  230. self.ignore_docstrings = self.config.ignore_docstrings
  231. def open(self):
  232. """init the checkers: reset linesets and statistics information"""
  233. self.linesets = []
  234. self.stats = self.linter.add_stats(nb_duplicated_lines=0,
  235. percent_duplicated_lines=0)
  236. def process_module(self, node):
  237. """process a module
  238. the module's content is accessible via the stream object
  239. stream must implement the readlines method
  240. """
  241. self.append_stream(self.linter.current_name, node.file_stream)
  242. def close(self):
  243. """compute and display similarities on closing (i.e. end of parsing)"""
  244. total = sum([len(lineset) for lineset in self.linesets])
  245. duplicated = 0
  246. stats = self.stats
  247. for num, couples in self._compute_sims():
  248. msg = []
  249. for lineset, idx in couples:
  250. msg.append("==%s:%s" % (lineset.name, idx))
  251. msg.sort()
  252. # pylint: disable=W0631
  253. for line in lineset._real_lines[idx:idx+num]:
  254. msg.append(line.rstrip())
  255. self.add_message('R0801', args=(len(couples), '\n'.join(msg)))
  256. duplicated += num * (len(couples) - 1)
  257. stats['nb_duplicated_lines'] = duplicated
  258. stats['percent_duplicated_lines'] = total and duplicated * 100. / total
  259. def register(linter):
  260. """required method to auto register this checker """
  261. linter.register_checker(SimilarChecker(linter))
  262. def usage(status=0):
  263. """display command line usage information"""
  264. print "finds copy pasted blocks in a set of files"
  265. print
  266. print 'Usage: symilar [-d|--duplicates min_duplicated_lines] \
  267. [-i|--ignore-comments] file1...'
  268. sys.exit(status)
  269. def run(argv=None):
  270. """standalone command line access point"""
  271. if argv is None:
  272. argv = sys.argv[1:]
  273. from getopt import getopt
  274. s_opts = 'hdi'
  275. l_opts = ('help', 'duplicates=', 'ignore-comments')
  276. min_lines = 4
  277. ignore_comments = False
  278. opts, args = getopt(argv, s_opts, l_opts)
  279. for opt, val in opts:
  280. if opt in ('-d', '--duplicates'):
  281. min_lines = int(val)
  282. elif opt in ('-h', '--help'):
  283. usage()
  284. elif opt in ('-i', '--ignore-comments'):
  285. ignore_comments = True
  286. if not args:
  287. usage(1)
  288. sim = Similar(min_lines, ignore_comments)
  289. for filename in args:
  290. sim.append_stream(filename, open(filename))
  291. sim.run()
  292. if __name__ == '__main__':
  293. run()