codeanalyze.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. import bisect
  2. import re
  3. import token
  4. import tokenize
  5. class ChangeCollector(object):
  6. def __init__(self, text):
  7. self.text = text
  8. self.changes = []
  9. def add_change(self, start, end, new_text=None):
  10. if new_text is None:
  11. new_text = self.text[start:end]
  12. self.changes.append((start, end, new_text))
  13. def get_changed(self):
  14. if not self.changes:
  15. return None
  16. def compare_changes(change1, change2):
  17. return cmp(change1[:2], change2[:2])
  18. self.changes.sort(compare_changes)
  19. pieces = []
  20. last_changed = 0
  21. for change in self.changes:
  22. start, end, text = change
  23. pieces.append(self.text[last_changed:start] + text)
  24. last_changed = end
  25. if last_changed < len(self.text):
  26. pieces.append(self.text[last_changed:])
  27. result = ''.join(pieces)
  28. if result != self.text:
  29. return result
  30. class SourceLinesAdapter(object):
  31. """Adapts source to Lines interface
  32. Note: The creation of this class is expensive.
  33. """
  34. def __init__(self, source_code):
  35. self.code = source_code
  36. self.starts = None
  37. self._initialize_line_starts()
  38. def _initialize_line_starts(self):
  39. self.starts = []
  40. self.starts.append(0)
  41. try:
  42. i = 0
  43. while True:
  44. i = self.code.index('\n', i) + 1
  45. self.starts.append(i)
  46. except ValueError:
  47. pass
  48. self.starts.append(len(self.code) + 1)
  49. def get_line(self, lineno):
  50. return self.code[self.starts[lineno - 1]:
  51. self.starts[lineno] - 1]
  52. def length(self):
  53. return len(self.starts) - 1
  54. def get_line_number(self, offset):
  55. return bisect.bisect(self.starts, offset)
  56. def get_line_start(self, lineno):
  57. return self.starts[lineno - 1]
  58. def get_line_end(self, lineno):
  59. return self.starts[lineno] - 1
  60. class ArrayLinesAdapter(object):
  61. def __init__(self, lines):
  62. self.lines = lines
  63. def get_line(self, line_number):
  64. return self.lines[line_number - 1]
  65. def length(self):
  66. return len(self.lines)
  67. class LinesToReadline(object):
  68. def __init__(self, lines, start):
  69. self.lines = lines
  70. self.current = start
  71. def readline(self):
  72. if self.current <= self.lines.length():
  73. self.current += 1
  74. return self.lines.get_line(self.current - 1) + '\n'
  75. return ''
  76. def __call__(self):
  77. return self.readline()
  78. class _CustomGenerator(object):
  79. def __init__(self, lines):
  80. self.lines = lines
  81. self.in_string = ''
  82. self.open_count = 0
  83. self.continuation = False
  84. def __call__(self):
  85. size = self.lines.length()
  86. result = []
  87. i = 1
  88. while i <= size:
  89. while i <= size and not self.lines.get_line(i).strip():
  90. i += 1
  91. if i <= size:
  92. start = i
  93. while True:
  94. line = self.lines.get_line(i)
  95. self._analyze_line(line)
  96. if not (self.continuation or self.open_count or
  97. self.in_string) or i == size:
  98. break
  99. i += 1
  100. result.append((start, i))
  101. i += 1
  102. return result
  103. _main_chars = re.compile(r'[\'|"|#|\\|\[|\]|\{|\}|\(|\)]')
  104. def _analyze_line(self, line):
  105. char = None
  106. for match in self._main_chars.finditer(line):
  107. char = match.group()
  108. i = match.start()
  109. if char in '\'"':
  110. if not self.in_string:
  111. self.in_string = char
  112. if char * 3 == line[i:i + 3]:
  113. self.in_string = char * 3
  114. elif self.in_string == line[i:i + len(self.in_string)] and \
  115. not (i > 0 and line[i - 1] == '\\' and
  116. not (i > 1 and line[i - 2] == '\\')):
  117. self.in_string = ''
  118. if self.in_string:
  119. continue
  120. if char == '#':
  121. break
  122. if char in '([{':
  123. self.open_count += 1
  124. elif char in ')]}':
  125. self.open_count -= 1
  126. if line and char != '#' and line.endswith('\\'):
  127. self.continuation = True
  128. else:
  129. self.continuation = False
  130. def custom_generator(lines):
  131. return _CustomGenerator(lines)()
  132. class LogicalLineFinder(object):
  133. def __init__(self, lines):
  134. self.lines = lines
  135. def logical_line_in(self, line_number):
  136. indents = count_line_indents(self.lines.get_line(line_number))
  137. tries = 0
  138. while True:
  139. block_start = get_block_start(self.lines, line_number, indents)
  140. try:
  141. return self._block_logical_line(block_start, line_number)
  142. except IndentationError, e:
  143. tries += 1
  144. if tries == 5:
  145. raise e
  146. lineno = e.lineno + block_start - 1
  147. indents = count_line_indents(self.lines.get_line(lineno))
  148. def generate_starts(self, start_line=1, end_line=None):
  149. for start, end in self.generate_regions(start_line, end_line):
  150. yield start
  151. def generate_regions(self, start_line=1, end_line=None):
  152. # XXX: `block_start` should be at a better position!
  153. block_start = 1
  154. readline = LinesToReadline(self.lines, block_start)
  155. shifted = start_line - block_start + 1
  156. try:
  157. for start, end in self._logical_lines(readline):
  158. real_start = start + block_start - 1
  159. real_start = self._first_non_blank(real_start)
  160. if end_line is not None and real_start >= end_line:
  161. break
  162. real_end = end + block_start - 1
  163. if real_start >= start_line:
  164. yield (real_start, real_end)
  165. except tokenize.TokenError, e:
  166. pass
  167. def _block_logical_line(self, block_start, line_number):
  168. readline = LinesToReadline(self.lines, block_start)
  169. shifted = line_number - block_start + 1
  170. region = self._calculate_logical(readline, shifted)
  171. start = self._first_non_blank(region[0] + block_start - 1)
  172. if region[1] is None:
  173. end = self.lines.length()
  174. else:
  175. end = region[1] + block_start - 1
  176. return start, end
  177. def _calculate_logical(self, readline, line_number):
  178. last_end = 1
  179. try:
  180. for start, end in self._logical_lines(readline):
  181. if line_number <= end:
  182. return (start, end)
  183. last_end = end + 1
  184. except tokenize.TokenError, e:
  185. current = e.args[1][0]
  186. return (last_end, max(last_end, current - 1))
  187. return (last_end, None)
  188. def _logical_lines(self, readline):
  189. last_end = 1
  190. for current_token in tokenize.generate_tokens(readline):
  191. current = current_token[2][0]
  192. if current_token[0] == token.NEWLINE:
  193. yield (last_end, current)
  194. last_end = current + 1
  195. def _first_non_blank(self, line_number):
  196. current = line_number
  197. while current < self.lines.length():
  198. line = self.lines.get_line(current).strip()
  199. if line and not line.startswith('#'):
  200. return current
  201. current += 1
  202. return current
  203. def tokenizer_generator(lines):
  204. return LogicalLineFinder(lines).generate_regions()
  205. class CachingLogicalLineFinder(object):
  206. def __init__(self, lines, generate=custom_generator):
  207. self.lines = lines
  208. self._generate = generate
  209. _starts = None
  210. @property
  211. def starts(self):
  212. if self._starts is None:
  213. self._init_logicals()
  214. return self._starts
  215. _ends = None
  216. @property
  217. def ends(self):
  218. if self._ends is None:
  219. self._init_logicals()
  220. return self._ends
  221. def _init_logicals(self):
  222. """Should initialize _starts and _ends attributes"""
  223. size = self.lines.length() + 1
  224. self._starts = [None] * size
  225. self._ends = [None] * size
  226. for start, end in self._generate(self.lines):
  227. self._starts[start] = True
  228. self._ends[end] = True
  229. def logical_line_in(self, line_number):
  230. start = line_number
  231. while start > 0 and not self.starts[start]:
  232. start -= 1
  233. if start == 0:
  234. try:
  235. start = self.starts.index(True, line_number)
  236. except ValueError:
  237. return (line_number, line_number)
  238. return (start, self.ends.index(True, start))
  239. def generate_starts(self, start_line=1, end_line=None):
  240. if end_line is None:
  241. end_line = self.lines.length()
  242. for index in range(start_line, end_line):
  243. if self.starts[index]:
  244. yield index
  245. def get_block_start(lines, lineno, maximum_indents=80):
  246. """Approximate block start"""
  247. pattern = get_block_start_patterns()
  248. for i in range(lineno, 0, -1):
  249. match = pattern.search(lines.get_line(i))
  250. if match is not None and \
  251. count_line_indents(lines.get_line(i)) <= maximum_indents:
  252. striped = match.string.lstrip()
  253. # Maybe we're in a list comprehension or generator expression
  254. if i > 1 and striped.startswith('if') or striped.startswith('for'):
  255. bracs = 0
  256. for j in range(i, min(i + 5, lines.length() + 1)):
  257. for c in lines.get_line(j):
  258. if c == '#':
  259. break
  260. if c in '[(':
  261. bracs += 1
  262. if c in ')]':
  263. bracs -= 1
  264. if bracs < 0:
  265. break
  266. if bracs < 0:
  267. break
  268. if bracs < 0:
  269. continue
  270. return i
  271. return 1
  272. _block_start_pattern = None
  273. def get_block_start_patterns():
  274. global _block_start_pattern
  275. if not _block_start_pattern:
  276. pattern = '^\\s*(((def|class|if|elif|except|for|while|with)\\s)|'\
  277. '((try|else|finally|except)\\s*:))'
  278. _block_start_pattern = re.compile(pattern, re.M)
  279. return _block_start_pattern
  280. def count_line_indents(line):
  281. indents = 0
  282. for char in line:
  283. if char == ' ':
  284. indents += 1
  285. elif char == '\t':
  286. indents += 8
  287. else:
  288. return indents
  289. return 0
  290. def get_string_pattern():
  291. start = r'(\b[uU]?[rR]?)?'
  292. longstr = r'%s"""(\\.|"(?!"")|\\\n|[^"\\])*"""' % start
  293. shortstr = r'%s"(\\.|[^"\\\n])*"' % start
  294. return '|'.join([longstr, longstr.replace('"', "'"),
  295. shortstr, shortstr.replace('"', "'")])
  296. def get_comment_pattern():
  297. return r'#[^\n]*'