textutils.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. # copyright 2003-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
  2. # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
  3. #
  4. # This file is part of logilab-common.
  5. #
  6. # logilab-common is free software: you can redistribute it and/or modify it under
  7. # the terms of the GNU Lesser General Public License as published by the Free
  8. # Software Foundation, either version 2.1 of the License, or (at your option) any
  9. # later version.
  10. #
  11. # logilab-common is distributed in the hope that it will be useful, but WITHOUT
  12. # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  13. # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
  14. # details.
  15. #
  16. # You should have received a copy of the GNU Lesser General Public License along
  17. # with logilab-common. If not, see <http://www.gnu.org/licenses/>.
  18. """Some text manipulation utility functions.
  19. :group text formatting: normalize_text, normalize_paragraph, pretty_match,\
  20. unquote, colorize_ansi
  21. :group text manipulation: searchall, splitstrip
  22. :sort: text formatting, text manipulation
  23. :type ANSI_STYLES: dict(str)
  24. :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code
  25. :type ANSI_COLORS: dict(str)
  26. :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code
  27. :type ANSI_PREFIX: str
  28. :var ANSI_PREFIX:
  29. ANSI terminal code notifying the start of an ANSI escape sequence
  30. :type ANSI_END: str
  31. :var ANSI_END:
  32. ANSI terminal code notifying the end of an ANSI escape sequence
  33. :type ANSI_RESET: str
  34. :var ANSI_RESET:
  35. ANSI terminal code resetting format defined by a previous ANSI escape sequence
  36. """
  37. __docformat__ = "restructuredtext en"
  38. import sys
  39. import re
  40. import os.path as osp
  41. from warnings import warn
  42. from unicodedata import normalize as _uninormalize
  43. try:
  44. from os import linesep
  45. except ImportError:
  46. linesep = '\n' # gae
  47. from logilab.common.deprecation import deprecated
  48. MANUAL_UNICODE_MAP = {
  49. u'\xa1': u'!', # INVERTED EXCLAMATION MARK
  50. u'\u0142': u'l', # LATIN SMALL LETTER L WITH STROKE
  51. u'\u2044': u'/', # FRACTION SLASH
  52. u'\xc6': u'AE', # LATIN CAPITAL LETTER AE
  53. u'\xa9': u'(c)', # COPYRIGHT SIGN
  54. u'\xab': u'"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
  55. u'\xe6': u'ae', # LATIN SMALL LETTER AE
  56. u'\xae': u'(r)', # REGISTERED SIGN
  57. u'\u0153': u'oe', # LATIN SMALL LIGATURE OE
  58. u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE
  59. u'\xd8': u'O', # LATIN CAPITAL LETTER O WITH STROKE
  60. u'\xf8': u'o', # LATIN SMALL LETTER O WITH STROKE
  61. u'\xbb': u'"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
  62. u'\xdf': u'ss', # LATIN SMALL LETTER SHARP S
  63. }
  64. def unormalize(ustring, ignorenonascii=None, substitute=None):
  65. """replace diacritical characters with their corresponding ascii characters
  66. Convert the unicode string to its long normalized form (unicode character
  67. will be transform into several characters) and keep the first one only.
  68. The normal form KD (NFKD) will apply the compatibility decomposition, i.e.
  69. replace all compatibility characters with their equivalents.
  70. :type substitute: str
  71. :param substitute: replacement character to use if decomposition fails
  72. :see: Another project about ASCII transliterations of Unicode text
  73. http://pypi.python.org/pypi/Unidecode
  74. """
  75. # backward compatibility, ignorenonascii was a boolean
  76. if ignorenonascii is not None:
  77. warn("ignorenonascii is deprecated, use substitute named parameter instead",
  78. DeprecationWarning, stacklevel=2)
  79. if ignorenonascii:
  80. substitute = ''
  81. res = []
  82. for letter in ustring[:]:
  83. try:
  84. replacement = MANUAL_UNICODE_MAP[letter]
  85. except KeyError:
  86. replacement = _uninormalize('NFKD', letter)[0]
  87. if ord(replacement) >= 2 ** 7:
  88. if substitute is None:
  89. raise ValueError("can't deal with non-ascii based characters")
  90. replacement = substitute
  91. res.append(replacement)
  92. return u''.join(res)
  93. def unquote(string):
  94. """remove optional quotes (simple or double) from the string
  95. :type string: str or unicode
  96. :param string: an optionally quoted string
  97. :rtype: str or unicode
  98. :return: the unquoted string (or the input string if it wasn't quoted)
  99. """
  100. if not string:
  101. return string
  102. if string[0] in '"\'':
  103. string = string[1:]
  104. if string[-1] in '"\'':
  105. string = string[:-1]
  106. return string
  107. _BLANKLINES_RGX = re.compile('\r?\n\r?\n')
  108. _NORM_SPACES_RGX = re.compile('\s+')
  109. def normalize_text(text, line_len=80, indent='', rest=False):
  110. """normalize a text to display it with a maximum line size and
  111. optionally arbitrary indentation. Line jumps are normalized but blank
  112. lines are kept. The indentation string may be used to insert a
  113. comment (#) or a quoting (>) mark for instance.
  114. :type text: str or unicode
  115. :param text: the input text to normalize
  116. :type line_len: int
  117. :param line_len: expected maximum line's length, default to 80
  118. :type indent: str or unicode
  119. :param indent: optional string to use as indentation
  120. :rtype: str or unicode
  121. :return:
  122. the input text normalized to fit on lines with a maximized size
  123. inferior to `line_len`, and optionally prefixed by an
  124. indentation string
  125. """
  126. if rest:
  127. normp = normalize_rest_paragraph
  128. else:
  129. normp = normalize_paragraph
  130. result = []
  131. for text in _BLANKLINES_RGX.split(text):
  132. result.append(normp(text, line_len, indent))
  133. return ('%s%s%s' % (linesep, indent, linesep)).join(result)
  134. def normalize_paragraph(text, line_len=80, indent=''):
  135. """normalize a text to display it with a maximum line size and
  136. optionally arbitrary indentation. Line jumps are normalized. The
  137. indentation string may be used top insert a comment mark for
  138. instance.
  139. :type text: str or unicode
  140. :param text: the input text to normalize
  141. :type line_len: int
  142. :param line_len: expected maximum line's length, default to 80
  143. :type indent: str or unicode
  144. :param indent: optional string to use as indentation
  145. :rtype: str or unicode
  146. :return:
  147. the input text normalized to fit on lines with a maximized size
  148. inferior to `line_len`, and optionally prefixed by an
  149. indentation string
  150. """
  151. text = _NORM_SPACES_RGX.sub(' ', text)
  152. line_len = line_len - len(indent)
  153. lines = []
  154. while text:
  155. aline, text = splittext(text.strip(), line_len)
  156. lines.append(indent + aline)
  157. return linesep.join(lines)
  158. def normalize_rest_paragraph(text, line_len=80, indent=''):
  159. """normalize a ReST text to display it with a maximum line size and
  160. optionally arbitrary indentation. Line jumps are normalized. The
  161. indentation string may be used top insert a comment mark for
  162. instance.
  163. :type text: str or unicode
  164. :param text: the input text to normalize
  165. :type line_len: int
  166. :param line_len: expected maximum line's length, default to 80
  167. :type indent: str or unicode
  168. :param indent: optional string to use as indentation
  169. :rtype: str or unicode
  170. :return:
  171. the input text normalized to fit on lines with a maximized size
  172. inferior to `line_len`, and optionally prefixed by an
  173. indentation string
  174. """
  175. toreport = ''
  176. lines = []
  177. line_len = line_len - len(indent)
  178. for line in text.splitlines():
  179. line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip())
  180. toreport = ''
  181. while len(line) > line_len:
  182. # too long line, need split
  183. line, toreport = splittext(line, line_len)
  184. lines.append(indent + line)
  185. if toreport:
  186. line = toreport + ' '
  187. toreport = ''
  188. else:
  189. line = ''
  190. if line:
  191. lines.append(indent + line.strip())
  192. return linesep.join(lines)
  193. def splittext(text, line_len):
  194. """split the given text on space according to the given max line size
  195. return a 2-uple:
  196. * a line <= line_len if possible
  197. * the rest of the text which has to be reported on another line
  198. """
  199. if len(text) <= line_len:
  200. return text, ''
  201. pos = min(len(text)-1, line_len)
  202. while pos > 0 and text[pos] != ' ':
  203. pos -= 1
  204. if pos == 0:
  205. pos = min(len(text), line_len)
  206. while len(text) > pos and text[pos] != ' ':
  207. pos += 1
  208. return text[:pos], text[pos+1:].strip()
  209. def splitstrip(string, sep=','):
  210. """return a list of stripped string by splitting the string given as
  211. argument on `sep` (',' by default). Empty string are discarded.
  212. >>> splitstrip('a, b, c , 4,,')
  213. ['a', 'b', 'c', '4']
  214. >>> splitstrip('a')
  215. ['a']
  216. >>>
  217. :type string: str or unicode
  218. :param string: a csv line
  219. :type sep: str or unicode
  220. :param sep: field separator, default to the comma (',')
  221. :rtype: str or unicode
  222. :return: the unquoted string (or the input string if it wasn't quoted)
  223. """
  224. return [word.strip() for word in string.split(sep) if word.strip()]
  225. get_csv = deprecated('get_csv is deprecated, use splitstrip')(splitstrip)
  226. def split_url_or_path(url_or_path):
  227. """return the latest component of a string containing either an url of the
  228. form <scheme>://<path> or a local file system path
  229. """
  230. if '://' in url_or_path:
  231. return url_or_path.rstrip('/').rsplit('/', 1)
  232. return osp.split(url_or_path.rstrip(osp.sep))
  233. def text_to_dict(text):
  234. """parse multilines text containing simple 'key=value' lines and return a
  235. dict of {'key': 'value'}. When the same key is encountered multiple time,
  236. value is turned into a list containing all values.
  237. >>> text_to_dict('''multiple=1
  238. ... multiple= 2
  239. ... single =3
  240. ... ''')
  241. {'single': '3', 'multiple': ['1', '2']}
  242. """
  243. res = {}
  244. if not text:
  245. return res
  246. for line in text.splitlines():
  247. line = line.strip()
  248. if line and not line.startswith('#'):
  249. key, value = [w.strip() for w in line.split('=', 1)]
  250. if key in res:
  251. try:
  252. res[key].append(value)
  253. except AttributeError:
  254. res[key] = [res[key], value]
  255. else:
  256. res[key] = value
  257. return res
  258. _BLANK_URE = r'(\s|,)+'
  259. _BLANK_RE = re.compile(_BLANK_URE)
  260. __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))'
  261. __UNITS_URE = r'[a-zA-Z]+'
  262. _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE, __UNITS_URE))
  263. _VALIDATION_RE = re.compile(r'^((%s)(%s))*(%s)?$' % (__VALUE_URE, __UNITS_URE,
  264. __VALUE_URE))
  265. BYTE_UNITS = {
  266. "b": 1,
  267. "kb": 1024,
  268. "mb": 1024 ** 2,
  269. "gb": 1024 ** 3,
  270. "tb": 1024 ** 4,
  271. }
  272. TIME_UNITS = {
  273. "ms": 0.0001,
  274. "s": 1,
  275. "min": 60,
  276. "h": 60 * 60,
  277. "d": 60 * 60 *24,
  278. }
  279. def apply_units(string, units, inter=None, final=float, blank_reg=_BLANK_RE,
  280. value_reg=_VALUE_RE):
  281. """Parse the string applying the units defined in units
  282. (e.g.: "1.5m",{'m',60} -> 80).
  283. :type string: str or unicode
  284. :param string: the string to parse
  285. :type units: dict (or any object with __getitem__ using basestring key)
  286. :param units: a dict mapping a unit string repr to its value
  287. :type inter: type
  288. :param inter: used to parse every intermediate value (need __sum__)
  289. :type blank_reg: regexp
  290. :param blank_reg: should match every blank char to ignore.
  291. :type value_reg: regexp with "value" and optional "unit" group
  292. :param value_reg: match a value and it's unit into the
  293. """
  294. if inter is None:
  295. inter = final
  296. fstring = _BLANK_RE.sub('', string)
  297. if not (fstring and _VALIDATION_RE.match(fstring)):
  298. raise ValueError("Invalid unit string: %r." % string)
  299. values = []
  300. for match in value_reg.finditer(fstring):
  301. dic = match.groupdict()
  302. lit, unit = dic["value"], dic.get("unit")
  303. value = inter(lit)
  304. if unit is not None:
  305. try:
  306. value *= units[unit.lower()]
  307. except KeyError:
  308. raise KeyError('invalid unit %s. valid units are %s' %
  309. (unit, units.keys()))
  310. values.append(value)
  311. return final(sum(values))
  312. _LINE_RGX = re.compile('\r\n|\r+|\n')
  313. def pretty_match(match, string, underline_char='^'):
  314. """return a string with the match location underlined:
  315. >>> import re
  316. >>> print(pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon'))
  317. il mange du bacon
  318. ^^^^^
  319. >>>
  320. :type match: _sre.SRE_match
  321. :param match: object returned by re.match, re.search or re.finditer
  322. :type string: str or unicode
  323. :param string:
  324. the string on which the regular expression has been applied to
  325. obtain the `match` object
  326. :type underline_char: str or unicode
  327. :param underline_char:
  328. character to use to underline the matched section, default to the
  329. carret '^'
  330. :rtype: str or unicode
  331. :return:
  332. the original string with an inserted line to underline the match
  333. location
  334. """
  335. start = match.start()
  336. end = match.end()
  337. string = _LINE_RGX.sub(linesep, string)
  338. start_line_pos = string.rfind(linesep, 0, start)
  339. if start_line_pos == -1:
  340. start_line_pos = 0
  341. result = []
  342. else:
  343. result = [string[:start_line_pos]]
  344. start_line_pos += len(linesep)
  345. offset = start - start_line_pos
  346. underline = ' ' * offset + underline_char * (end - start)
  347. end_line_pos = string.find(linesep, end)
  348. if end_line_pos == -1:
  349. string = string[start_line_pos:]
  350. result.append(string)
  351. result.append(underline)
  352. else:
  353. end = string[end_line_pos + len(linesep):]
  354. string = string[start_line_pos:end_line_pos]
  355. result.append(string)
  356. result.append(underline)
  357. result.append(end)
  358. return linesep.join(result).rstrip()
  359. # Ansi colorization ###########################################################
  360. ANSI_PREFIX = '\033['
  361. ANSI_END = 'm'
  362. ANSI_RESET = '\033[0m'
  363. ANSI_STYLES = {
  364. 'reset': "0",
  365. 'bold': "1",
  366. 'italic': "3",
  367. 'underline': "4",
  368. 'blink': "5",
  369. 'inverse': "7",
  370. 'strike': "9",
  371. }
  372. ANSI_COLORS = {
  373. 'reset': "0",
  374. 'black': "30",
  375. 'red': "31",
  376. 'green': "32",
  377. 'yellow': "33",
  378. 'blue': "34",
  379. 'magenta': "35",
  380. 'cyan': "36",
  381. 'white': "37",
  382. }
  383. def _get_ansi_code(color=None, style=None):
  384. """return ansi escape code corresponding to color and style
  385. :type color: str or None
  386. :param color:
  387. the color name (see `ANSI_COLORS` for available values)
  388. or the color number when 256 colors are available
  389. :type style: str or None
  390. :param style:
  391. style string (see `ANSI_COLORS` for available values). To get
  392. several style effects at the same time, use a coma as separator.
  393. :raise KeyError: if an unexistent color or style identifier is given
  394. :rtype: str
  395. :return: the built escape code
  396. """
  397. ansi_code = []
  398. if style:
  399. style_attrs = splitstrip(style)
  400. for effect in style_attrs:
  401. ansi_code.append(ANSI_STYLES[effect])
  402. if color:
  403. if color.isdigit():
  404. ansi_code.extend(['38', '5'])
  405. ansi_code.append(color)
  406. else:
  407. ansi_code.append(ANSI_COLORS[color])
  408. if ansi_code:
  409. return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END
  410. return ''
  411. def colorize_ansi(msg, color=None, style=None):
  412. """colorize message by wrapping it with ansi escape codes
  413. :type msg: str or unicode
  414. :param msg: the message string to colorize
  415. :type color: str or None
  416. :param color:
  417. the color identifier (see `ANSI_COLORS` for available values)
  418. :type style: str or None
  419. :param style:
  420. style string (see `ANSI_COLORS` for available values). To get
  421. several style effects at the same time, use a coma as separator.
  422. :raise KeyError: if an unexistent color or style identifier is given
  423. :rtype: str or unicode
  424. :return: the ansi escaped string
  425. """
  426. # If both color and style are not defined, then leave the text as is
  427. if color is None and style is None:
  428. return msg
  429. escape_code = _get_ansi_code(color, style)
  430. # If invalid (or unknown) color, don't wrap msg with ansi codes
  431. if escape_code:
  432. return '%s%s%s' % (escape_code, msg, ANSI_RESET)
  433. return msg
  434. DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'}
  435. def diff_colorize_ansi(lines, out=sys.stdout, style=DIFF_STYLE):
  436. for line in lines:
  437. if line[:4] in ('--- ', '+++ '):
  438. out.write(colorize_ansi(line, style['separator']))
  439. elif line[0] == '-':
  440. out.write(colorize_ansi(line, style['remove']))
  441. elif line[0] == '+':
  442. out.write(colorize_ansi(line, style['add']))
  443. elif line[:4] == '--- ':
  444. out.write(colorize_ansi(line, style['separator']))
  445. elif line[:4] == '+++ ':
  446. out.write(colorize_ansi(line, style['separator']))
  447. else:
  448. out.write(line)