sjjaffe
/
abnf-parser


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
							#include <cassert>
#include <cctype>
#include <charconv>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <string>
#include <variant>

#include <abnf/forward.h>
#include <abnf/grammar.h>

namespace abnf {
grammar parse(std::istream && in) { return parse(in); }
grammar_group parse_group(std::istream && in) { return parse_group(in); }

static void append(rule & rule, rule_part const & part, bool is_one_of) {
  if (rule.rules.empty()) {
    rule.rules.push_back(is_one_of ? one_of(part) : part);
  } else if (not is_one_of) {
    rule.rules.push_back(part);
  } else if (auto * of = std::get_if<one_of>(&rule.rules.back())) {
    of->rules.push_back(part);
  } else {
    rule.rules.back() = one_of{rule.rules.back(), part};
  }
}

static repeated parse_repeated(std::string & token) {
  if (token[0] == '[') {
    token.erase(0, 1);
    return {.min = 0, .max = 1};
  }
  if (token[0] == '(') {
    // TODO: Can I just inline this when is_one_of is false?
    token.erase(0, 1);
    return {.min = 1, .max = 1};
  }
  repeated rval;
  size_t idx = 0;
  if (not token.starts_with('*')) {
    rval.min = std::stoull(token, &idx);
    token.erase(0, idx);
  }
  if (not token.starts_with('*')) {
    token.erase(0, token[0] == '(' ? 1 : 0);
    rval.max = rval.min;
    return rval;
  }

  token.erase(0, 1);
  if (not token.empty() && std::strchr("123456789", token[0])) {
    rval.max = std::stoull(token, &idx);
    token.erase(0, idx);
  }
  token.erase(0, token[0] == '(' ? 1 : 0);
  return rval;
}

static int base(char c) {
  using std::string_literals::operator""s;
  static std::unordered_map<char, int> const s_bases{{'x', 16}, {'d', 10}};
  auto it = s_bases.find(c);
  if (it == s_bases.end()) {
    throw std::invalid_argument("Unknown Base for Character Range: '"s + c +
                                "'");
  }
  return it->second;
}

char_range parse_char_range(std::string_view token) {
  char_range rval;
  int const base = abnf::base(token[1]);
  token.remove_prefix(2);
  char const * const last = token.end();
  auto [end, ec] = std::from_chars(token.data(), last, rval.first.value_, base);
  if (*end == '-') {
    ec = std::from_chars(end + 1, last, rval.last.value_, base).ec;
  } else {
    rval.last = rval.first;
  }

  return rval;
}

static std::string parse_rule(std::istream & in, std::string const & name,
                              rule & rule, bool is_one_of = false) {
  bool expecting_return = false;
  std::string token;

  while (in >> token) {
    if (std::strchr("])", token[0])) {
      return token; // End Sub-Expression
    }

    static constexpr char const s_repeated_chars[] = "0123456789[(*";
    if (std::strchr(s_repeated_chars, token[0])) {
      repeated tmp = parse_repeated(token);
      if (not token.empty()) {
        std::stringstream ss(token);
        parse_rule(ss, name, tmp.rule);
      } else {
        parse_rule(in, name, tmp.rule);
      }
      append(rule, tmp, is_one_of);
    } else if (token.starts_with("%")) {
      append(rule, parse_char_range(token), is_one_of);
    } else if (token.starts_with('"')) {
      append(rule, literal{token}, is_one_of);
    } else if (token == "/") {
      // See Below
    } else if (token == ";") {
      std::getline(in, token); // Discard the comment
    } else if (token.starts_with('<') || std::isalpha(token[0])) {
      if (expecting_return) { return token; }
      append(rule, reference{token}, is_one_of);
    }

    expecting_return = false;
    is_one_of = (token == "/");
    if (std::strchr("\r\n", in.peek())) { expecting_return = true; }
  }

  return "";
}

static auto parse_impl(std::istream & in) {
  std::string first_rule;
  grammar_base::rule_store store;

  std::string name;
  rule rule;

  auto push_rule = [&]() {
    if (name.empty() || not std::isalpha(name[0])) { return; }
    if (first_rule.empty()) { first_rule = name; }

    store.insert_or_assign(std::move(name), std::move(rule));
  };

  bool is_one_of = false;
  std::string token;
  in >> name;
  in >> token; // =
  while (not(token = parse_rule(in, name, rule, is_one_of)).empty()) {
    if (token != name) { push_rule(); }
    name = token;
    in >> token; // = OR /=
    if ((is_one_of = (token == "/=" || token == "=/"))) { rule = store[name]; }
  }
  push_rule();

  return std::make_pair(first_rule, std::move(store));
}

grammar parse(std::istream & in) {
  auto [name, rules] = parse_impl(in);
  return grammar(name, rules.extract(name).mapped(), rules);
}

grammar_group parse_group(std::istream & in) {
  return grammar_group(parse_impl(in).second);
}
}