Browse Source

refactor: fix vocabulary usage to be propogated instead of locked to version tags

Sam Jaffe 3 tháng trước cách đây
mục cha
commit
435b083a88

+ 11 - 13
include/jvalidate/detail/reference_manager.h

@@ -24,9 +24,6 @@
 
 namespace jvalidate::detail {
 template <Adapter A> class ReferenceManager {
-public:
-  using Keywords = std::unordered_map<std::string_view, std::set<schema::Wraps>>;
-
 private:
   static inline std::map<std::string_view, schema::Version> const g_schema_ids{
       {"json-schema.org/draft-03/schema", schema::Version::Draft03},
@@ -97,7 +94,7 @@ public:
     return active_dynamic_anchors_.scope(uri, dynamic_anchors_[uri]);
   }
 
-  std::optional<A> load(Reference const & ref, schema::Version version) {
+  std::optional<A> load(Reference const & ref, Vocabulary<A> const * vocab) {
     if (auto it = roots_.find(ref.root()); it != roots_.end()) {
       return ref.pointer().walk(it->second);
     }
@@ -107,9 +104,8 @@ public:
       return std::nullopt;
     }
 
-    // TODO(samjaffe): Change Versions if needed...
     references_.emplace(ref.uri());
-    prime(*external, ref, &vocab(version));
+    prime(*external, ref, vocab);
 
     // May have a sub-id that we map to
     if (auto it = roots_.find(ref.root()); it != roots_.end()) {
@@ -145,7 +141,9 @@ public:
       return base.parent() / uri;
     }();
 
-    URI const dyn_uri = ref.uri().empty() ? ref.uri() : uri;
+    // This seems unintuitive, but we generally want to avoid providing a URI
+    // when looking up dynamic references, unless they are explicitly asked for.
+    URI const dyn_uri = ref.uri().empty() ? URI() : uri;
     if (std::optional dynref = dynamic(dyn_uri, ref, dynamic_reference)) {
       return *dynref;
     }
@@ -189,23 +187,23 @@ private:
       return;
     }
 
-    canonicalize(where, vocab->version(), json);
-
     auto schema = json.as_object();
     if (schema.contains("$schema")) {
       vocab = &this->vocab(URI(schema["$schema"].as_string()));
     }
 
+    // Load ids, anchors, etc.
+    prime_roots(where, vocab->version(), json);
+
+    // Recurse through the document
     for (auto const & [key, value] : schema) {
       if (not vocab->is_keyword(key)) {
         continue;
       }
       switch (value.type()) {
       case adapter::Type::Array: {
-        size_t index = 0;
-        for (auto const & elem : value.as_array()) {
+        for (auto const & [index, elem] : detail::enumerate(value.as_array())) {
           prime(elem, where / key / index, vocab);
-          ++index;
         }
         break;
       }
@@ -223,7 +221,7 @@ private:
     }
   }
 
-  void canonicalize(Reference & where, schema::Version version, A const & json) {
+  void prime_roots(Reference & where, schema::Version version, A const & json) {
     std::string const id = version <= schema::Version::Draft04 ? "id" : "$id";
     auto const schema = json.as_object();
 

+ 81 - 29
include/jvalidate/detail/vocabulary.h

@@ -22,31 +22,44 @@ private:
   std::unordered_set<std::string> vocabularies_;
 
   // TODO(samjaffe): Migrate this back to constraintsfactory
-  std::unordered_set<std::string_view> keywords_{"$defs",
-                                                 "additionalItems",
-                                                 "additionalProperties",
-                                                 "allOf",
-                                                 "anyOf",
-                                                 "definitions",
-                                                 "dependencies",
-                                                 "dependentSchemas",
-                                                 "else",
-                                                 "extends",
-                                                 "if",
-                                                 "items",
-                                                 "not",
-                                                 "oneOf",
-                                                 "patternProperties",
-                                                 "prefixItems",
-                                                 "properties",
-                                                 "then",
-                                                 "unevaluatedItems",
-                                                 "unevaluatedProperties"};
-  std::unordered_set<std::string_view> property_keywords_{
+  // A list of keywords that participate in scans for "$id" and "$anchor" tokens
+  // etc. We need to track this because it is possible (though an anti-pattern),
+  // to embed an $id token in a "const", or in a bogus keyword.
+  inline static const std::unordered_set<std::string_view> s_keywords{
+      // Special tokens - we need to scan definitions for sub-ids, the keys of
+      // these objects are arbitrary, so we need to skip past them in scanning.
+      "$defs", "definitions",
+      // Draft03 only - extends allows us to specify an arbitrary number of
+      // parent schemas that we use on top of the current schema. Equivalent to
+      // allOf.[*].$ref in Draft04+.
+      "extends",
+      // Algorithmic/Conditional Schema types - for annoying reasons, we need
+      // to be able to scan the subschemas of these for $id tokens and whatnot,
+      // despite it never being a sensible decision to embed $ids like that.
+      "allOf", "anyOf", "not", "oneOf", "if", "then", "else",
+      // Next are the four array-specific schema keywords
+      "items", "prefixItems", "additionalItems", "unevaluatedItems",
+      // And the six object-specific schema keywords. With the exception of
+      // additionalProperties and unevaluatedProperties, all of these represent
+      // objects mapping "arbitrary" keys onto schemas, so we need to signal
+      // that...
+      "dependencies", "dependentSchemas", "patternProperties", "properties", "additionalProperties",
+      "unevaluatedProperties"};
+  // ...using this property_keywords_ object, we list those six keywords that
+  // are represented as an object of arbitrary keys onto schemas that may
+  // contain $id/$anchor fields.
+  inline static const std::unordered_set<std::string_view> s_property_keywords{
       "$defs",     "definitions", "dependencies", "dependentSchemas", "patternProperties",
       "properties"};
-  std::unordered_set<std::string_view> post_constraints_{"unevaluatedItems",
-                                                         "unevaluatedProperties"};
+
+  // Special rules must be applied for post constraints, of which there are
+  // currently two. Current discussion says that "constraints" and
+  // "post-contraints" SHOULD be run as two separate phases (since posts need
+  // to know which items/properties would be processed, and perhaps other things
+  // in the future), but that there is no rule on order-of-operations within
+  // a given phase, nor is there any intention to introduce some kind of Phase 3
+  inline static const std::unordered_set<std::string_view> s_post_constraints{
+      "unevaluatedItems", "unevaluatedProperties"};
 
 public:
   Vocabulary() = default;
@@ -57,11 +70,23 @@ public:
     }
   }
 
+  /**
+   * @brief Reset the list of keywords that Vocabulary actually respects
+   *
+   * @param permitted_keywords The selection of keywords to allow for
+   * searches/constraint building. Note that a constraint might be
+   * registered to a null function for compatibility with this.
+   *
+   * @param vocabularies An optional selection of vocabulary schemas, used
+   * as metadata, and deducing {@see is_format_assertion}.
+   */
   void restrict(std::unordered_set<std::string> const & permitted_keywords,
-                std::unordered_set<std::string> const & vocabularies) & {
+                std::unordered_set<std::string> const & vocabularies = {}) & {
     permitted_.clear();
     vocabularies_ = vocabularies;
     for (auto const & [keyword, _] : make_) {
+      // We only file permitted_keywords into this Vocabulary if we have defined
+      // bindings for that keyword
       if (permitted_keywords.contains(std::string(keyword))) {
         permitted_.insert(keyword);
       }
@@ -84,7 +109,7 @@ public:
     // Therefore - starting in Draft 2019-09, the format keyword is an
     // annotation by default, instead of an assertion.
     if (version_ == schema::Version::Draft2019_09) {
-      return permitted_.contains("/vocab/format");
+      return vocabularies_.contains("/vocab/format");
     }
 
     // Draft 2020-12 makes this even more explicit - having separate vocabulary
@@ -96,27 +121,54 @@ public:
 
   /**
    * @brief Is the given "key"word actually a keyword? As in, would
-   * I expect to resolve a constraint out of it.
+   * I expect to resolve a constraint out of it. This is a slightly more
+   * lenient version of {@see is_constraint} - since it allows keywords that
+   * have a null factory, as long as they've been registered (e.g. then/else).
+   *
+   * @param word The "key"word being looked up (e.g. "if", "properties", ...)
    */
   bool is_keyword(std::string_view word) const {
-    return permitted_.contains(word) && make_.contains(word) && keywords_.contains(word);
+    return permitted_.contains(word) && make_.contains(word) && s_keywords.contains(word);
   }
 
   /**
    * @brief Does the given "key"word represent a property object - that is to
    * say, an object containing some number of schemas mapped by arbitrary keys
+   *
+   * @param word The "key"word being looked up (e.g. "if", "properties", ...)
    */
   bool is_property_keyword(std::string_view word) const {
-    return is_keyword(word) && property_keywords_.contains(word);
+    return is_keyword(word) && s_property_keywords.contains(word);
   }
 
+  /**
+   * @brief Is the given word a real constraint in the Vocabulary. In essence,
+   * it must be an enabled keyword AND it must have a non-null factory function.
+   *
+   * @param word The "key"word being looked up (e.g. "if", "properties", ...)
+   */
   bool is_constraint(std::string_view word) const {
     return permitted_.contains(word) && make_.contains(word) && make_.at(word);
   }
 
+  /**
+   * @brief Fabricate the given constraint if real from the current context
+   *
+   * @param word The "key"word being looked up (e.g. "if", "properties", ...)
+   *
+   * @param context The current context of schema parsing, used for re-entrancy.
+   *
+   * @returns A pair whose first element is either a pointer to a constraint
+   * (if word represents a supported constraint AND the constraint resolves to
+   * something meaningful), else null.
+   *
+   * The second element is a boolean indicating if the constraint needs to be
+   * evaluted after other constraints to use their tracking/annotations.
+   * See the above comments on s_post_constraints for more info.
+   */
   auto constraint(std::string_view word, ParserContext<A> const & context) const {
     return std::make_pair(is_constraint(word) ? make_.at(word)(context) : nullptr,
-                          post_constraints_.contains(word));
+                          s_post_constraints.contains(word));
   }
 };
 }

+ 1 - 1
include/jvalidate/schema.h

@@ -185,7 +185,7 @@ private:
       return *cached;
     }
 
-    if (std::optional root = context.ref.load(lexical, context.vocab->version())) {
+    if (std::optional root = context.ref.load(lexical, context.vocab)) {
       return fetch_schema(context.rebind(*root, lexical, dynamic));
     }