diff --git a/common/extract-examples.rb b/common/extract-examples.rb index 5035f657..2f9d2bfd 100755 --- a/common/extract-examples.rb +++ b/common/extract-examples.rb @@ -15,6 +15,7 @@ require 'fileutils' require 'colorize' require 'yaml' +require 'cgi' PREFIXES = { dc: "http://purl.org/dc/terms/", @@ -49,8 +50,8 @@ # Remove highlighting and commented out sections def justify(str) str = str. - sub(/^\s*\s*$/, ''). + gsub(/^\s*\s*$/, ''). gsub('****', ''). gsub(/####([^#]*)####/, '') @@ -222,7 +223,7 @@ def save_example(examples:, element:, title:, example_number:, error:, warn:) examples[title] = { title: title, filename: fn, - content: content, + content: content.to_s.gsub(/^\s*< !\s*-\s*-/, ''), content_type: element.attr('data-content-type'), number: example_number, ext: ext, @@ -302,6 +303,7 @@ def save_example(examples:, element:, title:, example_number:, error:, warn:) # Perform example syntactic validation based on extension case ex[:ext] when 'json', 'jsonld', 'jsonldf' + content = CGI.unescapeHTML(content) begin ::JSON.parse(content) rescue JSON::ParserError => exception @@ -325,22 +327,16 @@ def save_example(examples:, element:, title:, example_number:, error:, warn:) ex[:base] = html_base.to_s if html_base script_content = doc.at_xpath(xpath) - if script_content - # Remove (faked) XML comments and unescape sequences - content = script_content - .inner_html - .sub(/^\s*< !\s*-\s*-/, '') - .sub(/-\s*- >\s*$/, '') - .gsub(/</, '<') - end - + + # Remove (faked) XML comments and unescape sequences + content = CGI.unescapeHTML(script_content.inner_html) if script_content rescue Nokogiri::XML::SyntaxError => exception errors << "Example #{ex[:number]} at line #{ex[:line]} parse error: #{exception.message}" $stdout.write "F".colorize(:red) next end when 'table' - # already in parsed form + content = Nokogiri::HTML.parse(content) when 'ttl', 'trig' begin reader_errors = [] @@ -443,10 +439,7 @@ def save_example(examples:, element:, title:, example_number:, error:, warn:) # Set argument to referenced content to be parsed args[0] = if examples[ex[:result_for]][:ext] == 'html' && method == :expand # If we are expanding, and the reference is HTML, find the first script element. - doc = Nokogiri::HTML.parse( - examples[ex[:result_for]][:content] - .sub(/^\s*< !\s*-\s*-/, '') - .sub(/-\s*- >\s*$/, '')) + doc = Nokogiri::HTML.parse(examples[ex[:result_for]][:content]) # Get base from document, if present html_base = doc.at_xpath('/html/head/base/@href') @@ -458,15 +451,10 @@ def save_example(examples:, element:, title:, example_number:, error:, warn:) $stdout.write "F".colorize(:red) next end - StringIO.new(script_content - .inner_html - .gsub(/</, '<')) + StringIO.new(CGI.unescapeHTML(script_content.inner_html)) elsif examples[ex[:result_for]][:ext] == 'html' && ex[:target] # Only use the targeted script - doc = Nokogiri::HTML.parse( - examples[ex[:result_for]][:content] - .sub(/^\s*< !\s*-\s*-/, '') - .sub(/-\s*- >\s*$/, '')) + doc = Nokogiri::HTML.parse(examples[ex[:result_for]][:content]) script_content = doc.at_xpath(xpath) unless script_content errors << "Example #{ex[:number]} at line #{ex[:line]} references example #{ex[:result_for].inspect} with no JSON-LD script element" @@ -565,7 +553,7 @@ def save_example(examples:, element:, title:, example_number:, error:, warn:) $stderr.puts "expected:\n" + expected.to_trig if verbose when 'table' expected = begin - table_to_dataset(content) + table_to_dataset(content.xpath('/html/body/table')) rescue errors << "Example #{ex[:number]} at line #{ex[:line]} raised error reading table: #{$!}" RDF::Dataset.new diff --git a/index.html b/index.html index 8d0d8064..8375adc1 100644 --- a/index.html +++ b/index.html @@ -4671,40 +4671,15 @@

Extract Script Content Algorithm

Overview

-

As a data block - may be inside a comment, and may be escaped, the algorithm extracts the JSON from any comment, - removes REVERSE SOLIDUS escapes, - and reverses HTML Character references. +

The algorithm reverses HTML Character references.

Algorithm

The algorithm takes a single required input variable: source, the textContent of an HTML script element.

-

For the purpose of this algorithm, the following tokens are defined in [[ABNF]]:

- -
-  space-character = %20 ; SPACE
-                  / %09 ; CHARACTER TABULATION (tab)
-                  / %0A ; LINE FEED (LF)
-                  / %0C ; FORM FEED (FF)
-                  / %0D ; CARRIAGE RETURN (CR)
-  comment-open    = *space-character "<!--" *space-character
-  comment-close   = *space-character "-->" *space-character
-      
    -
  1. If source begins with comment-open and ends with comment-close, - remove those sequences from source.
  2. -
  3. If source contains comment-open or comment-close, - an invalid script element has been detected, and processing is aborted.
  4. -
  5. For all occurances of the any of the character sequences - <\script, - <\/script, - <\!--, - or --\> - in source using a case-insenstive match, - replace the sequence with the equivalent sequence excluding the REVERSE SOLIDUS (\).
  6. For all occurances of a HTML Character reference in source, replace the sequence with the equivalent Unicode character as defined in Named character references in [[HTML52]].
  7. @@ -4866,6 +4841,14 @@

    The JsonLdProcessor Interface

    a string representing the IRI of a remote document, extract the content of the JSON-LD script element(s) into original input:
      +
    1. Set base IRI to the the Document Base URL + of original input, as defined in [[HTML52]], + using the existing base IRI as the document's URL. +
      + The use of the Document Base URL + from [[HTML52]] for setting the base IRI of the enclosed JSON-LD + is an experimental feature, which may be changed in a future version of this specification. +
    2. If the original passed input parameter contains a fragment identifier, set source to the textContent diff --git a/tests/expand-manifest.jsonld b/tests/expand-manifest.jsonld index dde9c18d..5facfbd8 100644 --- a/tests/expand-manifest.jsonld +++ b/tests/expand-manifest.jsonld @@ -1353,22 +1353,6 @@ "input": "expand/h007-in.html", "expect": "expand/h007-out.jsonld", "option": {"specVersion": "json-ld-1.1", "extractAllScripts": true} - }, { - "@id": "#th008", - "@type": ["jld:PositiveEvaluationTest", "jld:ExpandTest"], - "name": "Expands embedded JSON-LD script element with comments", - "purpose": "Tests embedded JSON-LD in HTML with comments", - "input": "expand/h008-in.html", - "expect": "expand/h008-out.jsonld", - "option": {"specVersion": "json-ld-1.1"} - }, { - "@id": "#th009", - "@type": ["jld:PositiveEvaluationTest", "jld:ExpandTest"], - "name": "Expands embedded JSON-LD script element with escaped tokens", - "purpose": "Tests embedded JSON-LD in HTML with escapes", - "input": "expand/h009-in.html", - "expect": "expand/h009-out.jsonld", - "option": {"specVersion": "json-ld-1.1"} }, { "@id": "#th010", "@type": ["jld:PositiveEvaluationTest", "jld:ExpandTest"], @@ -1433,6 +1417,46 @@ "input": "expand/h017-in.html", "expect": "invalid script element", "option": {"specVersion": "json-ld-1.1"} + }, { + "@id": "#th018", + "@type": ["jld:PositiveEvaluationTest", "jld:ExpandTest"], + "name": "Expands embedded JSON-LD script element relative to document base", + "purpose": "Tests embedded JSON-LD in HTML", + "input": "expand/h018-in.html", + "expect": "expand/h018-out.jsonld", + "option": {"specVersion": "json-ld-1.1"} + }, { + "@id": "#th019", + "@type": ["jld:PositiveEvaluationTest", "jld:ExpandTest"], + "name": "Expands embedded JSON-LD script element relative to base option", + "purpose": "Tests embedded JSON-LD in HTML", + "input": "expand/h019-in.html", + "expect": "expand/h019-out.jsonld", + "option": {"specVersion": "json-ld-1.1", "base": "http://a.example.com/doc"} + }, { + "@id": "#th020", + "@type": ["jld:PositiveEvaluationTest", "jld:ExpandTest"], + "name": "Expands embedded JSON-LD script element relative to HTML base", + "purpose": "Tests embedded JSON-LD in HTML", + "input": "expand/h020-in.html", + "expect": "expand/h020-out.jsonld", + "option": {"specVersion": "json-ld-1.1", "base": "http://a.example.com/doc"} + }, { + "@id": "#th021", + "@type": ["jld:PositiveEvaluationTest", "jld:ExpandTest"], + "name": "Expands embedded JSON-LD script element relative to relative HTML base", + "purpose": "Tests embedded JSON-LD in HTML", + "input": "expand/h021-in.html", + "expect": "expand/h021-out.jsonld", + "option": {"specVersion": "json-ld-1.1", "base": "http://a.example.com/doc"} + }, { + "@id": "#th022", + "@type": ["jld:PositiveEvaluationTest", "jld:ExpandTest"], + "name": "Expands targeted JSON-LD script element with fragment and HTML base", + "purpose": "Tests embedded JSON-LD in HTML with fragment identifier", + "input": "expand/h022-in.html#second", + "expect": "expand/h022-out.jsonld", + "option": {"specVersion": "json-ld-1.1"} }, { "@id": "#tm001", "@type": ["jld:PositiveEvaluationTest", "jld:ExpandTest"], diff --git a/tests/expand/h018-in.html b/tests/expand/h018-in.html new file mode 100644 index 00000000..8a644238 --- /dev/null +++ b/tests/expand/h018-in.html @@ -0,0 +1,13 @@ + + + + + \ No newline at end of file diff --git a/tests/expand/h018-out.jsonld b/tests/expand/h018-out.jsonld new file mode 100644 index 00000000..7bf2e224 --- /dev/null +++ b/tests/expand/h018-out.jsonld @@ -0,0 +1,4 @@ +[{ + "@id": "https://w3c.github.io/json-ld-api/tests/expand/h018-in.html", + "http://example.com/foo": [{"@value": "bar"}] +}] \ No newline at end of file diff --git a/tests/expand/h019-in.html b/tests/expand/h019-in.html new file mode 100644 index 00000000..8a644238 --- /dev/null +++ b/tests/expand/h019-in.html @@ -0,0 +1,13 @@ + + + + + \ No newline at end of file diff --git a/tests/expand/h019-out.jsonld b/tests/expand/h019-out.jsonld new file mode 100644 index 00000000..1617d64f --- /dev/null +++ b/tests/expand/h019-out.jsonld @@ -0,0 +1,4 @@ +[{ + "@id": "http://a.example.com/doc", + "http://example.com/foo": [{"@value": "bar"}] +}] \ No newline at end of file diff --git a/tests/expand/h020-in.html b/tests/expand/h020-in.html new file mode 100644 index 00000000..c7fdfb1b --- /dev/null +++ b/tests/expand/h020-in.html @@ -0,0 +1,14 @@ + + + + + + \ No newline at end of file diff --git a/tests/expand/h020-out.jsonld b/tests/expand/h020-out.jsonld new file mode 100644 index 00000000..f309cd84 --- /dev/null +++ b/tests/expand/h020-out.jsonld @@ -0,0 +1,4 @@ +[{ + "@id": "http://a.example.com/base", + "http://example.com/foo": [{"@value": "bar"}] +}] \ No newline at end of file diff --git a/tests/expand/h021-in.html b/tests/expand/h021-in.html new file mode 100644 index 00000000..25b5e3cf --- /dev/null +++ b/tests/expand/h021-in.html @@ -0,0 +1,14 @@ + + + + + + \ No newline at end of file diff --git a/tests/expand/h021-out.jsonld b/tests/expand/h021-out.jsonld new file mode 100644 index 00000000..f309cd84 --- /dev/null +++ b/tests/expand/h021-out.jsonld @@ -0,0 +1,4 @@ +[{ + "@id": "http://a.example.com/base", + "http://example.com/foo": [{"@value": "bar"}] +}] \ No newline at end of file diff --git a/tests/expand/h022-in.html b/tests/expand/h022-in.html new file mode 100644 index 00000000..0d18ab9d --- /dev/null +++ b/tests/expand/h022-in.html @@ -0,0 +1,20 @@ + + + + + + + \ No newline at end of file diff --git a/tests/expand/h022-out.jsonld b/tests/expand/h022-out.jsonld new file mode 100644 index 00000000..bbfd82eb --- /dev/null +++ b/tests/expand/h022-out.jsonld @@ -0,0 +1,4 @@ +[{ + "@id": "http://a.example.com/base", + "http://example.com/bar": [{"@value": "foo"}] +}] \ No newline at end of file