Skip to content

Commit 8abc2ba

Browse files
committed
minimal extra support for xml
1 parent 27c790c commit 8abc2ba

15 files changed

+197
-12
lines changed

html2rdf-cli/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ publish.workspace = true
99
readme.workspace = true
1010
repository.workspace = true
1111

12+
1213
[[bin]]
1314
name = "html2rdf"
1415
path = "src/main.rs"

html2rdf/src/lib.rs

Lines changed: 80 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -788,14 +788,6 @@ impl HostLanguage for &HTMLHost {
788788
None // TODO
789789
}
790790

791-
// “The current language can be set using either the @lang
792-
// or @xml:lang attributes. When the @lang attribute and
793-
// the @xml:lang attribute are specified on the same element,
794-
// the @xml:lang attribute takes precedence. When both @lang
795-
// and @xml:lang are specified on the same element, they MUST
796-
// have the same value. Further details related to setting the
797-
// current language can be found in section 3.3 Specifying the
798-
// Language for a Literal.
799791
fn default_language(&self) -> Option<LanguageIdentifier> {
800792
None
801793
}
@@ -807,6 +799,18 @@ impl HostLanguage for &HTMLHost {
807799
// NB: note that the "additional initial context" is currently empty.
808800
}
809801

802+
struct XHTMLHost {}
803+
804+
impl HostLanguage for &XHTMLHost {
805+
fn default_language(&self) -> Option<LanguageIdentifier> {
806+
None // TODO
807+
}
808+
809+
fn default_vocabulary(&self) -> Option<oxrdf::NamedNode> {
810+
None // TODO
811+
}
812+
}
813+
810814
fn emit_processor(pg: &mut Graph, pg_type: PGType, msg: &str) {
811815
let warning_subj: oxrdf::NamedOrBlankNode = oxrdf::BlankNode::default().into();
812816
let pg_type: oxrdf::NamedNodeRef = pg_type.into();
@@ -852,7 +856,7 @@ impl<'o, 'p> RDFaProcessor<'o, 'p> {
852856
S::Child(element, base_ctx, parent_span) => {
853857
let _span = parent_span.entered();
854858

855-
let new_ctx = Rc::new(self.process_element(&base_ctx, element, &host)?);
859+
let new_ctx = Rc::new(self.process_element(&base_ctx, &html, element, &host)?);
856860
if element.has_children() {
857861
stack.push(S::OutputList(
858862
new_ctx.parent_subject.clone(),
@@ -912,6 +916,7 @@ impl<'o, 'p> RDFaProcessor<'o, 'p> {
912916
fn process_element(
913917
&mut self,
914918
eval_context: &EvaluationContext,
919+
html: &scraper::Html,
915920
element: scraper::ElementRef,
916921
host: impl HostLanguage,
917922
) -> Result<EvaluationContext, Error> {
@@ -1023,11 +1028,31 @@ impl<'o, 'p> RDFaProcessor<'o, 'p> {
10231028
// 3.
10241029
// “Next, the current element is examined for IRI mappings and these are added to the local list of IRI mappings.
10251030
// Note that an IRI mapping will simply overwrite any current mapping in the list that has the same name;
1031+
//
1032+
// [HTML-RDFA]
1033+
// “Extracting URI Mappings declared via @xmlns: while operating from within a DOM Level 2 based RDFa processor
1034+
// can be achieved using the following algorithm:
1035+
// “While processing each DOM2 [Element] as described in [rdfa-core], Section 7.5: Sequence, Step #2:
1036+
// “1. For each [Attr] in the [Node.attributes] list that has a [namespace prefix] value of @xmlns,
1037+
// create an [IRI mapping] by storing the [local name] as the value to be mapped, and the
1038+
// [Node.nodeValue] as the value to map.
1039+
// (Note: this is not done because html5ever/scraper never reports namespace prefixes…)
1040+
// “2. For each [Attr] in the [Node.attributes] list that has a [namespace prefix] value of null
1041+
// and a [local name] that starts with @xmlns:, create an [IRI mapping] by storing the [local name]
1042+
// part with the @xmlns: characters removed as the value to be mapped, and the [Node.nodeValue] as
1043+
// the value to map.
1044+
// (Note: this is what is implemented below…)
10261045
let xmlns_prefixes = el
10271046
.attrs
10281047
.iter()
1029-
.filter(|(qn, _)| qn.prefix.as_deref() == Some("xmlns"))
1030-
.map(|(qn, val)| (qn.local.as_ref(), val.as_ref()))
1048+
.filter_map(|(qn, value)| -> Option<_> {
1049+
if qn.prefix.is_none() {
1050+
let prefix = qn.local.strip_prefix("xmlns:")?;
1051+
Some((prefix, value))
1052+
} else {
1053+
None
1054+
}
1055+
})
10311056
.collect::<Vec<_>>();
10321057

10331058
let prefixes = el
@@ -1069,6 +1094,16 @@ impl<'o, 'p> RDFaProcessor<'o, 'p> {
10691094
// 4. Language
10701095
// “The current element is also parsed for any language information,
10711096
// and if present, current language is set accordingly;
1097+
//
1098+
// [HTML-RDFA] 3.1
1099+
// “The current language can be set using either the @lang
1100+
// or @xml:lang attributes. When the @lang attribute and
1101+
// the @xml:lang attribute are specified on the same element,
1102+
// the @xml:lang attribute takes precedence. When both @lang
1103+
// and @xml:lang are specified on the same element, they MUST
1104+
// have the same value. Further details related to setting the
1105+
// current language can be found in section 3.3 Specifying the
1106+
// Language for a Literal.
10721107
if let Some(lang) = el.attr("xml:lang").or(el.attr("lang")) {
10731108
if lang.is_empty() {
10741109
local.current_language = None;
@@ -1621,6 +1656,40 @@ impl<'o, 'p> RDFaProcessor<'o, 'p> {
16211656
// the element itself, and giving it a datatype of XMLLiteral in the vocabulary
16221657
// http://www.w3.org/1999/02/22-rdf-syntax-ns#. The format of the resulting
16231658
// serialized content is as defined in Exclusive XML Canonicalization Version 1.0 [XML-EXC-C14N].
1659+
//
1660+
// [HTML-RDFA]
1661+
// “When generating literals of type XMLLiteral, the processor MUST ensure that the
1662+
// output XMLLiteral is a namespace well-formed XML fragment. A namespace well-formed XML
1663+
// fragment has the following properties:
1664+
// “- The XML fragment, when placed inside of a single root element, MUST validate as well-formed
1665+
// XML. The normative language that describes a well-formed XML document is specified in
1666+
// Section 2.1 "Well-Formed XML Documents" of the XML specification.
1667+
// “- The XML fragment, when placed inside of a single root element, MUST retain all active
1668+
// namespace information. The currently active attributes declared using @xmlns and @xmlns:
1669+
// that are stored in the RDFa processor's current evaluation context in the IRI mappings
1670+
// MUST be preserved in the generated XMLLiteral. The PREFIX value for @xmlns:PREFIX MUST
1671+
// be entirely transformed into lower-case characters when preserving the value in the
1672+
// XMLLiteral. All active namespaces declared via @xmlns, @xmlns:, and @prefix MUST be
1673+
// placed in each top-level element in the generated XMLLiteral, taking care to not overwrite
1674+
// pre-existing namespace values.
1675+
// (TODO: the above is not yet implemented, since I can't figure out how to work with
1676+
// the scraper API effectively here...)
1677+
/*
1678+
let mut output = String::new();
1679+
for child in element.children() {
1680+
if let Some(el) = child.value().as_element() {
1681+
let mut el = el.clone();
1682+
for (prefix, iri) in local.iri_mappings.mappings() {
1683+
let name = html5ever::QualName::new(
1684+
None,
1685+
html5ever::ns!(xmlns),
1686+
html5ever::LocalName::from(prefix.as_str()),
1687+
);
1688+
el.attrs.push((name, iri.to_string().into()));
1689+
}
1690+
} else {
1691+
}
1692+
} */
16241693
serialized = element.inner_html();
16251694
oxrdf::LiteralRef::new_typed_literal(&serialized, datatype).into()
16261695
// TODO: incorrect, needs to be c14n'd
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
source: html2rdf/tests/test-cases.rs
3+
expression: "utils::serialize_graph(processor_graph, base.as_str(),)"
4+
---
5+
@base <http://rdfa.info/test-suite/test-cases/rdfa1.1-lite/xhtml5/0140.xhtml> .
6+
@prefix rdf: <//www.w3.org/1999/02/22-rdf-syntax-ns#> .
7+
@prefix rdfa: <//www.w3.org/ns/rdfa#> .
8+
@prefix dc: <//purl.org/dc/terms/> .
9+
_:c14n0 a rdfa:Warning ;
10+
dc:description "@property cannot refer to a bnode: [_:invalid]" .
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
source: html2rdf/tests/test-cases.rs
3+
expression: "utils::serialize_graph(processor_graph, base.as_str(),)"
4+
---
5+
@base <http://rdfa.info/test-suite/test-cases/rdfa1.1-lite/xhtml5/0312.xhtml> .
6+
@prefix rdf: <//www.w3.org/1999/02/22-rdf-syntax-ns#> .
7+
@prefix rdfa: <//www.w3.org/ns/rdfa#> .
8+
@prefix dc: <//purl.org/dc/terms/> .
9+
_:c14n0 a rdfa:Warning ;
10+
dc:description "Invalid IRI: <nofollow> (No scheme found in an absolute IRI)" .
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
source: html2rdf/tests/test-cases.rs
3+
expression: "utils::serialize_graph(processor_graph, base.as_str(),)"
4+
---
5+
@base <http://rdfa.info/test-suite/test-cases/rdfa1.1/xhtml5/0112.xhtml> .
6+
@prefix rdf: <//www.w3.org/1999/02/22-rdf-syntax-ns#> .
7+
@prefix rdfa: <//www.w3.org/ns/rdfa#> .
8+
@prefix dc: <//purl.org/dc/terms/> .
9+
_:c14n0 a rdfa:Warning ;
10+
dc:description "Invalid IRI: <> (No scheme found in an absolute IRI)" .
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
source: html2rdf/tests/test-cases.rs
3+
expression: "utils::serialize_graph(processor_graph, base.as_str(),)"
4+
---
5+
@base <http://rdfa.info/test-suite/test-cases/rdfa1.1/xhtml5/0140.xhtml> .
6+
@prefix rdf: <//www.w3.org/1999/02/22-rdf-syntax-ns#> .
7+
@prefix rdfa: <//www.w3.org/ns/rdfa#> .
8+
@prefix dc: <//purl.org/dc/terms/> .
9+
_:c14n0 a rdfa:Warning ;
10+
dc:description "@property cannot refer to a bnode: [_:invalid]" .
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
source: html2rdf/tests/test-cases.rs
3+
expression: "utils::serialize_graph(processor_graph, base.as_str(),)"
4+
---
5+
@base <http://rdfa.info/test-suite/test-cases/rdfa1.1/xhtml5/0253.xhtml> .
6+
@prefix rdf: <//www.w3.org/1999/02/22-rdf-syntax-ns#> .
7+
@prefix rdfa: <//www.w3.org/ns/rdfa#> .
8+
@prefix dc: <//purl.org/dc/terms/> .
9+
_:c14n0 a rdfa:Warning ;
10+
dc:description "Invalid IRI: <> (No scheme found in an absolute IRI)" .
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
source: html2rdf/tests/test-cases.rs
3+
expression: "utils::serialize_graph(processor_graph, base.as_str(),)"
4+
---
5+
@base <http://rdfa.info/test-suite/test-cases/rdfa1.1/xhtml5/0254.xhtml> .
6+
@prefix rdf: <//www.w3.org/1999/02/22-rdf-syntax-ns#> .
7+
@prefix rdfa: <//www.w3.org/ns/rdfa#> .
8+
@prefix dc: <//purl.org/dc/terms/> .
9+
_:c14n0 a rdfa:Warning ;
10+
dc:description "Invalid IRI: <> (No scheme found in an absolute IRI)" .
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
source: html2rdf/tests/test-cases.rs
3+
expression: "utils::serialize_graph(processor_graph, base.as_str(),)"
4+
---
5+
@base <http://rdfa.info/test-suite/test-cases/rdfa1.1/xhtml5/0290.xhtml> .
6+
@prefix rdf: <//www.w3.org/1999/02/22-rdf-syntax-ns#> .
7+
@prefix rdfa: <//www.w3.org/ns/rdfa#> .
8+
@prefix dc: <//purl.org/dc/terms/> .
9+
_:c14n0 a rdfa:Warning ;
10+
dc:description "Invalid IRI: <> (No scheme found in an absolute IRI)" .
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
source: html2rdf/tests/test-cases.rs
3+
expression: "utils::serialize_graph(processor_graph, base.as_str(),)"
4+
---
5+
@base <http://rdfa.info/test-suite/test-cases/rdfa1.1/xhtml5/0312.xhtml> .
6+
@prefix rdf: <//www.w3.org/1999/02/22-rdf-syntax-ns#> .
7+
@prefix rdfa: <//www.w3.org/ns/rdfa#> .
8+
@prefix dc: <//purl.org/dc/terms/> .
9+
_:c14n0 a rdfa:Warning ;
10+
dc:description "Invalid IRI: <nofollow> (No scheme found in an absolute IRI)" .

0 commit comments

Comments
 (0)