Skip to content

Commit faa15a4

Browse files
committed
refactor the world
big simplification/XHTML support
1 parent 94964ef commit faa15a4

File tree

317 files changed

+10883
-2420
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

317 files changed

+10883
-2420
lines changed

Cargo.lock

Lines changed: 714 additions & 234 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[workspace]
22
resolver = "2"
3-
members = [ "html2rdf" , "html2rdf-cli"]
3+
members = [ "html2rdf", "html2rdf-cli", "librdfa-wrapper" ]
44

55
[workspace.package]
66
authors = ["George Pollard <porges@porg.es>"]
@@ -13,10 +13,22 @@ repository = "https://github.com/Porges/html2rdf"
1313
rust-version = "1.88.0"
1414

1515
[workspace.dependencies]
16+
derive_more = { version = "^2.1.1", features = ["display", "error"] }
1617
html2rdf = { path = "html2rdf" }
18+
icu_locale = "2.1.1"
19+
jiff = "^0.2.23"
20+
librdfa-wrapper = { path = "librdfa-wrapper" }
1721
oxiri = "0.2.11"
1822
oxrdf = "0.3.3"
1923
oxttl = "0.2.3"
24+
reqwest = { version = "^0.13.2", features = ["blocking", "charset", "http2", "native-tls", "system-proxy"] }
25+
26+
[profile.release]
27+
debug = 1
28+
29+
[profile.profiling]
30+
inherits = "release"
31+
debug = true
2032

2133
[profile.fuzz]
2234
inherits = "dev"

html2rdf-cli/Cargo.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,14 @@ path = "src/main.rs"
1616

1717
[dependencies]
1818
clap = { version = "4.5.59", features = ["cargo", "derive", "wrap_help"] }
19+
derive_more = { workspace = true, features = ["from", "from_str"] } #unified
20+
dunce = "1.0.5"
1921
html2rdf = { workspace = true }
20-
jiff = "0.2.20"
22+
jiff = { workspace = true } #unified
2123
oxiri = { workspace = true }
2224
oxrdf = { workspace = true, features = ["rdfc-10"] }
2325
oxttl = { workspace = true }
24-
reqwest = { version = "0.13.2", default-features = false, features = ["charset", "http2", "native-tls", "system-proxy"] }
26+
reqwest = { workspace = true, features = ["blocking"] } #unified
2527
tokio = { version = "1.49.0", features = ["macros", "rt"] }
2628
tracing-subscriber = { version = "0.3.22" }
2729
url = "2.5.8"

html2rdf-cli/src/main.rs

Lines changed: 88 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,99 @@
11
use std::{io::Write, process::ExitCode};
22

33
use clap::Parser;
4-
use oxrdf::graph::{CanonicalizationAlgorithm, CanonicalizationHashAlgorithm};
4+
use html2rdf::{Options, algorithms::OnlineVocabularyResolver, host_language::Html5};
5+
use oxrdf::{
6+
Graph,
7+
graph::{CanonicalizationAlgorithm, CanonicalizationHashAlgorithm},
8+
};
59

610
#[derive(Parser)]
711
#[command(version, about)]
812
struct Args {
9-
#[arg(value_name = "URL")]
10-
target: url::Url,
13+
#[clap(flatten)]
14+
input: InputGroup,
1115

16+
/// Canonicalize the resulting graph.
1217
#[arg(long, short = 'c')]
1318
canonicalize: bool,
19+
20+
/// Perform RDFa vocabulary expansion.
21+
#[arg(long)]
22+
vocab_expansion: bool,
23+
}
24+
25+
#[derive(Debug, clap::Args)]
26+
#[group(required = true, multiple = false)]
27+
pub struct InputGroup {
28+
/// The target URL (must be absolute).
29+
#[arg(value_name = "URL")]
30+
url: Option<url::Url>,
31+
32+
/// The target path (can be relative).
33+
#[arg(long, value_name = "PATH")]
34+
path: Option<std::path::PathBuf>,
35+
}
36+
37+
#[derive(derive_more::Error, derive_more::Display, Debug)]
38+
#[display("Unsupported URL scheme `{scheme}`.")]
39+
struct UnsupportedUrlScheme {
40+
scheme: String,
1441
}
1542

1643
#[tokio::main(flavor = "current_thread")]
1744
async fn main() -> Result<ExitCode, Box<dyn std::error::Error>> {
1845
tracing_subscriber::fmt::init();
1946

2047
let args = Args::parse();
21-
let client = reqwest::Client::new();
22-
let base = args.target.to_string();
23-
let base_iri = oxiri::Iri::parse(base.clone())?;
24-
let response = client.get(args.target).send().await?.error_for_status()?;
25-
let content_type = response
26-
.headers()
27-
.get(reqwest::header::CONTENT_TYPE)
28-
.and_then(|v| v.to_str().ok());
29-
30-
if content_type.is_some_and(|ct| !ct.starts_with("text/html")) {
31-
eprintln!("Error: content type is not text/html.");
32-
return Ok(ExitCode::FAILURE);
33-
}
34-
35-
drop(client);
36-
37-
let final_url = response.url().clone();
48+
let base = if let Some(url) = args.input.url {
49+
url
50+
} else {
51+
let p = args.input.path.unwrap(); // UNWRAP: clap guarantee
52+
url::Url::from_file_path(dunce::canonicalize(p)?).unwrap()
53+
};
54+
let base_iri = oxiri::Iri::parse(base.as_str())?;
55+
56+
let mut final_url = base.clone(); // the final, resolved, URL
57+
let content: String = match base.scheme() {
58+
"http" | "https" => {
59+
let client = reqwest::Client::new();
60+
let response = client.get(base.clone()).send().await?.error_for_status()?;
61+
let content_type = response
62+
.headers()
63+
.get(reqwest::header::CONTENT_TYPE)
64+
.and_then(|v| v.to_str().ok());
65+
66+
if content_type.is_some_and(|ct| !ct.starts_with("text/html")) {
67+
eprintln!("Error: content type is not text/html.");
68+
return Ok(ExitCode::FAILURE);
69+
}
70+
71+
final_url = response.url().clone();
72+
response.text().await?
73+
}
74+
"file" => std::fs::read_to_string(base.to_file_path().unwrap())?,
75+
scheme => {
76+
return Err(UnsupportedUrlScheme {
77+
scheme: scheme.to_string(),
78+
}
79+
.into());
80+
}
81+
};
3882

39-
let content = response.text().await?;
40-
let mut output_graph = oxrdf::Graph::new();
41-
let mut processor_graph = oxrdf::Graph::new();
42-
html2rdf::process(
43-
&content,
44-
base_iri.clone(),
45-
&mut output_graph,
46-
&mut processor_graph,
47-
)?;
83+
let mut options = Options::<Html5>::default();
84+
if args.vocab_expansion {
85+
options = options.enable_vocabulary_expansion(OnlineVocabularyResolver::default());
86+
}
87+
let (mut output_graph, processor_graph) =
88+
html2rdf::doc_to_graphs::<_, Graph>(&content, base_iri.as_ref(), options)
89+
.unwrap_or_else(|inf| match inf {});
4890

4991
{
5092
// output any warnings/errors
5193
let serializer = oxttl::TurtleSerializer::new();
5294
let mut locked_err = std::io::stderr().lock();
5395
let mut writer = serializer.for_writer(&mut locked_err);
54-
for triple in processor_graph.iter() {
96+
for triple in &processor_graph {
5597
writer.serialize_triple(triple)?;
5698
}
5799

@@ -61,8 +103,8 @@ async fn main() -> Result<ExitCode, Box<dyn std::error::Error>> {
61103

62104
{
63105
// use serializer with all known prefixes
64-
let serializer = html2rdf::initial_context_prefixes().mappings().try_fold(
65-
oxttl::TurtleSerializer::new().with_base_iri(base)?,
106+
let serializer = html2rdf::initial_context::prefixes().mappings().try_fold(
107+
oxttl::TurtleSerializer::new().with_base_iri(base.as_str())?,
66108
|serializer, (prefix, value)| serializer.with_prefix(prefix, value),
67109
)?;
68110

@@ -72,8 +114,11 @@ async fn main() -> Result<ExitCode, Box<dyn std::error::Error>> {
72114
});
73115
}
74116

75-
let mut locked_out = std::io::stdout().lock();
76-
locked_out.write_all(
117+
let locked_out = std::io::stdout().lock();
118+
// yes, stdout is already buffered, but we want to just write big chunks
119+
// TTL tends to write lots of short lines which performs pretty poorly with LineWriter
120+
let mut out = std::io::BufWriter::new(locked_out);
121+
out.write_all(
77122
format!(
78123
"# generated by html2rdf {} at {:.0} from: {}\n",
79124
clap::crate_version!(),
@@ -84,16 +129,18 @@ async fn main() -> Result<ExitCode, Box<dyn std::error::Error>> {
84129
)?;
85130

86131
if args.canonicalize {
87-
locked_out.write_all(b"# (output has been canonicalized)\n")?;
132+
out.write_all(b"# (output has been canonicalized)\n")?;
88133
}
89134

90-
let mut writer = serializer.for_writer(&mut locked_out);
91-
for triple in output_graph.iter() {
92-
writer.serialize_triple(triple)?;
93-
}
135+
{
136+
let mut writer = serializer.for_writer(&mut out);
137+
for triple in &output_graph {
138+
writer.serialize_triple(triple)?;
139+
}
94140

95-
writer.finish()?;
96-
drop(output_graph);
141+
writer.finish()?;
142+
}
143+
out.flush()?;
97144
}
98145

99146
Ok(ExitCode::SUCCESS)

html2rdf/Cargo.toml

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,33 +16,45 @@ exclude = [
1616
"tests/test-suite/test-cases/*/svg*/*",
1717
]
1818

19+
[features]
20+
default = ["html", "http", "xhtml"]
21+
html = ["dep:scraper"]
22+
xhtml = ["dep:uppsala"]
23+
http = ["dep:reqwest"]
24+
1925
[dependencies]
26+
bergshamra-c14n = "0.3.1"
27+
bergshamra-xml = "0.3.1"
2028
curie = "0.1.4"
21-
derive_more = { version = "2.1.1", features = [
22-
"display",
23-
"error",
24-
"from",
25-
"from_str",
26-
] }
27-
icu_locale = { version = "2.1.1" }
29+
derive_more = { workspace = true, features = ["from", "from_str"] }
2830
indexmap = "2.13.0"
2931
itertools = "0.14.0"
32+
jiff = { workspace = true }
3033
mitsein = "0.9.0"
34+
ouroboros = "0.18.5"
35+
oxilangtag = "0.1.5"
3136
oxiri = { workspace = true }
3237
oxrdf = { workspace = true }
3338
oxsdatatypes = "0.2.2"
34-
rxml_validation = "0.11.0"
35-
scraper = "0.25.0"
39+
percent-encoding = "2.3.2"
40+
reqwest = { workspace = true, optional = true, features = ["blocking"] }
41+
rxml_validation = "0.12.0"
42+
scraper = { version = "0.26.0", optional = true, default-features = false, features = ["errors"] }
43+
thiserror = "2.0.18"
3644
tracing = "0.1.44"
45+
uppsala = { version = "0.3.0", optional = true }
3746

3847
[dev-dependencies]
3948
bolero = "0.13.4"
49+
indoc = "2.0.7"
4050
insta = { version = "1.46.3", features = ["glob"] }
4151
oxrdf = { workspace = true, features = ["rdfc-10"] }
4252
oxttl = { workspace = true }
4353
pretty_assertions = "1.4.1"
44-
pyo3 = "0.28.2"
4554
rstest = { version = "0.26.1", default-features = false, features = [
4655
"crate-name",
4756
] }
48-
sha2 = "0.10.9"
57+
58+
[target.'cfg(not(target_os = "windows"))'.dev-dependencies]
59+
librdfa-wrapper = { workspace = true }
60+
pyo3 = "0.28.2"

html2rdf/src/algorithms/mod.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
//! Supplementary algorithms used in RDFa.
2+
3+
mod property_copying;
4+
mod vocabulary_expansion;
5+
6+
pub use property_copying::property_copying;
7+
pub use vocabulary_expansion::{
8+
OfflineVocabularyResolver, OnlineVocabularyResolver, VocabularyResolver, vocabulary_expansion,
9+
};

0 commit comments

Comments
 (0)