11use std:: { io:: Write , process:: ExitCode } ;
22
33use clap:: Parser ;
4- use oxrdf:: graph:: { CanonicalizationAlgorithm , CanonicalizationHashAlgorithm } ;
4+ use html2rdf:: { Options , algorithms:: OnlineVocabularyResolver , host_language:: Html5 } ;
5+ use oxrdf:: {
6+ Graph ,
7+ graph:: { CanonicalizationAlgorithm , CanonicalizationHashAlgorithm } ,
8+ } ;
59
610#[ derive( Parser ) ]
711#[ command( version, about) ]
812struct Args {
9- #[ arg ( value_name = "URL" ) ]
10- target : url :: Url ,
13+ #[ clap ( flatten ) ]
14+ input : InputGroup ,
1115
16+ /// Canonicalize the resulting graph.
1217 #[ arg( long, short = 'c' ) ]
1318 canonicalize : bool ,
19+
20+ /// Perform RDFa vocabulary expansion.
21+ #[ arg( long) ]
22+ vocab_expansion : bool ,
23+ }
24+
25+ #[ derive( Debug , clap:: Args ) ]
26+ #[ group( required = true , multiple = false ) ]
27+ pub struct InputGroup {
28+ /// The target URL (must be absolute).
29+ #[ arg( value_name = "URL" ) ]
30+ url : Option < url:: Url > ,
31+
32+ /// The target path (can be relative).
33+ #[ arg( long, value_name = "PATH" ) ]
34+ path : Option < std:: path:: PathBuf > ,
35+ }
36+
37+ #[ derive( derive_more:: Error , derive_more:: Display , Debug ) ]
38+ #[ display( "Unsupported URL scheme `{scheme}`." ) ]
39+ struct UnsupportedUrlScheme {
40+ scheme : String ,
1441}
1542
1643#[ tokio:: main( flavor = "current_thread" ) ]
1744async fn main ( ) -> Result < ExitCode , Box < dyn std:: error:: Error > > {
1845 tracing_subscriber:: fmt:: init ( ) ;
1946
2047 let args = Args :: parse ( ) ;
21- let client = reqwest:: Client :: new ( ) ;
22- let base = args. target . to_string ( ) ;
23- let base_iri = oxiri:: Iri :: parse ( base. clone ( ) ) ?;
24- let response = client. get ( args. target ) . send ( ) . await ?. error_for_status ( ) ?;
25- let content_type = response
26- . headers ( )
27- . get ( reqwest:: header:: CONTENT_TYPE )
28- . and_then ( |v| v. to_str ( ) . ok ( ) ) ;
29-
30- if content_type. is_some_and ( |ct| !ct. starts_with ( "text/html" ) ) {
31- eprintln ! ( "Error: content type is not text/html." ) ;
32- return Ok ( ExitCode :: FAILURE ) ;
33- }
34-
35- drop ( client) ;
36-
37- let final_url = response. url ( ) . clone ( ) ;
48+ let base = if let Some ( url) = args. input . url {
49+ url
50+ } else {
51+ let p = args. input . path . unwrap ( ) ; // UNWRAP: clap guarantee
52+ url:: Url :: from_file_path ( dunce:: canonicalize ( p) ?) . unwrap ( )
53+ } ;
54+ let base_iri = oxiri:: Iri :: parse ( base. as_str ( ) ) ?;
55+
56+ let mut final_url = base. clone ( ) ; // the final, resolved, URL
57+ let content: String = match base. scheme ( ) {
58+ "http" | "https" => {
59+ let client = reqwest:: Client :: new ( ) ;
60+ let response = client. get ( base. clone ( ) ) . send ( ) . await ?. error_for_status ( ) ?;
61+ let content_type = response
62+ . headers ( )
63+ . get ( reqwest:: header:: CONTENT_TYPE )
64+ . and_then ( |v| v. to_str ( ) . ok ( ) ) ;
65+
66+ if content_type. is_some_and ( |ct| !ct. starts_with ( "text/html" ) ) {
67+ eprintln ! ( "Error: content type is not text/html." ) ;
68+ return Ok ( ExitCode :: FAILURE ) ;
69+ }
70+
71+ final_url = response. url ( ) . clone ( ) ;
72+ response. text ( ) . await ?
73+ }
74+ "file" => std:: fs:: read_to_string ( base. to_file_path ( ) . unwrap ( ) ) ?,
75+ scheme => {
76+ return Err ( UnsupportedUrlScheme {
77+ scheme : scheme. to_string ( ) ,
78+ }
79+ . into ( ) ) ;
80+ }
81+ } ;
3882
39- let content = response. text ( ) . await ?;
40- let mut output_graph = oxrdf:: Graph :: new ( ) ;
41- let mut processor_graph = oxrdf:: Graph :: new ( ) ;
42- html2rdf:: process (
43- & content,
44- base_iri. clone ( ) ,
45- & mut output_graph,
46- & mut processor_graph,
47- ) ?;
83+ let mut options = Options :: < Html5 > :: default ( ) ;
84+ if args. vocab_expansion {
85+ options = options. enable_vocabulary_expansion ( OnlineVocabularyResolver :: default ( ) ) ;
86+ }
87+ let ( mut output_graph, processor_graph) =
88+ html2rdf:: doc_to_graphs :: < _ , Graph > ( & content, base_iri. as_ref ( ) , options)
89+ . unwrap_or_else ( |inf| match inf { } ) ;
4890
4991 {
5092 // output any warnings/errors
5193 let serializer = oxttl:: TurtleSerializer :: new ( ) ;
5294 let mut locked_err = std:: io:: stderr ( ) . lock ( ) ;
5395 let mut writer = serializer. for_writer ( & mut locked_err) ;
54- for triple in processor_graph. iter ( ) {
96+ for triple in & processor_graph {
5597 writer. serialize_triple ( triple) ?;
5698 }
5799
@@ -61,8 +103,8 @@ async fn main() -> Result<ExitCode, Box<dyn std::error::Error>> {
61103
62104 {
63105 // use serializer with all known prefixes
64- let serializer = html2rdf:: initial_context_prefixes ( ) . mappings ( ) . try_fold (
65- oxttl:: TurtleSerializer :: new ( ) . with_base_iri ( base) ?,
106+ let serializer = html2rdf:: initial_context :: prefixes ( ) . mappings ( ) . try_fold (
107+ oxttl:: TurtleSerializer :: new ( ) . with_base_iri ( base. as_str ( ) ) ?,
66108 |serializer, ( prefix, value) | serializer. with_prefix ( prefix, value) ,
67109 ) ?;
68110
@@ -72,8 +114,11 @@ async fn main() -> Result<ExitCode, Box<dyn std::error::Error>> {
72114 } ) ;
73115 }
74116
75- let mut locked_out = std:: io:: stdout ( ) . lock ( ) ;
76- locked_out. write_all (
117+ let locked_out = std:: io:: stdout ( ) . lock ( ) ;
118+ // yes, stdout is already buffered, but we want to just write big chunks
119+ // TTL tends to write lots of short lines which performs pretty poorly with LineWriter
120+ let mut out = std:: io:: BufWriter :: new ( locked_out) ;
121+ out. write_all (
77122 format ! (
78123 "# generated by html2rdf {} at {:.0} from: {}\n " ,
79124 clap:: crate_version!( ) ,
@@ -84,16 +129,18 @@ async fn main() -> Result<ExitCode, Box<dyn std::error::Error>> {
84129 ) ?;
85130
86131 if args. canonicalize {
87- locked_out . write_all ( b"# (output has been canonicalized)\n " ) ?;
132+ out . write_all ( b"# (output has been canonicalized)\n " ) ?;
88133 }
89134
90- let mut writer = serializer. for_writer ( & mut locked_out) ;
91- for triple in output_graph. iter ( ) {
92- writer. serialize_triple ( triple) ?;
93- }
135+ {
136+ let mut writer = serializer. for_writer ( & mut out) ;
137+ for triple in & output_graph {
138+ writer. serialize_triple ( triple) ?;
139+ }
94140
95- writer. finish ( ) ?;
96- drop ( output_graph) ;
141+ writer. finish ( ) ?;
142+ }
143+ out. flush ( ) ?;
97144 }
98145
99146 Ok ( ExitCode :: SUCCESS )
0 commit comments