Browse Source

basic download functionality complete

trunk
Shanti Chellaram 2 years ago
parent
commit
87e37697b7
  1. 1428
      Cargo.lock
  2. 7
      Cargo.toml
  3. 89
      src/main.rs

1428
Cargo.lock
File diff suppressed because it is too large
View File

7
Cargo.toml

@ -6,3 +6,10 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
chrono = "0.4.26"
ego-tree = "0.6.2"
futures = "0.3.28"
hyper = { version = "0.14.27", features = ["http2", "client", "http1"] }
hyper-tls = "0.5.0"
scraper = "0.17.1"
tokio = { version = "1.29.1", features = ["rt-multi-thread"] }

89
src/main.rs

@ -1,6 +1,93 @@
fn get_page(month:
use hyper::Body;
use hyper::Request;
use hyper::Client;
use hyper::Uri;
use chrono::naive::NaiveDate;
use chrono::Datelike;
use hyper_tls::HttpsConnector;
use scraper::Html;
use scraper::Selector;
use scraper::ElementRef;
use scraper::Node;
use ego_tree::NodeRef;
use tokio::runtime::Runtime;
#[derive(Debug)]
struct Question {
question: String,
answer: String,
}
struct Anki {
uri: Uri,
deck_id: String,
}
fn main() { fn main() {
//ensure datasets are loaded
// download pages
// parse questions
//ensure questions are in anki
println!("Hello, world!"); println!("Hello, world!");
// TODO: cache downloaded pages
// TODO: have an overall cacheing strategy
let rt = Runtime::new().unwrap();
// the window of available data is like today - 15 days or something
let for_date = NaiveDate::from_ymd_opt(2023, 7, 27).unwrap(); // TODO: replace with today
let questions = rt.block_on(scrape_questions(for_date));
println!("Questions: {:?}", questions);
rt.block_on(upload_to_anki(questions));
}
async fn scrape_questions(date: NaiveDate) -> Option<Vec<Question>> {
let uri = Uri::builder()
.scheme("https")
.authority("mainichikanji.com")
.path_and_query(format!("/{}gatu{}.html", date.month(), date.day()))
.build()
.unwrap();
let client = Client::builder().build::<_, hyper::Body>(HttpsConnector::new());
let resp = client.get(uri).await.unwrap();
println!("Status: {:?}", resp.status());
let page_contents = hyper::body::to_bytes(resp).await.unwrap();
// We have page contents, now parse out the bits we care about
let document = Html::parse_document(std::str::from_utf8(&page_contents).unwrap());
let question_selector = Selector::parse(".question").unwrap();
let results = document.select(&question_selector)
.map(parse_question)
.map(Option::unwrap)
.collect::<Vec<_>>();
Some(results)
} }
fn parse_question(element: ElementRef) -> Option<Question> {
//println!("Element: {}", element.text());
let question_end_selector = Selector::parse("div.answer-box").unwrap();
let answer_selector = Selector::parse(".answer").unwrap();
let question_end = element.select(&question_end_selector).next();
let question = element.children()
.take_while(|x| ElementRef::wrap(*x) != question_end)
.map(|node| match node.value() {
Node::Text(text) => String::from(text.text.trim()),
Node::Element(_element) => format!("<em>{}</em>", node.first_child().unwrap().value().as_text().unwrap().text.trim()),
_ => panic!("fresh peach heart shower"),
})
.collect::<Vec<_>>()
.join(""); //TODO: remove number prefix here
let answer = element.select(&answer_selector).next().unwrap().inner_html();
Some(Question{question, answer})
}
//async fn upload_to_anki(client: &Client<hyper::Body>, questions: &dyn Iterator<Item = Question>) {
// let request = Request::builder()
// .uri(Uri::from_static("http://localhost:8765"))
// .body(Body::from(""))
// .build();
// let resp = client.request(request).await.unwrap();
//}
Loading…
Cancel
Save