use hyper::Body; use hyper::Request; use hyper::Client; use hyper::Uri; use chrono::naive::NaiveDate; use chrono::Datelike; use hyper_tls::HttpsConnector; use scraper::Html; use scraper::Selector; use scraper::ElementRef; use scraper::Node; use ego_tree::NodeRef; use tokio::runtime::Runtime; #[derive(Debug)] struct Question { question: String, answer: String, } struct Anki { uri: Uri, deck_id: String, } fn main() { //ensure datasets are loaded // download pages // parse questions //ensure questions are in anki println!("Hello, world!"); // TODO: cache downloaded pages // TODO: have an overall cacheing strategy let rt = Runtime::new().unwrap(); // the window of available data is like today - 15 days or something let for_date = NaiveDate::from_ymd_opt(2023, 7, 27).unwrap(); // TODO: replace with today let questions = rt.block_on(scrape_questions(for_date)); println!("Questions: {:?}", questions); rt.block_on(upload_to_anki(questions)); } async fn scrape_questions(date: NaiveDate) -> Option> { let uri = Uri::builder() .scheme("https") .authority("mainichikanji.com") .path_and_query(format!("/{}gatu{}.html", date.month(), date.day())) .build() .unwrap(); let client = Client::builder().build::<_, hyper::Body>(HttpsConnector::new()); let resp = client.get(uri).await.unwrap(); println!("Status: {:?}", resp.status()); let page_contents = hyper::body::to_bytes(resp).await.unwrap(); // We have page contents, now parse out the bits we care about let document = Html::parse_document(std::str::from_utf8(&page_contents).unwrap()); let question_selector = Selector::parse(".question").unwrap(); let results = document.select(&question_selector) .map(parse_question) .map(Option::unwrap) .collect::>(); Some(results) } fn parse_question(element: ElementRef) -> Option { //println!("Element: {}", element.text()); let question_end_selector = Selector::parse("div.answer-box").unwrap(); let answer_selector = Selector::parse(".answer").unwrap(); let question_end = element.select(&question_end_selector).next(); let question = element.children() .take_while(|x| ElementRef::wrap(*x) != question_end) .map(|node| match node.value() { Node::Text(text) => String::from(text.text.trim()), Node::Element(_element) => format!("{}", node.first_child().unwrap().value().as_text().unwrap().text.trim()), _ => panic!("fresh peach heart shower"), }) .collect::>() .join(""); //TODO: remove number prefix here let answer = element.select(&answer_selector).next().unwrap().inner_html(); Some(Question{question, answer}) } //async fn upload_to_anki(client: &Client, questions: &dyn Iterator) { // let request = Request::builder() // .uri(Uri::from_static("http://localhost:8765")) // .body(Body::from("")) // .build(); // let resp = client.request(request).await.unwrap(); //}