[BE] Parse HTML and get users from online classroom

This commit is contained in:
Araozu 2023-09-29 17:12:58 -05:00
parent 29b52a8bc0
commit 8c1883d5dd
4 changed files with 433 additions and 12 deletions

316
backend/Cargo.lock generated
View File

@ -122,6 +122,7 @@ dependencies = [
"once_cell",
"reqwest",
"rocket",
"scraper",
"serde",
"sqlx",
]
@ -345,6 +346,29 @@ dependencies = [
"typenum",
]
[[package]]
name = "cssparser"
version = "0.31.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"phf 0.11.2",
"smallvec",
]
[[package]]
name = "cssparser-macros"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
"syn 2.0.29",
]
[[package]]
name = "der"
version = "0.7.8"
@ -362,6 +386,17 @@ version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2696e8a945f658fd14dc3b87242e6b80cd0f36ff04ea560fa39082368847946"
[[package]]
name = "derive_more"
version = "0.99.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "devise"
version = "0.4.1"
@ -413,6 +448,27 @@ version = "0.15.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
[[package]]
name = "dtoa"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653"
[[package]]
name = "dtoa-short"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbaceec3c6e4211c79e7b1800fb9680527106beb2f9c51904a3210c03a448c74"
dependencies = [
"dtoa",
]
[[package]]
name = "ego-tree"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591"
[[package]]
name = "either"
version = "1.9.0"
@ -537,6 +593,16 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]]
name = "futures"
version = "0.3.28"
@ -624,6 +690,15 @@ dependencies = [
"slab",
]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "generator"
version = "0.7.5"
@ -647,6 +722,15 @@ dependencies = [
"version_check",
]
[[package]]
name = "getopts"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
dependencies = [
"unicode-width",
]
[[package]]
name = "getrandom"
version = "0.2.10"
@ -762,6 +846,20 @@ dependencies = [
"windows-sys",
]
[[package]]
name = "html5ever"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
dependencies = [
"log",
"mac",
"markup5ever",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "http"
version = "0.2.9"
@ -1024,6 +1122,26 @@ dependencies = [
"tracing-subscriber",
]
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "markup5ever"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
dependencies = [
"log",
"phf 0.10.1",
"phf_codegen",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "matchers"
version = "0.1.0"
@ -1124,6 +1242,12 @@ dependencies = [
"tempfile",
]
[[package]]
name = "new_debug_unreachable"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54"
[[package]]
name = "nom"
version = "7.1.3"
@ -1334,6 +1458,86 @@ version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94"
[[package]]
name = "phf"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
dependencies = [
"phf_shared 0.10.0",
]
[[package]]
name = "phf"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
dependencies = [
"phf_macros",
"phf_shared 0.11.2",
]
[[package]]
name = "phf_codegen"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
]
[[package]]
name = "phf_generator"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
dependencies = [
"phf_shared 0.10.0",
"rand",
]
[[package]]
name = "phf_generator"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
dependencies = [
"phf_shared 0.11.2",
"rand",
]
[[package]]
name = "phf_macros"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b"
dependencies = [
"phf_generator 0.11.2",
"phf_shared 0.11.2",
"proc-macro2",
"quote",
"syn 2.0.29",
]
[[package]]
name = "phf_shared"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
dependencies = [
"siphasher",
]
[[package]]
name = "phf_shared"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
dependencies = [
"siphasher",
]
[[package]]
name = "pin-project"
version = "1.1.3"
@ -1399,6 +1603,12 @@ version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro2"
version = "1.0.66"
@ -1814,6 +2024,23 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c95a930e03325234c18c7071fd2b60118307e025d6fff3e12745ffbf63a3d29c"
dependencies = [
"ahash",
"cssparser",
"ego-tree",
"getopts",
"html5ever",
"once_cell",
"selectors",
"smallvec",
"tendril",
]
[[package]]
name = "sct"
version = "0.7.0"
@ -1847,6 +2074,25 @@ dependencies = [
"libc",
]
[[package]]
name = "selectors"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06"
dependencies = [
"bitflags 2.4.0",
"cssparser",
"derive_more",
"fxhash",
"log",
"new_debug_unreachable",
"phf 0.10.1",
"phf_codegen",
"precomputed-hash",
"servo_arc",
"smallvec",
]
[[package]]
name = "serde"
version = "1.0.188"
@ -1899,6 +2145,15 @@ dependencies = [
"serde",
]
[[package]]
name = "servo_arc"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44"
dependencies = [
"stable_deref_trait",
]
[[package]]
name = "sha1"
version = "0.10.5"
@ -1949,6 +2204,12 @@ dependencies = [
"rand_core",
]
[[package]]
name = "siphasher"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
[[package]]
name = "slab"
version = "0.4.9"
@ -2231,6 +2492,12 @@ dependencies = [
"memchr",
]
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "state"
version = "0.5.3"
@ -2240,6 +2507,32 @@ dependencies = [
"loom",
]
[[package]]
name = "string_cache"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b"
dependencies = [
"new_debug_unreachable",
"once_cell",
"parking_lot",
"phf_shared 0.10.0",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
"proc-macro2",
"quote",
]
[[package]]
name = "stringprep"
version = "0.1.3"
@ -2291,6 +2584,17 @@ dependencies = [
"windows-sys",
]
[[package]]
name = "tendril"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
dependencies = [
"futf",
"mac",
"utf-8",
]
[[package]]
name = "thiserror"
version = "1.0.47"
@ -2600,6 +2904,12 @@ version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
[[package]]
name = "unicode-width"
version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85"
[[package]]
name = "unicode-xid"
version = "0.2.4"
@ -2629,6 +2939,12 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "uuid"
version = "1.4.1"

View File

@ -13,3 +13,4 @@ once_cell = "1.18.0"
dotenvy = "0.15.7"
serde = "1.0.188"
chrono = "0.4.27"
scraper = "0.17.1"

View File

@ -23,7 +23,7 @@ pub struct Person {
/// Example: `Gomez`
pub person_maternal_surname: String,
/// Id of the online classroom user id linked to this user
pub person_classroom_id: Option<i32>
pub person_classroom_id: Option<i32>,
}
impl Person {

View File

@ -1,6 +1,15 @@
use rocket::{http::Status, serde::json::Json};
use super::{json_result::JsonResult, session::request};
use rocket::{http::Status, serde::json::Json};
use scraper::{ElementRef, Html, Selector};
use serde::{Deserialize, Serialize};
#[derive(Debug, Serialize, Deserialize)]
pub struct ClassroomPerson {
name: String,
surname: String,
username: String,
user_id: String,
}
// Instead of requesting pages and managing session & cookies manually,
// create a wrapper that:
@ -10,16 +19,111 @@ use super::{json_result::JsonResult, session::request};
// - Returns the html string, or an error
#[get("/classroom/users/<full_name>")]
pub async fn get_users(full_name: String) -> (Status, Json<JsonResult<()>>) {
let html = request(format!("/main/admin/user_list.php?keyword={}&submit=&_qf__search_simple=", full_name)).await;
pub async fn get_users(full_name: String) -> (Status, Json<JsonResult<Vec<ClassroomPerson>>>) {
let html = request(format!(
"/main/admin/user_list.php?keyword={}&submit=&_qf__search_simple=",
full_name
))
.await;
match html {
Ok(html) => {
println!("{}", html);
(Status::Ok, JsonResult::ok(()))
}
Err(reason) => {
(Status::InternalServerError, JsonResult::err(reason))
}
Ok(html) => match parse_users(&html) {
Ok(users) => (Status::Ok, JsonResult::ok(users)),
Err(reason) => {
// println!("{}", html);
(Status::InternalServerError, JsonResult::err(reason))
}
},
Err(reason) => (Status::InternalServerError, JsonResult::err(reason)),
}
}
fn parse_users(file: &str) -> Result<Vec<ClassroomPerson>, String> {
// Selectors
let Ok(form_selector) = Selector::parse("form#form_users_id") else {
return Err("Error parsing form#form_users_id selector".into());
};
let Ok(tr_selector) = Selector::parse("tr:not(:first-child)") else {
return Err("Error parsing tr:not(:first-child) selector".into());
};
let Ok(td_selector) = Selector::parse("td") else {
return Err("Error parsing td selector".into());
};
let fragment = Html::parse_document(file);
let form_element = match fragment.select(&form_selector).next() {
Some(el) => el,
None => return Err("Error selecting form#form_users_id: not found".into()),
};
let mut result_vec = Vec::new();
for element in form_element.select(&tr_selector) {
let td_vec: Vec<_> = element.select(&td_selector).collect();
if td_vec.len() != 12 {
return Err(format!(
"Error parsing tr: td elements count is not 12, but {}",
td_vec.len()
));
}
result_vec.push(get_person_data(&td_vec)?);
}
Ok(result_vec)
}
fn get_person_data(td_vec: &Vec<ElementRef>) -> Result<ClassroomPerson, String> {
// Surnames
let surname_ref = td_vec[3];
let name_ref = td_vec[4];
let username_ref = td_vec[5];
// Selectors
let a_selector = Selector::parse("a").expect("Error parsing `a` selector");
//
// Get the href of the surname link
//
let surnames_a_node = surname_ref
.first_child()
.ok_or("Expected the 3rd td element to have a children")?;
let surnames_a_element = surnames_a_node
.value()
.as_element()
.ok_or("Expected the 3rd td element to have an html children")?;
let href_value = surnames_a_element
.attr("href")
.ok_or("Expected the 3rd td element's children to have an href attribute")?;
// Get the surname
let Some(surname_a_element) = surname_ref.select(&a_selector).next() else {
return Err("Expected the 3rd td element to have an `a` element".into());
};
let surname = surname_a_element.inner_html();
// Get the name
let Some(surnames_a_element) = name_ref.select(&a_selector).next() else {
return Err("Expected the 4th td element to have an `a` element".into());
};
let name = surnames_a_element.inner_html();
// Get the username
let username = username_ref.inner_html();
// Parse userid from href
// format: https://testing.aulavirtual.eegsac.com/main/admin/user_information.php?user_id=1087
// Get the position of 'user_id='
let user_id_start = href_value.find("user_id=").ok_or("Error parsing user_id")? + 8;
let user_id = href_value[user_id_start..].to_string();
Ok(ClassroomPerson {
name,
surname,
username,
user_id,
})
}