mirror of
				https://github.com/dani-garcia/vaultwarden.git
				synced 2025-10-25 08:10:38 +03:00 
			
		
		
		
	Merge pull request #3751 from BlackDex/optimize-icon-fetching
Optimized Favicon downloading
This commit is contained in:
		| @@ -124,7 +124,7 @@ email_address = "0.2.4" | ||||
| handlebars = { version = "4.3.7", features = ["dir_source"] } | ||||
|  | ||||
| # HTTP client (Used for favicons, version check, DUO and HIBP API) | ||||
| reqwest = { version = "0.11.18", features = ["stream", "json", "gzip", "brotli", "socks", "cookies", "trust-dns"] } | ||||
| reqwest = { version = "0.11.18", features = ["stream", "json", "deflate", "gzip", "brotli", "socks", "cookies", "trust-dns", "native-tls-alpn"] } | ||||
|  | ||||
| # Favicon extraction libraries | ||||
| html5gum = "0.5.7" | ||||
|   | ||||
							
								
								
									
										242
									
								
								src/api/icons.rs
									
									
									
									
									
								
							
							
						
						
									
										242
									
								
								src/api/icons.rs
									
									
									
									
									
								
							| @@ -19,7 +19,7 @@ use tokio::{ | ||||
|     net::lookup_host, | ||||
| }; | ||||
|  | ||||
| use html5gum::{Emitter, EndTag, HtmlString, InfallibleTokenizer, Readable, StartTag, StringReader, Tokenizer}; | ||||
| use html5gum::{Emitter, HtmlString, InfallibleTokenizer, Readable, StringReader, Tokenizer}; | ||||
|  | ||||
| use crate::{ | ||||
|     error::Error, | ||||
| @@ -46,10 +46,15 @@ static CLIENT: Lazy<Client> = Lazy::new(|| { | ||||
|     // Generate the cookie store | ||||
|     let cookie_store = Arc::new(Jar::default()); | ||||
|  | ||||
|     let icon_download_timeout = Duration::from_secs(CONFIG.icon_download_timeout()); | ||||
|     let pool_idle_timeout = Duration::from_secs(10); | ||||
|     // Reuse the client between requests | ||||
|     let client = get_reqwest_client_builder() | ||||
|         .cookie_provider(Arc::clone(&cookie_store)) | ||||
|         .timeout(Duration::from_secs(CONFIG.icon_download_timeout())) | ||||
|         .timeout(icon_download_timeout) | ||||
|         .pool_max_idle_per_host(5) // Configure the Hyper Pool to only have max 5 idle connections | ||||
|         .pool_idle_timeout(pool_idle_timeout) // Configure the Hyper Pool to timeout after 10 seconds | ||||
|         .trust_dns(true) | ||||
|         .default_headers(default_headers.clone()); | ||||
|  | ||||
|     match client.build() { | ||||
| @@ -58,9 +63,11 @@ static CLIENT: Lazy<Client> = Lazy::new(|| { | ||||
|             error!("Possible trust-dns error, trying with trust-dns disabled: '{e}'"); | ||||
|             get_reqwest_client_builder() | ||||
|                 .cookie_provider(cookie_store) | ||||
|                 .timeout(Duration::from_secs(CONFIG.icon_download_timeout())) | ||||
|                 .default_headers(default_headers) | ||||
|                 .timeout(icon_download_timeout) | ||||
|                 .pool_max_idle_per_host(5) // Configure the Hyper Pool to only have max 5 idle connections | ||||
|                 .pool_idle_timeout(pool_idle_timeout) // Configure the Hyper Pool to timeout after 10 seconds | ||||
|                 .trust_dns(false) | ||||
|                 .default_headers(default_headers) | ||||
|                 .build() | ||||
|                 .expect("Failed to build client") | ||||
|         } | ||||
| @@ -258,7 +265,7 @@ mod tests { | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone)] | ||||
| #[derive(Clone)] | ||||
| enum DomainBlacklistReason { | ||||
|     Regex, | ||||
|     IP, | ||||
| @@ -415,38 +422,34 @@ fn get_favicons_node( | ||||
|     const TAG_LINK: &[u8] = b"link"; | ||||
|     const TAG_BASE: &[u8] = b"base"; | ||||
|     const TAG_HEAD: &[u8] = b"head"; | ||||
|     const ATTR_REL: &[u8] = b"rel"; | ||||
|     const ATTR_HREF: &[u8] = b"href"; | ||||
|     const ATTR_SIZES: &[u8] = b"sizes"; | ||||
|  | ||||
|     let mut base_url = url.clone(); | ||||
|     let mut icon_tags: Vec<StartTag> = Vec::new(); | ||||
|     let mut icon_tags: Vec<Tag> = Vec::new(); | ||||
|     for token in dom { | ||||
|         match token { | ||||
|             FaviconToken::StartTag(tag) => { | ||||
|                 if *tag.name == TAG_LINK | ||||
|                     && tag.attributes.contains_key(ATTR_REL) | ||||
|                     && tag.attributes.contains_key(ATTR_HREF) | ||||
|                 { | ||||
|                     let rel_value = std::str::from_utf8(tag.attributes.get(ATTR_REL).unwrap()) | ||||
|                         .unwrap_or_default() | ||||
|                         .to_ascii_lowercase(); | ||||
|                     if rel_value.contains("icon") && !rel_value.contains("mask-icon") { | ||||
|                         icon_tags.push(tag); | ||||
|                     } | ||||
|                 } else if *tag.name == TAG_BASE && tag.attributes.contains_key(ATTR_HREF) { | ||||
|                     let href = std::str::from_utf8(tag.attributes.get(ATTR_HREF).unwrap()).unwrap_or_default(); | ||||
|                     debug!("Found base href: {href}"); | ||||
|                     base_url = match base_url.join(href) { | ||||
|                         Ok(inner_url) => inner_url, | ||||
|                         _ => url.clone(), | ||||
|                     }; | ||||
|                 } | ||||
|         let tag_name: &[u8] = &token.tag.name; | ||||
|         match tag_name { | ||||
|             TAG_LINK => { | ||||
|                 icon_tags.push(token.tag); | ||||
|             } | ||||
|             FaviconToken::EndTag(tag) => { | ||||
|                 if *tag.name == TAG_HEAD { | ||||
|                     break; | ||||
|                 } | ||||
|             TAG_BASE => { | ||||
|                 base_url = if let Some(href) = token.tag.attributes.get(ATTR_HREF) { | ||||
|                     let href = std::str::from_utf8(href).unwrap_or_default(); | ||||
|                     debug!("Found base href: {href}"); | ||||
|                     match base_url.join(href) { | ||||
|                         Ok(inner_url) => inner_url, | ||||
|                         _ => continue, | ||||
|                     } | ||||
|                 } else { | ||||
|                     continue; | ||||
|                 }; | ||||
|             } | ||||
|             TAG_HEAD if token.closing => { | ||||
|                 break; | ||||
|             } | ||||
|             _ => { | ||||
|                 continue; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| @@ -820,43 +823,64 @@ impl reqwest::cookie::CookieStore for Jar { | ||||
| } | ||||
|  | ||||
| /// Custom FaviconEmitter for the html5gum parser. | ||||
| /// The FaviconEmitter is using an almost 1:1 copy of the DefaultEmitter with some small changes. | ||||
| /// The FaviconEmitter is using an optimized version of the DefaultEmitter. | ||||
| /// This prevents emitting tags like comments, doctype and also strings between the tags. | ||||
| /// But it will also only emit the tags we need and only if they have the correct attributes | ||||
| /// Therefor parsing the HTML content is faster. | ||||
| use std::collections::{BTreeSet, VecDeque}; | ||||
| use std::collections::BTreeMap; | ||||
|  | ||||
| #[derive(Debug)] | ||||
| enum FaviconToken { | ||||
|     StartTag(StartTag), | ||||
|     EndTag(EndTag), | ||||
| #[derive(Default)] | ||||
| pub struct Tag { | ||||
|     /// The tag's name, such as `"link"` or `"base"`. | ||||
|     pub name: HtmlString, | ||||
|  | ||||
|     /// A mapping for any HTML attributes this start tag may have. | ||||
|     /// | ||||
|     /// Duplicate attributes are ignored after the first one as per WHATWG spec. | ||||
|     pub attributes: BTreeMap<HtmlString, HtmlString>, | ||||
| } | ||||
|  | ||||
| #[derive(Default, Debug)] | ||||
| struct FaviconToken { | ||||
|     tag: Tag, | ||||
|     closing: bool, | ||||
| } | ||||
|  | ||||
| #[derive(Default)] | ||||
| struct FaviconEmitter { | ||||
|     current_token: Option<FaviconToken>, | ||||
|     last_start_tag: HtmlString, | ||||
|     current_attribute: Option<(HtmlString, HtmlString)>, | ||||
|     seen_attributes: BTreeSet<HtmlString>, | ||||
|     emitted_tokens: VecDeque<FaviconToken>, | ||||
|     emit_token: bool, | ||||
| } | ||||
|  | ||||
| impl FaviconEmitter { | ||||
|     fn emit_token(&mut self, token: FaviconToken) { | ||||
|         self.emitted_tokens.push_front(token); | ||||
|     } | ||||
|     fn flush_current_attribute(&mut self, emit_current_tag: bool) { | ||||
|         const ATTR_HREF: &[u8] = b"href"; | ||||
|         const ATTR_REL: &[u8] = b"rel"; | ||||
|         const TAG_LINK: &[u8] = b"link"; | ||||
|         const TAG_BASE: &[u8] = b"base"; | ||||
|         const TAG_HEAD: &[u8] = b"head"; | ||||
|  | ||||
|     fn flush_current_attribute(&mut self) { | ||||
|         if let Some((k, v)) = self.current_attribute.take() { | ||||
|             match self.current_token { | ||||
|                 Some(FaviconToken::StartTag(ref mut tag)) => { | ||||
|                     tag.attributes.entry(k).and_modify(|_| {}).or_insert(v); | ||||
|                 } | ||||
|                 Some(FaviconToken::EndTag(_)) => { | ||||
|                     self.seen_attributes.insert(k); | ||||
|                 } | ||||
|                 _ => { | ||||
|                     debug_assert!(false); | ||||
|         if let Some(ref mut token) = self.current_token { | ||||
|             let tag_name: &[u8] = &token.tag.name; | ||||
|  | ||||
|             if self.current_attribute.is_some() && (tag_name == TAG_BASE || tag_name == TAG_LINK) { | ||||
|                 let (k, v) = self.current_attribute.take().unwrap(); | ||||
|                 token.tag.attributes.entry(k).and_modify(|_| {}).or_insert(v); | ||||
|             } | ||||
|  | ||||
|             let tag_attr = &token.tag.attributes; | ||||
|             match tag_name { | ||||
|                 TAG_HEAD if token.closing => self.emit_token = true, | ||||
|                 TAG_BASE if tag_attr.contains_key(ATTR_HREF) => self.emit_token = true, | ||||
|                 TAG_LINK if emit_current_tag && tag_attr.contains_key(ATTR_REL) && tag_attr.contains_key(ATTR_HREF) => { | ||||
|                     let rel_value = | ||||
|                         std::str::from_utf8(token.tag.attributes.get(ATTR_REL).unwrap()).unwrap_or_default(); | ||||
|                     if rel_value.contains("icon") && !rel_value.contains("mask-icon") { | ||||
|                         self.emit_token = true | ||||
|                     } | ||||
|                 } | ||||
|                 _ => (), | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| @@ -871,87 +895,71 @@ impl Emitter for FaviconEmitter { | ||||
|     } | ||||
|  | ||||
|     fn pop_token(&mut self) -> Option<Self::Token> { | ||||
|         self.emitted_tokens.pop_back() | ||||
|     } | ||||
|  | ||||
|     fn init_start_tag(&mut self) { | ||||
|         self.current_token = Some(FaviconToken::StartTag(StartTag::default())); | ||||
|     } | ||||
|  | ||||
|     fn init_end_tag(&mut self) { | ||||
|         self.current_token = Some(FaviconToken::EndTag(EndTag::default())); | ||||
|         self.seen_attributes.clear(); | ||||
|     } | ||||
|  | ||||
|     fn emit_current_tag(&mut self) -> Option<html5gum::State> { | ||||
|         self.flush_current_attribute(); | ||||
|         let mut token = self.current_token.take().unwrap(); | ||||
|         let mut emit = false; | ||||
|         match token { | ||||
|             FaviconToken::EndTag(ref mut tag) => { | ||||
|                 // Always clean seen attributes | ||||
|                 self.seen_attributes.clear(); | ||||
|                 self.set_last_start_tag(None); | ||||
|  | ||||
|                 // Only trigger an emit for the </head> tag. | ||||
|                 // This is matched, and will break the for-loop. | ||||
|                 if *tag.name == b"head" { | ||||
|                     emit = true; | ||||
|                 } | ||||
|             } | ||||
|             FaviconToken::StartTag(ref mut tag) => { | ||||
|                 // Only trriger an emit for <link> and <base> tags. | ||||
|                 // These are the only tags we want to parse. | ||||
|                 if *tag.name == b"link" || *tag.name == b"base" { | ||||
|                     self.set_last_start_tag(Some(&tag.name)); | ||||
|                     emit = true; | ||||
|                 } else { | ||||
|                     self.set_last_start_tag(None); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // Only emit the tags we want to parse. | ||||
|         if emit { | ||||
|             self.emit_token(token); | ||||
|         if self.emit_token { | ||||
|             self.emit_token = false; | ||||
|             return self.current_token.take(); | ||||
|         } | ||||
|         None | ||||
|     } | ||||
|  | ||||
|     fn init_start_tag(&mut self) { | ||||
|         self.current_token = Some(FaviconToken { | ||||
|             tag: Tag::default(), | ||||
|             closing: false, | ||||
|         }); | ||||
|     } | ||||
|  | ||||
|     fn init_end_tag(&mut self) { | ||||
|         self.current_token = Some(FaviconToken { | ||||
|             tag: Tag::default(), | ||||
|             closing: true, | ||||
|         }); | ||||
|     } | ||||
|  | ||||
|     fn emit_current_tag(&mut self) -> Option<html5gum::State> { | ||||
|         self.flush_current_attribute(true); | ||||
|         self.last_start_tag.clear(); | ||||
|         if self.current_token.is_some() && !self.current_token.as_ref().unwrap().closing { | ||||
|             self.last_start_tag.extend(&*self.current_token.as_ref().unwrap().tag.name); | ||||
|         } | ||||
|         html5gum::naive_next_state(&self.last_start_tag) | ||||
|     } | ||||
|  | ||||
|     fn push_tag_name(&mut self, s: &[u8]) { | ||||
|         match self.current_token { | ||||
|             Some( | ||||
|                 FaviconToken::StartTag(StartTag { | ||||
|                     ref mut name, | ||||
|                     .. | ||||
|                 }) | ||||
|                 | FaviconToken::EndTag(EndTag { | ||||
|                     ref mut name, | ||||
|                     .. | ||||
|                 }), | ||||
|             ) => { | ||||
|                 name.extend(s); | ||||
|             } | ||||
|             _ => debug_assert!(false), | ||||
|         if let Some(ref mut token) = self.current_token { | ||||
|             token.tag.name.extend(s); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn init_attribute(&mut self) { | ||||
|         self.flush_current_attribute(); | ||||
|         self.current_attribute = Some(Default::default()); | ||||
|         self.flush_current_attribute(false); | ||||
|         self.current_attribute = match &self.current_token { | ||||
|             Some(token) => { | ||||
|                 let tag_name: &[u8] = &token.tag.name; | ||||
|                 match tag_name { | ||||
|                     b"link" | b"head" | b"base" => Some(Default::default()), | ||||
|                     _ => None, | ||||
|                 } | ||||
|             } | ||||
|             _ => None, | ||||
|         }; | ||||
|     } | ||||
|  | ||||
|     fn push_attribute_name(&mut self, s: &[u8]) { | ||||
|         self.current_attribute.as_mut().unwrap().0.extend(s); | ||||
|         if let Some(attr) = &mut self.current_attribute { | ||||
|             attr.0.extend(s) | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn push_attribute_value(&mut self, s: &[u8]) { | ||||
|         self.current_attribute.as_mut().unwrap().1.extend(s); | ||||
|         if let Some(attr) = &mut self.current_attribute { | ||||
|             attr.1.extend(s) | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn current_is_appropriate_end_tag_token(&mut self) -> bool { | ||||
|         match self.current_token { | ||||
|             Some(FaviconToken::EndTag(ref tag)) => !self.last_start_tag.is_empty() && self.last_start_tag == tag.name, | ||||
|         match &self.current_token { | ||||
|             Some(token) if token.closing => !self.last_start_tag.is_empty() && self.last_start_tag == token.tag.name, | ||||
|             _ => false, | ||||
|         } | ||||
|     } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user