|
- use crate::{
- config::SearchTokenizerConfig, db_conn::DbPool, instance::Instance, posts::Post, schema::posts,
- search::query::PlumeQuery, tags::Tag, Error, Result, CONFIG,
- };
- use chrono::{Datelike, Utc};
- use diesel::{ExpressionMethods, QueryDsl, RunQueryDsl};
- use itertools::Itertools;
- use std::{
- cmp,
- fs::create_dir_all,
- io,
- path::Path,
- sync::{Arc, Mutex},
- };
- use tantivy::{
- collector::TopDocs, directory::MmapDirectory, schema::*, Index, IndexReader, IndexWriter,
- ReloadPolicy, TantivyError, Term,
- };
- use whatlang::{detect as detect_lang, Lang};
-
- #[derive(Debug)]
- pub enum SearcherError {
- IndexCreationError,
- WriteLockAcquisitionError,
- IndexOpeningError,
- IndexEditionError,
- InvalidIndexDataError,
- }
-
- pub struct Searcher {
- index: Index,
- reader: IndexReader,
- writer: Mutex<Option<IndexWriter>>,
- dbpool: DbPool,
- }
-
- impl Searcher {
- /// Initializes a new `Searcher`, ready to be used by
- /// Plume.
- ///
- /// The main task of this function is to try everything
- /// to get a valid `Searcher`:
- ///
- /// - first it tries to open the search index normally (using the options from `CONFIG`)
- /// - if it fails, it makes a back-up of the index files, deletes the original ones,
- /// and recreate the whole index. It removes the backup only if the re-creation
- /// succeeds.
- ///
- /// # Panics
- ///
- /// This function panics if it needs to create a backup and it can't, or if it fails
- /// to recreate the search index.
- ///
- /// After that, it can also panic if there are still errors remaining.
- ///
- /// The panic messages are normally explicit enough for a human to
- /// understand how to fix the issue when they see it.
- pub fn new(db_pool: DbPool) -> Arc<Self> {
- // We try to open the index a first time
- let searcher = match Self::open(
- &CONFIG.search_index,
- db_pool.clone(),
- &CONFIG.search_tokenizers,
- ) {
- // The index may be corrupted, inexistent or use an older format.
- // In this case, we can easily recover by deleting and re-creating it.
- Err(Error::Search(SearcherError::InvalidIndexDataError)) => {
- if Self::create(
- &CONFIG.search_index,
- db_pool.clone(),
- &CONFIG.search_tokenizers,
- )
- .is_err()
- {
- let current_path = Path::new(&CONFIG.search_index);
- let backup_path =
- format!("{}.{}", ¤t_path.display(), Utc::now().timestamp());
- let backup_path = Path::new(&backup_path);
- std::fs::rename(current_path, backup_path)
- .expect("Error while backing up search index directory for re-creation");
- if Self::create(
- &CONFIG.search_index,
- db_pool.clone(),
- &CONFIG.search_tokenizers,
- )
- .is_ok()
- {
- if std::fs::remove_dir_all(backup_path).is_err() {
- eprintln!(
- "error on removing backup directory: {}. it remains",
- backup_path.display()
- );
- }
- } else {
- panic!("Error while re-creating search index in new index format. Remove search index and run `plm search init` manually.");
- }
- }
- Self::open(&CONFIG.search_index, db_pool, &CONFIG.search_tokenizers)
- }
- // If it opened successfully or if it was another kind of
- // error (that we don't know how to handle), don't do anything more
- other => other,
- };
-
- // At this point, if there are still errors, we just panic
- #[allow(clippy::match_wild_err_arm)]
- match searcher {
- Err(Error::Search(e)) => match e {
- SearcherError::WriteLockAcquisitionError => panic!(
- r#"
- Your search index is locked. Plume can't start. To fix this issue
- make sure no other Plume instance is started, and run:
-
- plm search unlock
-
- Then try to restart Plume.
- "#
- ),
- SearcherError::IndexOpeningError => panic!(
- r#"
- Plume was unable to open the search index. If you created the index
- before, make sure to run Plume in the same directory it was created in, or
- to set SEARCH_INDEX accordingly. If you did not yet create the search
- index, run this command:
-
- plm search init
-
- Then try to restart Plume
- "#
- ),
- e => Err(e).unwrap(),
- },
- Err(_) => panic!("Unexpected error while opening search index"),
- Ok(s) => Arc::new(s),
- }
- }
-
- pub fn schema() -> Schema {
- let tag_indexing = TextOptions::default().set_indexing_options(
- TextFieldIndexing::default()
- .set_tokenizer("tag_tokenizer")
- .set_index_option(IndexRecordOption::Basic),
- );
-
- let content_indexing = TextOptions::default().set_indexing_options(
- TextFieldIndexing::default()
- .set_tokenizer("content_tokenizer")
- .set_index_option(IndexRecordOption::WithFreqsAndPositions),
- );
-
- let property_indexing = TextOptions::default().set_indexing_options(
- TextFieldIndexing::default()
- .set_tokenizer("property_tokenizer")
- .set_index_option(IndexRecordOption::WithFreqsAndPositions),
- );
-
- let mut schema_builder = SchemaBuilder::default();
-
- schema_builder.add_i64_field("post_id", STORED | INDEXED);
- schema_builder.add_i64_field("creation_date", INDEXED);
-
- schema_builder.add_text_field("instance", tag_indexing.clone());
- schema_builder.add_text_field("author", tag_indexing.clone());
- schema_builder.add_text_field("tag", tag_indexing);
-
- schema_builder.add_text_field("blog", content_indexing.clone());
- schema_builder.add_text_field("content", content_indexing.clone());
- schema_builder.add_text_field("subtitle", content_indexing.clone());
- schema_builder.add_text_field("title", content_indexing);
-
- schema_builder.add_text_field("lang", property_indexing.clone());
- schema_builder.add_text_field("license", property_indexing);
-
- schema_builder.build()
- }
-
- pub fn create(
- path: &dyn AsRef<Path>,
- dbpool: DbPool,
- tokenizers: &SearchTokenizerConfig,
- ) -> Result<Self> {
- let schema = Self::schema();
-
- create_dir_all(path).map_err(|_| SearcherError::IndexCreationError)?;
- let index = Index::create(
- MmapDirectory::open(path).map_err(|_| SearcherError::IndexCreationError)?,
- schema,
- )
- .map_err(|_| SearcherError::IndexCreationError)?;
-
- {
- let tokenizer_manager = index.tokenizers();
- tokenizer_manager.register("tag_tokenizer", tokenizers.tag_tokenizer);
- tokenizer_manager.register("content_tokenizer", tokenizers.content_tokenizer);
- tokenizer_manager.register("property_tokenizer", tokenizers.property_tokenizer);
- } //to please the borrow checker
- Ok(Self {
- writer: Mutex::new(Some(
- index
- .writer(50_000_000)
- .map_err(|_| SearcherError::WriteLockAcquisitionError)?,
- )),
- reader: index
- .reader_builder()
- .reload_policy(ReloadPolicy::Manual)
- .try_into()
- .map_err(|_| SearcherError::IndexCreationError)?,
- index,
- dbpool,
- })
- }
-
- pub fn open(
- path: &dyn AsRef<Path>,
- dbpool: DbPool,
- tokenizers: &SearchTokenizerConfig,
- ) -> Result<Self> {
- let mut index =
- Index::open(MmapDirectory::open(path).map_err(|_| SearcherError::IndexOpeningError)?)
- .map_err(|_| SearcherError::IndexOpeningError)?;
-
- {
- let tokenizer_manager = index.tokenizers();
- tokenizer_manager.register("tag_tokenizer", tokenizers.tag_tokenizer);
- tokenizer_manager.register("content_tokenizer", tokenizers.content_tokenizer);
- tokenizer_manager.register("property_tokenizer", tokenizers.property_tokenizer);
- } //to please the borrow checker
- let writer = index
- .writer(50_000_000)
- .map_err(|_| SearcherError::WriteLockAcquisitionError)?;
-
- // Since Tantivy v0.12.0, IndexWriter::garbage_collect_files() returns Future.
- // To avoid conflict with Plume async project, we don't introduce async now.
- // After async is introduced to Plume, we can use garbage_collect_files() again.
- // Algorithm stolen from Tantivy's SegmentUpdater::list_files()
- use std::collections::HashSet;
- use std::path::PathBuf;
- let mut files: HashSet<PathBuf> = index
- .list_all_segment_metas()
- .into_iter()
- .flat_map(|segment_meta| segment_meta.list_files())
- .collect();
- files.insert(Path::new("meta.json").to_path_buf());
- index
- .directory_mut()
- .garbage_collect(|| files)
- .map_err(|_| SearcherError::IndexEditionError)?;
-
- Ok(Self {
- writer: Mutex::new(Some(writer)),
- reader: index
- .reader_builder()
- .reload_policy(ReloadPolicy::Manual)
- .try_into()
- .map_err(|e| {
- if let TantivyError::IOError(err) = e {
- let err: io::Error = err.into();
- if err.kind() == io::ErrorKind::InvalidData {
- // Search index was created in older Tantivy format.
- SearcherError::InvalidIndexDataError
- } else {
- SearcherError::IndexCreationError
- }
- } else {
- SearcherError::IndexCreationError
- }
- })?,
- index,
- dbpool,
- })
- }
-
- pub fn add_document(&self, post: &Post) -> Result<()> {
- if !post.published {
- return Ok(());
- }
-
- let schema = self.index.schema();
-
- let post_id = schema.get_field("post_id").unwrap();
- let creation_date = schema.get_field("creation_date").unwrap();
-
- let instance = schema.get_field("instance").unwrap();
- let author = schema.get_field("author").unwrap();
- let tag = schema.get_field("tag").unwrap();
-
- let blog_name = schema.get_field("blog").unwrap();
- let content = schema.get_field("content").unwrap();
- let subtitle = schema.get_field("subtitle").unwrap();
- let title = schema.get_field("title").unwrap();
-
- let lang = schema.get_field("lang").unwrap();
- let license = schema.get_field("license").unwrap();
-
- let conn = match self.dbpool.get() {
- Ok(c) => c,
- Err(_) => return Err(Error::DbPool),
- };
- let mut writer = self.writer.lock().unwrap();
- let writer = writer.as_mut().unwrap();
- writer.add_document(doc!(
- post_id => i64::from(post.id),
- author => post.get_authors(&conn)?.into_iter().map(|u| u.fqn).join(" "),
- creation_date => i64::from(post.creation_date.num_days_from_ce()),
- instance => Instance::get(&conn, post.get_blog(&conn)?.instance_id)?.public_domain,
- tag => Tag::for_post(&conn, post.id)?.into_iter().map(|t| t.tag).join(" "),
- blog_name => post.get_blog(&conn)?.title,
- content => post.content.get().clone(),
- subtitle => post.subtitle.clone(),
- title => post.title.clone(),
- lang => detect_lang(post.content.get()).and_then(|i| if i.is_reliable() { Some(i.lang()) } else {None} ).unwrap_or(Lang::Eng).name(),
- license => post.license.clone(),
- ));
- Ok(())
- }
-
- pub fn delete_document(&self, post: &Post) {
- let schema = self.index.schema();
- let post_id = schema.get_field("post_id").unwrap();
-
- let doc_id = Term::from_field_i64(post_id, i64::from(post.id));
- let mut writer = self.writer.lock().unwrap();
- let writer = writer.as_mut().unwrap();
- writer.delete_term(doc_id);
- }
-
- pub fn update_document(&self, post: &Post) -> Result<()> {
- self.delete_document(post);
- self.add_document(post)
- }
-
- pub fn search_document(&self, query: PlumeQuery, (min, max): (i32, i32)) -> Vec<Post> {
- let schema = self.index.schema();
- let post_id = schema.get_field("post_id").unwrap();
-
- let collector = TopDocs::with_limit(cmp::max(1, max) as usize);
-
- let searcher = self.reader.searcher();
- let res = searcher.search(&query.into_query(), &collector).unwrap();
-
- let conn = match self.dbpool.get() {
- Ok(c) => c,
- Err(_) => return Vec::new(),
- };
-
- res.get(min as usize..)
- .unwrap_or(&[])
- .iter()
- .filter_map(|(_, doc_add)| {
- let doc = searcher.doc(*doc_add).ok()?;
- let id = doc.get_first(post_id)?;
- Post::get(&conn, id.i64_value() as i32).ok()
- //borrow checker don't want me to use filter_map or and_then here
- })
- .collect()
- }
-
- pub fn fill(&self) -> Result<()> {
- let conn = match self.dbpool.get() {
- Ok(c) => c,
- Err(_) => return Err(Error::DbPool),
- };
- for post in posts::table
- .filter(posts::published.eq(true))
- .load::<Post>(&conn)?
- {
- self.update_document(&post)?
- }
- Ok(())
- }
-
- pub fn commit(&self) {
- let mut writer = self.writer.lock().unwrap();
- writer.as_mut().unwrap().commit().unwrap();
- self.reader.reload().unwrap();
- }
-
- pub fn drop_writer(&self) {
- self.writer.lock().unwrap().take();
- }
- }
|