1use anyhow::{Context, Result, anyhow, bail};
2use cj_path_util::{path_util::AppendToPath, unix::polyfill::add_extension};
3use clap::Parser;
4use evobench_tools::{
5 config_file::backend_from_path,
6 ctx,
7 git::GitHash,
8 io_utils::div::create_dir_if_not_exists,
9 run::{
10 benchmarking_job::{BenchmarkingJob, BenchmarkingJobPublic, BenchmarkingJobState},
11 config::{BenchmarkingCommand, PreExecLevel2},
12 custom_parameter::{CustomParameterType, CustomParameterValue},
13 env_vars::AllowableCustomEnvVar,
14 key::RunParameters,
15 },
16 serde_types::{allowed_env_var::AllowedEnvVar, priority::Priority},
17 serde_util::CanonicalJson,
18 silo::query::Query,
19 utillib::get_terminal_width::get_terminal_width,
20 utillib::{
21 arc::CloneArc,
22 conslist::{List, cons},
23 integers::rounding_integer_division,
24 logging::{LogLevelOpts, set_log_level},
25 },
26 warn,
27};
28use itertools::Itertools;
29use kstring::KString;
30use lazy_static::lazy_static;
31use linfa::traits::Transformer;
32use linfa_clustering::Dbscan;
33use ndarray::{Array2, ArrayBase, Dim, ViewRepr};
34use noisy_float::types::R64;
35use num_traits::Float;
36use regex::Regex;
37use serde_json::{Number, Value};
38
39use std::{
40 collections::{BTreeMap, BTreeSet, btree_map::Entry},
41 fmt::Display,
42 fs::File,
43 io::{BufWriter, Write},
44 marker::PhantomData,
45 ops::Range,
46 os::unix::fs::symlink,
47 path::{Path, PathBuf},
48 str::FromStr,
49 sync::Arc,
50};
51
52type Weight = R64;
53
54const ACTION_TYPE_WEIGHT: f64 = 5.;
57
58#[derive(Debug)]
59struct KeyWeightsAndRanges<'t>(Vec<(Weight, Range<Weight>)>, PhantomData<&'t ()>);
60
61#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63struct KeyId(usize);
64
65impl<'t> KeyWeightsAndRanges<'t> {
66 fn key_weight_and_range(&self, key_id: KeyId) -> &(Weight, Range<Weight>) {
68 &self.0[key_id.0]
69 }
70
71 fn len(&self) -> usize {
72 self.0.len()
73 }
74}
75
76#[derive(Debug)]
77struct QueryWeights<'t> {
78 #[allow(unused)]
79 id: QueryId,
80 weights: Vec<Weight>,
81 _phantom: PhantomData<&'t ()>,
82}
83
84impl<'t> QueryWeights<'t> {
85 fn copy_to_array<F: Float + Display>(
86 &self,
87 mut array: ArrayBase<ViewRepr<&mut F>, Dim<[usize; 1]>>,
88 ) {
89 for (i, w) in self.weights.iter().enumerate() {
90 let w = F::from(*w).expect("Weight type is convertible to Float types");
91 if w.is_nan() {
93 panic!("why? w = {w}");
94 }
95 array[i] = w;
96 }
97 }
98}
99
100#[derive(clap::Parser, Debug)]
101#[clap(next_line_help = true)]
102#[clap(term_width = get_terminal_width(4))]
103struct Opts {
105 #[clap(flatten)]
106 log_level: LogLevelOpts,
107
108 #[clap(subcommand)]
111 subcommand: SubCommand,
112}
113
114#[derive(Debug, clap::Args)]
115struct ParseFile {
116 path: PathBuf,
118}
119
120#[derive(clap::Subcommand, Debug)]
121enum SubCommand {
122 Canonicalize {
124 #[clap(long)]
126 optimize: bool,
127
128 input: PathBuf,
130
131 output: PathBuf,
135 },
136
137 Vectorize {
140 #[clap(short, long)]
142 show_values: bool,
143
144 #[clap(long)]
146 optimize: bool,
147
148 #[clap(flatten)]
149 parse_file: ParseFile,
150
151 #[clap(subcommand)]
153 queries_output: Option<QueriesOutput>,
154 },
155
156 MergeClustersTo {
160 new_queries_dir: PathBuf,
162
163 queries_dirs: Vec<PathBuf>,
165 },
168}
169
170#[derive(clap::Subcommand, Debug)]
171enum QueriesOutput {
172 ToFiles {
173 file_base_path: PathBuf,
179 },
180 ToFolders {
181 #[clap(long)]
186 reason: Option<String>,
187
188 folder_base_path: PathBuf,
199 },
200}
201
202fn write_queries_file<'a>(
203 queries_file_path: &Path,
204 queries: &mut dyn Iterator<Item = &'a Arc<str>>,
205) -> Result<()> {
206 (|| -> Result<()> {
207 let mut out = BufWriter::new(File::create(queries_file_path)?);
208 for query in queries {
209 out.write_all(query.trim_end().as_bytes())?;
210 out.write_all(b"\n")?;
211 }
212 out.flush()?;
213 Ok(())
214 })()
215 .with_context(|| anyhow!("writing query file {queries_file_path:?}"))
216}
217
218fn extension_for_cluster_id(cluster_id: Option<ClusterId>) -> String {
220 if let Some(cluster_id) = cluster_id {
221 format!("{cluster_id:03}")
222 } else {
223 format!("unclustered")
224 }
225}
226
227impl QueriesOutput {
228 fn write_queries<'a>(
235 &self,
236 extension: Option<String>,
237 queries: &mut dyn ExactSizeIterator<Item = &'a Arc<str>>,
238 ) -> Result<()> {
239 let queries_len = queries.len();
241
242 match self {
243 QueriesOutput::ToFiles { file_base_path } => {
244 let queries_file_path = if let Some(extension) = extension {
245 let extension = format!("{extension}.ndjson");
246
247 &add_extension(file_base_path, extension).ok_or_else(|| {
248 anyhow!(
249 "to-files requires a path to which a file \
250 extension can be added"
251 )
252 })?
253 } else {
254 file_base_path
255 };
256 write_queries_file(queries_file_path, queries)
257 }
258 QueriesOutput::ToFolders {
259 folder_base_path,
260 reason,
261 } => {
262 let folder_path = if let Some(extension) = extension {
263 &add_extension(folder_base_path, extension).ok_or_else(|| {
264 anyhow!(
265 "to-folders requires a path to which a file \
266 extension can be added"
267 )
268 })?
269 } else {
270 folder_base_path
271 };
272
273 create_dir_if_not_exists(folder_path, "folder for queries file")?;
274
275 {
277 let queries_file_path = folder_path.append("queries.ndjson");
278 write_queries_file(&queries_file_path, queries)?;
279 }
280
281 {
283 let symlink_path = folder_path.append("ignore_queries_for_checksum_regex.txt");
284 match symlink(
285 "../queries/ignore_queries_for_checksum_regex.txt",
286 &symlink_path,
287 ) {
288 Ok(()) => (),
289 Err(e) => match e.kind() {
290 std::io::ErrorKind::AlreadyExists => (),
291 _ => Err(e).map_err(ctx!("creating symlink at {symlink_path:?}"))?,
292 },
293 }
294 }
295
296 {
298 let template_path = folder_path.append("job-template.json5");
299 let reason: &str = if let Some(reason) = reason {
300 reason
301 } else {
302 let folder_base_name = (&folder_base_path)
303 .file_name()
304 .ok_or_else(||anyhow!(
305 "expect a to-folder base path from which the last element can be taken"))?
306 .to_string_lossy();
307
308 &format!("t_{folder_base_name}")
309 };
310
311 let repeat = {
312 let original_len = 33126;
318
319 if queries_len > original_len {
320 bail!(
321 "queries_len {queries_len} > hard-coded original_len {original_len}"
322 );
323 }
324
325 rounding_integer_division(original_len, queries_len)
326 };
327
328 let custom_parameters: Vec<(
329 AllowedEnvVar<AllowableCustomEnvVar>,
330 CustomParameterValue,
331 )> = {
332 let folder_name = folder_path
333 .file_name()
334 .expect("can get back file name of path to which a suffix was added")
335 .to_string_lossy();
336
337 let var = |k: &str, t: CustomParameterType, v: &str| -> Result<_> {
338 Ok((
339 k.parse()?,
340 CustomParameterValue::checked_from(t, &KString::from_ref(v))?,
341 ))
342 };
343
344 use CustomParameterType::*;
345 vec![
347 var("CONCURRENCY", NonZeroU32, "120")?,
348 var("DATASET", Dirname, "SC2open")?,
349 var("QUERIES", Filename, &folder_name)?,
353 var("RANDOMIZED", Bool, "1")?,
354 var("REPEAT", NonZeroU32, &repeat.to_string())?,
355 var("SORTED", Bool, "0")?,
356 ]
357 };
358 let custom_parameters: BTreeMap<
359 AllowedEnvVar<AllowableCustomEnvVar>,
360 CustomParameterValue,
361 > = BTreeMap::from_iter(custom_parameters.into_iter());
362
363 let template = BenchmarkingJob::new(
364 BenchmarkingJobPublic {
365 reason: Some((*reason).to_owned()),
366 run_parameters: Arc::new(RunParameters {
367 commit_id: GitHash::from_str(
368 "a71209b88a91d6ac3fcdb5b9c41062d06a170376",
369 )
370 .expect("hash"),
371 custom_parameters: Arc::new(custom_parameters.into()),
372 }),
373 command: Arc::new(BenchmarkingCommand {
375 target_name: "api".parse().expect("ok"),
376 subdir: "benchmarking".into(),
377 command: "make".into(),
378 arguments: vec!["api".into()],
379 pre_exec_bash_code: PreExecLevel2::new(None),
380 }),
381 },
382 BenchmarkingJobState {
383 remaining_count: 8,
384 remaining_error_budget: 2,
385 last_working_directory: None,
386 },
387 Priority::LOW,
388 Priority::NORMAL,
389 );
390 let backend = backend_from_path(&template_path)?;
391 backend.save_config_file(&template_path, &template)?;
392 }
393
394 Ok(())
395 }
396 }
397 }
398}
399
400#[derive(Debug)]
404enum LeafValue {
405 Bool(bool),
409 Number(Number),
410 None,
412}
413
414#[derive(Debug)]
415struct Queries {
416 query_strings: Vec<Arc<str>>,
417 query_strings_index: BTreeMap<Arc<str>, usize>,
419
420 vectors: BTreeMap<String, (Weight, BTreeMap<QueryId, LeafValue>)>,
425}
426
427#[derive(Debug, PartialEq, Eq)]
428enum UplistEntry<'t> {
429 MapType(String),
432 MapKey(&'t str),
433 Array,
434 String(&'t str),
435 LeafType(&'static str),
439}
440
441fn _uplist_to_string(uplist: &List<UplistEntry>, out: &mut String) {
442 match uplist {
443 List::Pair(val, list) => {
444 _uplist_to_string(list, out);
445 match val {
446 UplistEntry::MapKey(s) => {
447 out.push_str(".");
448 out.push_str(s);
449 }
450 UplistEntry::String(s) => {
451 use std::fmt::Write;
452 let _ = write!(out, "={s:?}");
453 }
454 UplistEntry::Array => {
455 out.push_str("[*]");
456 }
457 UplistEntry::LeafType(s) => {
470 out.push(':');
471 out.push_str(s);
472 }
473 UplistEntry::MapType(s) => {
474 out.push_str("{type:");
475 out.push_str(s);
477 out.push_str("}");
478 }
479 }
480 }
481 List::Null => (),
482 }
483}
484
485fn uplist_to_string(uplist: &List<UplistEntry>) -> String {
486 let mut out = String::new();
487 _uplist_to_string(uplist, &mut out);
488 out
489}
490
491#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
492struct ClusterId(usize);
493
494impl Display for ClusterId {
495 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
496 (self.0 + 1).fmt(f)
497 }
498}
499
500#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
501struct QueryId(usize);
502
503impl Display for QueryId {
504 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
505 (self.0 + 1).fmt(f)
506 }
507}
508
509impl Queries {
510 fn new() -> Queries {
511 Queries {
512 query_strings: Default::default(),
513 query_strings_index: Default::default(),
514 vectors: Default::default(),
515 }
516 }
517
518 fn query_by_id(&self, id: QueryId) -> &Arc<str> {
520 &self.query_strings[id.0]
521 }
522
523 fn insert_query(&mut self, query: Arc<str>) -> Option<QueryId> {
525 match self.query_strings_index.entry(query.clone_arc()) {
526 Entry::Occupied(_occupied_entry) => None,
527 Entry::Vacant(vacant_entry) => {
528 let i = self.query_strings.len();
529 vacant_entry.insert(i);
530 self.query_strings.push(query);
531 Some(QueryId(i))
532 }
533 }
534 }
535
536 fn add_vector_entry(&mut self, id: QueryId, key: String, weight: Weight, val: LeafValue) {
538 match self.vectors.entry(key) {
539 Entry::Occupied(mut query_ids) => {
540 let r = query_ids.get_mut();
541 assert_eq!(r.0, weight);
542 r.1.insert(id, val);
543 }
544 Entry::Vacant(vacant_entry) => {
545 let mut hs = BTreeMap::new();
546 hs.insert(id, val);
547 vacant_entry.insert((weight, hs));
548 }
549 }
550 }
551
552 fn key_weights_and_ranges(&self) -> KeyWeightsAndRanges<'_> {
554 KeyWeightsAndRanges(
555 self.vectors
556 .iter()
557 .map(|(_k, vals)| -> (Weight, Range<Weight>) {
558 let (min, max) = vals
561 .1
562 .values()
563 .filter_map(|leafvalue| match leafvalue {
564 LeafValue::Bool(_) => None,
565 LeafValue::Number(number) => {
566 if let Some(x) = number.as_f64() {
567 Some(x)
568 } else {
569 warn!(
570 "got a Number value that cannot be represented \
571 as f64: {number}"
572 );
573 None
574 }
575 }
576 LeafValue::None => None,
577 })
578 .minmax()
579 .into_option()
580 .unwrap_or_else(|| (0., 1.));
584 let range = Weight::from_f64(min)..Weight::from_f64(max);
585 (vals.0, range)
586 })
587 .collect(),
588 PhantomData,
589 )
590 }
591
592 fn query_ids(&self) -> impl Iterator<Item = QueryId> {
593 (0..self.query_strings.len()).map(|i| QueryId(i))
594 }
595
596 fn query_ids_count(&self) -> usize {
597 self.query_strings.len()
598 }
599
600 fn vector_values(
601 &self,
602 ) -> impl Iterator<Item = (KeyId, &(Weight, BTreeMap<QueryId, LeafValue>))> {
603 self.vectors
604 .values()
605 .enumerate()
606 .map(|(i, v)| (KeyId(i), v))
607 }
608
609 fn weights_for_query(
610 &self,
611 id: QueryId,
612 key_weights_and_ranges: &KeyWeightsAndRanges,
613 ) -> QueryWeights<'_> {
614 QueryWeights {
615 id,
616 weights: self
617 .vector_values()
618 .map(|(key_id, v)| -> Weight {
619 let (key_weight, range) = key_weights_and_ranges.key_weight_and_range(key_id);
620 let x = if let Some(val) = v.1.get(&id) {
621 match val {
622 LeafValue::Bool(b) => {
623 if *b {
624 Weight::from_f64(1.)
625 } else {
626 Weight::from_f64(0.)
627 }
628 }
629 LeafValue::Number(n) => {
630 let range_len = range.end - range.start;
631 if range_len == 0. {
632 Weight::from_f64(1.)
637 } else {
638 let x = n.as_f64().expect(
639 "really need the number to be representable as f64, XX",
640 );
641 let x =
642 Weight::try_from(x).expect("user does not feed NaN or inf");
643 let d = x - range.start;
645 let d01 = d / range_len;
646 d01
647 }
648 }
649 LeafValue::None => Weight::from_f64(0.5),
652 }
653 } else {
654 Weight::from_f64(0.)
655 };
656 x * *key_weight
657 })
658 .collect(),
659 _phantom: PhantomData,
660 }
661 }
662
663 fn _add_vectors(&mut self, id: QueryId, value: &Value, uplist: &List<UplistEntry>) {
664 let (uplist, leaf_value) = match value {
665 Value::Null => {
667 return;
672 }
673 Value::Bool(b) => (
674 &cons(UplistEntry::LeafType("Bool"), uplist),
675 LeafValue::Bool(*b),
676 ),
677 Value::Number(x) => {
678 let prev = uplist.first();
679 let val = if prev == Some(&UplistEntry::MapKey("position")) {
680 LeafValue::None
681 } else if prev == Some(&UplistEntry::MapKey("numberOfMatchers")) {
682 LeafValue::None
683 } else {
684 LeafValue::Number(x.clone())
685 };
686 (&cons(UplistEntry::LeafType("Number"), uplist), val)
687 }
688 Value::String(s) => {
689 lazy_static! {
690 static ref DATE_DAY: Regex = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap();
691 static ref SINGLE_WORD: Regex = Regex::new(r"^[a-zA-Z]+$").unwrap();
692 static ref SPACED_WORDS: Regex =
693 Regex::new(r"^[a-zA-Z]+(?: [a-zA-Z]+)+$").unwrap();
694 static ref LINEAGE: Regex = Regex::new(r"^[a-zA-Z]+(?:\.\d+)+$").unwrap();
695 static ref WORDNUM: Regex = Regex::new(r"^[a-zA-Z]+\d+$").unwrap();
696 static ref NUMWORD: Regex = Regex::new(r"^\d+[a-zA-Z]+$").unwrap();
697 static ref WORDNUMWORD: Regex = Regex::new(r"^[a-zA-Z]+\d+[a-zA-Z]+$").unwrap();
698 }
699
700 let prev = uplist.first();
701 let entry = if prev == Some(&UplistEntry::MapKey("symbol")) {
702 UplistEntry::LeafType("String")
703 } else if prev == Some(&UplistEntry::MapKey("from"))
704 || prev == Some(&UplistEntry::MapKey("to"))
705 {
706 if DATE_DAY.is_match(s) {
707 UplistEntry::LeafType("DateDayString")
708 } else {
709 UplistEntry::String(s)
710 }
711 } else if prev == Some(&UplistEntry::MapKey("value"))
712 || prev == Some(&UplistEntry::MapKey("sequenceName"))
713 {
714 if SINGLE_WORD.is_match(s) {
715 UplistEntry::LeafType("SingleWordString")
716 } else if WORDNUM.is_match(s) {
717 UplistEntry::LeafType("WordnumString")
718 } else if NUMWORD.is_match(s) {
719 UplistEntry::LeafType("NumwordString")
720 } else if WORDNUMWORD.is_match(s) {
721 UplistEntry::LeafType("WordnumwordString")
722 } else if SPACED_WORDS.is_match(s) {
723 UplistEntry::LeafType("SpacedWordsString")
724 } else if LINEAGE.is_match(s) {
725 UplistEntry::LeafType("LineageString")
726 } else if s.contains('*') {
727 UplistEntry::LeafType("StringContainingStar")
728 } else if s.contains('∗') {
729 UplistEntry::LeafType("StringContainingSpecialStar")
730 } else {
731 dbg!(s);
732 UplistEntry::String(s)
733 }
734 } else {
735 UplistEntry::String(s)
736 };
737 (&cons(entry, uplist), LeafValue::None)
738 }
739
740 Value::Array(values) => {
742 let uplist = cons(UplistEntry::Array, uplist);
743 for value in values {
744 self._add_vectors(id, value, &uplist);
745 }
746 return;
747 }
748 Value::Object(map) => {
749 let (uplist, ignore_type, weight) = if let Some(val) = map.get("type") {
750 match val {
751 Value::String(s) => {
752 let weight = if uplist.any(|entry| match entry {
753 UplistEntry::MapKey(n) => *n == "action",
761 _ => false,
762 }) {
763 ACTION_TYPE_WEIGHT
764 } else {
765 1.
766 };
767 (
768 &cons(UplistEntry::MapType(s.to_owned()), uplist),
769 true,
770 weight,
771 )
772 }
773 _ => {
774 warn!("expecting val for 'type' key to be a string, but got: {val}");
775 (uplist, false, 1.)
776 }
777 }
778 } else {
779 (uplist, false, 1.)
780 };
781
782 if ignore_type {
786 let s = uplist_to_string(uplist);
787 self.add_vector_entry(
789 id,
790 s,
791 Weight::from_f64(weight),
792 LeafValue::Bool(true), );
794 }
795
796 for (k, v) in map {
797 if ignore_type && k == "type" {
798 continue;
799 }
800 let uplist = cons(UplistEntry::MapKey(k), uplist);
801 self._add_vectors(id, v, &uplist);
802 }
803 return;
804 }
805 };
806 self.add_vector_entry(
807 id,
808 uplist_to_string(uplist),
809 Weight::from_f64(1.),
810 leaf_value,
811 );
812 }
813 fn add_vectors(&mut self, id: QueryId, value: &Value) {
814 self._add_vectors(id, value, &List::Null)
815 }
816}
817
818fn canonicalize(optimize: bool, line: &str, line0: usize, path: &Path) -> Result<(Value, String)> {
819 let possibly_optimized = if optimize {
820 let query: Query = serde_json::from_str(line).map_err(ctx!(
821 "parsing query line at {}:{}",
822 path.to_string_lossy(),
823 line0 + 1
824 ))?;
825 let optimized = query.optimize();
826 &serde_json::to_string(&optimized)?
827 } else {
828 line
829 };
830
831 let value: Value = serde_json::from_str(possibly_optimized).map_err(ctx!(
832 "parsing line at {}:{}",
833 path.to_string_lossy(),
834 line0 + 1
835 ))?;
836 let canonical_query = CanonicalJson(&value).to_string();
837 Ok((value, canonical_query))
838}
839
840fn read_queries_file_step_1(input: &Path) -> Result<String> {
841 std::fs::read_to_string(&input).map_err(ctx!("reading ndjson file {input:?}"))
842}
843
844fn read_queries_file_step_2(s: &str) -> impl Iterator<Item = (usize, &str)> {
845 s.trim_end().split("\n").enumerate()
846}
847
848fn main() -> Result<()> {
849 let Opts {
850 log_level,
851 subcommand,
852 } = Opts::parse();
853
854 set_log_level(log_level.try_into()?);
855
856 match subcommand {
857 SubCommand::Canonicalize {
858 optimize,
859 input,
860 output,
861 } => {
862 let s = read_queries_file_step_1(&input)?;
864 let mut out = BufWriter::new(
865 File::create(&output).map_err(ctx!("opening output file {output:?}"))?,
866 );
867 for (i, line) in read_queries_file_step_2(&s) {
868 let (_value, canonical_query) = canonicalize(optimize, line, i, &input)?;
869 (|| -> Result<()> {
870 out.write_all(canonical_query.as_bytes())?;
871 out.write_all(b"\n")?;
872 Ok(())
873 })()
874 .with_context(|| anyhow!("writing to output file {output:?}"))?;
875 }
876 out.flush()
877 .with_context(|| anyhow!("writing to output file {output:?}"))?;
878 }
879 SubCommand::Vectorize {
880 parse_file: ParseFile { path },
881 show_values,
882 optimize,
883 queries_output,
884 } => {
885 let mut queries = Queries::new();
886 let s = read_queries_file_step_1(&path)?;
887 for (i, line) in read_queries_file_step_2(&s) {
888 let (value, canonical_query) = canonicalize(optimize, line, i, &path)?;
889 if let Some(id) = queries.insert_query(canonical_query.into()) {
890 queries.add_vectors(id, &value);
891 }
892 }
893 for (key, (weight, idvals)) in &queries.vectors {
895 println!("{weight}\t{key}");
896 if show_values {
897 for (id, val) in idvals {
898 match val {
899 LeafValue::Bool(b) => println!(" {id} => {b:?}"),
900 LeafValue::Number(number) => println!(" {id} => {number:?}"),
901 LeafValue::None => (),
902 }
903 }
904 }
905 }
906
907 let key_weights_and_ranges = queries.key_weights_and_ranges();
908 dbg!(&key_weights_and_ranges);
909 let n_queries = queries.query_ids_count();
910 let n_weights = key_weights_and_ranges.len();
911 let mut records = Array2::<f64>::zeros((n_queries, n_weights));
912
913 for query_id in queries.query_ids() {
914 let qw = queries.weights_for_query(query_id, &key_weights_and_ranges);
915 qw.copy_to_array(records.row_mut(query_id.0));
917 }
918 dbg!(records.row(0));
919 dbg!(records.row(1));
920
921 let min_points = 2;
922 let clusters = Dbscan::params(min_points)
923 .tolerance(1.9)
924 .transform(&records)?;
925 assert_eq!(clusters.len(), n_queries);
928
929 let mut easy_clusters: BTreeMap<ClusterId, BTreeSet<Arc<str>>> = BTreeMap::new();
932 let mut non_clustered: Vec<Arc<str>> = Vec::new();
933
934 for (query_id, cluster) in clusters.iter().enumerate() {
935 let query_id = QueryId(query_id);
936 let query = queries.query_by_id(query_id).clone_arc();
937 if let Some(cluster_id) = cluster {
938 let cluster_id = ClusterId(*cluster_id);
939 match easy_clusters.entry(cluster_id) {
940 Entry::Vacant(vacant_entry) => {
941 let mut set = BTreeSet::new();
942 set.insert(query);
943 vacant_entry.insert(set);
944 }
945 Entry::Occupied(mut occupied_entry) => {
946 occupied_entry.get_mut().insert(query);
947 }
948 }
949 } else {
950 non_clustered.push(query);
951 }
952 }
953
954 if let Some(queries_output) = queries_output {
955 for (cluster_id, cluster) in &easy_clusters {
956 queries_output.write_queries(
957 Some(extension_for_cluster_id(Some(*cluster_id))),
958 &mut cluster.iter(),
959 )?;
960 }
961
962 if !non_clustered.is_empty() {
963 queries_output.write_queries(
964 Some(extension_for_cluster_id(None)),
965 &mut non_clustered.iter(),
966 )?;
967 }
968 } else {
969 dbg!(easy_clusters);
970 dbg!(non_clustered);
971 }
972 }
973
974 SubCommand::MergeClustersTo {
975 new_queries_dir,
976 queries_dirs,
977 } => {
978 let mut queries = Vec::new();
979
980 for queries_dir in queries_dirs {
981 let queries_file_path = queries_dir.append("queries.ndjson");
982 let s = read_queries_file_step_1(&queries_file_path)?;
983 for (_i, line) in read_queries_file_step_2(&s) {
984 queries.push(Arc::from(line.to_owned()));
985 }
986 }
987
988 let queries_output = QueriesOutput::ToFolders {
990 reason: None,
991 folder_base_path: new_queries_dir,
992 };
993 queries_output.write_queries(None, &mut queries.iter())?;
994 }
995 }
996
997 Ok(())
998}