evobench_ndjson/
evobench-ndjson.rs

1use anyhow::{Context, Result, anyhow, bail};
2use cj_path_util::{path_util::AppendToPath, unix::polyfill::add_extension};
3use clap::Parser;
4use evobench_tools::{
5    config_file::backend_from_path,
6    ctx,
7    git::GitHash,
8    io_utils::div::create_dir_if_not_exists,
9    run::{
10        benchmarking_job::{BenchmarkingJob, BenchmarkingJobPublic, BenchmarkingJobState},
11        config::{BenchmarkingCommand, PreExecLevel2},
12        custom_parameter::{CustomParameterType, CustomParameterValue},
13        env_vars::AllowableCustomEnvVar,
14        key::RunParameters,
15    },
16    serde_types::{allowed_env_var::AllowedEnvVar, priority::Priority},
17    serde_util::CanonicalJson,
18    silo::query::Query,
19    utillib::get_terminal_width::get_terminal_width,
20    utillib::{
21        arc::CloneArc,
22        conslist::{List, cons},
23        integers::rounding_integer_division,
24        logging::{LogLevelOpts, set_log_level},
25    },
26    warn,
27};
28use itertools::Itertools;
29use kstring::KString;
30use lazy_static::lazy_static;
31use linfa::traits::Transformer;
32use linfa_clustering::Dbscan;
33use ndarray::{Array2, ArrayBase, Dim, ViewRepr};
34use noisy_float::types::R64;
35use num_traits::Float;
36use regex::Regex;
37use serde_json::{Number, Value};
38
39use std::{
40    collections::{BTreeMap, BTreeSet, btree_map::Entry},
41    fmt::Display,
42    fs::File,
43    io::{BufWriter, Write},
44    marker::PhantomData,
45    ops::Range,
46    os::unix::fs::symlink,
47    path::{Path, PathBuf},
48    str::FromStr,
49    sync::Arc,
50};
51
52type Weight = R64;
53
54// The action type should be more important than other attributes, by
55// this factor
56const ACTION_TYPE_WEIGHT: f64 = 5.;
57
58#[derive(Debug)]
59struct KeyWeightsAndRanges<'t>(Vec<(Weight, Range<Weight>)>, PhantomData<&'t ()>);
60
61/// A reference for a key path in `Queries`
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63struct KeyId(usize);
64
65impl<'t> KeyWeightsAndRanges<'t> {
66    /// Panics for invalid `KeyId`s
67    fn key_weight_and_range(&self, key_id: KeyId) -> &(Weight, Range<Weight>) {
68        &self.0[key_id.0]
69    }
70
71    fn len(&self) -> usize {
72        self.0.len()
73    }
74}
75
76#[derive(Debug)]
77struct QueryWeights<'t> {
78    #[allow(unused)]
79    id: QueryId,
80    weights: Vec<Weight>,
81    _phantom: PhantomData<&'t ()>,
82}
83
84impl<'t> QueryWeights<'t> {
85    fn copy_to_array<F: Float + Display>(
86        &self,
87        mut array: ArrayBase<ViewRepr<&mut F>, Dim<[usize; 1]>>,
88    ) {
89        for (i, w) in self.weights.iter().enumerate() {
90            let w = F::from(*w).expect("Weight type is convertible to Float types");
91            // / F::from(1.).expect("Weight::MAX is convertible to Float types");
92            if w.is_nan() {
93                panic!("why? w = {w}");
94            }
95            array[i] = w;
96        }
97    }
98}
99
100#[derive(clap::Parser, Debug)]
101#[clap(next_line_help = true)]
102#[clap(term_width = get_terminal_width(4))]
103/// Schedule and query benchmarking jobs.
104struct Opts {
105    #[clap(flatten)]
106    log_level: LogLevelOpts,
107
108    /// The subcommand to run. Use `--help` after the sub-command to
109    /// get a list of the allowed options there.
110    #[clap(subcommand)]
111    subcommand: SubCommand,
112}
113
114#[derive(Debug, clap::Args)]
115struct ParseFile {
116    /// The ndjson file to parse
117    path: PathBuf,
118}
119
120#[derive(clap::Subcommand, Debug)]
121enum SubCommand {
122    /// Bring a queries file into canonical representation.
123    Canonicalize {
124        /// Whether to optimize filter expressions
125        #[clap(long)]
126        optimize: bool,
127
128        /// Path to ndjson file with queries
129        input: PathBuf,
130
131        /// Path to where the same queries should be written, in the
132        /// same order, but canonicalized (whitespace standardized,
133        /// key order sorted alphabetically)
134        output: PathBuf,
135    },
136
137    /// Build queries clusters out of a queries file (includes
138    /// bringing queries into their canonical representation).
139    Vectorize {
140        /// Show values (with query id where it appears) with each key
141        #[clap(short, long)]
142        show_values: bool,
143
144        /// Whether to optimize filter expressions
145        #[clap(long)]
146        optimize: bool,
147
148        #[clap(flatten)]
149        parse_file: ParseFile,
150
151        /// Whether to write query (and job template) files
152        #[clap(subcommand)]
153        queries_output: Option<QueriesOutput>,
154    },
155
156    /// Copy contents of multiple queries dirs into a single new one
157    /// (does *not* run query canonicalization); creates a new
158    /// template file from scratch.
159    MergeClustersTo {
160        /// Path to new dir (no extension is added here!)
161        new_queries_dir: PathBuf,
162
163        /// Paths to existing queries dirs
164        queries_dirs: Vec<PathBuf>,
165        // Not adding `reason` override here; just rely on the
166        // default, OK?
167    },
168}
169
170#[derive(clap::Subcommand, Debug)]
171enum QueriesOutput {
172    ToFiles {
173        /// Write queries files with the given path, with `.$n.ndjson`
174        /// appended, where `$n` is the number of the cluster. There
175        /// is also one file with `.unclustered.ndjson` appended, that
176        /// contains queries that did not cluster with any other
177        /// query.
178        file_base_path: PathBuf,
179    },
180    ToFolders {
181        /// The reason put into the benchmarking job template. By
182        /// default, takes the name of the folder in which the
183        /// template is created (i.e. of `folder_base_path` after
184        /// appending the cluster id).
185        #[clap(long)]
186        reason: Option<String>,
187
188        /// Write queries files one to newly created folders each,
189        /// with `.$n` appended to the path, where `$n` is the number
190        /// of the cluster, as `queries.ndjson` files. Also creates a
191        /// symlink `ignore_queries_for_checksum_regex.txt ->
192        /// ../queries/ignore_queries_for_checksum_regex.txt`, and a
193        /// file `job-template.json5` that can be used with
194        /// `evobench insert-file` to run a job with that queries
195        /// file.  There is also one dir with `.unclustered` appended,
196        /// that contains queries that did not cluster with any other
197        /// query (only if that set is not empty).
198        folder_base_path: PathBuf,
199    },
200}
201
202fn write_queries_file<'a>(
203    queries_file_path: &Path,
204    queries: &mut dyn Iterator<Item = &'a Arc<str>>,
205) -> Result<()> {
206    (|| -> Result<()> {
207        let mut out = BufWriter::new(File::create(queries_file_path)?);
208        for query in queries {
209            out.write_all(query.trim_end().as_bytes())?;
210            out.write_all(b"\n")?;
211        }
212        out.flush()?;
213        Ok(())
214    })()
215    .with_context(|| anyhow!("writing query file {queries_file_path:?}"))
216}
217
218// *Without* the .ndjson extension, that is added by the backend itself
219fn extension_for_cluster_id(cluster_id: Option<ClusterId>) -> String {
220    if let Some(cluster_id) = cluster_id {
221        format!("{cluster_id:03}")
222    } else {
223        format!("unclustered")
224    }
225}
226
227impl QueriesOutput {
228    // `extension` is the part *without* the .ndjson extension (that
229    // is added by the backend itself if targetting a file); None
230    // means nothing is added to the "base" paths in `self`.
231    // `queries` is/can be strings without the end of line character
232    // (any whitespace at the end is stripped before re-adding a
233    // newline).
234    fn write_queries<'a>(
235        &self,
236        extension: Option<String>,
237        queries: &mut dyn ExactSizeIterator<Item = &'a Arc<str>>,
238    ) -> Result<()> {
239        // Get this before mutating `queries`!
240        let queries_len = queries.len();
241
242        match self {
243            QueriesOutput::ToFiles { file_base_path } => {
244                let queries_file_path = if let Some(extension) = extension {
245                    let extension = format!("{extension}.ndjson");
246
247                    &add_extension(file_base_path, extension).ok_or_else(|| {
248                        anyhow!(
249                            "to-files requires a path to which a file \
250                             extension can be added"
251                        )
252                    })?
253                } else {
254                    file_base_path
255                };
256                write_queries_file(queries_file_path, queries)
257            }
258            QueriesOutput::ToFolders {
259                folder_base_path,
260                reason,
261            } => {
262                let folder_path = if let Some(extension) = extension {
263                    &add_extension(folder_base_path, extension).ok_or_else(|| {
264                        anyhow!(
265                            "to-folders requires a path to which a file \
266                         extension can be added"
267                        )
268                    })?
269                } else {
270                    folder_base_path
271                };
272
273                create_dir_if_not_exists(folder_path, "folder for queries file")?;
274
275                // File with queries
276                {
277                    let queries_file_path = folder_path.append("queries.ndjson");
278                    write_queries_file(&queries_file_path, queries)?;
279                }
280
281                // Symlink
282                {
283                    let symlink_path = folder_path.append("ignore_queries_for_checksum_regex.txt");
284                    match symlink(
285                        "../queries/ignore_queries_for_checksum_regex.txt",
286                        &symlink_path,
287                    ) {
288                        Ok(()) => (),
289                        Err(e) => match e.kind() {
290                            std::io::ErrorKind::AlreadyExists => (),
291                            _ => Err(e).map_err(ctx!("creating symlink at {symlink_path:?}"))?,
292                        },
293                    }
294                }
295
296                // Job template file
297                {
298                    let template_path = folder_path.append("job-template.json5");
299                    let reason: &str = if let Some(reason) = reason {
300                        reason
301                    } else {
302                        let folder_base_name = (&folder_base_path)
303                        .file_name()
304                            .ok_or_else(||anyhow!(
305                                "expect a to-folder base path from which the last element can be taken"))?
306                        .to_string_lossy();
307
308                        &format!("t_{folder_base_name}")
309                    };
310
311                    let repeat = {
312                        // XX hack hard-coded; can't just take the
313                        // total over all clusters, because the ndjson
314                        // input file may already be a subset of the
315                        // original count!
316                        // wc -l silo-benchmark-datasets/SC2open/v0.9.0/queries/queries.ndjson
317                        let original_len = 33126;
318
319                        if queries_len > original_len {
320                            bail!(
321                                "queries_len {queries_len} > hard-coded original_len {original_len}"
322                            );
323                        }
324
325                        rounding_integer_division(original_len, queries_len)
326                    };
327
328                    let custom_parameters: Vec<(
329                        AllowedEnvVar<AllowableCustomEnvVar>,
330                        CustomParameterValue,
331                    )> = {
332                        let folder_name = folder_path
333                            .file_name()
334                            .expect("can get back file name of path to which a suffix was added")
335                            .to_string_lossy();
336
337                        let var = |k: &str, t: CustomParameterType, v: &str| -> Result<_> {
338                            Ok((
339                                k.parse()?,
340                                CustomParameterValue::checked_from(t, &KString::from_ref(v))?,
341                            ))
342                        };
343
344                        use CustomParameterType::*;
345                        // XX hack hard-coded
346                        vec![
347                            var("CONCURRENCY", NonZeroU32, "120")?,
348                            var("DATASET", Dirname, "SC2open")?,
349                            // Heh, still using the Filename type here!
350                            // But then, now using a suffix, thus actually
351                            // appropriate?
352                            var("QUERIES", Filename, &folder_name)?,
353                            var("RANDOMIZED", Bool, "1")?,
354                            var("REPEAT", NonZeroU32, &repeat.to_string())?,
355                            var("SORTED", Bool, "0")?,
356                        ]
357                    };
358                    let custom_parameters: BTreeMap<
359                        AllowedEnvVar<AllowableCustomEnvVar>,
360                        CustomParameterValue,
361                    > = BTreeMap::from_iter(custom_parameters.into_iter());
362
363                    let template = BenchmarkingJob::new(
364                        BenchmarkingJobPublic {
365                            reason: Some((*reason).to_owned()),
366                            run_parameters: Arc::new(RunParameters {
367                                commit_id: GitHash::from_str(
368                                    "a71209b88a91d6ac3fcdb5b9c41062d06a170376",
369                                )
370                                .expect("hash"),
371                                custom_parameters: Arc::new(custom_parameters.into()),
372                            }),
373                            // XX all hard-coded; what HACK is this?
374                            command: Arc::new(BenchmarkingCommand {
375                                target_name: "api".parse().expect("ok"),
376                                subdir: "benchmarking".into(),
377                                command: "make".into(),
378                                arguments: vec!["api".into()],
379                                pre_exec_bash_code: PreExecLevel2::new(None),
380                            }),
381                        },
382                        BenchmarkingJobState {
383                            remaining_count: 8,
384                            remaining_error_budget: 2,
385                            last_working_directory: None,
386                        },
387                        Priority::LOW,
388                        Priority::NORMAL,
389                    );
390                    let backend = backend_from_path(&template_path)?;
391                    backend.save_config_file(&template_path, &template)?;
392                }
393
394                Ok(())
395            }
396        }
397    }
398}
399
400/// Original value from the query json; will be converted to use the
401/// u32 range once all values are known and min and max can be
402/// derived.
403#[derive(Debug)]
404enum LeafValue {
405    // `Null` would only be useful if the key does not contain the
406    // type declaration `:Null`, but if it does, we just drop the
407    // value, thus None is used instead.
408    Bool(bool),
409    Number(Number),
410    /// When the value does not appear relevant, e.g. for position
411    None,
412}
413
414#[derive(Debug)]
415struct Queries {
416    query_strings: Vec<Arc<str>>,
417    // id of the above entries
418    query_strings_index: BTreeMap<Arc<str>, usize>,
419
420    // Which queries have that vector; the key string here is the
421    // vector string (the json path alike), not the query string. The
422    // `Weight` is a multiplier, can give higher or lower weight for
423    // this dimension.
424    vectors: BTreeMap<String, (Weight, BTreeMap<QueryId, LeafValue>)>,
425}
426
427#[derive(Debug, PartialEq, Eq)]
428enum UplistEntry<'t> {
429    /// in the middle of paths, to signal the presence of a map key
430    /// .type="foo"
431    MapType(String),
432    MapKey(&'t str),
433    Array,
434    String(&'t str),
435    // Null,
436    // Bool(bool),
437    // Number(Number),
438    LeafType(&'static str),
439}
440
441fn _uplist_to_string(uplist: &List<UplistEntry>, out: &mut String) {
442    match uplist {
443        List::Pair(val, list) => {
444            _uplist_to_string(list, out);
445            match val {
446                UplistEntry::MapKey(s) => {
447                    out.push_str(".");
448                    out.push_str(s);
449                }
450                UplistEntry::String(s) => {
451                    use std::fmt::Write;
452                    let _ = write!(out, "={s:?}");
453                }
454                UplistEntry::Array => {
455                    out.push_str("[*]");
456                }
457                // UplistEntry::Null => {
458                //     out.push_str("=");
459                //     out.push_str("null");
460                // }
461                // UplistEntry::Bool(b) => {
462                //     out.push_str("=");
463                //     out.push_str(if *b { "true" } else { "false" });
464                // }
465                // UplistEntry::Number(number) => {
466                //     use std::fmt::Write;
467                //     let _ = write!(out, "={number}");
468                // }
469                UplistEntry::LeafType(s) => {
470                    out.push(':');
471                    out.push_str(s);
472                }
473                UplistEntry::MapType(s) => {
474                    out.push_str("{type:");
475                    // missing escaping, like in some other places, insecure
476                    out.push_str(s);
477                    out.push_str("}");
478                }
479            }
480        }
481        List::Null => (),
482    }
483}
484
485fn uplist_to_string(uplist: &List<UplistEntry>) -> String {
486    let mut out = String::new();
487    _uplist_to_string(uplist, &mut out);
488    out
489}
490
491#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
492struct ClusterId(usize);
493
494impl Display for ClusterId {
495    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
496        (self.0 + 1).fmt(f)
497    }
498}
499
500#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
501struct QueryId(usize);
502
503impl Display for QueryId {
504    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
505        (self.0 + 1).fmt(f)
506    }
507}
508
509impl Queries {
510    fn new() -> Queries {
511        Queries {
512            query_strings: Default::default(),
513            query_strings_index: Default::default(),
514            vectors: Default::default(),
515        }
516    }
517
518    /// panics for invalid ids
519    fn query_by_id(&self, id: QueryId) -> &Arc<str> {
520        &self.query_strings[id.0]
521    }
522
523    /// Returns Null if query is already in it
524    fn insert_query(&mut self, query: Arc<str>) -> Option<QueryId> {
525        match self.query_strings_index.entry(query.clone_arc()) {
526            Entry::Occupied(_occupied_entry) => None,
527            Entry::Vacant(vacant_entry) => {
528                let i = self.query_strings.len();
529                vacant_entry.insert(i);
530                self.query_strings.push(query);
531                Some(QueryId(i))
532            }
533        }
534    }
535
536    /// `weight` is only used when this is the first entry of `key`
537    fn add_vector_entry(&mut self, id: QueryId, key: String, weight: Weight, val: LeafValue) {
538        match self.vectors.entry(key) {
539            Entry::Occupied(mut query_ids) => {
540                let r = query_ids.get_mut();
541                assert_eq!(r.0, weight);
542                r.1.insert(id, val);
543            }
544            Entry::Vacant(vacant_entry) => {
545                let mut hs = BTreeMap::new();
546                hs.insert(id, val);
547                vacant_entry.insert((weight, hs));
548            }
549        }
550    }
551
552    /// How much the distance at each key should contribute
553    fn key_weights_and_ranges(&self) -> KeyWeightsAndRanges<'_> {
554        KeyWeightsAndRanges(
555            self.vectors
556                .iter()
557                .map(|(_k, vals)| -> (Weight, Range<Weight>) {
558                    // ^ originally wanted to use k.len() for
559                    // weighting; not anymore, getting it in f64^ now.
560                    let (min, max) = vals
561                        .1
562                        .values()
563                        .filter_map(|leafvalue| match leafvalue {
564                            LeafValue::Bool(_) => None,
565                            LeafValue::Number(number) => {
566                                if let Some(x) = number.as_f64() {
567                                    Some(x)
568                                } else {
569                                    warn!(
570                                        "got a Number value that cannot be represented \
571                                         as f64: {number}"
572                                    );
573                                    None
574                                }
575                            }
576                            LeafValue::None => None,
577                        })
578                        .minmax()
579                        .into_option()
580                        // .expect("always have some values if we have the
581                        // key"); wrong, it happens, of course, if we
582                        // only have None items. XX What to use?
583                        .unwrap_or_else(|| (0., 1.));
584                    let range = Weight::from_f64(min)..Weight::from_f64(max);
585                    (vals.0, range)
586                })
587                .collect(),
588            PhantomData,
589        )
590    }
591
592    fn query_ids(&self) -> impl Iterator<Item = QueryId> {
593        (0..self.query_strings.len()).map(|i| QueryId(i))
594    }
595
596    fn query_ids_count(&self) -> usize {
597        self.query_strings.len()
598    }
599
600    fn vector_values(
601        &self,
602    ) -> impl Iterator<Item = (KeyId, &(Weight, BTreeMap<QueryId, LeafValue>))> {
603        self.vectors
604            .values()
605            .enumerate()
606            .map(|(i, v)| (KeyId(i), v))
607    }
608
609    fn weights_for_query(
610        &self,
611        id: QueryId,
612        key_weights_and_ranges: &KeyWeightsAndRanges,
613    ) -> QueryWeights<'_> {
614        QueryWeights {
615            id,
616            weights: self
617                .vector_values()
618                .map(|(key_id, v)| -> Weight {
619                    let (key_weight, range) = key_weights_and_ranges.key_weight_and_range(key_id);
620                    let x = if let Some(val) = v.1.get(&id) {
621                        match val {
622                            LeafValue::Bool(b) => {
623                                if *b {
624                                    Weight::from_f64(1.)
625                                } else {
626                                    Weight::from_f64(0.)
627                                }
628                            }
629                            LeafValue::Number(n) => {
630                                let range_len = range.end - range.start;
631                                if range_len == 0. {
632                                    // Only should get here if there
633                                    // is only 1 distinct value. ('No
634                                    // values' should get us to 0.0
635                                    // .. 1.0 instead.)  -- XX ? still not the 20.0 place.
636                                    Weight::from_f64(1.)
637                                } else {
638                                    let x = n.as_f64().expect(
639                                        "really need the number to be representable as f64, XX",
640                                    );
641                                    let x =
642                                        Weight::try_from(x).expect("user does not feed NaN or inf");
643                                    // ^ XX sigh. user-exposed panic!
644                                    let d = x - range.start;
645                                    let d01 = d / range_len;
646                                    d01
647                                }
648                            }
649                            // presence should be the median 'normal
650                            // value', vs. MIN for absence, OK?
651                            LeafValue::None => Weight::from_f64(0.5),
652                        }
653                    } else {
654                        Weight::from_f64(0.)
655                    };
656                    x * *key_weight
657                })
658                .collect(),
659            _phantom: PhantomData,
660        }
661    }
662
663    fn _add_vectors(&mut self, id: QueryId, value: &Value, uplist: &List<UplistEntry>) {
664        let (uplist, leaf_value) = match value {
665            // Leaf positions
666            Value::Null => {
667                // ignore entry: null is for an Option::None, and we
668                // don't know the type, it's best to just omit the
669                // path. That will yield a default value (0.)
670                // for that field later on.
671                return;
672            }
673            Value::Bool(b) => (
674                &cons(UplistEntry::LeafType("Bool"), uplist),
675                LeafValue::Bool(*b),
676            ),
677            Value::Number(x) => {
678                let prev = uplist.first();
679                let val = if prev == Some(&UplistEntry::MapKey("position")) {
680                    LeafValue::None
681                } else if prev == Some(&UplistEntry::MapKey("numberOfMatchers")) {
682                    LeafValue::None
683                } else {
684                    LeafValue::Number(x.clone())
685                };
686                (&cons(UplistEntry::LeafType("Number"), uplist), val)
687            }
688            Value::String(s) => {
689                lazy_static! {
690                    static ref DATE_DAY: Regex = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap();
691                    static ref SINGLE_WORD: Regex = Regex::new(r"^[a-zA-Z]+$").unwrap();
692                    static ref SPACED_WORDS: Regex =
693                        Regex::new(r"^[a-zA-Z]+(?: [a-zA-Z]+)+$").unwrap();
694                    static ref LINEAGE: Regex = Regex::new(r"^[a-zA-Z]+(?:\.\d+)+$").unwrap();
695                    static ref WORDNUM: Regex = Regex::new(r"^[a-zA-Z]+\d+$").unwrap();
696                    static ref NUMWORD: Regex = Regex::new(r"^\d+[a-zA-Z]+$").unwrap();
697                    static ref WORDNUMWORD: Regex = Regex::new(r"^[a-zA-Z]+\d+[a-zA-Z]+$").unwrap();
698                }
699
700                let prev = uplist.first();
701                let entry = if prev == Some(&UplistEntry::MapKey("symbol")) {
702                    UplistEntry::LeafType("String")
703                } else if prev == Some(&UplistEntry::MapKey("from"))
704                    || prev == Some(&UplistEntry::MapKey("to"))
705                {
706                    if DATE_DAY.is_match(s) {
707                        UplistEntry::LeafType("DateDayString")
708                    } else {
709                        UplistEntry::String(s)
710                    }
711                } else if prev == Some(&UplistEntry::MapKey("value"))
712                    || prev == Some(&UplistEntry::MapKey("sequenceName"))
713                {
714                    if SINGLE_WORD.is_match(s) {
715                        UplistEntry::LeafType("SingleWordString")
716                    } else if WORDNUM.is_match(s) {
717                        UplistEntry::LeafType("WordnumString")
718                    } else if NUMWORD.is_match(s) {
719                        UplistEntry::LeafType("NumwordString")
720                    } else if WORDNUMWORD.is_match(s) {
721                        UplistEntry::LeafType("WordnumwordString")
722                    } else if SPACED_WORDS.is_match(s) {
723                        UplistEntry::LeafType("SpacedWordsString")
724                    } else if LINEAGE.is_match(s) {
725                        UplistEntry::LeafType("LineageString")
726                    } else if s.contains('*') {
727                        UplistEntry::LeafType("StringContainingStar")
728                    } else if s.contains('∗') {
729                        UplistEntry::LeafType("StringContainingSpecialStar")
730                    } else {
731                        dbg!(s);
732                        UplistEntry::String(s)
733                    }
734                } else {
735                    UplistEntry::String(s)
736                };
737                (&cons(entry, uplist), LeafValue::None)
738            }
739
740            // Non-leaf positions
741            Value::Array(values) => {
742                let uplist = cons(UplistEntry::Array, uplist);
743                for value in values {
744                    self._add_vectors(id, value, &uplist);
745                }
746                return;
747            }
748            Value::Object(map) => {
749                let (uplist, ignore_type, weight) = if let Some(val) = map.get("type") {
750                    match val {
751                        Value::String(s) => {
752                            let weight = if uplist.any(|entry| match entry {
753                                // We don't have `"type": "Action"`;
754                                // we have `"action":{... "type": x}`;
755                                // i.e. x might be a *subtype* of
756                                // Action; but if we are under an
757                                // "action" field key, then it *must*
758                                // be an Action subtype. Saves us
759                                // checking for all subtypes.
760                                UplistEntry::MapKey(n) => *n == "action",
761                                _ => false,
762                            }) {
763                                ACTION_TYPE_WEIGHT
764                            } else {
765                                1.
766                            };
767                            (
768                                &cons(UplistEntry::MapType(s.to_owned()), uplist),
769                                true,
770                                weight,
771                            )
772                        }
773                        _ => {
774                            warn!("expecting val for 'type' key to be a string, but got: {val}");
775                            (uplist, false, 1.)
776                        }
777                    }
778                } else {
779                    (uplist, false, 1.)
780                };
781
782                // If we have a `type` entry, add the path on its own,
783                // for the weight if it is an Action type, but should
784                // be in there anyway, right?
785                if ignore_type {
786                    let s = uplist_to_string(uplist);
787                    // dbg!((&s, weight)); -- this is OK
788                    self.add_vector_entry(
789                        id,
790                        s,
791                        Weight::from_f64(weight),
792                        LeafValue::Bool(true), // XXX grusig
793                    );
794                }
795
796                for (k, v) in map {
797                    if ignore_type && k == "type" {
798                        continue;
799                    }
800                    let uplist = cons(UplistEntry::MapKey(k), uplist);
801                    self._add_vectors(id, v, &uplist);
802                }
803                return;
804            }
805        };
806        self.add_vector_entry(
807            id,
808            uplist_to_string(uplist),
809            Weight::from_f64(1.),
810            leaf_value,
811        );
812    }
813    fn add_vectors(&mut self, id: QueryId, value: &Value) {
814        self._add_vectors(id, value, &List::Null)
815    }
816}
817
818fn canonicalize(optimize: bool, line: &str, line0: usize, path: &Path) -> Result<(Value, String)> {
819    let possibly_optimized = if optimize {
820        let query: Query = serde_json::from_str(line).map_err(ctx!(
821            "parsing query line at {}:{}",
822            path.to_string_lossy(),
823            line0 + 1
824        ))?;
825        let optimized = query.optimize();
826        &serde_json::to_string(&optimized)?
827    } else {
828        line
829    };
830
831    let value: Value = serde_json::from_str(possibly_optimized).map_err(ctx!(
832        "parsing line at {}:{}",
833        path.to_string_lossy(),
834        line0 + 1
835    ))?;
836    let canonical_query = CanonicalJson(&value).to_string();
837    Ok((value, canonical_query))
838}
839
840fn read_queries_file_step_1(input: &Path) -> Result<String> {
841    std::fs::read_to_string(&input).map_err(ctx!("reading ndjson file {input:?}"))
842}
843
844fn read_queries_file_step_2(s: &str) -> impl Iterator<Item = (usize, &str)> {
845    s.trim_end().split("\n").enumerate()
846}
847
848fn main() -> Result<()> {
849    let Opts {
850        log_level,
851        subcommand,
852    } = Opts::parse();
853
854    set_log_level(log_level.try_into()?);
855
856    match subcommand {
857        SubCommand::Canonicalize {
858            optimize,
859            input,
860            output,
861        } => {
862            // ~copy paste from SubCommand::Vectorize
863            let s = read_queries_file_step_1(&input)?;
864            let mut out = BufWriter::new(
865                File::create(&output).map_err(ctx!("opening output file {output:?}"))?,
866            );
867            for (i, line) in read_queries_file_step_2(&s) {
868                let (_value, canonical_query) = canonicalize(optimize, line, i, &input)?;
869                (|| -> Result<()> {
870                    out.write_all(canonical_query.as_bytes())?;
871                    out.write_all(b"\n")?;
872                    Ok(())
873                })()
874                .with_context(|| anyhow!("writing to output file {output:?}"))?;
875            }
876            out.flush()
877                .with_context(|| anyhow!("writing to output file {output:?}"))?;
878        }
879        SubCommand::Vectorize {
880            parse_file: ParseFile { path },
881            show_values,
882            optimize,
883            queries_output,
884        } => {
885            let mut queries = Queries::new();
886            let s = read_queries_file_step_1(&path)?;
887            for (i, line) in read_queries_file_step_2(&s) {
888                let (value, canonical_query) = canonicalize(optimize, line, i, &path)?;
889                if let Some(id) = queries.insert_query(canonical_query.into()) {
890                    queries.add_vectors(id, &value);
891                }
892            }
893            // dbg!(queries);
894            for (key, (weight, idvals)) in &queries.vectors {
895                println!("{weight}\t{key}");
896                if show_values {
897                    for (id, val) in idvals {
898                        match val {
899                            LeafValue::Bool(b) => println!("    {id} => {b:?}"),
900                            LeafValue::Number(number) => println!("    {id} => {number:?}"),
901                            LeafValue::None => (),
902                        }
903                    }
904                }
905            }
906
907            let key_weights_and_ranges = queries.key_weights_and_ranges();
908            dbg!(&key_weights_and_ranges);
909            let n_queries = queries.query_ids_count();
910            let n_weights = key_weights_and_ranges.len();
911            let mut records = Array2::<f64>::zeros((n_queries, n_weights));
912
913            for query_id in queries.query_ids() {
914                let qw = queries.weights_for_query(query_id, &key_weights_and_ranges);
915                // dbg!(&qw);
916                qw.copy_to_array(records.row_mut(query_id.0));
917            }
918            dbg!(records.row(0));
919            dbg!(records.row(1));
920
921            let min_points = 2;
922            let clusters = Dbscan::params(min_points)
923                .tolerance(1.9)
924                .transform(&records)?;
925            // dbg!(clusters);
926
927            assert_eq!(clusters.len(), n_queries);
928
929            // (XX is there actually no reason to use Set vs Vec here?
930            // Ordering enforced to be by id in either case?)
931            let mut easy_clusters: BTreeMap<ClusterId, BTreeSet<Arc<str>>> = BTreeMap::new();
932            let mut non_clustered: Vec<Arc<str>> = Vec::new();
933
934            for (query_id, cluster) in clusters.iter().enumerate() {
935                let query_id = QueryId(query_id);
936                let query = queries.query_by_id(query_id).clone_arc();
937                if let Some(cluster_id) = cluster {
938                    let cluster_id = ClusterId(*cluster_id);
939                    match easy_clusters.entry(cluster_id) {
940                        Entry::Vacant(vacant_entry) => {
941                            let mut set = BTreeSet::new();
942                            set.insert(query);
943                            vacant_entry.insert(set);
944                        }
945                        Entry::Occupied(mut occupied_entry) => {
946                            occupied_entry.get_mut().insert(query);
947                        }
948                    }
949                } else {
950                    non_clustered.push(query);
951                }
952            }
953
954            if let Some(queries_output) = queries_output {
955                for (cluster_id, cluster) in &easy_clusters {
956                    queries_output.write_queries(
957                        Some(extension_for_cluster_id(Some(*cluster_id))),
958                        &mut cluster.iter(),
959                    )?;
960                }
961
962                if !non_clustered.is_empty() {
963                    queries_output.write_queries(
964                        Some(extension_for_cluster_id(None)),
965                        &mut non_clustered.iter(),
966                    )?;
967                }
968            } else {
969                dbg!(easy_clusters);
970                dbg!(non_clustered);
971            }
972        }
973
974        SubCommand::MergeClustersTo {
975            new_queries_dir,
976            queries_dirs,
977        } => {
978            let mut queries = Vec::new();
979
980            for queries_dir in queries_dirs {
981                let queries_file_path = queries_dir.append("queries.ndjson");
982                let s = read_queries_file_step_1(&queries_file_path)?;
983                for (_i, line) in read_queries_file_step_2(&s) {
984                    queries.push(Arc::from(line.to_owned()));
985                }
986            }
987
988            // Re-use QueriesOutput; a bit hacky.
989            let queries_output = QueriesOutput::ToFolders {
990                reason: None,
991                folder_base_path: new_queries_dir,
992            };
993            queries_output.write_queries(None, &mut queries.iter())?;
994        }
995    }
996
997    Ok(())
998}