commit 0de60fb7c3048bb5ae393888d7cabff007dfe74e Author: Lian Studer Date: Sat Feb 14 21:00:23 2026 +0100 first working version diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..d5c92db --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,408 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anyhow" +version = "1.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e0fee31ef5ed1ba1316088939cea399010ed7731dba877ed44aeb407a75ea" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] +name = "bumpalo" +version = "3.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" + +[[package]] +name = "cc" +version = "1.2.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "chrono" +version = "0.4.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-link", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "js-sys" +version = "0.3.85" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.182" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "murmur3" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b" + +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "reimagine" +version = "0.1.0" +dependencies = [ + "anyhow", + "bit-vec", + "chrono", + "murmur3", + "num", + "walkdir", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "syn" +version = "2.0.115" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e614ed320ac28113fa64972c4262d5dbc89deacdfd00c34a3e4cea073243c12" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "537dd038a89878be9b64dd4bd1b260315c1bb94f4d784956b81e27a088d9a09e" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..b987e66 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "reimagine" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = "1.0.101" +bit-vec = "0.8.0" +chrono = "0.4.43" +murmur3 = "0.5.2" +num = "0.4.3" +walkdir = "2.5.0" diff --git a/README.md b/README.md new file mode 100644 index 0000000..0ffd556 --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +# Reimagine + +## Finding file count + +find . -type f -iname "*.jpg" | wc -l \ No newline at end of file diff --git a/src/bloom_filter.rs b/src/bloom_filter.rs new file mode 100644 index 0000000..dee2d22 --- /dev/null +++ b/src/bloom_filter.rs @@ -0,0 +1,66 @@ +use std::{f64, io::Cursor}; + +use bit_vec::BitVec; +use murmur3::murmur3_32; + +const FALSE_POSITIVE_PROB: f64 = 0.05; + +pub struct BloomFilter { + bit_vec: BitVec, + bits: u32, + + hash_count: u32, +} + +impl BloomFilter { + pub fn new(expected_elems: u32) -> BloomFilter { + let bits = BloomFilter::get_optimal_size( + expected_elems, FALSE_POSITIVE_PROB); + let bit_vec = BitVec::from_elem(bits as usize, false); + + let hash_count = BloomFilter::get_hash_count( + bits, expected_elems); + + BloomFilter { + bit_vec, + bits, + + hash_count + } + } + + pub fn insert(&mut self, value: &String) { + for i in 0..self.hash_count { + let mut cursor = Cursor::new(value); + let digest = murmur3_32(&mut cursor, i).unwrap() % self.bits; + + self.bit_vec.set(digest as usize, true); + } + } + + pub fn lookup(&self, value: &String) -> bool { + for i in 0..self.hash_count { + let mut cursor = Cursor::new(value); + let hash = murmur3_32(&mut cursor, i).unwrap() % self.bits; + + if self.bit_vec[hash as usize] == false { return false; } + } + + true + } + + // n is the expected amount of elements + // p is the probability of a false positive + fn get_optimal_size(n: u32, p: f64) -> u32 { + -((n as f64 * p.ln()) / ((2_f64).ln()).powi(2)) as u32 + } + + // m is the bit vector size + // n is the number of items expected + fn get_hash_count(m: u32, n: u32) -> u32 { + let factor = m as f64 / n as f64; + let log2 = (2_f64).ln(); + + (factor * log2) as u32 + } +} \ No newline at end of file diff --git a/src/crawler.rs b/src/crawler.rs new file mode 100644 index 0000000..801e879 --- /dev/null +++ b/src/crawler.rs @@ -0,0 +1,41 @@ +use std::path::PathBuf; +use walkdir::WalkDir; + +use crate::{bloom_filter::BloomFilter, index::Index}; + + +pub struct Crawler { + bloom_filter: BloomFilter, + + pub rejected: usize, +} + +impl Crawler { + pub fn new(expected_elems: u32) -> Crawler { + let bloom_filter = BloomFilter::new(expected_elems); + + Crawler { bloom_filter, rejected: 0 } + } + + pub fn create_index(&mut self, starting_point: &PathBuf) -> Index { + let mut index = Index::new(); + + for f in WalkDir::new(starting_point).into_iter().filter_map(|f| f.ok()) { + if f.metadata().unwrap().is_file() { + let filename = f.file_name().to_ascii_lowercase(); + let filename = filename.to_str().unwrap().to_string(); + + if !self.bloom_filter.lookup(&filename) { + self.bloom_filter.insert(&filename); + + index.add(filename, f.clone().into_path(), f.metadata().unwrap()); + } else { + self.rejected += 1; + println!("Rejected: {}", filename); + } + } + } + + index + } +} \ No newline at end of file diff --git a/src/index.rs b/src/index.rs new file mode 100644 index 0000000..2bf9cd6 --- /dev/null +++ b/src/index.rs @@ -0,0 +1,81 @@ +use std::{collections::HashMap, fs::{self, DirEntry, Metadata}, hash::Hash, path::PathBuf}; + +use chrono::{DateTime, Datelike, Local, NaiveDate, NaiveDateTime, Utc}; + +#[derive(Debug, Default)] +struct Folder { + files: HashMap> +} + +#[derive(Debug)] +struct ImageInfo { + path: PathBuf, + name: String, + metadata: Metadata, +} + +impl ImageInfo { + pub fn new(path: PathBuf, name: String, metadata: Metadata) -> ImageInfo { + ImageInfo { + path, + name, + metadata + } + } +} + +#[derive(Debug)] +pub struct Index { + folders: HashMap +} + +impl Index { + pub fn new() -> Index { + Index { + folders: HashMap::new() + } + } + + pub fn add(&mut self, file_name: String, path: PathBuf, meta: Metadata) { + let new_file = ImageInfo::new(path, file_name, meta.clone()); + let created_on: DateTime = meta.created().unwrap().into(); + + let year = created_on.year() as u32; + let date = created_on.date_naive().to_string(); + + let folder_exists = self.folders.contains_key(&year); + if !folder_exists { + self.folders.insert(year, Folder::default()); + } + + let folder = self.folders.get_mut(&year).unwrap(); + + let subfolder_exists = folder.files.contains_key(&date); + if !subfolder_exists { + folder.files.insert(date, vec![new_file]); + return; + } + + let existing_subfolder = folder.files.get_mut(&date).unwrap(); + existing_subfolder.push(new_file); + } + + pub fn propagate(&self, output: PathBuf) { + for (year, folder) in self.folders.iter() { + for (date, files) in folder.files.iter() { + let mut path = output.clone(); + path.push(year.to_string()); + path.push(date); + + fs::create_dir_all(&path).expect("Pfad konnte nicht erstellt werden"); + + for file in files.iter() { + let mut new_path = path.clone(); + new_path.push(&file.name); + + fs::copy(&file.path, new_path).expect("Datei konnte nicht kopiert werden"); + } + } + } + } +} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..45fc45c --- /dev/null +++ b/src/main.rs @@ -0,0 +1,36 @@ +use std::path::PathBuf; +use anyhow::Context; + +use crate::{crawler::Crawler}; + +mod bloom_filter; +mod crawler; +mod index; + +fn main() -> anyhow::Result<()> { + let args = std::env::args() + .skip(1) + .collect::>(); + + let master_path = args.get(0) + .context("Kein Masterpfad angegeben") + .unwrap(); + + let expected_elems = args.get(1) + .context("Anzahl zu erwartender Dateien fehlt") + .unwrap(); + let expected_elems = expected_elems.parse::() + .context("Angegebene Anzahl ist keien gültige Zahl") + .unwrap(); + + let mut crawler = Crawler::new(expected_elems); + + let master_path = PathBuf::from(master_path); + let index = crawler.create_index(&master_path); + + index.propagate(PathBuf::from("output")); + + println!("Total Rejected: {:?}", crawler.rejected); + + Ok(()) +}