diff options
| author | Mitch Riedstra <mitch@riedstra.us> | 2020-09-27 21:00:21 -0400 |
|---|---|---|
| committer | Mitch Riedstra <mitch@riedstra.us> | 2020-09-27 21:00:21 -0400 |
| commit | ed8ece2154abac85660dbc25fbcf5e069f779913 (patch) | |
| tree | 5aea55e55429a4e71b03e72725ffa5e675f8a696 | |
| parent | 0b830cc8bb7fde0bbf9514b8ae607084188c5c8e (diff) | |
| download | deduplicator-ed8ece2154abac85660dbc25fbcf5e069f779913.tar.gz deduplicator-ed8ece2154abac85660dbc25fbcf5e069f779913.tar.xz | |
Mostly working as I want...
| -rw-r--r-- | dedup.go | 100 | ||||
| -rw-r--r-- | main.go | 253 |
2 files changed, 253 insertions, 100 deletions
diff --git a/dedup.go b/dedup.go deleted file mode 100644 index a6b4549..0000000 --- a/dedup.go +++ /dev/null @@ -1,100 +0,0 @@ -package main - -import ( - "crypto/sha256" - "flag" - "fmt" - "io" - "os" - "path/filepath" - "strings" -) - -// sha256Sum Takes an io.Reader and computes the checksum returning it as a -// formatted string and an error if any -func sha256Sum(rdr io.Reader) (string, error) { - h := sha256.New() - if _, err := io.Copy(h, rdr); err != nil { - return "", err - } - - return fmt.Sprintf("%X", h.Sum(nil)), nil -} - -// Takes a filepath and returns the sha256sum and an error if any -func getChecksum(fileName string) (string, error) { - fh, err := os.Open(fileName) - if err != nil { - return "", err - } - defer fh.Close() - - return sha256Sum(fh) -} - -func main() { - fl := flag.NewFlagSet("deduplicator", flag.ExitOnError) - - path := fl.String("path", ".", "Path to deduplicate files in") - all := fl.Bool("a", false, "Scan hidden files as well") - script := fl.Bool("s", false, "Output format sutiable for scripts") - - _ = fl.Parse(os.Args[1:]) - - // checksum -> filepaths - checksums := make(map[string][]string) - - err := filepath.Walk(*path, func(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - - if !*all { - if strings.Contains(path, "/.") { - return nil - } - } - - if info.IsDir() { - return nil - } - - sum, err := getChecksum(path) - if err != nil { - return err - } - - if _, ok := checksums[sum]; !ok { - checksums[sum] = []string{path} - } else { - checksums[sum] = append( - checksums[sum], - path) - } - - return nil - }) - - if err != nil { - fmt.Fprintln(os.Stderr, err) - } - - for sum, list := range checksums { - if len(list) == 1 { - continue - } - - switch { - case *script: - for _, path := range list { - fmt.Printf("%s::%s\n", sum, path) - } - default: - fmt.Println(sum) - for _, path := range list { - fmt.Println("\t" + path) - } - } - } - -} @@ -0,0 +1,253 @@ +package main + +import ( + "crypto/sha256" + "flag" + "fmt" + "io" + "log" + "math" + "os" + "path/filepath" + "regexp" + "runtime" + "strings" +) + +/* +func writeCache(fn string) error { +} + +func readCache(fn string) error { +} +*/ + +// sha256Sum Takes an io.Reader and computes the checksum returning it as a +// formatted string and an error if any +func sha256Sum(rdr io.Reader) (string, error) { + h := sha256.New() + if _, err := io.Copy(h, rdr); err != nil { + return "", err + } + + return fmt.Sprintf("%X", h.Sum(nil)), nil +} + +// Takes a filepath and returns the sha256sum and an error if any +func getChecksum(fileName string) (string, error) { + fh, err := os.Open(fileName) + if err != nil { + return "", err + } + defer fh.Close() + + return sha256Sum(fh) +} + +type ChecksummerResult struct { + Path string + Checksum string + Error error + Info os.FileInfo +} + +type ChecksummerInput struct { + Path string + Info os.FileInfo +} + +// Worker that takes in a filepath off of a channel and outputs the checksum, +// and any errors +func Checksummer(done chan<- bool, input <-chan *ChecksummerInput, results chan<- *ChecksummerResult) { + for i := range input { + sum, err := getChecksum(i.Path) + results <- &ChecksummerResult{ + Path: i.Path, + Checksum: sum, + Error: err, + Info: i.Info, + } + } + done <- true +} + +// Scans a path, dumps the results into the paths channel +func Scanner(done chan<- bool, all bool, path string, paths chan<- *ChecksummerInput) { + err := filepath.Walk(path, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + + if !all { + if strings.Contains(path, "/.") { + return nil + } + } + + if info.IsDir() { + return nil + } + + // paths <- path + paths <- &ChecksummerInput{Path: path, Info: info} + + return nil + }) + close(paths) + if err != nil { + fmt.Fprintln(os.Stderr, err) + } + done <- true +} + +func formatBytes(b int64) string { + if b < 1024 { + return fmt.Sprintf("%d b", b) + } + + s := "" + + pfxs := "kmgt" + for i := 0; i < len(pfxs); i++ { + pow := math.Pow(float64(1024), float64(i+1)) + // This one is too big, return the previous string + if b < int64(pow) { + return s + } + s = fmt.Sprintf("%.2f %cb", + float64(b)/(pow), + pfxs[i]) + } + + return s +} + +func main() { + fl := flag.NewFlagSet("deduplicator", flag.ExitOnError) + + path := fl.String("path", ".", "Path to deduplicate files in") + all := fl.Bool("a", false, "Scan hidden files as well") + script := fl.Bool("s", false, "Output format sutiable for scripts") + procs := fl.Int("n", runtime.NumCPU(), "Number of processes to run at once") + remove := fl.Bool("rm", false, "Remove duplicate files? ( Note requires a regex, and will not remove all versions, defaults to a dry run") + removeRegexStr := fl.String("regex", "", "Regular expression to match duplicated files") + removeYes := fl.Bool("yes-i-want-my-data-gone", false, "Actually remove the files") + matchPath := fl.Bool("match-path", false, "match on the path, rather than the filename") + + _ = fl.Parse(os.Args[1:]) + + if *remove && *removeRegexStr == "" { + log.Fatal("A regular expression (-regex '<expr>') is required") + } + + removeRegex, err := regexp.Compile(*removeRegexStr) + if err != nil { + log.Fatalf("Error compiling provided regular expression: %s", err) + } + + checksums := make(map[string][]*ChecksummerResult) + pths := make(chan *ChecksummerInput) + results := make(chan *ChecksummerResult) + done := make(chan bool) + + for j := 0; j < *procs; j++ { + go Checksummer(done, pths, results) + } + + go Scanner(done, *all, *path, pths) + + finished := 0 +wait: + for { + select { + case result := <-results: + if result.Error != nil { + fmt.Fprintln(os.Stderr, result.Error) + } + sum := result.Checksum + + if _, ok := checksums[sum]; !ok { + checksums[sum] = []*ChecksummerResult{result} + } else { + checksums[sum] = append( + checksums[sum], + result) + } + case <-done: + // fmt.Printf("reading from done... finished: %d\n", finished) + finished++ + if finished >= *procs+1 { + break wait + } + } + + } + + // fmt.Println("Actually removing: ", *removeYes) + + for sum, list := range checksums { + if len(list) <= 1 { + continue + } + + switch { + case *script: + for _, result := range list { + fmt.Printf("%s::%d::%s\n", sum, result.Info.Size(), result.Path) + } + case *remove: + + removing := []*ChecksummerResult{} + + for _, result := range list { + s := "" + if !*matchPath { + s = result.Info.Name() + } else { + s = result.Path + } + + if removeRegex.MatchString(s) { + removing = append(removing, result) + } + + } + + if len(removing) >= len(list) { + fmt.Printf("%s: Not removing, matches all files\n", sum) + continue + } + + if len(removing) == 0 { + fmt.Printf("%s: Not removing, no matches\n", sum) + continue + } + + if !*removeYes { + for _, f := range removing { + fmt.Printf("Would remove: %s\n", f.Path) + } + } else { + for _, f := range removing { + fmt.Printf("Removing: %s\n", f.Path) + err := os.Remove(f.Path) + if err != nil { + fmt.Fprintln(os.Stderr, err) + } + } + } + + default: + fmt.Println(sum) + for i, result := range list { + if i == 0 { + // fmt.Printf("%d kbytes\n", result.Info.Size()/1024) + fmt.Println(formatBytes(result.Info.Size())) + } + fmt.Println("\t" + result.Path) + } + fmt.Println("") + } + } + +} |
