package main import ( "crypto/sha256" "encoding/json" "flag" "fmt" "io" "log" "math" "os" "path/filepath" "regexp" "runtime" "strings" ) // sha256Sum Takes an io.Reader and computes the checksum returning it as a // formatted string and an error if any func sha256Sum(rdr io.Reader) (string, error) { h := sha256.New() if _, err := io.Copy(h, rdr); err != nil { return "", err } return fmt.Sprintf("%X", h.Sum(nil)), nil } // Takes a filepath and returns the sha256sum and an error if any func getChecksum(fileName string) (string, error) { fh, err := os.Open(fileName) if err != nil { return "", err } defer fh.Close() return sha256Sum(fh) } type ChecksummerResult struct { Path string Checksum string Error error Name string Size int64 } type ChecksummerInput struct { Path string Info os.FileInfo } type ChecksumSlice []*ChecksummerResult type ChecksummerResultMap map[string]ChecksumSlice func (c *ChecksummerResultMap) Save(fn string) error { fh, err := os.OpenFile(fn, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0666) if err != nil { return err } defer fh.Close() enc := json.NewEncoder(fh) return enc.Encode(c) } func (c *ChecksummerResultMap) Load(fn string) error { fh, err := os.Open(fn) if err != nil { return err } defer fh.Close() dec := json.NewDecoder(fh) return dec.Decode(c) } func (c ChecksumSlice) BuildRemoveList(reg *regexp.Regexp, matchPath bool) ChecksumSlice { removing := ChecksumSlice{} for _, result := range c { s := "" if !matchPath { s = result.Name } else { s = result.Path } if reg.MatchString(s) { removing = append(removing, result) } } return removing } // Worker that takes in a filepath off of a channel and outputs the checksum, // and any errors func Checksummer(done chan<- bool, input <-chan *ChecksummerInput, results chan<- *ChecksummerResult) { for i := range input { sum, err := getChecksum(i.Path) results <- &ChecksummerResult{ Path: i.Path, Checksum: sum, Error: err, Name: i.Info.Name(), Size: i.Info.Size(), } } done <- true } // Scans a path, dumps the results into the paths channel func Scanner(done chan<- bool, all bool, path string, paths chan<- *ChecksummerInput) { err := filepath.Walk(path, func(path string, info os.FileInfo, err error) error { if err != nil { return err } if !all { if strings.Contains(path, "/.") { return nil } } if info.IsDir() { return nil } // paths <- path paths <- &ChecksummerInput{Path: path, Info: info} return nil }) close(paths) if err != nil { fmt.Fprintln(os.Stderr, err) } done <- true } func formatBytes(b int64) string { if b < 1024 { return fmt.Sprintf("%d b", b) } s := "" pfxs := "kmgt" for i := 0; i < len(pfxs); i++ { pow := math.Pow(float64(1024), float64(i+1)) // This one is too big, return the previous string if b < int64(pow) { return s } s = fmt.Sprintf("%.2f %cb", float64(b)/(pow), pfxs[i]) } return s } func main() { fl := flag.NewFlagSet("deduplicator", flag.ExitOnError) path := fl.String("path", ".", "Path to deduplicate files in") all := fl.Bool("a", false, "Scan hidden files as well") script := fl.Bool("s", false, "Output format sutiable for scripts") procs := fl.Int("n", runtime.NumCPU(), "Number of processes to run at once") remove := fl.Bool("rm", false, "Remove duplicate files? ( Note requires a regex, and will not remove all versions, defaults to a dry run") removeRegexStr := fl.String("regex", "", "Regular expression to match duplicated files") removeYes := fl.Bool("yes-i-want-my-data-gone", false, "Actually remove the files") matchPath := fl.Bool("match-path", false, "match on the path, rather than the filename") cacheFile := fl.String("cache", "", "If not an empty string, the data gathered on a directory will be cached to the file, allowing subsequent runs to be near instant. No care is taken to check whether or not the cache is up to date with the current state of the directroy. If in doubt, leave empty.") _ = fl.Parse(os.Args[1:]) if *remove && *removeRegexStr == "" { log.Fatal("A regular expression (-regex '') is required") } removeRegex, err := regexp.Compile(*removeRegexStr) if err != nil { log.Fatalf("Error compiling provided regular expression: %s", err) } // checksums := make(map[string]ChecksumSlice) checksums := make(ChecksummerResultMap) pths := make(chan *ChecksummerInput) results := make(chan *ChecksummerResult) done := make(chan bool) finished := 0 if *cacheFile != "" { err = checksums.Load(*cacheFile) if err != nil && !strings.Contains(err.Error(), "no such file or directory") { fmt.Fprintln(os.Stderr, "Error loading cache file: ", err) } if err == nil { fmt.Fprintln(os.Stderr, "Loaded cache file, using") goto parsing } } for j := 0; j < *procs; j++ { go Checksummer(done, pths, results) } go Scanner(done, *all, *path, pths) wait: for { select { case result := <-results: if result.Error != nil { fmt.Fprintln(os.Stderr, result.Error) } sum := result.Checksum if _, ok := checksums[sum]; !ok { checksums[sum] = []*ChecksummerResult{result} } else { checksums[sum] = append( checksums[sum], result) } case <-done: // fmt.Printf("reading from done... finished: %d\n", finished) finished++ if finished >= *procs+1 { break wait } } } if *cacheFile != "" { fmt.Fprintln(os.Stderr, "Saving cache file") err = checksums.Save(*cacheFile) if err != nil { fmt.Fprintln(os.Stderr, "Error saving cache file: ", err) } } parsing: for sum, list := range checksums { if len(list) <= 1 { continue } switch { case *script && *remove: removing := list.BuildRemoveList(removeRegex, *matchPath) if len(removing) >= len(list) || len(removing) == 0 { for _, result := range list { fmt.Printf("%s::%d::%s::%s\n", sum, result.Size, result.Path, "not removing") } continue } if !*removeYes { for _, f := range removing { fmt.Printf("%s::%d::%s::%s\n", sum, f.Size, f.Path, "would remove") } } else { for _, f := range removing { fmt.Printf("%s::%d::%s::%s\n", sum, f.Size, f.Path, "removing") err := os.Remove(f.Path) if err != nil { fmt.Fprintln(os.Stderr, err) } } } case *remove: removing := list.BuildRemoveList(removeRegex, *matchPath) if len(removing) >= len(list) { fmt.Printf("%s: Not removing, matches all files\n", sum) continue } if len(removing) == 0 { fmt.Printf("%s: Not removing, no matches\n", sum) continue } if !*removeYes { for _, f := range removing { fmt.Printf("Would remove: %s\n", f.Path) } } else { for _, f := range removing { fmt.Printf("Removing: %s\n", f.Path) err := os.Remove(f.Path) if err != nil { fmt.Fprintln(os.Stderr, err) } } } case *script: for _, result := range list { fmt.Printf("%s::%d::%s\n", sum, result.Size, result.Path) } default: fmt.Println(sum) for i, result := range list { if i == 0 { fmt.Println(formatBytes(result.Size)) } fmt.Println("\t" + result.Path) } fmt.Println("") } } }