package main import ( "crypto/sha256" "flag" "fmt" "io" "log" "math" "os" "path/filepath" "regexp" "runtime" "strings" ) /* func writeCache(fn string) error { } func readCache(fn string) error { } */ // sha256Sum Takes an io.Reader and computes the checksum returning it as a // formatted string and an error if any func sha256Sum(rdr io.Reader) (string, error) { h := sha256.New() if _, err := io.Copy(h, rdr); err != nil { return "", err } return fmt.Sprintf("%X", h.Sum(nil)), nil } // Takes a filepath and returns the sha256sum and an error if any func getChecksum(fileName string) (string, error) { fh, err := os.Open(fileName) if err != nil { return "", err } defer fh.Close() return sha256Sum(fh) } type ChecksummerResult struct { Path string Checksum string Error error Info os.FileInfo } type ChecksummerInput struct { Path string Info os.FileInfo } type ChecksumSlice []*ChecksummerResult func (c ChecksumSlice) BuildRemoveList(reg *regexp.Regexp, matchPath bool) ChecksumSlice { removing := ChecksumSlice{} for _, result := range c { s := "" if !matchPath { s = result.Info.Name() } else { s = result.Path } if reg.MatchString(s) { removing = append(removing, result) } } return removing } // Worker that takes in a filepath off of a channel and outputs the checksum, // and any errors func Checksummer(done chan<- bool, input <-chan *ChecksummerInput, results chan<- *ChecksummerResult) { for i := range input { sum, err := getChecksum(i.Path) results <- &ChecksummerResult{ Path: i.Path, Checksum: sum, Error: err, Info: i.Info, } } done <- true } // Scans a path, dumps the results into the paths channel func Scanner(done chan<- bool, all bool, path string, paths chan<- *ChecksummerInput) { err := filepath.Walk(path, func(path string, info os.FileInfo, err error) error { if err != nil { return err } if !all { if strings.Contains(path, "/.") { return nil } } if info.IsDir() { return nil } // paths <- path paths <- &ChecksummerInput{Path: path, Info: info} return nil }) close(paths) if err != nil { fmt.Fprintln(os.Stderr, err) } done <- true } func formatBytes(b int64) string { if b < 1024 { return fmt.Sprintf("%d b", b) } s := "" pfxs := "kmgt" for i := 0; i < len(pfxs); i++ { pow := math.Pow(float64(1024), float64(i+1)) // This one is too big, return the previous string if b < int64(pow) { return s } s = fmt.Sprintf("%.2f %cb", float64(b)/(pow), pfxs[i]) } return s } func main() { fl := flag.NewFlagSet("deduplicator", flag.ExitOnError) path := fl.String("path", ".", "Path to deduplicate files in") all := fl.Bool("a", false, "Scan hidden files as well") script := fl.Bool("s", false, "Output format sutiable for scripts") procs := fl.Int("n", runtime.NumCPU(), "Number of processes to run at once") remove := fl.Bool("rm", false, "Remove duplicate files? ( Note requires a regex, and will not remove all versions, defaults to a dry run") removeRegexStr := fl.String("regex", "", "Regular expression to match duplicated files") removeYes := fl.Bool("yes-i-want-my-data-gone", false, "Actually remove the files") matchPath := fl.Bool("match-path", false, "match on the path, rather than the filename") _ = fl.Parse(os.Args[1:]) if *remove && *removeRegexStr == "" { log.Fatal("A regular expression (-regex '') is required") } removeRegex, err := regexp.Compile(*removeRegexStr) if err != nil { log.Fatalf("Error compiling provided regular expression: %s", err) } checksums := make(map[string]ChecksumSlice) pths := make(chan *ChecksummerInput) results := make(chan *ChecksummerResult) done := make(chan bool) for j := 0; j < *procs; j++ { go Checksummer(done, pths, results) } go Scanner(done, *all, *path, pths) finished := 0 wait: for { select { case result := <-results: if result.Error != nil { fmt.Fprintln(os.Stderr, result.Error) } sum := result.Checksum if _, ok := checksums[sum]; !ok { checksums[sum] = []*ChecksummerResult{result} } else { checksums[sum] = append( checksums[sum], result) } case <-done: // fmt.Printf("reading from done... finished: %d\n", finished) finished++ if finished >= *procs+1 { break wait } } } // fmt.Println("Actually removing: ", *removeYes) for sum, list := range checksums { if len(list) <= 1 { continue } switch { case *script && *remove: removing := list.BuildRemoveList(removeRegex, *matchPath) if len(removing) >= len(list) || len(removing) == 0 { for _, result := range list { fmt.Printf("%s::%d::%s::%s\n", sum, result.Info.Size(), result.Path, "not removing") } continue } if !*removeYes { for _, f := range removing { fmt.Printf("%s::%d::%s::%s\n", sum, f.Info.Size(), f.Path, "would remove") } } else { for _, f := range removing { fmt.Printf("%s::%d::%s::%s\n", sum, f.Info.Size(), f.Path, "removing") err := os.Remove(f.Path) if err != nil { fmt.Fprintln(os.Stderr, err) } } } case *remove: removing := list.BuildRemoveList(removeRegex, *matchPath) if len(removing) >= len(list) { fmt.Printf("%s: Not removing, matches all files\n", sum) continue } if len(removing) == 0 { fmt.Printf("%s: Not removing, no matches\n", sum) continue } if !*removeYes { for _, f := range removing { fmt.Printf("Would remove: %s\n", f.Path) } } else { for _, f := range removing { fmt.Printf("Removing: %s\n", f.Path) err := os.Remove(f.Path) if err != nil { fmt.Fprintln(os.Stderr, err) } } } case *script: for _, result := range list { fmt.Printf("%s::%d::%s\n", sum, result.Info.Size(), result.Path) } default: fmt.Println(sum) for i, result := range list { if i == 0 { fmt.Println(formatBytes(result.Info.Size())) } fmt.Println("\t" + result.Path) } fmt.Println("") } } }