aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMitch Riedstra <mitch@riedstra.us>2020-09-27 21:00:21 -0400
committerMitch Riedstra <mitch@riedstra.us>2020-09-27 21:00:21 -0400
commited8ece2154abac85660dbc25fbcf5e069f779913 (patch)
tree5aea55e55429a4e71b03e72725ffa5e675f8a696
parent0b830cc8bb7fde0bbf9514b8ae607084188c5c8e (diff)
downloaddeduplicator-ed8ece2154abac85660dbc25fbcf5e069f779913.tar.gz
deduplicator-ed8ece2154abac85660dbc25fbcf5e069f779913.tar.xz
Mostly working as I want...
-rw-r--r--dedup.go100
-rw-r--r--main.go253
2 files changed, 253 insertions, 100 deletions
diff --git a/dedup.go b/dedup.go
deleted file mode 100644
index a6b4549..0000000
--- a/dedup.go
+++ /dev/null
@@ -1,100 +0,0 @@
-package main
-
-import (
- "crypto/sha256"
- "flag"
- "fmt"
- "io"
- "os"
- "path/filepath"
- "strings"
-)
-
-// sha256Sum Takes an io.Reader and computes the checksum returning it as a
-// formatted string and an error if any
-func sha256Sum(rdr io.Reader) (string, error) {
- h := sha256.New()
- if _, err := io.Copy(h, rdr); err != nil {
- return "", err
- }
-
- return fmt.Sprintf("%X", h.Sum(nil)), nil
-}
-
-// Takes a filepath and returns the sha256sum and an error if any
-func getChecksum(fileName string) (string, error) {
- fh, err := os.Open(fileName)
- if err != nil {
- return "", err
- }
- defer fh.Close()
-
- return sha256Sum(fh)
-}
-
-func main() {
- fl := flag.NewFlagSet("deduplicator", flag.ExitOnError)
-
- path := fl.String("path", ".", "Path to deduplicate files in")
- all := fl.Bool("a", false, "Scan hidden files as well")
- script := fl.Bool("s", false, "Output format sutiable for scripts")
-
- _ = fl.Parse(os.Args[1:])
-
- // checksum -> filepaths
- checksums := make(map[string][]string)
-
- err := filepath.Walk(*path, func(path string, info os.FileInfo, err error) error {
- if err != nil {
- return err
- }
-
- if !*all {
- if strings.Contains(path, "/.") {
- return nil
- }
- }
-
- if info.IsDir() {
- return nil
- }
-
- sum, err := getChecksum(path)
- if err != nil {
- return err
- }
-
- if _, ok := checksums[sum]; !ok {
- checksums[sum] = []string{path}
- } else {
- checksums[sum] = append(
- checksums[sum],
- path)
- }
-
- return nil
- })
-
- if err != nil {
- fmt.Fprintln(os.Stderr, err)
- }
-
- for sum, list := range checksums {
- if len(list) == 1 {
- continue
- }
-
- switch {
- case *script:
- for _, path := range list {
- fmt.Printf("%s::%s\n", sum, path)
- }
- default:
- fmt.Println(sum)
- for _, path := range list {
- fmt.Println("\t" + path)
- }
- }
- }
-
-}
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..3bfa10c
--- /dev/null
+++ b/main.go
@@ -0,0 +1,253 @@
+package main
+
+import (
+ "crypto/sha256"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "math"
+ "os"
+ "path/filepath"
+ "regexp"
+ "runtime"
+ "strings"
+)
+
+/*
+func writeCache(fn string) error {
+}
+
+func readCache(fn string) error {
+}
+*/
+
+// sha256Sum Takes an io.Reader and computes the checksum returning it as a
+// formatted string and an error if any
+func sha256Sum(rdr io.Reader) (string, error) {
+ h := sha256.New()
+ if _, err := io.Copy(h, rdr); err != nil {
+ return "", err
+ }
+
+ return fmt.Sprintf("%X", h.Sum(nil)), nil
+}
+
+// Takes a filepath and returns the sha256sum and an error if any
+func getChecksum(fileName string) (string, error) {
+ fh, err := os.Open(fileName)
+ if err != nil {
+ return "", err
+ }
+ defer fh.Close()
+
+ return sha256Sum(fh)
+}
+
+type ChecksummerResult struct {
+ Path string
+ Checksum string
+ Error error
+ Info os.FileInfo
+}
+
+type ChecksummerInput struct {
+ Path string
+ Info os.FileInfo
+}
+
+// Worker that takes in a filepath off of a channel and outputs the checksum,
+// and any errors
+func Checksummer(done chan<- bool, input <-chan *ChecksummerInput, results chan<- *ChecksummerResult) {
+ for i := range input {
+ sum, err := getChecksum(i.Path)
+ results <- &ChecksummerResult{
+ Path: i.Path,
+ Checksum: sum,
+ Error: err,
+ Info: i.Info,
+ }
+ }
+ done <- true
+}
+
+// Scans a path, dumps the results into the paths channel
+func Scanner(done chan<- bool, all bool, path string, paths chan<- *ChecksummerInput) {
+ err := filepath.Walk(path, func(path string, info os.FileInfo, err error) error {
+ if err != nil {
+ return err
+ }
+
+ if !all {
+ if strings.Contains(path, "/.") {
+ return nil
+ }
+ }
+
+ if info.IsDir() {
+ return nil
+ }
+
+ // paths <- path
+ paths <- &ChecksummerInput{Path: path, Info: info}
+
+ return nil
+ })
+ close(paths)
+ if err != nil {
+ fmt.Fprintln(os.Stderr, err)
+ }
+ done <- true
+}
+
+func formatBytes(b int64) string {
+ if b < 1024 {
+ return fmt.Sprintf("%d b", b)
+ }
+
+ s := ""
+
+ pfxs := "kmgt"
+ for i := 0; i < len(pfxs); i++ {
+ pow := math.Pow(float64(1024), float64(i+1))
+ // This one is too big, return the previous string
+ if b < int64(pow) {
+ return s
+ }
+ s = fmt.Sprintf("%.2f %cb",
+ float64(b)/(pow),
+ pfxs[i])
+ }
+
+ return s
+}
+
+func main() {
+ fl := flag.NewFlagSet("deduplicator", flag.ExitOnError)
+
+ path := fl.String("path", ".", "Path to deduplicate files in")
+ all := fl.Bool("a", false, "Scan hidden files as well")
+ script := fl.Bool("s", false, "Output format sutiable for scripts")
+ procs := fl.Int("n", runtime.NumCPU(), "Number of processes to run at once")
+ remove := fl.Bool("rm", false, "Remove duplicate files? ( Note requires a regex, and will not remove all versions, defaults to a dry run")
+ removeRegexStr := fl.String("regex", "", "Regular expression to match duplicated files")
+ removeYes := fl.Bool("yes-i-want-my-data-gone", false, "Actually remove the files")
+ matchPath := fl.Bool("match-path", false, "match on the path, rather than the filename")
+
+ _ = fl.Parse(os.Args[1:])
+
+ if *remove && *removeRegexStr == "" {
+ log.Fatal("A regular expression (-regex '<expr>') is required")
+ }
+
+ removeRegex, err := regexp.Compile(*removeRegexStr)
+ if err != nil {
+ log.Fatalf("Error compiling provided regular expression: %s", err)
+ }
+
+ checksums := make(map[string][]*ChecksummerResult)
+ pths := make(chan *ChecksummerInput)
+ results := make(chan *ChecksummerResult)
+ done := make(chan bool)
+
+ for j := 0; j < *procs; j++ {
+ go Checksummer(done, pths, results)
+ }
+
+ go Scanner(done, *all, *path, pths)
+
+ finished := 0
+wait:
+ for {
+ select {
+ case result := <-results:
+ if result.Error != nil {
+ fmt.Fprintln(os.Stderr, result.Error)
+ }
+ sum := result.Checksum
+
+ if _, ok := checksums[sum]; !ok {
+ checksums[sum] = []*ChecksummerResult{result}
+ } else {
+ checksums[sum] = append(
+ checksums[sum],
+ result)
+ }
+ case <-done:
+ // fmt.Printf("reading from done... finished: %d\n", finished)
+ finished++
+ if finished >= *procs+1 {
+ break wait
+ }
+ }
+
+ }
+
+ // fmt.Println("Actually removing: ", *removeYes)
+
+ for sum, list := range checksums {
+ if len(list) <= 1 {
+ continue
+ }
+
+ switch {
+ case *script:
+ for _, result := range list {
+ fmt.Printf("%s::%d::%s\n", sum, result.Info.Size(), result.Path)
+ }
+ case *remove:
+
+ removing := []*ChecksummerResult{}
+
+ for _, result := range list {
+ s := ""
+ if !*matchPath {
+ s = result.Info.Name()
+ } else {
+ s = result.Path
+ }
+
+ if removeRegex.MatchString(s) {
+ removing = append(removing, result)
+ }
+
+ }
+
+ if len(removing) >= len(list) {
+ fmt.Printf("%s: Not removing, matches all files\n", sum)
+ continue
+ }
+
+ if len(removing) == 0 {
+ fmt.Printf("%s: Not removing, no matches\n", sum)
+ continue
+ }
+
+ if !*removeYes {
+ for _, f := range removing {
+ fmt.Printf("Would remove: %s\n", f.Path)
+ }
+ } else {
+ for _, f := range removing {
+ fmt.Printf("Removing: %s\n", f.Path)
+ err := os.Remove(f.Path)
+ if err != nil {
+ fmt.Fprintln(os.Stderr, err)
+ }
+ }
+ }
+
+ default:
+ fmt.Println(sum)
+ for i, result := range list {
+ if i == 0 {
+ // fmt.Printf("%d kbytes\n", result.Info.Size()/1024)
+ fmt.Println(formatBytes(result.Info.Size()))
+ }
+ fmt.Println("\t" + result.Path)
+ }
+ fmt.Println("")
+ }
+ }
+
+}