-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfinddupes
More file actions
executable file
·129 lines (106 loc) · 4.68 KB
/
finddupes
File metadata and controls
executable file
·129 lines (106 loc) · 4.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env bash
#
# finddupes — find near-duplicate files in a directory tree
#
# "Near-duplicate" means identical size, file type (libmagic), and MD5 checksum.
# Symbolic links are never followed or reported.
#
# Usage: finddupes [directory] (defaults to current directory)
set -euo pipefail
SEARCH_ROOT="${1:-.}"
if [[ ! -d "$SEARCH_ROOT" ]]; then
echo "error: not a directory: $SEARCH_ROOT" >&2
exit 1
fi
# ---------------------------------------------------------------------------
# Phase 1 – collect one line per regular file into a temp file:
#
# size|||magic|||md5|||filepath
#
# The ||| delimiter is chosen because it is vanishingly unlikely to appear in
# a libmagic description. Using -type f (regular files only) is sufficient to
# exclude symbolic links; no additional flag is needed.
# ---------------------------------------------------------------------------
echo "Scanning: $SEARCH_ROOT" >&2
echo >&2
scratch=$(mktemp)
trap 'rm -f "$scratch"' EXIT
find "$SEARCH_ROOT" -type f | sort | while IFS= read -r filepath; do
size=$(stat -c '%s' "$filepath")
# file -b may emit multiple lines; collapse them to a single line
magic=$(file -b "$filepath" | tr '\n' ' ' | sed 's/[[:space:]]*$//')
md5=$(md5sum "$filepath" | cut -d' ' -f1)
printf '%s|||%s|||%s|||%s\n' "$size" "$magic" "$md5" "$filepath"
done | sort > "$scratch"
# ---------------------------------------------------------------------------
# Phase 2 – group by (size, magic, md5) and report each duplicate set
#
# Awk reads the sorted scratch file. Because equal keys are adjacent after
# the sort, a single pass is enough to collect groups. We store per-group
# filepaths in a two-dimensional array (group_id, file_index) to avoid the
# fragility of splitting newline-delimited strings later.
# ---------------------------------------------------------------------------
awk -F'[|][|][|]' '
# ── Accumulation pass ──────────────────────────────────────────────────────
{
key = $1 SUBSEP $2 SUBSEP $3 # composite key: size + magic + md5
if (!(key in group_id)) {
group_id[key] = ++total_groups
group_size[key] = $1
group_magic[key] = $2
group_md5[key] = $3
}
gid = group_id[key]
file_count[gid]++
filepath[gid, file_count[gid]] = $4
}
# ── Report ─────────────────────────────────────────────────────────────────
END {
dup_groups = 0
total_wasted = 0
for (g = 1; g <= total_groups; g++) {
# Reverse-look up the key for group g. (We need it to access the
# group_* metadata arrays that are keyed by the composite key.)
n = file_count[g]
if (n < 2) continue # unique file — nothing to report
# Find the key that maps to this group id
for (key in group_id) {
if (group_id[key] == g) break
}
dup_groups++
sz = group_size[key] + 0 # coerce to numeric
wasted = sz * (n - 1) # one copy is "original", rest wasted
total_wasted += wasted
printf "────────────────────────────────────────────────────────────\n"
printf "Group %d (%d copies)\n", dup_groups, n
printf " Size : %s bytes\n", commify(sz)
printf " Type : %s\n", group_magic[key]
printf " MD5 : %s\n", group_md5[key]
printf " Wasted : %s bytes (%d redundant cop%s)\n",
commify(wasted), n - 1, (n - 1 == 1 ? "y" : "ies")
printf " Files :\n"
for (j = 1; j <= n; j++)
printf " %s\n", filepath[g, j]
printf "\n"
}
if (dup_groups == 0) {
print "No near-duplicate files found."
} else {
printf "════════════════════════════════════════════════════════════\n"
printf "Summary: %d duplicate group%s, %s bytes of redundant data\n",
dup_groups,
(dup_groups == 1 ? "" : "s"),
commify(total_wasted)
}
}
# Insert thousands separators into a non-negative integer
function commify(n, s, result) {
s = sprintf("%d", n)
result = ""
while (length(s) > 3) {
result = "," substr(s, length(s) - 2) result
s = substr(s, 1, length(s) - 3)
}
return s result
}
' "$scratch"