]> git.openstreetmap.org Git - chef.git/blob - cookbooks/planet/templates/default/planet-file-cleanup.erb
Copy planet dumps to S3
[chef.git] / cookbooks / planet / templates / default / planet-file-cleanup.erb
1 #!/usr/bin/ruby
2
3 require 'date'
4 require 'optparse'
5
6 # always keep the last 4 weeks
7 ALWAYS_KEEP_DAYS = 4 * 7
8
9 # otherwise, bucket by month and keep the earliest in the bucket
10 def bucket(date)
11   Date.new(date.year, date.month, 1)
12 end
13
14 Candidate = Struct.new(:filename, :date)
15
16 def list_files(glob, date_pattern)
17   # find all candidates for deletion
18   real_files = Dir.glob(glob).select do |file|
19     File.file?(file) && !File.symlink?(file)
20   end
21
22   real_files.map do |file|
23     # extract the date
24     m = date_pattern.match(file)
25     raise "Unable to extract date string from #{file.inspect}" if m.nil?
26     d = Date.strptime(m[1], "%y%m%d")
27     Candidate.new(file, d)
28   end
29 end
30
31 def deletion_candidates(today, candidates)
32   candidate_buckets = Hash.new
33
34   candidates.each do |c|
35     next if today - c.date < ALWAYS_KEEP_DAYS
36     b = bucket(c.date)
37
38     candidate_buckets[b] = Array.new unless candidate_buckets.has_key?(b)
39     candidate_buckets[b] << c
40   end
41
42   # delete all but the earliest in each bucket
43   candidate_buckets.collect_concat do |bucket, contents|
44     contents.sort_by {|c| c.date}[1..-1]
45   end
46 end
47
48 def deletions(glob, date_pattern, today, expansions)
49   candidates = list_files(glob, date_pattern)
50   to_delete = deletion_candidates(today, candidates)
51
52   expanded = to_delete.collect_concat do |candidate|
53     dir = File.dirname(candidate.filename)
54     expansions.map do |e|
55       exp = candidate.date.strftime(e)
56       "#{dir}/#{exp}"
57     end
58   end
59
60   expanded.select {|e| File.exist?(e)}
61 end
62
63 dry_run = false
64 debug = false
65
66 OptionParser.new do |opt|
67   opt.on('--dry-run') { dry_run = true }
68   opt.on('--debug') { debug = true }
69 end.parse!
70
71 xml_directory = "<%= node[:planet][:dump][:xml_directory] %>"
72 xml_history_directory = "<%= node[:planet][:dump][:xml_history_directory] %>"
73 pbf_directory = "<%= node[:planet][:dump][:pbf_directory] %>"
74 pbf_history_directory = "<%= node[:planet][:dump][:pbf_history_directory] %>"
75
76 today = Date.today
77 to_delete = Array.new
78
79 to_delete += deletions(
80   "#{xml_directory}/20??/planet-??????.osm.bz2",
81   /planet-([0-9]{6}).osm.bz2/,
82   today,
83   ["changesets-%y%m%d.osm.bz2",
84    "changesets-%y%m%d.osm.bz2.md5",
85    "discussions-%y%m%d.osm.bz2",
86    "discussions-%y%m%d.osm.bz2.md5",
87    "planet-%y%m%d.osm.bz2",
88    "planet-%y%m%d.osm.bz2.md5"])
89
90 to_delete += deletions(
91   "#{xml_history_directory}/20??/history-??????.osm.bz2",
92   /history-([0-9]{6}).osm.bz2/,
93   today,
94   ["history-%y%m%d.osm.bz2",
95    "history-%y%m%d.osm.bz2.md5"])
96
97 to_delete += deletions(
98   "#{pbf_directory}/planet-??????.osm.pbf",
99   /planet-([0-9]{6}).osm.pbf/,
100   today,
101   ["planet-%y%m%d.osm.pbf",
102    "planet-%y%m%d.osm.pbf.md5"])
103
104 to_delete += deletions(
105   "#{pbf_history_directory}/history-??????.osm.pbf",
106   /history-([0-9]{6}).osm.pbf/,
107   today,
108   ["history-%y%m%d.osm.pbf",
109    "history-%y%m%d.osm.pbf.md5"])
110
111 total_size = 0
112 num_deleted = 0
113 cmd = dry_run ? "Would delete" : "Deleted"
114 to_delete.each do |file|
115   s = File.stat(file)
116   File.delete(file) unless dry_run
117   puts "#{cmd} #{file.inspect}, #{s.size / 1000000} MB" if debug
118   total_size += s.size
119   num_deleted += 1
120 end
121 if debug
122   puts "#{cmd} files of total size #{total_size / 1000000000.0} GB"
123   puts "#{cmd} #{num_deleted} files"
124 end