aboutsummaryrefslogtreecommitdiff
path: root/src/rpc/layout/history.rs
blob: 23196aee873c95474413417dfafa1e82721e503d (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
use std::collections::HashSet;

use garage_util::crdt::{Crdt, Lww, LwwMap};
use garage_util::data::*;
use garage_util::encode::nonversioned_encode;
use garage_util::error::*;

use super::*;
use crate::replication_mode::ReplicationMode;

impl LayoutHistory {
	pub fn new(replication_factor: usize) -> Self {
		let version = LayoutVersion::new(replication_factor);

		let staging = LayoutStaging {
			parameters: Lww::<LayoutParameters>::new(version.parameters),
			roles: LwwMap::new(),
		};

		LayoutHistory {
			versions: vec![version],
			old_versions: vec![],
			update_trackers: Default::default(),
			staging: Lww::raw(0, staging),
		}
	}

	// ------------------ who stores what now? ---------------

	pub fn current(&self) -> &LayoutVersion {
		self.versions.last().as_ref().unwrap()
	}

	pub fn min_stored(&self) -> u64 {
		self.versions.first().as_ref().unwrap().version
	}

	pub fn get_all_nodes(&self) -> Vec<Uuid> {
		if self.versions.len() == 1 {
			self.versions[0].all_nodes().to_vec()
		} else {
			let set = self
				.versions
				.iter()
				.flat_map(|x| x.all_nodes())
				.collect::<HashSet<_>>();
			set.into_iter().copied().collect::<Vec<_>>()
		}
	}

	pub(crate) fn get_all_nongateway_nodes(&self) -> Vec<Uuid> {
		if self.versions.len() == 1 {
			self.versions[0].nongateway_nodes().to_vec()
		} else {
			let set = self
				.versions
				.iter()
				.flat_map(|x| x.nongateway_nodes())
				.collect::<HashSet<_>>();
			set.into_iter().copied().collect::<Vec<_>>()
		}
	}

	// ---- housekeeping (all invoked by LayoutHelper) ----

	pub(crate) fn keep_current_version_only(&mut self) {
		while self.versions.len() > 1 {
			let removed = self.versions.remove(0);
			self.old_versions.push(removed);
		}
	}

	pub(crate) fn cleanup_old_versions(&mut self) {
		// If there are invalid versions before valid versions, remove them
		if self.versions.len() > 1 && self.current().check().is_ok() {
			while self.versions.len() > 1 && self.versions.first().unwrap().check().is_err() {
				let removed = self.versions.remove(0);
				info!(
					"Layout history: pruning old invalid version {}",
					removed.version
				);
			}
		}

		// If there are old versions that no one is reading from anymore,
		// remove them (keep them in self.old_versions).
		// ASSUMPTION: we only care about where nodes in the current layout version
		// are reading from, as we assume older nodes are being discarded.
		while self.versions.len() > 1 {
			let current_nodes = &self.current().node_id_vec;
			let min_version = self.min_stored();
			let sync_ack_map_min = self
				.update_trackers
				.sync_ack_map
				.min_among(current_nodes, min_version);
			if self.min_stored() < sync_ack_map_min {
				let removed = self.versions.remove(0);
				info!(
					"Layout history: moving version {} to old_versions",
					removed.version
				);
				self.old_versions.push(removed);
			} else {
				break;
			}
		}

		while self.old_versions.len() > OLD_VERSION_COUNT {
			let removed = self.old_versions.remove(0);
			info!("Layout history: removing old_version {}", removed.version);
		}
	}

	pub(crate) fn clamp_update_trackers(&mut self, nodes: &[Uuid]) {
		let min_v = self.min_stored();
		for node in nodes {
			self.update_trackers.ack_map.set_max(*node, min_v);
			self.update_trackers.sync_map.set_max(*node, min_v);
			self.update_trackers.sync_ack_map.set_max(*node, min_v);
		}
	}

	pub(crate) fn calculate_sync_map_min_with_quorum(
		&self,
		replication_mode: ReplicationMode,
		all_nongateway_nodes: &[Uuid],
	) -> u64 {
		// This function calculates the minimum layout version from which
		// it is safe to read if we want to maintain read-after-write consistency.
		// In the general case the computation can be a bit expensive so
		// we try to optimize it in several ways.

		// If there is only one layout version, we know that's the one
		// we need to read from.
		if self.versions.len() == 1 {
			return self.current().version;
		}

		let quorum = replication_mode.write_quorum();

		let min_version = self.min_stored();
		let global_min = self
			.update_trackers
			.sync_map
			.min_among(all_nongateway_nodes, min_version);

		// If the write quorums are equal to the total number of nodes,
		// i.e. no writes can succeed while they are not written to all nodes,
		// then we must in all case wait for all nodes to complete a sync.
		// This is represented by reading from the layout with version
		// number global_min, the smallest layout version for which all nodes
		// have completed a sync.
		if quorum == self.current().replication_factor {
			return global_min;
		}

		// In the general case, we need to look at all write sets for all partitions,
		// and find a safe layout version to read for that partition. We then
		// take the minimum value among all partition as the safe layout version
		// to read in all cases (the layout version to which all reads are directed).
		let mut current_min = self.current().version;
		let mut sets_done = HashSet::<Vec<Uuid>>::new();

		for (_, p_hash) in self.current().partitions() {
			for v in self.versions.iter() {
				if v.version == self.current().version {
					// We don't care about whether nodes in the latest layout version
					// have completed a sync or not, as the sync is push-only
					// and by definition nodes in the latest layout version do not
					// hold data that must be pushed to nodes in the latest layout
					// version, since that's the same version (any data that's
					// already in the latest version is assumed to have been written
					// by an operation that ensured a quorum of writes within
					// that version).
					continue;
				}

				// Determine set of nodes for partition p in layout version v.
				// Sort the node set to avoid duplicate computations.
				let mut set = v
					.nodes_of(&p_hash, v.replication_factor)
					.collect::<Vec<Uuid>>();
				set.sort();

				// If this set was already processed, skip it.
				if sets_done.contains(&set) {
					continue;
				}

				// Find the value of the sync update trackers that is the
				// highest possible minimum within a quorum of nodes.
				let mut sync_values = set
					.iter()
					.map(|x| self.update_trackers.sync_map.get(x, min_version))
					.collect::<Vec<_>>();
				sync_values.sort();
				let set_min = sync_values[sync_values.len() - quorum];
				if set_min < current_min {
					current_min = set_min;
				}
				// defavorable case, we know we are at the smallest possible version,
				// so we can stop early
				assert!(current_min >= global_min);
				if current_min == global_min {
					return current_min;
				}

				// Add set to already processed sets
				sets_done.insert(set);
			}
		}

		current_min
	}

	pub(crate) fn calculate_trackers_hash(&self) -> Hash {
		blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..])
	}

	pub(crate) fn calculate_staging_hash(&self) -> Hash {
		blake2sum(&nonversioned_encode(&self.staging).unwrap()[..])
	}

	// ================== updates to layout, public interface ===================

	pub fn merge(&mut self, other: &LayoutHistory) -> bool {
		let mut changed = false;

		// Add any new versions to history
		for v2 in other.versions.iter() {
			if let Some(v1) = self.versions.iter().find(|v| v.version == v2.version) {
				// Version is already present, check consistency
				if v1 != v2 {
					error!("Inconsistent layout histories: different layout compositions for version {}. Your cluster will be broken as long as this layout version is not replaced.", v2.version);
				}
			} else if self.versions.iter().all(|v| v.version != v2.version - 1) {
				error!(
					"Cannot receive new layout version {}, version {} is missing",
					v2.version,
					v2.version - 1
				);
			} else {
				self.versions.push(v2.clone());
				changed = true;
			}
		}

		// Merge trackers
		let c = self.update_trackers.merge(&other.update_trackers);
		changed = changed || c;

		// Merge staged layout changes
		if self.staging != other.staging {
			let prev_staging = self.staging.clone();
			self.staging.merge(&other.staging);
			changed = changed || self.staging != prev_staging;
		}

		changed
	}

	pub fn apply_staged_changes(mut self, version: Option<u64>) -> Result<(Self, Message), Error> {
		match version {
			None => {
				let error = r#"
Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout.
To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes.
				"#;
				return Err(Error::Message(error.into()));
			}
			Some(v) => {
				if v != self.current().version + 1 {
					return Err(Error::Message("Invalid new layout version".into()));
				}
			}
		}

		// Compute new version and add it to history
		let (new_version, msg) = self
			.current()
			.clone()
			.calculate_next_version(self.staging.get())?;

		self.versions.push(new_version);
		self.cleanup_old_versions();

		// Reset the staged layout changes
		self.staging.update(LayoutStaging {
			parameters: self.staging.get().parameters.clone(),
			roles: LwwMap::new(),
		});

		Ok((self, msg))
	}

	pub fn revert_staged_changes(mut self) -> Result<Self, Error> {
		self.staging.update(LayoutStaging {
			parameters: Lww::new(self.current().parameters),
			roles: LwwMap::new(),
		});

		Ok(self)
	}

	pub fn check(&self) -> Result<(), String> {
		// TODO: anything more ?
		self.current().check()
	}
}