aboutsummaryrefslogtreecommitdiff
path: root/src/net/send.rs
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2024-02-15 13:58:15 +0100
committerAlex Auvolat <alex@adnab.me>2024-02-15 14:06:34 +0100
commiteb4a6ce1060a847be0b62c6a10ff3ba956e3f34d (patch)
tree84f6746ec92ca3a5344da22edadcd1ac3413aa29 /src/net/send.rs
parentcf2af186fcc0c8f581a966454b6cd4720d3821f0 (diff)
parent7be3f15e45fcfff10a45302a040c2919a3ba8ccd (diff)
downloadgarage-eb4a6ce1060a847be0b62c6a10ff3ba956e3f34d.tar.gz
garage-eb4a6ce1060a847be0b62c6a10ff3ba956e3f34d.zip
Merge branch 'main' into next-0.10
Diffstat (limited to 'src/net/send.rs')
-rw-r--r--src/net/send.rs356
1 files changed, 356 insertions, 0 deletions
diff --git a/src/net/send.rs b/src/net/send.rs
new file mode 100644
index 00000000..0db0ba77
--- /dev/null
+++ b/src/net/send.rs
@@ -0,0 +1,356 @@
+use std::collections::{HashMap, VecDeque};
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use async_trait::async_trait;
+use bytes::{BufMut, Bytes, BytesMut};
+use log::*;
+
+use futures::{AsyncWriteExt, Future};
+use kuska_handshake::async_std::BoxStreamWrite;
+use tokio::sync::mpsc;
+
+use crate::error::*;
+use crate::message::*;
+use crate::stream::*;
+
+// Messages are sent by chunks
+// Chunk format:
+// - u32 BE: request id (same for request and response)
+// - u16 BE: chunk length + flags:
+// CHUNK_FLAG_HAS_CONTINUATION when this is not the last chunk of the stream
+// CHUNK_FLAG_ERROR if this chunk denotes an error
+// (these two flags are exclusive, an error denotes the end of the stream)
+// **special value** 0xFFFF indicates a CANCEL message
+// - [u8; chunk_length], either
+// - if not error: chunk data
+// - if error:
+// - u8: error kind, encoded using error::io_errorkind_to_u8
+// - rest: error message
+// - absent for cancel messag
+
+pub(crate) type RequestID = u32;
+pub(crate) type ChunkLength = u16;
+
+pub(crate) const MAX_CHUNK_LENGTH: ChunkLength = 0x3FF0;
+pub(crate) const CHUNK_FLAG_ERROR: ChunkLength = 0x4000;
+pub(crate) const CHUNK_FLAG_HAS_CONTINUATION: ChunkLength = 0x8000;
+pub(crate) const CHUNK_LENGTH_MASK: ChunkLength = 0x3FFF;
+pub(crate) const CANCEL_REQUEST: ChunkLength = 0xFFFF;
+
+pub(crate) enum SendItem {
+ Stream(RequestID, RequestPriority, Option<OrderTag>, ByteStream),
+ Cancel(RequestID),
+}
+
+// ----
+
+struct SendQueue {
+ items: Vec<(u8, SendQueuePriority)>,
+}
+
+struct SendQueuePriority {
+ items: VecDeque<SendQueueItem>,
+ order: HashMap<u64, VecDeque<u64>>,
+}
+
+struct SendQueueItem {
+ id: RequestID,
+ prio: RequestPriority,
+ order_tag: Option<OrderTag>,
+ data: ByteStreamReader,
+ sent: usize,
+}
+
+impl SendQueue {
+ fn new() -> Self {
+ Self {
+ items: Vec::with_capacity(64),
+ }
+ }
+ fn push(&mut self, item: SendQueueItem) {
+ let prio = item.prio;
+ let pos_prio = match self.items.binary_search_by(|(p, _)| p.cmp(&prio)) {
+ Ok(i) => i,
+ Err(i) => {
+ self.items.insert(i, (prio, SendQueuePriority::new()));
+ i
+ }
+ };
+ self.items[pos_prio].1.push(item);
+ }
+ fn remove(&mut self, id: RequestID) {
+ for (_, prioq) in self.items.iter_mut() {
+ prioq.remove(id);
+ }
+ self.items.retain(|(_prio, q)| !q.is_empty());
+ }
+ fn is_empty(&self) -> bool {
+ self.items.iter().all(|(_k, v)| v.is_empty())
+ }
+
+ // this is like an async fn, but hand implemented
+ fn next_ready(&mut self) -> SendQueuePollNextReady<'_> {
+ SendQueuePollNextReady { queue: self }
+ }
+}
+
+impl SendQueuePriority {
+ fn new() -> Self {
+ Self {
+ items: VecDeque::new(),
+ order: HashMap::new(),
+ }
+ }
+ fn push(&mut self, item: SendQueueItem) {
+ if let Some(OrderTag(stream, order)) = item.order_tag {
+ let order_vec = self.order.entry(stream).or_default();
+ let i = order_vec.iter().take_while(|o2| **o2 < order).count();
+ order_vec.insert(i, order);
+ }
+ self.items.push_front(item);
+ }
+ fn remove(&mut self, id: RequestID) {
+ if let Some(i) = self.items.iter().position(|x| x.id == id) {
+ let item = self.items.remove(i).unwrap();
+ if let Some(OrderTag(stream, order)) = item.order_tag {
+ let order_vec = self.order.get_mut(&stream).unwrap();
+ let j = order_vec.iter().position(|x| *x == order).unwrap();
+ order_vec.remove(j).unwrap();
+ if order_vec.is_empty() {
+ self.order.remove(&stream);
+ }
+ }
+ }
+ }
+ fn is_empty(&self) -> bool {
+ self.items.is_empty()
+ }
+ fn poll_next_ready(&mut self, ctx: &mut Context<'_>) -> Poll<(RequestID, DataFrame)> {
+ for (j, item) in self.items.iter_mut().enumerate() {
+ if let Some(OrderTag(stream, order)) = item.order_tag {
+ if order > *self.order.get(&stream).unwrap().front().unwrap() {
+ continue;
+ }
+ }
+
+ let mut item_reader = item.data.read_exact_or_eos(MAX_CHUNK_LENGTH as usize);
+ if let Poll::Ready(bytes_or_err) = Pin::new(&mut item_reader).poll(ctx) {
+ let id = item.id;
+ let eos = item.data.eos();
+
+ let packet = bytes_or_err.map_err(|e| match e {
+ ReadExactError::Stream(err) => err,
+ _ => unreachable!(),
+ });
+
+ let is_err = packet.is_err();
+ let data_frame = DataFrame::from_packet(packet, !eos);
+ item.sent += data_frame.data().len();
+
+ if eos || is_err {
+ // If item had an order tag, remove it from the corresponding ordering list
+ if let Some(OrderTag(stream, order)) = item.order_tag {
+ let order_stream = self.order.get_mut(&stream).unwrap();
+ assert_eq!(order_stream.pop_front(), Some(order));
+ if order_stream.is_empty() {
+ self.order.remove(&stream);
+ }
+ }
+ // Remove item from sending queue
+ self.items.remove(j);
+ } else {
+ // Move item later in send queue to implement LAS scheduling
+ // (LAS = Least Attained Service)
+ for k in j..self.items.len() - 1 {
+ if self.items[k].sent >= self.items[k + 1].sent {
+ self.items.swap(k, k + 1);
+ } else {
+ break;
+ }
+ }
+ }
+
+ return Poll::Ready((id, data_frame));
+ }
+ }
+
+ Poll::Pending
+ }
+ fn dump(&self, prio: u8) -> String {
+ self.items
+ .iter()
+ .map(|i| format!("[{} {} {:?} @{}]", prio, i.id, i.order_tag, i.sent))
+ .collect::<Vec<_>>()
+ .join(" ")
+ }
+}
+
+struct SendQueuePollNextReady<'a> {
+ queue: &'a mut SendQueue,
+}
+
+impl<'a> futures::Future for SendQueuePollNextReady<'a> {
+ type Output = (RequestID, DataFrame);
+
+ fn poll(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Self::Output> {
+ for (i, (_prio, items_at_prio)) in self.queue.items.iter_mut().enumerate() {
+ if let Poll::Ready(res) = items_at_prio.poll_next_ready(ctx) {
+ if items_at_prio.is_empty() {
+ self.queue.items.remove(i);
+ }
+ return Poll::Ready(res);
+ }
+ }
+ // If the queue is empty, this futures is eternally pending.
+ // This is ok because we use it in a select with another future
+ // that can interrupt it.
+ Poll::Pending
+ }
+}
+
+enum DataFrame {
+ /// a fixed size buffer containing some data + a boolean indicating whether
+ /// there may be more data comming from this stream. Can be used for some
+ /// optimization. It's an error to set it to false if there is more data, but it is correct
+ /// (albeit sub-optimal) to set it to true if there is nothing coming after
+ Data(Bytes, bool),
+ /// An error code automatically signals the end of the stream
+ Error(Bytes),
+}
+
+impl DataFrame {
+ fn from_packet(p: Packet, has_cont: bool) -> Self {
+ match p {
+ Ok(bytes) => {
+ assert!(bytes.len() <= MAX_CHUNK_LENGTH as usize);
+ Self::Data(bytes, has_cont)
+ }
+ Err(e) => {
+ let mut buf = BytesMut::new();
+ buf.put_u8(io_errorkind_to_u8(e.kind()));
+
+ let msg = format!("{}", e).into_bytes();
+ if msg.len() > (MAX_CHUNK_LENGTH - 1) as usize {
+ buf.put(&msg[..(MAX_CHUNK_LENGTH - 1) as usize]);
+ } else {
+ buf.put(&msg[..]);
+ }
+
+ Self::Error(buf.freeze())
+ }
+ }
+ }
+
+ fn header(&self) -> [u8; 2] {
+ let header_u16 = match self {
+ DataFrame::Data(data, false) => data.len() as u16,
+ DataFrame::Data(data, true) => data.len() as u16 | CHUNK_FLAG_HAS_CONTINUATION,
+ DataFrame::Error(msg) => msg.len() as u16 | CHUNK_FLAG_ERROR,
+ };
+ ChunkLength::to_be_bytes(header_u16)
+ }
+
+ fn data(&self) -> &[u8] {
+ match self {
+ DataFrame::Data(ref data, _) => &data[..],
+ DataFrame::Error(ref msg) => &msg[..],
+ }
+ }
+}
+
+/// The SendLoop trait, which is implemented both by the client and the server
+/// connection objects (ServerConna and ClientConn) adds a method `.send_loop()`
+/// that takes a channel of messages to send and an asynchronous writer,
+/// and sends messages from the channel to the async writer, putting them in a queue
+/// before being sent and doing the round-robin sending strategy.
+///
+/// The `.send_loop()` exits when the sending end of the channel is closed,
+/// or if there is an error at any time writing to the async writer.
+#[async_trait]
+pub(crate) trait SendLoop: Sync {
+ async fn send_loop<W>(
+ self: Arc<Self>,
+ msg_recv: mpsc::UnboundedReceiver<SendItem>,
+ mut write: BoxStreamWrite<W>,
+ debug_name: String,
+ ) -> Result<(), Error>
+ where
+ W: AsyncWriteExt + Unpin + Send + Sync,
+ {
+ let mut sending = SendQueue::new();
+ let mut msg_recv = Some(msg_recv);
+ while msg_recv.is_some() || !sending.is_empty() {
+ trace!(
+ "send_loop({}): queue = {:?}",
+ debug_name,
+ sending
+ .items
+ .iter()
+ .map(|(prio, i)| i.dump(*prio))
+ .collect::<Vec<_>>()
+ .join(" ; ")
+ );
+
+ let recv_fut = async {
+ if let Some(chan) = &mut msg_recv {
+ chan.recv().await
+ } else {
+ futures::future::pending().await
+ }
+ };
+ let send_fut = sending.next_ready();
+
+ // recv_fut is cancellation-safe according to tokio doc,
+ // send_fut is cancellation-safe as implemented above?
+ tokio::select! {
+ biased; // always read incomming channel first if it has data
+ sth = recv_fut => {
+ match sth {
+ Some(SendItem::Stream(id, prio, order_tag, data)) => {
+ trace!("send_loop({}): add stream {} to send", debug_name, id);
+ sending.push(SendQueueItem {
+ id,
+ prio,
+ order_tag,
+ data: ByteStreamReader::new(data),
+ sent: 0,
+ })
+ }
+ Some(SendItem::Cancel(id)) => {
+ trace!("send_loop({}): cancelling {}", debug_name, id);
+ sending.remove(id);
+ let header_id = RequestID::to_be_bytes(id);
+ write.write_all(&header_id[..]).await?;
+ write.write_all(&ChunkLength::to_be_bytes(CANCEL_REQUEST)).await?;
+ write.flush().await?;
+ }
+ None => {
+ msg_recv = None;
+ }
+ };
+ }
+ (id, data) = send_fut => {
+ trace!(
+ "send_loop({}): id {}, send {} bytes, header_size {}",
+ debug_name,
+ id,
+ data.data().len(),
+ hex::encode(data.header())
+ );
+
+ let header_id = RequestID::to_be_bytes(id);
+ write.write_all(&header_id[..]).await?;
+
+ write.write_all(&data.header()).await?;
+ write.write_all(data.data()).await?;
+ write.flush().await?;
+ }
+ }
+ }
+
+ let _ = write.goodbye().await;
+ Ok(())
+ }
+}