diff --git a/Chapters/Background.tex b/Chapters/Background.tex index 94d4517..03e1b88 100644 --- a/Chapters/Background.tex +++ b/Chapters/Background.tex @@ -37,6 +37,83 @@ the 80\% success rate sets a baseline expectation, while the 55-second timeout informs analysis of each implementation's keep-alive behavior during source code review. +\subsection{The Babel routing protocol} +\label{sec:babel} + +Babel~\cite{chroboczek_babel_2021} is a distance-vector routing +protocol designed for both wired and wireless mesh networks. Each +node periodically sends \emph{Hello} messages to discover neighbours +and \emph{Update} messages to advertise reachable prefixes along with +a numeric cost metric. A node selects the route with the lowest +cumulative metric for each destination, subject to a +\emph{feasibility condition} that prevents routing loops. Because +Babel is distance-vector rather than link-state, nodes only know the +cost of their own best path, not the full topology. + +Two properties of Babel matter for the benchmarks in +Chapter~\ref{Results}. First, route advertisements are periodic: a +node will not learn about a new path until the next Update interval, +which can be on the order of minutes depending on the implementation's +timer settings. Second, Babel intentionally resists frequent route +changes to avoid flapping; a node may continue using a suboptimal path +until a significantly better alternative is advertised. Both +properties can cause the selected route for a given destination to +differ across consecutive benchmark runs, even when the physical +topology has not changed. + +\subsection{TCP flow control and congestion control} +\label{sec:tcp_windows} + +TCP uses two window mechanisms to regulate how much unacknowledged data +a sender may have in flight. The \emph{receive window} +(\texttt{rwnd}), also called the \emph{send window} in +\texttt{iperf3} output, is advertised by the receiver and reflects how +much buffer space it has available. The \emph{congestion window} +(\texttt{cwnd}) is maintained locally by the sender and tracks the +network's estimated capacity. At any point, the sender may transmit +up to $\min(\texttt{rwnd}, \texttt{cwnd})$ bytes beyond the last +acknowledged byte \cite{rfc5681}. + +The congestion window starts small (typically a few segments) and +grows during the \emph{slow-start} phase, doubling each round trip +until it reaches a threshold or triggers a loss event. After that, +\emph{congestion avoidance} takes over and the window grows linearly. +When the sender detects a loss (through duplicate ACKs or a +retransmission timeout), it treats the loss as a signal of congestion: +the window is reduced, often halved, and the sender enters a recovery +phase before resuming growth. Each retransmission therefore has a +direct mechanical cost: it shrinks the congestion window and reduces +the instantaneous sending rate. + +The \emph{bandwidth-delay product} (BDP) determines how large the +window must be to fully utilize a link. It is the product of the +link's bandwidth and the round-trip time: +\begin{equation} + \text{BDP} = \text{bandwidth} \times \text{RTT} + \label{eq:bdp} +\end{equation} +A 1\,Gbps link with a 1\,ms RTT has a BDP of 125\,KB: the sender +must keep at least 125\,KB of unacknowledged data in flight to +saturate the link. If the congestion window is smaller than the BDP, +the sender will finish transmitting its window and then wait idle for +acknowledgements, leaving bandwidth unused. High-latency paths make +this problem worse because the BDP grows linearly with RTT. A +34\,ms RTT on the same 1\,Gbps link raises the BDP to 4.25\,MB, well +beyond the default congestion window of most TCP stacks. One common +workaround is to run multiple TCP flows in parallel: each flow +maintains its own congestion window, and their aggregate in-flight +data can approach the BDP even when no single flow could. + +In VPN benchmarks these two windows appear as distinct bottlenecks. A +small receive window means the receiver (or the tunnel endpoint in +front of it) cannot absorb data fast enough. A small congestion +window means the path between sender and receiver is experiencing +loss, forcing TCP into repeated recovery cycles. Comparing congestion +windows across VPNs with different maximum segment sizes requires +care, because the window is measured in bytes: a VPN with jumbo +segments will report a larger byte-valued window for the same number +of in-flight segments. + \subsection{An Overview of Packet Reordering in TCP} TODO \cite{leung_overview_2007} diff --git a/Chapters/Results.tex b/Chapters/Results.tex index 9da7ac8..529c818 100644 --- a/Chapters/Results.tex +++ b/Chapters/Results.tex @@ -132,87 +132,77 @@ VpnCloud, while Hyprspace, Tinc, and Mycelium occupy the bottom tier at under 40\,\% of baseline. Figure~\ref{fig:tcp_throughput} visualizes this hierarchy. -Raw throughput alone is incomplete, however. The retransmit column -reveals that not all high-throughput VPNs get there cleanly. -ZeroTier, for instance, reaches 814\,Mbps but accumulates -1\,163~retransmits per test, over 1\,000$\times$ what WireGuard -needs. ZeroTier compensates for tunnel-internal packet loss by -repeatedly triggering TCP congestion-control recovery, whereas -WireGuard delivers data with negligible in-tunnel loss. The -bare-metal Internal reference sits at 1.7~retransmits per test, -essentially noise, and the VPNs split into three groups around -it: \emph{clean} ($<$110: WireGuard, Yggdrasil, Headscale), -\emph{stressed} (200--900: Tinc, EasyTier, Mycelium, VpnCloud), -and \emph{pathological} ($>$950: Nebula, ZeroTier, Hyprspace). +\begin{figure}[H] + \centering + \includegraphics[width=\textwidth]{{Figures/baseline/tcp/TCP + Throughput}.png} + \caption{Average single-stream TCP throughput} + \label{fig:tcp_throughput} +\end{figure} -% TODO: Is this naming scheme any good? +Raw throughput alone is incomplete. The retransmit rate +(Figure~\ref{fig:tcp_retransmits}) normalizes raw retransmit counts +by estimated packet count, accounting for the different segment sizes +each VPN negotiates (1\,228 to 32\,731 bytes). WireGuard and +Headscale are effectively loss-free ($<$\,0.01\,\%). Tinc, EasyTier, +Nebula, and VpnCloud form a moderate band (0.03--0.06\,\%). +Yggdrasil, ZeroTier, and Mycelium cluster between 0.09\,\% and +0.13\,\%, and Hyprspace is the clear outlier at 0.49\,\%. ZeroTier +reaches 814\,Mbps despite a 0.10\,\% retransmit rate by compensating +for tunnel-internal loss through repeated TCP congestion-control +recovery; WireGuard delivers comparable throughput with effectively +zero loss. \begin{figure}[H] \centering - \begin{subfigure}[t]{\textwidth} - \centering - \includegraphics[width=\textwidth]{{Figures/baseline/tcp/TCP - Throughput}.png} - \caption{Average single-stream TCP throughput} - \label{fig:tcp_throughput} - \end{subfigure} - - \vspace{1em} - - \begin{subfigure}[t]{\textwidth} - \centering - \includegraphics[width=\textwidth]{{Figures/baseline/tcp/TCP - Retransmit Rate}.png} - % TODO: Caption says "retransmits" (counts) but the plot axis shows - % "Retransmit Rate (\%)." Align the caption with the plot. - \caption{TCP retransmit rate (\%)} - \label{fig:tcp_retransmits} - \end{subfigure} - % TODO: This parent caption still says "retransmit count" but the - % subfigure axis and caption were corrected to "retransmit rate (%)." - % Align the parent caption terminology (counts vs rates). - \caption{TCP throughput and retransmit rate at baseline. WireGuard - leads at 864\,Mbps with 1 retransmit. Hyprspace has nearly 5000 - retransmits per test. The retransmit count does not always track - inversely with throughput: ZeroTier achieves high throughput - \emph{despite} high retransmits.} - \label{fig:tcp_results} + \includegraphics[width=\textwidth]{{Figures/baseline/tcp/TCP + Retransmit Rate}.png} + \caption{TCP retransmit rate at baseline. WireGuard and Headscale + are effectively loss-free ($<$\,0.01\,\%). Hyprspace is the clear + outlier at 0.49\,\%.} + \label{fig:tcp_retransmits} \end{figure} Retransmits have a direct mechanical relationship with TCP congestion control: each one triggers a reduction in the congestion window (\texttt{cwnd}) and throttles the sender. -Figure~\ref{fig:retransmit_correlations} shows the relationship: -Hyprspace, with 4965 -retransmits, maintains the smallest max congestion window in the -dataset (205\,KB), while Yggdrasil's 75 retransmits allow a 4.3\,MB -window, the largest of any VPN. At first glance this suggests a -clean inverse correlation between retransmits and congestion window -size, but the picture is misleading. Yggdrasil's outsized window is -largely an artifact of its jumbo overlay MTU (32\,731 bytes): each -segment carries far more data, so the window in bytes is inflated -relative to VPNs using a standard ${\sim}$1\,400-byte MTU. Comparing -congestion windows across different MTU sizes is not meaningful -without normalizing for segment size. The reliable conclusion is -simpler: high retransmit rates force TCP to spend more time in -congestion recovery than in steady-state transmission, and that -caps throughput regardless of available bandwidth. ZeroTier -illustrates the opposite extreme: brute-force retransmission can -still yield high throughput (814\,Mbps with 1\,163 retransmits), at -the cost of wasted bandwidth and unstable flow behavior. +Figure~\ref{fig:tcp_window} shows the raw window sizes, and +Figure~\ref{fig:retransmit_correlations} plots them against retransmit +rate. Hyprspace, with a 0.49\,\% retransmit rate, maintains the +smallest max congestion window in the dataset (200\,KB), while +Yggdrasil's 0.09\,\% rate allows a 4.2\,MB window, the largest of +any VPN. At +first glance this suggests a clean inverse correlation between +retransmit rate and congestion window size, but the picture is +misleading. Yggdrasil's outsized window is largely an artifact of +its jumbo overlay MTU (32\,731 bytes): each segment carries far more +data, so the window in bytes is inflated relative to VPNs using a +standard ${\sim}$1\,400-byte MTU. Comparing congestion windows +across different MTU sizes is not meaningful without normalizing for +segment size. The reliable conclusion is simpler: high retransmit +rates force TCP to spend more time in congestion recovery than in +steady-state transmission, and that caps throughput regardless of +available bandwidth. ZeroTier illustrates the opposite extreme: +brute-force retransmission can still yield high throughput +(814\,Mbps at a 0.10\,\% rate), at the cost of wasted bandwidth and +unstable flow behavior. + +\begin{figure}[H] + \centering + \includegraphics[width=\textwidth]{{Figures/baseline/tcp/Max TCP + Window Size}.png} + \caption{Maximum TCP window sizes (send and congestion) at baseline. + Yggdrasil's congestion window (4\,219\,KB) dwarfs all others but + is inflated by its 32\,KB jumbo overlay MTU. Hyprspace has the + smallest congestion window (200\,KB).} + \label{fig:tcp_window} +\end{figure} VpnCloud stands out: its sender reports 538.8\,Mbps but the receiver measures only 413.4\,Mbps, a 23\,\% gap and the largest in the dataset. This points to significant in-tunnel packet loss -or buffering at the VpnCloud layer that the retransmit count (857) -alone does not fully explain. -% TODO: Clarify whether the headline TCP table -% (Table~\ref{tab:tcp_baseline}, 539\,Mbps for VpnCloud) reports -% sender or receiver throughput. The prose here cites sender -% 538.8 vs.\ receiver 413.4 --- the 539 figure matches the sender -% column, so the table caption should say so explicitly. Same -% clarification needed for Hyprspace (368 in table vs.\ sender -% 367.9 / receiver 419.8 in the pathological-cases paragraph). +or buffering at the VpnCloud layer that the retransmit rate +(0.06\,\%) alone does not fully explain. Variability, whether stochastic across runs or systematic across links, also differs substantially. WireGuard's three link @@ -243,14 +233,14 @@ on every direction. \caption{Retransmits vs.\ max congestion window} \label{fig:retransmit_cwnd} \end{subfigure} - \caption{Retransmit correlations (log scale on x-axis). High - retransmits do not always mean low throughput (ZeroTier: 1\,163 - retransmits, 814\,Mbps), but extreme retransmits do (Hyprspace: - 4\,965 retransmits, 368\,Mbps). The apparent inverse correlation - between retransmits and congestion window size is dominated by + \caption{Retransmit correlations (log scale on x-axis). A high + retransmit rate does not always mean low throughput (ZeroTier: + 0.10\,\%, 814\,Mbps), but an extreme rate does (Hyprspace: + 0.49\,\%, 368\,Mbps). The apparent inverse correlation between + retransmit rate and congestion window size is dominated by Yggdrasil's outlier (4.3\,MB \texttt{cwnd}), which is inflated - by its 32\,KB jumbo overlay MTU rather than by low retransmits - alone.} + by its 32\,KB jumbo overlay MTU rather than by a low retransmit + rate alone.} \label{fig:retransmit_correlations} \end{figure} @@ -258,29 +248,35 @@ on every direction. Sorting by latency rearranges the rankings considerably. Table~\ref{tab:latency_baseline} lists the average ping round-trip -times, which cluster into three distinct ranges. +times, which cluster into three distinct ranges. The table also +reports the average maximum RTT observed across test runs and the +resulting spike ratio (max/avg); a high ratio signals bursty tail +latency that the average alone conceals. \begin{table}[H] \centering - \caption{Average ping RTT at baseline, sorted by latency} + \caption{Ping RTT statistics at baseline, sorted by average latency. + The spike ratio is max\,RTT\,/\,avg\,RTT; higher values indicate + bursty tail latency.} \label{tab:latency_baseline} - \begin{tabular}{lr} + \begin{tabular}{lrrrr} \hline - \textbf{VPN} & \textbf{Avg RTT (ms)} \\ + \textbf{VPN} & \textbf{Avg RTT (ms)} & \textbf{Max RTT (ms)} + & \textbf{Spike Ratio} & \textbf{Jitter (ms)} \\ \hline - Internal & 0.60 \\ - VpnCloud & 1.13 \\ - Tinc & 1.19 \\ - WireGuard & 1.20 \\ - Nebula & 1.25 \\ - ZeroTier & 1.28 \\ - EasyTier & 1.33 \\ + Internal & 0.60 & 0.65 & 1.1$\times$ & 0.04 \\ + VpnCloud & 1.13 & 3.14 & 2.8$\times$ & 0.25 \\ + Tinc & 1.19 & 1.31 & 1.1$\times$ & 0.07 \\ + WireGuard & 1.20 & 1.81 & 1.5$\times$ & 0.13 \\ + Nebula & 1.25 & 1.53 & 1.2$\times$ & 0.10 \\ + ZeroTier & 1.28 & 3.00 & 2.3$\times$ & 0.25 \\ + EasyTier & 1.33 & 1.55 & 1.2$\times$ & 0.10 \\ \hline - Headscale & 1.64 \\ - Hyprspace & 1.79 \\ - Yggdrasil & 2.20 \\ + Headscale & 1.64 & 1.81 & 1.1$\times$ & 0.09 \\ + Hyprspace & 1.79 & 2.21 & 1.2$\times$ & 0.13 \\ + Yggdrasil & 2.20 & 3.13 & 1.4$\times$ & 0.20 \\ \hline - Mycelium & 34.9 \\ + Mycelium & 34.9 & 48.6 & 1.4$\times$ & 1.49 \\ \hline \end{tabular} \end{table} @@ -296,13 +292,16 @@ moderate overhead. Then there is Mycelium at 34.9\,ms, so far removed from the rest that Section~\ref{sec:mycelium_routing} gives it a dedicated analysis. -% TODO: The max RTT claim (8.6 ms) is not visible in the Average RTT -% plot. Add a max-RTT figure or table, or reference the raw data -% source. -ZeroTier's average of 1.28\,ms looks unremarkable, but its maximum -RTT spikes to 8.6\,ms, a 6.8$\times$ jump and the largest for any -sub-2\,ms VPN. These spikes point to periodic control-plane -interference that the average hides. +The spike-ratio column in Table~\ref{tab:latency_baseline} exposes two +outliers among the low-latency VPNs. VpnCloud leads at +2.8$\times$ (avg 1.13\,ms, max 3.14\,ms) and ZeroTier follows at +2.3$\times$ (avg 1.28\,ms, max 3.00\,ms); both share the highest +jitter in the table (0.25\,ms). Tinc and Headscale, by contrast, +stay below 1.1$\times$ with jitter under 0.09\,ms, so their packet +timing is nearly as stable as bare metal. The spikes in VpnCloud and +ZeroTier are consistent with periodic +control-plane work such as key rotation or peer heartbeats that +briefly stalls the data path. \begin{figure}[H] \centering @@ -315,43 +314,42 @@ interference that the average hides. Tinc presents a paradox: it has the third-lowest latency (1.19\,ms) but only the second-lowest throughput (336\,Mbps). Packets traverse -the tunnel quickly, yet something caps the overall rate. The qperf -benchmark reports Tinc maxing out at 14.9\,\% total system CPU while -delivering 336\,Mbps. On a multi-core host this figure is consistent +the tunnel quickly, yet something caps the overall rate. +Figure~\ref{fig:tcp_cpu} shows that Tinc uses only 12.3\,\% host CPU +during the TCP test. On a multi-core host this figure is consistent with a single saturated core, which fits Tinc's single-threaded userspace architecture: one core encrypts, copies, and forwards -packets, and the remaining cores sit idle. But VpnCloud reports the -same 14.9\,\% and still reaches 539\,Mbps (60\,\% more than Tinc), -so whole-system CPU alone cannot explain the gap, and a per-packet -processing cost difference must also be in play. -% TODO: 14.9\% total CPU does not pin the bottleneck on its own. -% This is whole-system utilization on a multi-core machine, and a -% single saturated core fits the budget — but VpnCloud reports the -% same 14.9\% \emph{and} reaches 539\,Mbps. Verify with per-thread -% CPU sampling or eBPF profiling to confirm the single-core story -% and quantify the per-packet cost difference. +packets, and the remaining cores sit idle. + +\begin{figure}[H] + \centering + \includegraphics[width=\textwidth]{{Figures/baseline/tcp/TCP CPU + Utilization}.png} + \caption{CPU utilization during TCP throughput tests, split by host + (sender) and remote (receiver). Tinc (12.3\,\%) and VpnCloud + (14.2\,\%) use similar CPU, yet VpnCloud achieves 60\,\% higher + throughput. Yggdrasil's low CPU (2.7\,\%) reflects its + kernel-level forwarding with jumbo segments.} + \label{fig:tcp_cpu} +\end{figure} + +VpnCloud is also +single-threaded and uses slightly more CPU (14.2\,\%), yet reaches +539\,Mbps (60\,\% more throughput). The gap comes down to per-packet +cost. Tinc uses a hand-written ChaCha20-Poly1305 implementation +without hardware acceleration, allocates a fresh stack buffer and +copies the payload for each packet, and routes through a splay-tree +lookup. VpnCloud uses the \texttt{ring} cryptographic library, which +employs optimized assembly and can select AES-128-GCM with hardware +AES-NI instructions at runtime; it encrypts in place with no extra +buffer copies and routes through an $O(1)$ hash-map lookup. These +differences compound in a tight single-threaded loop: every +microsecond saved per packet raises the maximum packet rate the one +available core can sustain. + Figure~\ref{fig:latency_throughput} makes this disconnect easy to spot. -% TODO: These CPU numbers are stated inline but never shown in a plot -% or table. Add a CPU utilization figure or table so readers can -% verify. Also, the claim that WireGuard's CPU usage "goes to -% cryptographic processing" is unsubstantiated: no profiling data -% is presented. Either add profiling evidence or soften to -% "likely" / "presumably." -The qperf measurements also reveal a wide spread in CPU usage. -Hyprspace (55.1\,\%) and Yggdrasil -(52.8\,\%) consume 5--6$\times$ as much CPU as Internal's -9.7\,\%. WireGuard sits at 30.8\,\%, higher than expected for a -kernel-level implementation; in-kernel cryptographic processing -is the likely cause, though no profiling data confirms this. -On the efficient end, VpnCloud -(14.9\,\%), Tinc (14.9\,\%), and EasyTier (15.4\,\%) use the least -CPU time. Nebula and Headscale are missing from -this comparison because qperf failed for both. - -%TODO: Explain why they consistently failed - \begin{figure}[H] \centering \includegraphics[width=\textwidth]{Figures/baseline/latency-vs-throughput.png} @@ -365,10 +363,7 @@ this comparison because qperf failed for both. \subsection{Parallel TCP Scaling} -The single-stream benchmark tests one link direction at a time. % -% TODO: The plot labels this benchmark "10-stream parallel" but this -% description says "six unidirectional flows." Verify the actual test -% configuration and reconcile the two. +The single-stream benchmark tests one link direction at a time. The parallel benchmark changes this setup: all three link directions (lom$\rightarrow$yuki, yuki$\rightarrow$luna, @@ -411,26 +406,25 @@ Table~\ref{tab:parallel_scaling} lists the results. \end{table} The VPNs that gain the most are those most constrained in -single-stream mode. Mycelium's 34.9\,ms RTT means a lone TCP stream -can never fill the pipe: the bandwidth-delay product (the amount - of in-flight data a TCP flow needs to saturate a link, equal to the -link bandwidth times the round-trip time) demands a window larger -than any single flow maintains, so multiple concurrent flows -compensate for that constraint and push throughput to 2.20$\times$ -the single-stream figure. Hyprspace scales almost as well -(2.18$\times$) for the same reason but with a different -bottleneck. Its libp2p send pipeline accumulates roughly -2\,800\,ms of under-load latency -(Section~\ref{sec:hyprspace_bloat}), which gives any single TCP -flow a bandwidth-delay product on the order of hundreds of -megabytes to fill, far beyond any single kernel cwnd. And -because Hyprspace keys \texttt{activeStreams} by destination -\texttt{peer.ID} (Listing~\ref{lst:hyprspace_sendpacket}), the -three concurrent peer pairs in the parallel benchmark each get -their own libp2p stream, their own mutex, and their own yamux -flow-control window. The three TCP senders therefore maintain -three independent windows in flight, and three windows fill -more of the bloated pipeline than one can. +single-stream mode. Mycelium's 34.9\,ms RTT gives it a +bandwidth-delay product (Equation~\ref{eq:bdp}) of roughly +4.4\,MB on a 1\,Gbps link. No single TCP flow maintains a +congestion window that large, so the link is never fully utilized. +Multiple concurrent flows each contribute their own window, and +their aggregate in-flight data approaches the BDP, which pushes +throughput to 2.20$\times$ the single-stream figure. + +Hyprspace scales almost as well (2.18$\times$) for the same +structural reason, but the bottleneck is different. Its libp2p send +pipeline accumulates roughly 2\,800\,ms of under-load latency +(Section~\ref{sec:hyprspace_bloat}), which inflates the effective BDP +to hundreds of megabytes, far beyond any single kernel congestion +window. Because Hyprspace keys \texttt{activeStreams} by destination +\texttt{peer.ID} (Listing~\ref{lst:hyprspace_sendpacket}), the three +concurrent peer pairs in the parallel benchmark each get their own +libp2p stream, their own mutex, and their own yamux flow-control +window. Three independent windows in flight fill more of the bloated +pipeline than one can. % TODO: This is still a hypothesis: it generalises the same % bandwidth-delay-product argument used for Mycelium directly % above, and is now grounded in the per-peer @@ -445,23 +439,41 @@ Tinc picks up a single-threaded CPU busy during what would otherwise be idle gaps in a single flow. -% TODO: "zero retransmits" in parallel mode is not shown in any table -% or figure. Add parallel-mode retransmit data or remove the claim. WireGuard and Internal both scale cleanly at around -1.48--1.50$\times$ with zero retransmits. This is consistent -with WireGuard's overhead being a fixed per-packet cost that does -not worsen under multiplexing. +1.48--1.50$\times$ with a 0.00\,\% retransmit rate in both modes. +This is consistent with WireGuard's overhead being a fixed per-packet +cost that does not worsen under multiplexing. Nebula is the only VPN that actually gets \emph{slower} with more streams: throughput drops from 706\,Mbps to 648\,Mbps -(0.92$\times$) while retransmits jump from 955 to 2\,462. The -streams are clearly fighting each other for resources inside the -tunnel. +(0.92$\times$). The cause is lock contention in Nebula's firewall +connection tracker (Listing~\ref{lst:nebula_conntrack}). A single +\texttt{sync.Mutex} protects the global \texttt{Conns} map, and every +packet in both directions must acquire it. The lock holder also +purges the timer wheel before releasing the lock, so other goroutines +stall while that housekeeping runs. Nebula mitigates this with a +per-routine cache that bypasses the global lock for known flows, but +the cache is invalidated every second, at which point all goroutines +contend on the mutex again. With parallel streams, the increased +goroutine count turns this periodic contention into a throughput +bottleneck. -More streams also amplify existing retransmit problems. Hyprspace -climbs from 4\,965 to 17\,426~retransmits; -VpnCloud from 857 to 6\,023. VPNs that were clean in single-stream -mode stay clean under load, while the stressed ones only get worse. +\lstinputlisting[language=Go,caption={Nebula's firewall conntrack: a + global mutex protects the connection map and is acquired on every + packet. + \textit{nebula/firewall.go:79--84, +486--558}},label={lst:nebula_conntrack}]{Listings/nebula_conntrack.go} + +Retransmit rates under parallel load shift in two directions. +VpnCloud's rate climbs from 0.06\,\% to 0.14\,\% (2.5$\times$) and +Yggdrasil's from 0.09\,\% to 0.23\,\% (2.7$\times$), so +multiplexing genuinely increases loss for these VPNs. Hyprspace's +rate, by contrast, drops slightly from 0.49\,\% to 0.39\,\% even +though it sends far more data in parallel; the per-packet loss +probability does not worsen, but the absolute count still triples +because three pairs are transmitting simultaneously. VPNs that were +clean in single-stream mode (WireGuard, Internal) stay clean under +parallel load. \begin{figure}[H] \centering @@ -938,81 +950,109 @@ no flow-control signal coupling the two. \textit{hyprspace/node/node.go:36--39, 282, 328--348}},label={lst:hyprspace_sendpacket}]{Listings/hyprspace_sendpacket.go} -\paragraph{Mycelium: Routing Anomaly.} +\paragraph{Mycelium: routing anomaly.} \label{sec:mycelium_routing} -Mycelium's 34.9\,ms average latency appears to be the cost of -routing through a global overlay. The per-path -numbers, however, -reveal a bimodal distribution: +Mycelium's 34.9\,ms average latency looks like a +straightforward cost of routing through a global +overlay. The per-path numbers do not fit this +explanation: \begin{itemize} - \bitem{luna$\rightarrow$lom:} 1.63\,ms (direct - path, comparable + \bitem{luna$\rightarrow$lom:} 1.63\,ms (comparable to Headscale at 1.64\,ms) - \bitem{lom$\rightarrow$yuki:} 51.47\,ms (overlay-routed) - \bitem{yuki$\rightarrow$luna:} 51.60\,ms (overlay-routed) + \bitem{lom$\rightarrow$yuki:} 51.47\,ms + \bitem{yuki$\rightarrow$luna:} 51.60\,ms \end{itemize} -One of the three links has found a direct route; the -other two still -bounce through the overlay. All three machines sit on the same -% TODO: Characterising path discovery as "failing -% intermittently" assumes -% direct routing is the expected outcome on a LAN. -% Mycelium is designed -% as a global overlay and may intentionally route -% through supernodes. -% If this is by-design behaviour, rephrase to avoid -% implying a bug. -% This characterisation also propagates to the -% impairment ping analysis -% in Section sec:impairment, which says impairment "pushes path -% discovery toward shorter routes." -% TODO: The throughput data INVERTS the latency split -% rather than -% "mirroring" it. The direct path (luna→lom, 1.63 ms -% RTT) achieves -% only 122 Mbps, while the overlay-routed path -% (yuki→luna, 51.60 ms -% RTT) reaches 379 Mbps: the opposite of what TCP -% theory predicts. -% The plot also shows luna→lom receiver throughput at -% only 57.2 Mbps -% (a 53% sender/receiver gap on that link). Explain -% why the direct -% path is 3× slower than the overlay path, or acknowledge the -% contradiction. The current wording "mirrors the -% split" is incorrect. -physical network, so Mycelium's path discovery is not -consistently -selecting the direct route, a more specific problem -than blanket overlay -overhead. Throughput shows a similarly lopsided split: -yuki$\rightarrow$luna reaches 379\,Mbps while -luna$\rightarrow$lom manages only 122\,Mbps, a 3:1 gap. In -bidirectional mode, the reverse direction on that -worst link drops -to 58.4\,Mbps, the lowest single-direction figure in the entire -dataset. +One link found a direct LAN path; the other two +bounced through the overlay. All three machines sit on +the same physical network, so the split is not a matter +of topology. + +The throughput results invert the latency ranking. +The link with the low ping latency, +luna$\rightarrow$lom at 1.63\,ms, should be the fastest +according to TCP congestion theory. It is the slowest: +122\,Mbps, with the reverse direction dropping to +58.4\,Mbps in bidirectional mode. Meanwhile +yuki$\rightarrow$luna, whose ICMP~RTT was 30$\times$ +higher, reaches 379\,Mbps +(Figure~\ref{fig:mycelium_paths}). The throughput +ranking is the exact inverse of what the ping data +predicts. + +The explanation is in the iperf3 logs. Each TCP stream +reports a kernel-measured RTT that is independent of +ICMP ping. For the luna$\rightarrow$lom stream, this +TCP~RTT starts at 51.6\,ms and climbs to a mean of +144\,ms over the 30-second run, with +757~retransmits---the link was clearly overlay-routed +during the throughput test, even though ping had found a +direct path eight minutes earlier. For +yuki$\rightarrow$luna the reverse happened: the TCP +stream measured only 12--22\,ms, and its bidirectional +return path recorded 1.0\,ms, a direct LAN connection +that the earlier ICMP test had not seen. The routes +changed between the two tests. + +Mycelium uses the Babel routing protocol +(Section~\ref{sec:babel}) to discover and select paths. +Two properties of its implementation explain why routes +shifted mid-benchmark. First, Mycelium advertises +routes at a five-minute interval +(Listing~\ref{lst:mycelium_constants}): + +\lstinputlisting[language=Rust,caption={Mycelium's + Babel timing constants. Routes are re-advertised + every 300\,s; the router will not learn about a new + path until the next cycle. +\textit{mycelium/src/router.rs:33--59}},label={lst:mycelium_constants}]{Listings/mycelium_route_constants.rs} + +A direct path that appears between update cycles is +invisible to the router until the next advertisement +arrives. The benchmark's ping and throughput tests ran +sequentially with several minutes between them, so each +test observed whichever route happened to be selected at +that point in Babel's five-minute cycle. + +Second, even when a better route \emph{is} advertised, +the router resists switching to it. +Listing~\ref{lst:mycelium_best_route} shows the +\texttt{find\_best\_route} function: a candidate route +is rejected unless its metric improves on the current +route by more than 10, or unless it is directly +connected (metric~0). This hysteresis prevents +flapping but also means that an overlay path, once +established, can persist for the remainder of the +update interval even after a shorter path becomes +available. + +\lstinputlisting[language=Rust,caption={Route + selection with hysteresis. Lines~16--25 reject a + candidate route unless it is directly connected or + improves the composite metric by more than + \texttt{SIGNIFICANT\_METRIC\_IMPROVEMENT}\,(10). +\textit{mycelium/src/router.rs:1213--1238}},label={lst:mycelium_best_route}]{Listings/mycelium_find_best_route.rs} + +The five-minute update interval and the switching +hysteresis together explain the throughput asymmetry. +The TCP-measured RTTs +are consistent with the observed throughput on every +link; only the ICMP~RTTs, measured minutes earlier under +a different routing state, give the impression of an +inversion. \begin{figure}[H] \centering \includegraphics[width=\textwidth]{{Figures/baseline/tcp/Mycelium/Average Throughput}.png} - % TODO: The caption attributes the asymmetry to - % "inconsistent direct - % route discovery" but the direct-route link - % (luna→lom, 1.63 ms RTT) - % is actually the SLOWEST (122 Mbps). The caption - % should address - % why the direct path underperforms the overlay paths. - \caption{Per-link TCP throughput for Mycelium, showing extreme - path asymmetry. The 3:1 ratio between best - (yuki$\rightarrow$luna, 379\,Mbps) and worst - (luna$\rightarrow$lom, 122\,Mbps) links does not - correlate with - the latency split (Section~\ref{sec:mycelium_routing}).} + \caption{Per-link TCP throughput for Mycelium. The + luna$\rightarrow$lom link appears slow despite its + low ping latency because Babel had switched to an + overlay route by the time the throughput test ran. + The TCP-level RTTs reported by iperf3, not the + earlier ICMP measurements, explain the 3:1 ratio.} \label{fig:mycelium_paths} \end{figure} diff --git a/Figures/baseline/ping/Zerotier_RTT Metrics Comparison.png b/Figures/baseline/ping/Zerotier_RTT Metrics Comparison.png new file mode 100644 index 0000000..0e2fdc7 Binary files /dev/null and b/Figures/baseline/ping/Zerotier_RTT Metrics Comparison.png differ diff --git a/Figures/baseline/quic/CPU Usage.png b/Figures/baseline/quic/CPU Usage.png new file mode 100644 index 0000000..0745e92 Binary files /dev/null and b/Figures/baseline/quic/CPU Usage.png differ diff --git a/Figures/baseline/quic/QUIC Bandwidth.png b/Figures/baseline/quic/QUIC Bandwidth.png new file mode 100644 index 0000000..10ffadc Binary files /dev/null and b/Figures/baseline/quic/QUIC Bandwidth.png differ diff --git a/Figures/baseline/tcp/Max TCP Window Size.png b/Figures/baseline/tcp/Max TCP Window Size.png new file mode 100644 index 0000000..347f535 Binary files /dev/null and b/Figures/baseline/tcp/Max TCP Window Size.png differ diff --git a/Figures/baseline/tcp/TCP CPU Utilization.png b/Figures/baseline/tcp/TCP CPU Utilization.png new file mode 100644 index 0000000..8a3fcdd Binary files /dev/null and b/Figures/baseline/tcp/TCP CPU Utilization.png differ diff --git a/Listings/mycelium_find_best_route.rs b/Listings/mycelium_find_best_route.rs new file mode 100644 index 0000000..6109c9a --- /dev/null +++ b/Listings/mycelium_find_best_route.rs @@ -0,0 +1,29 @@ +fn find_best_route<'a>(&self, routes: &'a RouteList) + -> Option<&'a RouteEntry> +{ + let source_table = self.source_table.read().unwrap(); + let current = routes.selected(); + let best = routes + .iter() + .filter(|re| !re.metric().is_infinite() + && source_table.route_feasible(re)) + .min_by_key(|re| + re.metric() + Metric::from(re.neighbour().link_cost())); + + if let (Some(best), Some(current)) = (best, current) { + // Only switch if the metric is significantly better + // OR if the route is directly connected (metric 0). + if (best.source() != current.source() + || best.neighbour() != current.neighbour()) + && !(best.metric() + + Metric::from(best.neighbour().link_cost()) + < current.metric() + + Metric::from(current.neighbour().link_cost()) + - SIGNIFICANT_METRIC_IMPROVEMENT + || best.metric().is_direct()) + { + return Some(current); // keep existing route + } + } + best +} diff --git a/Listings/mycelium_route_constants.rs b/Listings/mycelium_route_constants.rs new file mode 100644 index 0000000..835eee8 --- /dev/null +++ b/Listings/mycelium_route_constants.rs @@ -0,0 +1,9 @@ +/// Time between HELLO messages, in seconds +const HELLO_INTERVAL: u64 = 20; +/// Max time used in UPDATE packets. +const UPDATE_INTERVAL: Duration = + Duration::from_secs(HELLO_INTERVAL * 3 * 5); // 300 s + +/// The amount a metric of a route needs to improve +/// before we will consider switching to it. +const SIGNIFICANT_METRIC_IMPROVEMENT: Metric = Metric::new(10); diff --git a/Listings/nebula_conntrack.go b/Listings/nebula_conntrack.go new file mode 100644 index 0000000..a0c5418 --- /dev/null +++ b/Listings/nebula_conntrack.go @@ -0,0 +1,39 @@ +type FirewallConntrack struct { + sync.Mutex + + Conns map[firewall.Packet]*conn + TimerWheel *TimerWheel[firewall.Packet] +} + +func (f *Firewall) inConns( + fp firewall.Packet, h *HostInfo, + caPool *cert.CAPool, + localCache firewall.ConntrackCache, +) bool { + if localCache != nil { + if _, ok := localCache[fp]; ok { + return true + } + } + conntrack := f.Conntrack + conntrack.Lock() + + // Purge every time we test + ep, has := conntrack.TimerWheel.Purge() + if has { + f.evict(ep) + } + + c, ok := conntrack.Conns[fp] + if !ok { + conntrack.Unlock() + return false + } + // ... update expiry ... + conntrack.Unlock() + + if localCache != nil { + localCache[fp] = struct{}{} + } + return true +} diff --git a/example.bib b/example.bib new file mode 100644 index 0000000..e69de29 diff --git a/main.tex b/main.tex index 836c9fd..016c240 100644 --- a/main.tex +++ b/main.tex @@ -98,6 +98,16 @@ morestring=[b]", sensitive=true, } +\lstdefinelanguage{Rust}{ + morekeywords={as,break,const,continue,crate,else,enum,extern,false,fn,for, + if,impl,in,let,loop,match,mod,move,mut,pub,ref,return,self,Self,static, + struct,super,trait,true,type,unsafe,use,where,while,async,await,dyn, + Some,None,Option,Result,Ok,Err,Duration}, + morecomment=[l]{//}, + morecomment=[s]{/*}{*/}, + morestring=[b]", + sensitive=true, +} \lstdefinelanguage{Go}{ morekeywords={break,case,chan,const,continue,default,defer,else,fallthrough, for,func,go,goto,if,import,interface,map,package,range,return,select, diff --git a/master_citations.bib b/master_citations.bib index f575833..e9cbe95 100644 --- a/master_citations.bib +++ b/master_citations.bib @@ -617,3 +617,25 @@ PDF:/home/lhebendanz/Zotero/storage/KM9D625Y/Whitner et al. - 2008 - Improved Packet Reordering Metrics.pdf:application/pdf}, } + +@misc{rfc5681, + title = {TCP Congestion Control}, + author = {Allman, Mark and Paxson, Vern and Blanton, Ethan}, + year = {2009}, + month = sep, + howpublished = {RFC 5681}, + doi = {10.17487/RFC5681}, + url = {https://www.rfc-editor.org/rfc/rfc5681}, + note = {Obsoletes RFC 2581}, +} + +@misc{chroboczek_babel_2021, + title = {The {Babel} Routing Protocol}, + author = {Chroboczek, Juliusz and Schinazi, David}, + year = {2021}, + month = jun, + howpublished = {RFC 8966}, + doi = {10.17487/RFC8966}, + url = {https://www.rfc-editor.org/rfc/rfc8966}, + note = {Obsoletes RFC 6126}, +}