diff --git a/doc/.gitignore b/doc/.gitignore new file mode 100644 index 0000000..7585238 --- /dev/null +++ b/doc/.gitignore @@ -0,0 +1 @@ +book diff --git a/doc/book.toml b/doc/book.toml new file mode 100644 index 0000000..338ad63 --- /dev/null +++ b/doc/book.toml @@ -0,0 +1,9 @@ +[book] +authors = ["Quentin Dufour"] +language = "en" +multilingual = false +src = "src" +title = "Aerogramme - Encrypted e-mail storage over Garage" + +[output.html] +mathjax-support = true diff --git a/doc/src/SUMMARY.md b/doc/src/SUMMARY.md new file mode 100644 index 0000000..6dd8034 --- /dev/null +++ b/doc/src/SUMMARY.md @@ -0,0 +1,29 @@ +# Summary + +[Introduction](./index.md) + +# Quick start + +- [Installation](./installation.md) +- [Setup](./setup.md) +- [Validation](./validate.md) + +# Cookbook + + - [Not ready for production]() + +# Design + +- [Overview](./overview.md) +- [Mailboxes](./mailbox.md) +- [Mutation Log](./log.md) +- [IMAP UID proof](./imap_uid.md) + +# Reference + +- [Data format](./data_format.md) +- [Cryptography & key management](./crypt-key.md) + +# Development + +- [Notes](./notes.md) diff --git a/doc/src/aero-compo.png b/doc/src/aero-compo.png new file mode 100644 index 0000000..fb81b46 Binary files /dev/null and b/doc/src/aero-compo.png differ diff --git a/doc/src/aero-paranoid.png b/doc/src/aero-paranoid.png new file mode 100644 index 0000000..f9e2df1 Binary files /dev/null and b/doc/src/aero-paranoid.png differ diff --git a/doc/src/aero-schema.png b/doc/src/aero-schema.png new file mode 100644 index 0000000..3206245 Binary files /dev/null and b/doc/src/aero-schema.png differ diff --git a/doc/src/aero-states.png b/doc/src/aero-states.png new file mode 100644 index 0000000..c3b015a Binary files /dev/null and b/doc/src/aero-states.png differ diff --git a/doc/src/aero-states2.png b/doc/src/aero-states2.png new file mode 100644 index 0000000..ed2077d Binary files /dev/null and b/doc/src/aero-states2.png differ diff --git a/doc/src/aerogramme.jpg b/doc/src/aerogramme.jpg new file mode 100644 index 0000000..c1fe11b Binary files /dev/null and b/doc/src/aerogramme.jpg differ diff --git a/doc/src/crypt-key.md b/doc/src/crypt-key.md new file mode 100644 index 0000000..9fb199b --- /dev/null +++ b/doc/src/crypt-key.md @@ -0,0 +1,82 @@ +# Cryptography & key management + +Keys that are used: + +- master secret key (for indexes) +- curve25519 public/private key pair (for incoming mail) + +Keys that are stored in K2V under PK `keys`: + +- `public`: the public curve25519 key (plain text) +- `salt`: the 32-byte salt `S` used to calculate digests that index keys below +- if a password is used, `password:`: + - a 32-byte salt `Skey` + - followed a secret box + - that is encrypted with a strong argon2 digest of the password (using the salt `Skey`) and a user secret (see below) + - that contains the master secret key and the curve25519 private key + +User secret: an additionnal secret that is added to the password when deriving the encryption key for the secret box. +This additionnal secret should not be stored in K2V/S3, so that just knowing a user's password isn't enough to be able +to decrypt their mailbox (supposing the attacker has a dump of their K2V/S3 bucket). +This user secret should typically be stored in the LDAP database or just in the configuration file when using +the static login provider. + +Operations: + +- **Initialize**(`user_secret`, `password`): + - if `"salt"` or `"public"` already exist, BAIL + - generate salt `S` (32 random bytes) + - generate `public`, `private` (curve25519 keypair) + - generate `master` (secretbox secret key) + - calculate `digest = argon2_S(password)` + - generate salt `Skey` (32 random bytes) + - calculate `key = argon2_Skey(user_secret + password)` + - serialize `box_contents = (private, master)` + - seal box `blob = seal_key(box_contents)` + - write `S` at `"salt"` + - write `concat(Skey, blob)` at `"password:{hex(digest[..16])}"` + - write `public` at `"public"` + +- **InitializeWithoutPassword**(`private`, `master`): + - if `"salt"` or `"public"` already exist, BAIL + - generate salt `S` (32 random bytes) + - write `S` at `"salt"` + - calculate `public` the public key associated with `private` + - write `public` at `"public"` + +- **Open**(`user_secret`, `password`): + - load `S = read("salt")` + - calculate `digest = argon2_S(password)` + - load `blob = read("password:{hex(digest[..16])}") + - set `Skey = blob[..32]` + - calculate `key = argon2_Skey(user_secret + password)` + - open secret box `box_contents = open_key(blob[32..])` + - retrieve `master` and `private` from `box_contents` + - retrieve `public = read("public")` + +- **OpenWithoutPassword**(`private`, `master`): + - load `public = read("public")` + - check that `public` is the correct public key associated with `private` + +- **AddPassword**(`user_secret`, `existing_password`, `new_password`): + - load `S = read("salt")` + - calculate `digest = argon2_S(existing_password)` + - load `blob = read("existing_password:{hex(digest[..16])}") + - set `Skey = blob[..32]` + - calculate `key = argon2_Skey(user_secret + existing_password)` + - open secret box `box_contents = open_key(blob[32..])` + - retrieve `master` and `private` from `box_contents` + + - calculate `digest_new = argon2_S(new_password)` + - generate salt `Skeynew` (32 random bytes) + - calculate `key_new = argon2_Skeynew(user_secret + new_password)` + - serialize `box_contents_new = (private, master)` + - seal box `blob_new = seal_key_new(box_contents_new)` + - write `concat(Skeynew, blob_new)` at `"new_password:{hex(digest_new[..16])}"` + +- **RemovePassword**(`password`): + - load `S = read("salt")` + - calculate `digest = argon2_S(existing_password)` + - check that `"password:{hex(digest[..16])}"` exists + - check that other passwords exist ?? (or not) + - delete `"password:{hex(digest[..16])}"` diff --git a/doc/src/data_format.md b/doc/src/data_format.md new file mode 100644 index 0000000..32aa2c3 --- /dev/null +++ b/doc/src/data_format.md @@ -0,0 +1,50 @@ +# Data format + +## Bay(ou) + +Checkpoints are stored in S3 at `/checkpoint/`. Example: + +``` +348 TestMailbox/checkpoint/00000180d77400dc126b16aac546b769 +369 TestMailbox/checkpoint/00000180d776e509b68fdc5c376d0abc +357 TestMailbox/checkpoint/00000180d77a7fe68f4f76e3b45aa751 +``` + +Operations are stored in K2V at PK ``, SK ``. Example: + +``` +TestMailbox 00000180d77400dc126b16aac546b769 RcIsESv7WrjMuHwyI/dvCnkIfy6op5Tiylf0WSnn94aMS2uagl7YeMBwdv09TiSXBpu5nJ5e/9QFSfuEI/NqKrdQkX54MOsnaIGhRb0oqUG3KNaar3BiVSvYvXuzYhk4ii+TUS2Eyd6fCCaNVNM5 +TestMailbox 00000180d775f27f5542a13fc21c665e RrTSOup/zO1Ei+QrjBcDLt4vvFSY+WJPBodwY64wy2ftW+Oh3VSArvlO4SAEPmdsx1gt0HPBZYR/OkVWsZpmix1ZLFUmvdib+rjNkorHQW1p+oLVK8tolGrqk4SRwl88cqu466T4vBEpDu7tRbH0 +TestMailbox 00000180d775f292b3c8da00718389b4 VAwd8SRycIwsipZW5AcSG+EIYZVWn/Uj/TADbWhb4x5LVMceiRBHWVquY08RgT/lJKdhIcUqBA15bVG3klIg8tLsWJVG784NbsZwdGRczWmngcA= +TestMailbox 00000180d775f29d24842cf375d679e0 /FbXtEwm/bijtvOdqM1XFvKUalQFAOPHp+vF9jZThZn/viY5a6W1PyHeI8kTusF6EsVPAwPHpQyjIv/ghskC0f+zUEsSUhDwQANdwLNqDLAvTA== +TestMailbox 00000180d7768ab1dc01ff504e887c62 W/fF0WitpxJ05yHeOv96BlpGymT1kVOjkIW00t9e6UE7mxkvNflu9cZSCd8PDJd2ymC0sC9bLVFAXKmNZsmCFEEHMQSyrX61qTYo4KFCZMp5zm6fXubaYuurrzjXzfUP/R7kBvICFZlF0daf0SwX +TestMailbox 00000180d7768aba629c7ad6adf25228 IPzYGNsSepCX2AEnee/1Eas9a3c5esPSmrNkvaj4XcFb6Ft2KC8N6ubUR3wB+K0oYCTQym6nhHG5dlAxf6NRu7Rk8YtBTBmSqtGqd6kMZ3bU5b8= +TestMailbox 00000180d7768ac1870cda61784114d4 aaLiaWxfx1mxh6aoKE3xUUfZWhivZ/K7ixabflFDW7FO/qbpvCaa+Y6w4lQemTy6m+leAhXGN+Dbyv2qP20yJ9O4oJF5d3Lz5Iv5uF18OxhVZzw= +TestMailbox 00000180d776e4fb294ccdab2612b406 EtUPrLgEeOyab2QRnSie4I3Me9dDh10UdwWnUKdGa/8ezMJDtiy7XlW+tUfJdqtu6Vj7nduT0emDOXbBZsNwlcmzgYNwuNu3I9AfhZTFWtwLgB+wnAgB/jim82DDrJfLia8kB2eA2ao5jfJ3uMSZ +TestMailbox 00000180d776e501528546d340490291 Lz4Z9wCTk1lZ86lL01urhAan4oHcr1NBqdRe+CDpA51D9IncA5+Fhc8I6knUIh2qQ5/woWgISLAVwzSS+0+TxrYoqxf5FumIQtUJfwDER5La3n0= +TestMailbox 00000180d776e509b68fdc5c376d0abc RUGE2xB3fFX/wRH/p2fHIUa+rMaXSRd7fY9zglw0pRfVPqJfpniOjAe4GHIwGlwbwjtFOwS5a+Q7yr0Wez6QwD+ohhqRFKpbjcFcN7VfMyVAf+k= +TestMailbox 00000180d7784b987a8ad8106dc400c9 K+0LVEtBbTnWNS67jy9DtTvQyd5arovduvu490tLOE2TzVhuVoF4pfvTMTN12bH3KwEAHeDfuwKkKJFqldOywouTYPzEjZFkJzyagHrkl6dfnE5CqmlDv+Vc5TOQRskxjW+wQiZdjU8wGiBiBGYh +TestMailbox 00000180d7784bede69ac3cff2c6b724 XMFY3+b1r1//uolVz80JSI3g/84XCk3Tm7/S0BFv+Qe/Xv3/poLrOvAKEe+GzD2s22j8p/T2RXR/JSZckzgjEZeO0wbPDXVQd94di2Pff7jxAH8= +TestMailbox 00000180d7784bffe2595abe7ed81858 QQZhF+7wSHfikoAp93a+UY/XDIX7TVnnVYOtmQ2XHnDKA2F6snRJCPbYBO4IRHCRfVrjDGi32c41it2C3Mu5PBepabxapsW1rfIV3rlX2lkKHtI= +TestMailbox 00000180d77a7fb3f01dbb147c20cf7f IHOlOa1JI11RUKVvQUq3HQPxiRr4UCeE+pHmL8DtNMkOh62V4spuP0VvvQTJCQcPQ1EQR/QcxZ3s7uHLkrZAHF30BkpUkGqsLBWpnyug/puhdiixWsMyLLb6G90zFjiComUwptnDc/CCXtGEHdSW +TestMailbox 00000180d77a7fbb54b100f521ceb347 Ze4KyyTCgrYbZlXlJSY5hNob8sMXvBAmwIx2cADbX5P0M1IHXwXfloEzvvd6WYOtatFC2GnDSrmQ6RdCfeZ3WV9TZilqa0Fv0XEg48sVyVCcguw= +TestMailbox 00000180d77a7fe68f4f76e3b45aa751 cJJVvvRzTVNKUaIHPCCDY2uY7/HlmkxGgo3ozWBlBSRDeBqU65zgZD3QIPCxa6xaqB/Gc0bQ9BGzfU0cvVmO5jgNeeDnbqqs3oeA2jml/Qv2YO9upApfNQtDT1GiwJ8vrgaIow== +TestMailbox 00000180d8e513d3ea58c679a13178ac Ce5su2YOxNmTzk2dK8SX8V/Uue5uAC7oklEjhesY9wCMqGphhOkdWjzCqq0xOzcb/ZzzZ58t+mTksNSYIU4kddHIHBFPgqIwKthVk2mlUdqYiN/Y2vEGqv+YmtKY+GST/7Ee87ZHpU/5sv0GoXxT +TestMailbox 00000180d8e5145a23f8faee86283900 sp3D8xFZcM9icNlDJXIUDJb3mo6VGD9f1aDHD+4RbPdx6mTYF+qNTsPHKCxHHxT/9NfNe8XPg2+8xYRtm7SXfgERZBDB8ye+Xt3fM1k+wbL6RsaJmDHVECeXeL5KHuITzpI22A== +TestMailbox 00000180d8e51465c38f0585f9bb760e FF0VId2O/bBNzYD5ABWReMs5hHoHwynOoJRKj9vyaUMZ3JykInFmvvRgtCbJBDjTQPwPU8apphKQfwuicO76H7GtZqH009Cbv5l8ZTRJKrmzOQmtjzBQc2eGEUMPfbml5t0GCg== +``` + +The timestamp of a checkpoint corresponds to the timestamp of the first operation NOT included in the checkpoint. +In other words, to reconstruct the final state: + +- find timestamp `` of last checkpoint +- load checkpoint `` +- load and apply all operations starting from ``, included + +## UID index + +The UID index is an application of the Bayou storage module +used to assign UID numbers to e-mails. +See document we sent to NGI for properties on UIDVALIDITY. + + diff --git a/doc/src/imap_uid.md b/doc/src/imap_uid.md new file mode 100644 index 0000000..ecdd52b --- /dev/null +++ b/doc/src/imap_uid.md @@ -0,0 +1,203 @@ +# IMAP UID proof + +**Notations** + +- $h$: the hash of a message, $\mathbb{H}$ is the set of hashes +- $i$: the UID of a message $(i \in \mathbb{N})$ +- $f$: a flag attributed to a message (it's a string), we write + $\mathbb{F}$ the set of possible flags +- if $M$ is a map (aka a dictionnary), if $x$ has no assigned value in + $M$ we write $M [x] = \bot$ or equivalently $x \not\in M$. If $x$ has a value + in the map we write $x \in M$ and $M [x] \neq \bot$ + +**State** + +- A map $I$ such that $I [h]$ is the UID of the message whose hash is + $h$ is the mailbox, or $\bot$ if there is no such message + +- A map $F$ such that $F [h]$ is the set of flags attributed to the + message whose hash is $h$ + +- $v$: the UIDVALIDITY value + +- $n$: the UIDNEXT value + +- $s$: an internal sequence number that is mostly equal to UIDNEXT but + also grows when mails are deleted + +**Operations** + + - MAIL\_ADD$(h, i)$: the value of $i$ that is put in this operation is + the value of $s$ in the state resulting of all already known operations, + i.e. $s (O_{gen})$ in the notation below where $O_{gen}$ is + the set of all operations known at the time when the MAIL\_ADD is generated. + Moreover, such an operation can only be generated if $I (O_{gen}) [h] + = \bot$, i.e. for a mail $h$ that is not already in the state at + $O_{gen}$. + + - MAIL\_DEL$(h)$ + + - FLAG\_ADD$(h, f)$ + + - FLAG\_DEL$(h, f)$ + +**Algorithms** + + +**apply** MAIL\_ADD$(h, i)$: +   *if* $i < s$: +     $v \leftarrow v + s - i$ +   *if* $F [h] = \bot$: +     $F [h] \leftarrow F_{initial}$ +  $I [h] \leftarrow s$ +  $s \leftarrow s + 1$ +  $n \leftarrow s$ + +**apply** MAIL\_DEL$(h)$: +   $I [h] \leftarrow \bot$ +  $F [h] \leftarrow \bot$ +  $s \leftarrow s + 1$ + +**apply** FLAG\_ADD$(h, f)$: +   *if* $h \in F$: +     $F [h] \leftarrow F [h] \cup \{ f \}$ + +**apply** FLAG\_DEL$(h, f)$: +   *if* $h \in F$: +     $F [h] \leftarrow F [h] \backslash \{ f \}$ + + +**More notations** + +- $o$ is an operation such as MAIL\_ADD, MAIL\_DEL, etc. $O$ is a set of + operations. Operations embed a timestamp, so a set of operations $O$ can be + written as $O = [o_1, o_2, \ldots, o_n]$ by ordering them by timestamp. + +- if $o \in O$, we write $O_{\leqslant o}$, $O_{< o}$, $O_{\geqslant + o}$, $O_{> o}$ the set of items of $O$ that are respectively earlier or + equal, strictly earlier, later or equal, or strictly later than $o$. In + other words, if we write $O = [o_1, \ldots, o_n]$, where $o$ is a certain + $o_i$ in this sequence, then: +$$ +\begin{aligned} +O_{\leqslant o} &= \{ o_1, \ldots, o_i \}\\ +O_{< o} &= \{ o_1, \ldots, o_{i - 1} \}\\ +O_{\geqslant o} &= \{ o_i, \ldots, o_n \}\\ +O_{> o} &= \{ o_{i + 1}, \ldots, o_n \} +\end{aligned} +$$ + +- If $O$ is a set of operations, we write $I (O)$, $F (O)$, $n (O), s + (O)$, and $v (O)$ the values of $I, F, n, s$ and $v$ in the state that + results of applying all of the operations in $O$ in their sorted order. (we + thus write $I (O) [h]$ the value of $I [h]$ in this state) + +**Hypothesis:** +An operation $o$ can only be in a set $O$ if it was +generated after applying operations of a set $O_{gen}$ such that +$O_{gen} \subset O$ (because causality is respected in how we deliver +operations). Sets of operations that do not respect this property are excluded +from all of the properties, lemmas and proofs below. + +**Simplification:** We will now exclude FLAG\_ADD and FLAG\_DEL +operations, as they do not manipulate $n$, $s$ and $v$, and adding them should +have no impact on the properties below. + +**Small lemma:** If there are no FLAG\_ADD and FLAG\_DEL operations, +then $s (O) = | O |$. This is easy to see because the possible operations are +only MAIL\_ADD and MAIL\_DEL, and both increment the value of $s$ by 1. + +**Defnition:** If $o$ is a MAIL\_ADD$(h, i)$ operation, and $O$ is a +set of operations such that $o \in O$, then we define the following value: +$$ +C (o, O) = s (O_{< o}) - i +$$ +We say that $C (o, O)$ is the *number of conflicts of $o$ in $O$*: it +corresponds to the number of operations that were added before $o$ in $O$ that +were not in $O_{gen}$. + +**Property:** + +We have that: + +$$ +v (O) = \sum_{o \in O} C (o, O) +$$ + +Or in English: $v (O)$ is the sum of the number of conflicts of all of the +MAIL\_ADD operations in $O$. This is easy to see because indeed $v$ is +incremented by $C (o, O)$ for each operation $o \in O$ that is applied. + + +**Property:** + If $O$ and $O'$ are two sets of operations, and $O \subseteq O'$, then: + +$$ +\begin{aligned} +\forall o \in O, \qquad C (o, O) \leqslant C (o, O') +\end{aligned} +$$ + +This is easy to see because $O_{< o} \subseteq O'_{< o}$ and $C (o, O') - C + (o, O) = s (O'_{< o}) - s (O_{< o}) = | O'_{< o} | - | O_{< o} | \geqslant + 0$ + +**Theorem:** + +If $O$ and $O'$ are two sets of operations: + +$$ +\begin{aligned} +O \subseteq O' & \Rightarrow & v (O) \leqslant v (O') +\end{aligned} +$$ + +**Proof:** + +$$ +\begin{aligned} +v (O') &= \sum_{o \in O'} C (o, O')\\ + & \geqslant \sum_{o \in O} C (o, O') \qquad \text{(because $O \subseteq + O'$)}\\ + & \geqslant \sum_{o \in O} C (o, O) \qquad \text{(because $\forall o \in + O, C (o, O) \leqslant C (o, O')$)}\\ + & \geqslant v (O) +\end{aligned} +$$ + +**Theorem:** + +If $O$ and $O'$ are two sets of operations, such that $O \subset O'$, + +and if there are two different mails $h$ and $h'$ $(h \neq h')$ such that $I + (O) [h] = I (O') [h']$ + + then: + $$v (O) < v (O')$$ + +**Proof:** + +We already know that $v (O) \leqslant v (O')$ because of the previous theorem. +We will now look at the sum: +$$ +v (O') = \sum_{o \in O'} C (o, O') +$$ +and show that there is at least one term in this sum that is strictly larger +than the corresponding term in the other sum: +$$ +v (O) = \sum_{o \in O} C (o, O) +$$ +Let $o$ be the last MAIL\_ADD$(h, \_)$ operation in $O$, i.e. the operation +that gives its definitive UID to mail $h$ in $O$, and similarly $o'$ be the +last MAIL\_ADD($h', \_$) operation in $O'$. + +Let us write $I = I (O) [h] = I (O') [h']$ + +$o$ is the operation at position $I$ in $O$, and $o'$ is the operation at +position $I$ in $O'$. But $o \neq o'$, so if $o$ is not the operation at +position $I$ in $O'$ then it has to be at a later position $I' > I$ in $O'$, +because no operations are removed between $O$ and $O'$, the only possibility +is that some other operations (including $o'$) are added before $o$. Therefore +we have that $C (o, O') > C (o, O)$, i.e. at least one term in the sum above +is strictly larger in the first sum than in the second one. Since all other +terms are greater or equal, we have $v (O') > v (O)$. diff --git a/doc/src/index.md b/doc/src/index.md new file mode 100644 index 0000000..9d8f910 --- /dev/null +++ b/doc/src/index.md @@ -0,0 +1,22 @@ +# Introduction + +

+ A scan of an Aerogramme dating from 1955 +
+[ Documentation +| Git repository +] +
+stability status: technical preview (do not use in production) +

+ +Aerogramme is an open-source **IMAP server** targeted at **distributed** infrastructures and written in **Rust**. +It is designed to be resilient, easy to operate and private by design. + +**Resilient** - Aerogramme is built on top of Garage, a (geographically) distributed object storage software. Aerogramme thus inherits Garage resiliency: its mailboxes are spread on multiple distant regions, regions can go offline while keeping mailboxes available, storage nodes can be added or removed on the fly, etc. + +**Easy to operate** - Aerogramme mutualizes the burden of data management by storing all its data in an object store and nothing on the local filesystem or any relational database. It can be seen as a proxy between the IMAP protocol and Garage protocols (S3 and K2V). It can thus be freely moved between machines. Multiple instances can also be run in parallel. + +**Private by design** - As emails are very sensitive, Aerogramme encrypts users' mailboxes with their passwords. Data is decrypted in RAM upon user login: the Garage storage layer handles only encrypted blobs. It is even possible to run locally Aerogramme while connecting it to a remote, third-party, untrusted Garage provider; in this case clear text emails never leak outside of your computer. + +Our main use case is to provide a modern email stack for autonomously hosted communities such as [Deuxfleurs](https://deuxfleurs.fr). More generally, we want to set new standards in term of email ethic by lowering the bar to become an email provider while making it harder to spy users' emails. diff --git a/doc/src/installation.md b/doc/src/installation.md new file mode 100644 index 0000000..1b5d485 --- /dev/null +++ b/doc/src/installation.md @@ -0,0 +1,25 @@ +# Installation + +Install a Rust nightly toolchain: [go to Rustup](https://rustup.rs/). + +Install and deploy a Garage cluster: [go to Garage documentation](https://garagehq.deuxfleurs.fr/documentation/quick-start/). + +Clone Aerogramme's repository: + +```bash +git clone https://git.deuxfleurs.fr/Deuxfleurs/aerogramme/ +``` + +Compile Aerogramme: + +```bash +cargo build +``` + +Check that your compiled binary works: + +```bash +cargo run +``` + +You are now ready to [setup Aerogramme!](./setup.md) diff --git a/doc/src/log.md b/doc/src/log.md new file mode 100644 index 0000000..f29ecee --- /dev/null +++ b/doc/src/log.md @@ -0,0 +1,149 @@ +# Mutation Log + + +Back to our data structure, we note that one major challenge with this project is to *correctly* handle mutable data. +With our current design, multiple processes can interact with the same mutable data without coordination, and we need a way to detect and solve conflicts. +Directly storing the result in a single k2v key would not work as we have no transaction or lock mechanism, and our state would be always corrupted. +Instead, we choose to record an ordered log of operations, ie. transitions, that each client can use locally to rebuild the state, each transition has its own immutable identifier. +This technique is sometimes referred to as event sourcing. + +With this system, we can't have conflict anymore at Garage level, but conflicts at the IMAP level can still occur, like 2 processes assigning the same identifier to different emails. +We thus need a logic to handle these conflicts that is flexible enough to accommodate the application's specific logic. + +Our solution is inspired by the work conducted by Terry et al. on [Bayou](https://dl.acm.org/doi/10.1145/224056.224070). +Clients fetch regularly the log from Garage, each entry is ordered by a timestamp and a unique identifier. +One of the 2 conflicting clients will be in the state where it has executed a log entry in the wrong order according to the specified ordering. +This client will need to roll back its changes to reapply the log in the same order as the others, and on conflicts, the same logic will be applied by all the clients to get, in the end, the same state. + +**Command definitions** + +The log is made of a sequence of ordered commands that can be run to get a deterministic state in the end. +We define the following commands: + +`FLAG_ADD ` - Add a flag to the target email +`FLAG_DEL ` - Remove a flag from a target email +`MAIL_DEL ` - Remove an email +`MAIL_ADD ` - Register an email in the mailbox with the given identifier +`REMOTE ` - Command is not directly stored here, instead it must be fetched from S3, see batching to understand why. + +*Note: FLAG commands could be enhanced with a MODSEQ field similar to the uid field for the emails, in order to implement IMAP RFC4551. Adding this field would force us to handle conflicts on flags +the same way as on emails, as MODSEQ must be monotonically incremented but is reset by a uid-validity change. This is out of the scope of this document.* + +**A note on UUID** + +When adding an email to the system, we associate it with a *universally unique identifier* or *UUID.* +We can then reference this email in the rest of the system without fearing a conflict or a race condition are we are confident that this UUID is unique. + +We could have used the email hash instead, but we identified some benefits in using UUID. +First, sometimes a mail must be duplicated, because the user received it from 2 different sources, so it is more correct to have 2 entries in the system. +Additionally, UUIDs are smaller and better compressible than a hash, which will lead to better performances. + +**Batching commands** + +Commands that are executed at the same time can be batched together. +Let's imagine a user is deleting its trash containing thousands of emails. +Instead of writing thousands of log lines, we can append them in a single entry. +If this entry becomes big (eg. > 100 commands), we can store it to S3 with the `REMOTE` command. +Batching is important as we want to keep the number of log entries small to be able to fetch them regularly and quickly. + +## Fixing conflicts in the operation log + +The log is applied in order from the last checkpoint. +To stay in sync, the client regularly asks the server for the last commands. + +When the log is applied, our system must enforce the following invariants: + +- For all emails e1 and e2 in the log, such as e2.order > e1.order, then e2.uid > e1.uid + +- For all emails e1 and e2 in the log, such as e1.uuid == e2.uuid, then e1.order == e2.order + +If an invariant is broken, the conflict is solved with the following algorithm and the `uidvalidity` value is increased. + + +```python +def apply_mail_add(uuid, imap_uid): + if imap_uid < internalseq: + uidvalidity += internalseq - imap_uid + mails.insert(uuid, internalseq, flags=["\Recent"]) + internalseq = internalseq + 1 + uidnext = internalseq + +def apply_mail_del(uuid): + mails.remove(uuid) + internalseq = internalseq + 1 +``` + +A mathematical demonstration in Appendix D. shows that this algorithm indeed guarantees that under the same `uidvalidity`, different e-mails cannot share the same IMAP UID. + +To illustrate, let us imagine two processes that have a first operation A in common, and then had a divergent state when one applied an operation B, and another one applied an operation C. For process 1, we have: + +```python +# state: uid-validity = 1, uid_next = 1, internalseq = 1 +(A) MAIL_ADD x 1 +# state: uid-validity = 1, x = 1, uid_next = 2, internalseq = 2 +(B) MAIL_ADD y 2 +# state: uid-validity = 1, x = 1, y = 2, uid_next = 3, internalseq = 3 +``` + +And for process 2 we have: + +```python +# state: uid-validity = 1, uid_next = 1, internalseq = 1 +(A) MAIL_ADD x 1 +# state: uid-validity = 1, x = 1, uid_next = 2, internalseq = 2 +(C) MAIL_ADD z 2 +# state: uid-validity = 1, x = 1, z = 2, uid_next = 3, internalseq = 3 +``` + +Suppose that a new client connects to one of the two processes after the conflicting operations have been communicated between them. They may have before connected either to process 1 or to process 2, so they might have observed either mail `y` or mail `z` with UID 2. The only way to make sure that the client will not be confused about mail UIDs is to bump the uidvalidity when the conflict is solved. This is indeed what happens with our algorithm: for both processes, once they have learned of the other's conflicting operation, they will execute the following set of operations and end in a deterministic state: + +```python +# state: uid-validity = 1, uid_next = 1, internalseq = 1 +(A) MAIL_ADD x 1 +# state: uid-validity = 1, x = 1, uid_next = 2, internalseq = 2 +(B) MAIL_ADD y 2 +# state: uid-validity = 1, x = 1, y = 2, uid_next = 3, internalseq = 3 +(C) MAIL_ADD z 2 +# conflict detected ! +# state: uid-validity = 2, x = 1, y = 2, z = 3, uid_next = 4, internalseq = 4 +``` + +## A computed state for efficient requests + +From a data structure perspective, a list of commands is very inefficient to get the current state of the mailbox. +Indeed, we don't want an `O(n)` complexity (where `n` is the number of log commands in the log) each time we want to know how many emails are stored in the mailbox. + +To address this issue, and thus query the mailbox efficiently, the MDA keeps an in-memory computed version of the logs, ie. the computed state. + +**Mapping IMAP identifiers to email identifiers with B-Tree** + +Core features of IMAP are synchronization and listing of emails. +Its associated command is `FETCH`, it has 2 parameters, a range of `uid` (or `seq`) and a filter. +For us, it means that we must be able to efficiently select a range of emails by their identifier, otherwise the user experience will be bad, and compute resources will be wasted. + +We identified that by using an ordered map based on a B-Tree, we can satisfy this requirement in an optimal manner. +For example, Rust defines a [BTreeMap](https://doc.rust-lang.org/std/collections/struct.BTreeMap.html) object in its standard library. +We define the following structure for our mailbox: + +```rust +struct mailbox { + emails: BTreeMap, + flags: BTreeMap>, + name: String, + uid_next: u32, + uid_validity: u32, + /* other fields */ +} +``` + +This data structure allows us to efficiently select a range of emails by their identifier by walking the tree, allowing the server to be responsive to syncronisation request from clients. + +**Checkpoints** + +Having an in-memory computed state does not solve all the problems of operation on a log only, as 1) bootstrapping a fresh client is expensive as we have to replay possibly thousand of logs, and 2) logs would be kept indefinitely, wasting valuable storage resources. + +As a solution to these limitations, the MDA regularly checkpoints the in-memory state. More specifically, it serializes it (eg. with MessagePack), compresses it (eg. with zstd), and then stores it on Garage through the S3 API. +A fresh client would then only have to download the latest checkpoint and the range of logs between the checkpoint and now, allowing swift bootstraping while retaining all of the value of the log model. + +Old logs and old checkpoints can be garbage collected after a few days for example as long as 1) the most recent checkpoint remains, 2) that all the logs after this checkpoint remain and 3) that we are confident enough that no log before this checkpoint will appear in the future. + diff --git a/doc/src/mailbox.md b/doc/src/mailbox.md new file mode 100644 index 0000000..02d0e5a --- /dev/null +++ b/doc/src/mailbox.md @@ -0,0 +1,56 @@ +# Mailboxes + +IMAP servers, at their root, handle mailboxes. +In this document, we explain the domain logic of IMAP and how we map it to Garage data +with Aerogramme. + +## IMAP Domain Logic + +The main specification of IMAP is defined in [RFC3501](https://datatracker.ietf.org/doc/html/rfc3501). +It defines 3 main objects: Mailboxes, Emails, and Flags. The following figure depicts how they work together: + +![An IMAP mailbox schema](./mailbox.png) + +Emails are stored ordered inside the mailbox, and for legacy reasons, the mailbox assigns 2 identifiers to each email we name `uid` and `seq`. + +`seq` is the legacy identifier, it numbers messages in a sequence. Each time an email is deleted, the message numbering will change to keep a continuous sequence without holes. +While this numbering is convenient for interactive operations, it is not efficient to synchronize mail locally and quickly detect missing new emails. + +To solve this problem, `uid` identifiers were introduced later. They are monotonically increasing integers that must remain stable across time and sessions: when an email is deleted, its identifier is never reused. +This is what Thunderbird uses for example when it synchronizes its mailboxes. + +If this ordering cannot be kept, for example because two independent IMAP daemons were adding an email to the same mailbox at the same time, it is possible to change the ordering as long as we change a value named `uid-validity` to trigger a full resynchronization of all clients. As this operation is expensive, we want to minimize the probability of having to trigger a full resynchronization, but in practice, having this recovery mechanism simplifies the operation of an IMAP server by providing a rather simple solution to rare collision situations. + +Flags are tags put on an email, some are defined at the protocol level, like `\Recent`, `\Deleted` or `\Seen`, which can be assigned or removed directly by the IMAP daemon. +Others can be defined arbitrarily by the client, for which the MUA will apply its own logic. +There is no mechanism in RFC3501 to synchronize flags between MUA besides listing the flags of all the emails. + +IMAP has many extensions, such as [RFC5465](https://www.rfc-editor.org/rfc/rfc5465.html) or [RFC7162](https://datatracker.ietf.org/doc/html/rfc7162). +They are referred to as capabilities and are [referenced by the IANA](https://www.iana.org/assignments/imap-capabilities/imap-capabilities.xhtml). +For this project, we are aiming to implement only IMAP4rev1 and no extension at all. + + +## Aerogramme Implementation + +From a high-level perspective, we will handle _immutable_ emails differently from _mutable_ mailboxes and flags. +Immutable data can be stored directly on Garage, as we do not fear reading an outdated value. +For mutable data, we cannot store them directly in Garage. +Instead, we choose to store a log of operations. Each client then applies this log of operation locally to rebuild its local state. + +During this design phase, we noted that the S3 API semantic was too limited for us, so we introduced a second API, K2V, to have more flexibility. +K2V is designed to store and fetch small values in batches, it uses 2 different keys: one to spread the data on the cluster (`P`), and one to sort linked data on the same node (`S`). +Having data on the same node allows for more efficient queries among this data. + +For performance reasons, we plan to introduce 2 optimizations. +First, we store an email summary in K2V that allows fetching multiple entries at once. +Second, we also store checkpoints of the logs in S3 to avoid keeping and replaying all the logs each time a client starts a session. +We have the following data handled by Garage: + +![Aerogramme Datatypes](./aero-states.png) + +In Garage, it is important to carefully choose the key(s) that are used to store data to have fast queries, we propose the following model: + +![Aerogramme Key Choice](./aero-states2.png) + + + diff --git a/doc/src/mailbox.png b/doc/src/mailbox.png new file mode 100644 index 0000000..038e3ac Binary files /dev/null and b/doc/src/mailbox.png differ diff --git a/doc/src/notes.md b/doc/src/notes.md new file mode 100644 index 0000000..33c98a0 --- /dev/null +++ b/doc/src/notes.md @@ -0,0 +1,4 @@ +# Notes + + +*No note currently* diff --git a/doc/src/overview.md b/doc/src/overview.md new file mode 100644 index 0000000..ca75a29 --- /dev/null +++ b/doc/src/overview.md @@ -0,0 +1,61 @@ +# Overview + +Aérogramme stands at the interface between the Garage storage server, and the user's e-mail client. It provides regular IMAP access on the client-side, and stores encrypted e-mail data on the server-side. Aérogramme also provides an LMTP server interface through which incoming mail can be forwarded by the MTA (e.g. Postfix). + +
+Aerogramme components +
+Figure 1: Aérogramme, our IMAP daemon, stores its data encrypted in Garage and provides regular IMAP access to mail clients
+ + +**Overview of architecture** + +Figure 2 below shows an overview of Aérogramme's architecture. Each user has a personal Garage bucket in which to store their mailbox contents. We will document below the details of the components that make up Aérogramme, but let us first provide a high-level overview. The two main classes, `User` and `Mailbox`, define how data is stored in this bucket, and provide a high-level interface with primitives such as reading the message index, loading a mail's content, copying, moving, and deleting messages, etc. This mail storage system is supported by two important primitives: a cryptography management system that provides encryption keys for user's data, and a simple log-like database system inspired by Bayou [1] which we have called Bay, that we use to store the index of messages in each mailbox. The mail storage system is made accessible to the outside world by two subsystems: an LMTP server that allows for incoming mail to be received and stored in a user's bucket, in a staging area, and the IMAP server itself which allows full-fledged manipulation of mailbox data by users. + +
+Aerogramme internals +Figure 2: Overview of Aérogramme's architecture and internal data structures for a given user, Alice
+ + +**Cryptography** + +Our cryptography module is taking care of: authenticating users against a data source (using their IMAP login and password), returning a set of credentials that allow read/write access to a Garage bucket, as well as a set of secret encryption keys used to encrypt and decrypt data stored in the bucket. +The cryptography module makes use of the user's authentication password as a passphrase to decrypt the user's secret keys, which are stored in the user's bucket in a dedicated K2V section. + +This module can use either of two data sources for user authentication: + +- LDAP, in which case the password (which is also the passphrase for decrypting the user's secret keys) must match the LDAP password of the user. +- Static, in which case the users are statically declared in Aérogramme's configuration file, and can have any password. + +The static authentication source can be used in a deployment scenario shown in Figure 3, where Aérogramme is not running on the side of the service provider, but on the user's device itself. In this case, the user can use any password to encrypt their data in the bucket; the only credentials they need for authentication against the service provider are the S3 and K2V API access keys. + +
+user side encryption +
+Figure 3: alternative deployment of Aérogramme on the user's device: the service provider never gets access to the plaintext data.
+ +The cryptography module also has a "public authentication" method, which allows the LMTP module to retrieve only a public key for the user to write incoming messages to the user's bucket but without having access to all of the existing encrypted data. + +The cryptography module of Aérogramme is based on standard cryptographic primitives from `libsodium` and follows best practices in the domain. + +**Bay, a simplification of Bayou** + +In our last milestone report, we described how we intended to implement the message index for IMAP mailboxes, based on an eventually-consistent log-like data structure. The principles of this system have been established in Bayou in 1995 [1], allowing users to use a weakly-coordinated datastore to exchange data and solve write conflicts. Bayou is based on a sequential specification, which defines the action that operations in the log have on the shared object's state. To handle concurrent modification, Bayou allows for log entries to be appended in non-sequential order: in case a process reads a log entry that was written earlier by another process, it can rewind its execution of the sequential specification to the point where the newly acquired operation should have been executed, and then execute the log again starting from this point. The challenge then consists in defining a sequential specification that provides the desired semantics for the application. In our last milestone report (milestone 3.A), we described a sequential specification that solves the UID assignment problem in IMAP and proved it correct. We refer the reader to that document for more details. + +For milestone 3B, we have implemented our customized version of Bayou, which we call Bay. Bay implements the log-like semantics and the rewind ability of Bayou, however, it makes use of a much simpler data system: Bay is not operating on a relational database that is stored on disk, but simply on a data structure in RAM, for which a full checkpoint is written regularly. We decided against using a complex database as we observed that the expected size of the data structures we would be handling (the message indexes for each mailbox) wouldn't be so big most of the time, and having a full copy in RAM was perfectly acceptable. This allows for a drastic simplification in comparison to the proposal of the original Bayou paper [1]. On the other side, we added encryption in Bay so that both log entries and checkpoints are stored encrypted in Garage using the user's secret key, meaning that a malicious Garage administrator cannot read the content of a user's mailbox index. + +**LMTP server and incoming mail handler** + +To handle incoming mail, we had to add a simple LMTP server to Aérogramme. This server uses the public authentication method of the cryptography module to retrieve a set of public credentials (in particular, a public key for asymmetric encryption) for storing incoming messages. The incoming messages are stored in their raw RFC822 form (encrypted) in a specific folder of the Garage bucket called `incoming/`. When a user logs in with their username and password, at which time Aérogramme can decrypt the user's secret keys, a special process is launched that watches the incoming folder and moves these messages to the `INBOX` folder. This task can only be done by a process that knows the user's secret keys, as it has to modify the mailbox index of the `INBOX` folder, which is encrypted using the user's secret keys. In later versions of Aérogramme, this process would be the perfect place to implement mail filtering logic using user-specified rules. These rules could be stored in a dedicated section of the bucket, again encrypted with the user's secret keys. + +To implement the LMTP server, we chose to make use of the `smtp-server` crate from the [Kannader](https://github.com/Ekleog/kannader) project (an MTA written in Rust). The `smtp-server` crate had all of the necessary functionality for building SMTP servers, however, it did not handle LMTP. As LMTP is extremely close to SMTP, we were able to extend the `smtp-server` module to allow it to be used for the implementation of both SMTP and LMTP servers. Our work has been proposed as a [pull request](https://github.com/Ekleog/kannader/pull/178) to be merged back upstream in Kannader, which should be integrated soon. + +**IMAP server** + +The last part that remains to build Aérogramme is to implement the logic behind the IMAP protocol and to link it with the mail storage primitives. We started by implementing a state machine that handled the transitions between the different states in the IMAP protocol: ANONYMOUS (before login), AUTHENTICATED (after login), and SELECTED (once a mailbox has been selected for reading/writing). In the SELECTED state, the IMAP session is linked to a given mailbox of the user. In addition, the IMAP server has to keep track of which updates to the mailbox it has sent (or not) to the client so that it can produce IMAP messages consistent with what the client believes to be in the mailbox. In particular, many IMAP commands make use of mail sequence numbers to identify messages, which are indices in the sorted array of all of the messages in the mailbox. However, if messages are added or removed concurrently, these sequence numbers change: hence we must keep a snapshot of the mailbox's index *as the client knows it*, which is not necessarily the same as what is _actually_ in the mailbox, to generate messages that the client will understand correctly. This snapshot is called a *mailbox view* and is synced regularly with the actual mailbox, at which time the corresponding IMAP updates are sent. This can be done only at specific moments when permitted by the IMAP protocol. + +The second part of this task consisted in implementing all of the IMAP protocol commands. Most are relatively straightforward, however, one command, in particular, needed special care: the FETCH command. The FETCH command in the IMAP protocol can return the contents of a message to the client. However, it must also understand precisely the semantics of the content of an e-mail message, as the client can specify very precisely how the message should be returned. For instance, in the case of a multipart message with attachments, the client can emit a FECTH command requesting only a certain attachment of the message to be returned, and not the whole message. To implement such semantics, we have based ourselves on the [`mail-parser`](https://docs.rs/mail-parser/latest/mail_parser/) crate, which can fully parse an RFC822-formatted e-mail message, and also supports some extensions such as MIME. To validate that we were correctly converting the parsed message structure to IMAP messages, we designed a test suite composed of several weirdly shaped e-mail messages, whose IMAP structure definition we extracted by taking Dovecot as a reference. We were then able to compare the output of Aérogramme on these messages with the reference consisting in what was returned by Dovecot. + +## References + +- [1] Terry, D. B., Theimer, M. M., Petersen, K., Demers, A. J., Spreitzer, M. J., & Hauser, C. H. (1995). Managing update conflicts in Bayou, a weakly connected replicated storage system. *ACM SIGOPS Operating Systems Review*, 29(5), 172-182. ([PDF](https://dl.acm.org/doi/pdf/10.1145/224057.224070)) diff --git a/doc/src/setup.md b/doc/src/setup.md new file mode 100644 index 0000000..66a1b7e --- /dev/null +++ b/doc/src/setup.md @@ -0,0 +1,74 @@ +# Setup + +You must start by creating a user profile in Garage. Run the following command after adjusting the parameters to your configuration: + +```bash +cargo run -- first-login \ + --region garage \ + --k2v-endpoint http://127.0.0.1:3904 \ + --s3-endpoint http://127.0.0.1:3900 \ + --aws-access-key-id GK... \ + --aws-secret-access-key c0ffee... --bucket mailrage-me \ + --user-secret s3cr3t +``` + +The program will interactively ask you some questions and finally generates for you a snippet of configuration: + +``` +Please enter your password for key decryption. +If you are using LDAP login, this must be your LDAP password. +If you are using the static login provider, enter any password, and this will also become your password for local IMAP access. +Enter password: +Confirm password: + +Cryptographic key setup is complete. + +If you are using the static login provider, add the following section to your .toml configuration file: + +[login_static.users.] +password = "$argon2id$v=19$m=4096,t=3,p=1$..." +aws_access_key_id = "GK..." +aws_secret_access_key = "c0ffee..." +``` + +In this tutorial, we will use the static login provider (and not the LDAP one). +We will thus create a config file named `aerogramme.toml` in which we will paste the previous snippet. You also need to enter some other keys. In the end, your file should look like that: + +```toml +s3_endpoint = "http://127.0.0.1:3900" +k2v_endpoint = "http://127.0.0.1:3904" +aws_region = "garage" + +[login_static] +default_bucket = "mailrage" +[login_static.users.me] +bucket = "mailrage-me" +user_secret = "s3cr3t" +alternate_user_secrets = [] +password = "$argon2id$v=19$m=4096,t=3,p=1$..." +aws_access_key_id = "GK..." +aws_secret_access_key = "c0ffee..." +``` + +If you fear to loose your password, you can backup your key with the following command: + +```bash +cargo run -- show-keys \ + --region garage \ + --k2v-endpoint http://127.0.0.1:3904 \ + --s3-endpoint http://127.0.0.1:3900 \ + --aws-access-key-id GK... \ + --aws-secret-access-key c0ffee... \ + --bucket mailrage-me + --user-secret s3cr3t +``` + +You will then be asked for your key decryption password: + +``` +Enter key decryption password: +master_key = "..." +secret_key = "..." +``` + +You are now ready to [validate your installation](./validate.md). diff --git a/doc/src/validate.md b/doc/src/validate.md new file mode 100644 index 0000000..ed7dfa7 --- /dev/null +++ b/doc/src/validate.md @@ -0,0 +1,9 @@ +# Validate + +Start a server as follow: + +```bash +cargo run -- server +``` + + diff --git a/mailrage.toml b/mailrage.toml new file mode 100644 index 0000000..4cba391 --- /dev/null +++ b/mailrage.toml @@ -0,0 +1,13 @@ +s3_endpoint = "http://[::1]:3900" +k2v_endpoint = "http://[::1]:3904" +aws_region = "garage" + +[imap] +bind_addr = "[::1]:4567" + +[login_static.users.quentin] +password = "$argon2id$v=19$m=4096,t=3,p=1$jR52Nq76f8yO0UXdhK+FiQ$KeIzDI4PJ/2bX+expyyaRkMZus0/1FsgTXtnvPUjwyw" +aws_access_key_id = "GK68198c3b4148f61dcd625b7e" +aws_secret_access_key = "1d4bd3853a4f7810b97cbb2f8eb52c7603eb93c202fe98ca40f4e3f6b7e70fa0" +user_secret = "poupou" +bucket = "quentin-mailrage"