2.4. etcd

Etcd is a distributed service configuration repository, natively accessible through gRPC.

Etcd Data Model

Single cluster wide key value store

  • Keys and values are byte arrays

  • Keys are sorted lexicographically

  • Store has monotonically increasing revisions

  • Used to timestamp key creation and modification

  • Historical revisions available until compaction

  • Keys have increasing versions

  • Keys can have leases

message KeyValue {
    bytes key = 1;
    int64 create_revision = 2;
    int64 mod_revision = 3;
    int64 version = 4;
    bytes value = 5;
    int64 lease = 6;
}

Put Request

rpc Put (PutRequest) returns (PutResponse) { }

message PutRequest {
    bytes key = 1;
    bytes value = 2;

    // Lease identifier or 0 for no lease
    int64 lease = 3;

    // Optionally return previous key value pair
    bool prev_kv = 4;

    // Optionally update using existing value
    bool ignore_value = 5;

    // Optionally update using existing lease
    bool ignore_lease = 6;
}

message PutResponse {
    ResponseHeader header = 1;
    KeyValue prev_kv = 2;
}

Range Request

rpc Range (RangeRequest) returns (RangeResponse) { }

message RangeRequest {

    enum SortOrder {
        NONE = 0;
        ASCEND = 1;
        DESCEND = 2;
    }

    enum SortTarget {
        KEY = 0;
        VERSION = 1;
        CREATE = 2;
        MOD = 3;
        VALUE = 4;
    }

    bytes key = 1;
    bytes range_end = 2;

    // Restrict number of keys returned
    int64 limit = 3;

    // Possibly query historical revision
    int64 revision = 4;

    SortOrder sort_order = 5;
    SortTarget sort_target = 6;

    // Linearizable returns cluster consensus
    // Serializable can return stale data
    bool serializable = 7;

    bool keys_only = 8;
    bool count_only = 9;

    int64 min_mod_revision = 10;
    int64 max_mod_revision = 11;

    int64 min_create_revision = 12;
    int64 max_create_revision = 13;
}

message RangeResponse {
    ResponseHeader header = 1;
    repeated KeyValue kvs = 2;
    bool more = 3;
    int64 count = 4;
}
  • also delete range request

Transaction Request

rpc Txn (TxnRequest) returns (TxnResponse) { }

message TxnRequest {
    // List of tests to perform before transaction
    repeated Compare compare = 1;
    // List of operations to perform when all tests succeed
    repeated RequestOp success = 2;
    // List of operations to perform when any test fails
    repeated RequestOp failure = 3;
}

message Compare {

    enum CompareResult {
        EQUAL = 0;
        GREATER = 1;
        LESS = 2;
        NOT_EQUAL = 3;
    }

    enum CompareTarget {
        VERSION = 0;
        CREATE = 1;
        MOD = 2;
        VALUE = 3;
        LEASE = 4;
    }

    CompareResult result = 1;
    CompareTarget target = 2;

    bytes key = 3;
    oneof target_union {
        int64 version = 4;
        int64 create_revision = 5;
        int64 mod_revision = 6;
        bytes value = 7;
        int64 lease = 8;
    }

    // Can compare key range rather than just one key
    bytes range_end = 64;
}

message RequestOp {
    oneof request {
        RangeRequest request_range = 1;
        PutRequest request_put = 2;
        DeleteRangeRequest request_delete_range = 3;
        TxnRequest request_txn = 4;
    }
}

message TxnResponse {
    ResponseHeader header = 1;
    bool succeeded = 2;
    repeated ResponseOp responses = 3;
}

The Watch interface returns a stream of key change events.

Watch Request

rpc Watch (stream WatchRequest) returns (stream WatchResponse) { }

message WatchRequest {
    oneof request_union {
        WatchCreateRequest create_request = 1;
        WatchCancelRequest cancel_request = 2;
        WatchProgressRequest progress_request = 3;
    }
}

message WatchCreateRequest {

    enum FilterType {
        NOPUT = 0;
        NODELETE = 1;
    }

    bytes key = 1;
    bytes range_end = 2;
    int64 start_revision = 3;

    // Request keepalive notifications
    bool progress_notify = 4;

    repeated FilterType filters = 5;

    bool prev_kv = 6;
    int64 watch_id = 7;
    bool fragment = 8;
}

message WatchResponse {
    ResponseHeader header = 1;
    int64 watch_id = 2;
    bool created = 3;
    bool canceled = 4;

    // Indicates attempt to watch already compacted revision
    int64 compact_revision = 5;

    string cancel_reason = 6;

    bool fragment = 7;

    repeated Event events = 11;
}

message Event {

    enum EventType {
        PUT = 0;
        DELETE = 1;
    }

    EventType type = 1;
    KeyValue kv = 2;
    KeyValue prev_kv = 3;
}
  • events reported before cluster consensus

The Lease interface monitors client liveness. Entries can be associated with leases, such entries are deleted on lease expiration.

Lease Request

rpc LeaseGrant (LeaseGrantRequest) returns (LeaseGrantResponse) { }

message LeaseGrantRequest {
    // Advisory time to live in seconds
    int64 TTL = 1;
    int64 ID = 2;
}

message LeaseGrantResponse {
    ResponseHeader header = 1;
    int64 ID = 2;
    int64 TTL = 3;
    string error = 4;
}
  • also lease revoke request

  • also lease keep alive request

  • also lease time to live query request

Lock Request

rpc Lock (LockRequest) returns (LockResponse) { }
rpc Unlock (UnlockRequest) returns (UnlockResponse) { }

message LockRequest {
    bytes name = 1;
    int64 lease = 2;
}

message LockResponse {
    ResponseHeader header = 1;
    bytes key = 2;
}

message UnlockRequest {
    bytes key = 1;
}

message UnlockResponse {
    ResponseHeader header = 1;
}
  • lock name creates key name/uuid

  • lock key must be checked in exclusive operations to ensure consistency

Leader Election Request

rpc Campaign (CampaignRequest) returns (CampaignResponse) { }
rpc Proclaim (ProclaimRequest) returns (ProclaimResponse) { }
rpc Leader (LeaderRequest) returns (LeaderResponse) { }
rpc Observe (LeaderRequest) returns (stream LeaderResponse) { }
rpc Resign (ResignRequest) returns (ResignResponse) { }

message CampaignRequest {
    bytes name = 1;
    int64 lease = 2;
    bytes value = 3;
}

message CampaignResponse {
    ResponseHeader header = 1;
    LeaderKey leader = 2;
}

message LeaderKey {
    bytes name = 1;
    bytes key = 2;
    int64 rev = 3;
    int64 lease = 4;
}

message ProclaimRequest {
    LeaderKey leader = 1;
    bytes value = 2;
}

message ProclaimResponse {
    ResponseHeader header = 1;
}

message LeaderRequest {
    bytes name = 1;
}

message LeaderResponse {
    ResponseHeader header = 1;
    KeyValue kv = 2;
}
  • leader proclaims shared value

  • followers observe proclaimed value

Server cluster is either configured statically, with list of server addresses provided in server configuration or in SRV DNS records, or through server discovery, where another etcd server is used to store current server list. Target cluster size must be given but can be changed at runtime.

Experiments

# This assumes starting from revision 1

> etcdctl put somekey somevalue
OK
> etcdctl put anotherkey anothervalue
OK
> etcdctl get a b
anotherkey
anothervalue
> etcdctl get a z
anotherkey
anothervalue
somekey
somevalue
> etcdctl put somekey updatedvalue
OK

> etcdctl get --rev=1 a z
> etcdctl get --rev=2 a z
somekey
somevalue
> etcdctl get --rev=3 a z
anotherkey
anothervalue
somekey
somevalue
> etcdctl get --rev=4 a z
anotherkey
anothervalue
somekey
updatedvalue

> etcdctl lock somelock
somelock/536f4d65436f4465
# Launched from another window
> etcdctl get somelock somelocl
somelock/536f4d65436f4465
# Empty value
^C

> etcdctl elect someelection --listen
# Launched from another window
> etcdctl elect someelection someproposal
someelection/536f4d65436f4465
someproposal
# Launched from another window
> etcdctl elect someelection anotherproposal
# Waits until current leader resigns
...

2.4.1. References

  1. Diego Ongaro, John Ousterhouth: In Search of an Understandable Consensus Algorithm. https://raft.github.io/raft.pdf