From 862f8a6644ddee9a2220244097e18a9bccf9e1da Mon Sep 17 00:00:00 2001 From: Yimin Jiang Date: Thu, 24 Oct 2019 18:46:04 +0800 Subject: [PATCH] rdma: allow binding to given interface (#7) * rdma: allow binding to given interface * tests: add key log * tests: add knob for log frequency --- src/rdma_van.h | 46 +++++++++++++++++++++++++++++----- tests/test_kv_app_benchmark.cc | 7 ++++-- 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/src/rdma_van.h b/src/rdma_van.h index 53e1cae6a..09beecd1d 100644 --- a/src/rdma_van.h +++ b/src/rdma_van.h @@ -49,6 +49,9 @@ static const int kMaxHostnameLength = 16; static const int kMaxDataFields = 4; static const size_t kAlignment = 8; +static const int kMaxResolveRetry = 50000; +static const int kBasePort = 9010; + template static inline T align_floor(T v, T align) { return v - (v % align); @@ -513,9 +516,16 @@ class RDMAVan : public Van { int Bind(const Node &node, int max_retry) override { CHECK(rdma_create_id(event_channel_, &listener_, nullptr, RDMA_PS_TCP) == 0) << "Create RDMA connection identifier failed"; - + struct sockaddr_in addr; - memset(&addr, 0, sizeof(addr)); + memset(&addr, 0, sizeof(addr)); + + auto val = Environment::Get()->find("DMLC_NODE_HOST"); + if (val) { + PS_VLOG(1) << "bind to DMLC_NODE_HOST: " << std::string(val); + addr.sin_addr.s_addr = inet_addr(val); + } + addr.sin_family = AF_INET; int port = node.port; unsigned seed = static_cast(time(NULL) + port); @@ -584,10 +594,34 @@ class RDMAVan : public Van { << "Create RDMA connection identifier failed"; endpoint->cm_id->context = endpoint; - CHECK_EQ(rdma_resolve_addr(endpoint->cm_id, nullptr, - remote_addr->ai_addr, kTimeoutms), - 0) - << "Resolve RDMA address failed with errno: " << errno; + int max_retry = kMaxResolveRetry; + int port = kBasePort; + unsigned seed = static_cast(time(NULL) + port); + auto val = Environment::Get()->find("DMLC_NODE_HOST"); + if (val) { + struct sockaddr_in addr; + memset(&addr, 0, sizeof(addr)); + addr.sin_addr.s_addr = inet_addr(val); + addr.sin_family = AF_INET; + for (int i = 0; i < max_retry + 1; ++i) { + addr.sin_port = htons(port); + if (rdma_resolve_addr(endpoint->cm_id, + reinterpret_cast(&addr), + remote_addr->ai_addr, kTimeoutms) == 0) { + break; + } + if (i == max_retry) { + port = -1; + } else { + port = 10000 + rand_r(&seed) % 40000; + } + } + } else { + CHECK_EQ(rdma_resolve_addr(endpoint->cm_id, nullptr, + remote_addr->ai_addr, kTimeoutms), + 0) + << "Resolve RDMA address failed with errno: " << strerror(errno); + } endpoint->cv.wait(lk, [endpoint] { return endpoint->status != Endpoint::CONNECTING; diff --git a/tests/test_kv_app_benchmark.cc b/tests/test_kv_app_benchmark.cc index fad3ffa1d..cc600414a 100644 --- a/tests/test_kv_app_benchmark.cc +++ b/tests/test_kv_app_benchmark.cc @@ -19,6 +19,7 @@ void EmptyHandler(const KVMeta &req_meta, const KVPairs &req_data, KVServer CHECK_EQ(req_data.vals.size(), (size_t)req_data.lens[0]); if (mem_map.find(key) == mem_map.end()) { + PS_VLOG(1) << "key " << key << " from worker-" << req_meta.sender; size_t len = (size_t) req_data.vals.size(); mem_map[key].keys.push_back(key); mem_map[key].vals.CopyFrom(req_data.vals); @@ -142,6 +143,8 @@ void RunWorker(int argc, char *argv[]) { auto end = std::chrono::high_resolution_clock::now(); auto val = Environment::Get()->find("THRESHOLD"); unsigned int threshold = val ? atoi(val) : 10; + val = Environment::Get()->find("LOG_DURATION"); + unsigned int log_duration = val ? atoi(val) : 500; int cnt = 0; while (1) { for (int server = 0; server < num_servers; server++) { @@ -160,9 +163,9 @@ void RunWorker(int argc, char *argv[]) { } timestamp_list.clear(); cnt++; - if (cnt % 100 == 0) { + if (cnt % log_duration == 0) { end = std::chrono::high_resolution_clock::now(); - LL << "Benchmark throughput: " + LL << "Application goodput: " << 8.0 * len * sizeof(float) * num_servers * cnt * threshold / (end - start).count() << " Gbps"; cnt = 0;