Hi Patrik,

(1)
In the kernel code, I put some print in the sk_common_release function in the sock.c and some print in the udp_queue_rcv_skb in the udp.c. I did see the packet in the receive in udp_queue_rcv_skb before socket closed in sk_common_release. 
I suspect 2 reasons.  First, DHCP request packet was txed, then CPU switch to other process(some interrupts happened), and later called close socket. But the packet was received before close was called. 
Second(I believe this is more possible),  the sk_common_release function calls udp_destroy_sock first, then calls unhash function like below.
void sk_common_release(struct sock *sk)
{
if (sk->sk_prot->destroy)
sk->sk_prot->destroy(sk);  /* This is udp_destroy_sock */

/*
* Observation: when sock_common_release is called, processes have
* no access to socket. But net still has.
* Step one, detach it from networking:
*
* A. Remove from hash tables.
*/

sk->sk_prot->unhash(sk);

/*
* In this point socket cannot receive new packets, but it is possible
* that some packets are in flight because some CPU runs receiver and
* did hash table lookup before we unhashed socket. They will achieve
* receive queue and will be purged by socket destructor.
*
* Also we still have packets pending on receive queue and probably,
* our own packets waiting in device queues. sock_destroy will drain
* receive queue, but transmitted packets will delay socket destruction
* until the last reference will be released.
*/

sock_orphan(sk);

xfrm_sk_free_policy(sk);

sk_refcnt_debug_release(sk);
sock_put(sk);
}


udp_destroy_sock -> lock_sock_fast -> might_sleep which can cause reschedule.

(2)For the errno return value issue, you want me to change like below?

if (bind(fd, (struct sockaddr *) &client, sizeof(client)) < 0) {
                        ret = -errno;
close(fd);
return ret;
                }

(3)For the combine code, I can do it. 

If it is ok, I will send a new patch for review.

Thanks,

Feng


 
On Feb 12, 2016, at 12:39 AM, Patrik Flykt <Patrik.Flykt@linux.intel.com> wrote:


Hi,

On Fri, 2016-01-29 at 16:47 -0800, Feng Wang wrote:
It fix DHCP ACK lost issue when doing DHCP renewal.
When doing DHCP renew, 2 sockets are opened. One is for
listening DHCP ACK, the other is for transmitting DHCP request
which is closed immediately after transmitting is done. But in
some cases, the socket is closed after the DHCP ACK is received.

Are there any more details on the "some cases" ?

The kernel will route the packet to the transmitting socket
because it has a better match result(dst ip/port etc). And the
packet was dropped when the socket was closed.
---
gdhcp/client.c |  6 ++++--
gdhcp/common.c | 60 +++++++++++++++++++++++++++++++++++-----------------------
gdhcp/common.h |  2 +-
3 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/gdhcp/client.c b/gdhcp/client.c
index 3bf8cb2..ad587b1 100644
--- a/gdhcp/client.c
+++ b/gdhcp/client.c
@@ -502,7 +502,8 @@ static int send_request(GDHCPClient *dhcp_client)
if (dhcp_client->state == RENEWING)
return dhcp_send_kernel_packet(&packet,
dhcp_client->requested_ip, CLIENT_PORT,
- dhcp_client->server_ip, SERVER_PORT);
+ dhcp_client->server_ip, SERVER_PORT,
+ dhcp_client->listener_sockfd);

return dhcp_send_raw_packet(&packet, INADDR_ANY, CLIENT_PORT,
INADDR_BROADCAST, SERVER_PORT,
@@ -526,7 +527,8 @@ static int send_release(GDHCPClient *dhcp_client,
dhcp_add_option_uint32(&packet, DHCP_SERVER_ID, server);

return dhcp_send_kernel_packet(&packet, ciaddr, CLIENT_PORT,
- server, SERVER_PORT);
+ server, SERVER_PORT,
+ dhcp_client->listener_sockfd);
}

static gboolean ipv4ll_probe_timeout(gpointer dhcp_data);
diff --git a/gdhcp/common.c b/gdhcp/common.c
index f3d4677..f0a9aa6 100644
--- a/gdhcp/common.c
+++ b/gdhcp/common.c
@@ -626,44 +626,56 @@ int dhcp_send_raw_packet(struct dhcp_packet *dhcp_pkt,

int dhcp_send_kernel_packet(struct dhcp_packet *dhcp_pkt,
uint32_t source_ip, int source_port,
- uint32_t dest_ip, int dest_port)
+ uint32_t dest_ip, int dest_port, int fd)
{
struct sockaddr_in client;
- int fd, n, opt = 1;
+ int n, opt = 1;

enum {
DHCP_SIZE = sizeof(struct dhcp_packet) -
EXTEND_FOR_BUGGY_SERVERS,
};

- fd = socket(PF_INET, SOCK_DGRAM | SOCK_CLOEXEC, IPPROTO_UDP);
- if (fd < 0)
- return -errno;
+ if (fd < 0) {
+ /* no socket opened, open a new socket to tx the packet and close it */
+ fd = socket(PF_INET, SOCK_DGRAM | SOCK_CLOEXEC, IPPROTO_UDP);
+ if (fd < 0)
+ return -errno;
+
+ setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
+
+ memset(&client, 0, sizeof(client));
+ client.sin_family = AF_INET;
+ client.sin_port = htons(source_port);
+ client.sin_addr.s_addr = htonl(source_ip);
+ if (bind(fd, (struct sockaddr *) &client, sizeof(client)) < 0) {
+ close(fd);
+ return -errno;

Here -errno is returned for the close() function, which succeeds. So the
returned error is zero. Please fix.

+ }

- setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
+ memset(&client, 0, sizeof(client));
+ client.sin_family = AF_INET;
+ client.sin_port = htons(dest_port);
+ client.sin_addr.s_addr = htonl(dest_ip);
+ if (connect(fd, (struct sockaddr *) &client, sizeof(client)) < 0) {
+ close(fd);
+ return -errno;
+ }

This part is repeated below, please combine.

- memset(&client, 0, sizeof(client));
- client.sin_family = AF_INET;
- client.sin_port = htons(source_port);
- client.sin_addr.s_addr = htonl(source_ip);
- if (bind(fd, (struct sockaddr *) &client, sizeof(client)) < 0) {
- close(fd);
- return -errno;
- }
+ n = write(fd, dhcp_pkt, DHCP_SIZE);

- memset(&client, 0, sizeof(client));
- client.sin_family = AF_INET;
- client.sin_port = htons(dest_port);
- client.sin_addr.s_addr = htonl(dest_ip);
- if (connect(fd, (struct sockaddr *) &client, sizeof(client)) < 0) {
close(fd);

Can the socket really be closed here so that we don't repeat the
original problem?

- return -errno;
+ } else {
+ /* Using existed socket to transmit the packet */
+ memset(&client, 0, sizeof(client));
+ client.sin_family = AF_INET;
+ client.sin_port = htons(dest_port);
+ client.sin_addr.s_addr = htonl(dest_ip);
+
+ n = sendto(fd, dhcp_pkt, DHCP_SIZE, MSG_DONTWAIT,
+ (struct sockaddr *) &client, sizeof(client));
}

- n = write(fd, dhcp_pkt, DHCP_SIZE);
-
- close(fd);
-
if (n < 0)
return -errno;

diff --git a/gdhcp/common.h b/gdhcp/common.h
index 75abc18..b92d214 100644
--- a/gdhcp/common.h
+++ b/gdhcp/common.h
@@ -209,7 +209,7 @@ int dhcp_send_raw_packet(struct dhcp_packet *dhcp_pkt,
int dhcpv6_send_packet(int index, struct dhcpv6_packet *dhcp_pkt, int len);
int dhcp_send_kernel_packet(struct dhcp_packet *dhcp_pkt,
uint32_t source_ip, int source_port,
- uint32_t dest_ip, int dest_port);
+ uint32_t dest_ip, int dest_port, int fd);
int dhcp_l3_socket(int port, const char *interface, int family);
int dhcp_recv_l3_packet(struct dhcp_packet *packet, int fd);
int dhcpv6_recv_l3_packet(struct dhcpv6_packet **packet, unsigned char *buf,


Patrik