Skip to Content.
Sympa Menu

ndt-dev - [ndt] r348 committed - Improve error handling when communicating with child. Use select() wi...

Subject: NDT-DEV email list created

List archive

[ndt] r348 committed - Improve error handling when communicating with child. Use select() wi...


Chronological Thread 
  • From:
  • To:
  • Subject: [ndt] r348 committed - Improve error handling when communicating with child. Use select() wi...
  • Date: Thu, 22 Apr 2010 00:45:54 +0000

Revision: 348
Author: rcarlson501
Date: Wed Apr 21 17:45:18 2010
Log: Improve error handling when communicating with child. Use select() with
a timer to prevent child processes from hanging in an accept() state.
the time will now expire and the child will return a error flag back to the parent
process.

Also handle write errors and terminate if a non EINTR error is encountered
while processing a write() call.

bumped version number to 3.6.3

RAC 4/21/10

http://code.google.com/p/ndt/source/detail?r=348

Modified:
/trunk/Applet/Tcpbw100.java
/trunk/configure
/trunk/configure.ac
/trunk/src/testoptions.c
/trunk/src/web100srv.c

=======================================
--- /trunk/Applet/Tcpbw100.java Fri Apr 9 09:49:47 2010
+++ /trunk/Applet/Tcpbw100.java Wed Apr 21 17:45:18 2010
@@ -98,7 +98,7 @@

public class Tcpbw100 extends JApplet implements ActionListener
{
- private static final String VERSION = "3.6.2b";
+ private static final String VERSION = "3.6.3";
private static final byte TEST_MID = (1 << 0);
private static final byte TEST_C2S = (1 << 1);
private static final byte TEST_S2C = (1 << 2);
=======================================
--- /trunk/configure Fri Apr 9 10:34:54 2010
+++ /trunk/configure Wed Apr 21 17:45:18 2010
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.63 for NDT 3.6.2b.
+# Generated by GNU Autoconf 2.63 for NDT 3.6.3.
#
# Report bugs to
<>.
#
@@ -596,8 +596,8 @@
# Identity of this package.
PACKAGE_NAME='NDT'
PACKAGE_TARNAME='ndt'
-PACKAGE_VERSION='3.6.2b'
-PACKAGE_STRING='NDT 3.6.2b'
+PACKAGE_VERSION='3.6.3'
+PACKAGE_STRING='NDT 3.6.3'

'

ac_unique_file="src/analyze.c"
@@ -1331,7 +1331,7 @@
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures NDT 3.6.2b to adapt to many kinds of systems.
+\`configure' configures NDT 3.6.3 to adapt to many kinds of systems.

Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1397,7 +1397,7 @@

if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of NDT 3.6.2b:";;
+ short | recursive ) echo "Configuration of NDT 3.6.3:";;
esac
cat <<\_ACEOF

@@ -1495,7 +1495,7 @@
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-NDT configure 3.6.2b
+NDT configure 3.6.3
generated by GNU Autoconf 2.63

Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1509,7 +1509,7 @@
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.

-It was created by NDT $as_me 3.6.2b, which was
+It was created by NDT $as_me 3.6.3, which was
generated by GNU Autoconf 2.63. Invocation command line was

$ $0 $@
@@ -2359,7 +2359,7 @@

# Define the identity of the package.
PACKAGE='ndt'
- VERSION='3.6.2b'
+ VERSION='3.6.3'


cat >>confdefs.h <<_ACEOF
@@ -8645,7 +8645,7 @@
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by NDT $as_me 3.6.2b, which was
+This file was extended by NDT $as_me 3.6.3, which was
generated by GNU Autoconf 2.63. Invocation command line was

CONFIG_FILES = $CONFIG_FILES
@@ -8708,7 +8708,7 @@
_ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_version="\\
-NDT config.status 3.6.2b
+NDT config.status 3.6.3
configured by $0, generated by GNU Autoconf 2.63,
with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"

=======================================
--- /trunk/configure.ac Fri Apr 9 10:34:54 2010
+++ /trunk/configure.ac Wed Apr 21 17:45:18 2010
@@ -29,7 +29,7 @@
# Process this file with autoconf to produce a configure script.
#
# AC_PREREQ(2.57)
-AC_INIT([NDT],[3.6.2b],[])
+AC_INIT([NDT],[3.6.3],[])
AC_CONFIG_AUX_DIR(config)
# AM_INIT_AUTOMAKE(NDT, v1.0, [no-define])
AM_INIT_AUTOMAKE
=======================================
--- /trunk/src/testoptions.c Fri Apr 9 09:23:03 2010
+++ /trunk/src/testoptions.c Wed Apr 21 17:45:18 2010
@@ -263,7 +263,7 @@
int maxseg=1456;
/* int maxseg=1456, largewin=16*1024*1024; */
/* int seg_size, win_size; */
- int midfd, j, ret;
+ int midsfd, j, ret;
struct sockaddr_storage cli_addr;
/* socklen_t optlen, clilen; */
socklen_t clilen;
@@ -274,6 +274,8 @@
int msgLen;
web100_connection* conn;
char tmpstr[256];
+ struct timeval sel_tv;
+ fd_set rfd;

assert(ctlsockfd != -1);
assert(agent);
@@ -359,33 +361,47 @@
* and does NAT detection. More analysis functions (window scale)
* will be done in the future.
*/
- j = 0;
clilen = sizeof(cli_addr);
- for (;;) {
- if ((midfd = accept(options->midsockfd, (struct sockaddr *) &cli_addr, &clilen)) > 0)
+ FD_ZERO(&rfd);
+ FD_SET(options->midsockfd, &rfd);
+ sel_tv.tv_sec = 5;
+ sel_tv.tv_usec = 0;
+ for (j=0; j<5; j++) {
+ ret = select((options->midsockfd)+1, &rfd, NULL, NULL, &sel_tv);
+ if ((ret == -1) && (errno == EINTR))
+ continue;
+ if (ret == 0)
+ return -100; /* timeout */
+ if (ret < 0)
+ return -errno;
+ if (j == 4)
+ return -101;
+midfd:
+ if ((midsfd = accept(options->midsockfd, (struct sockaddr *) &cli_addr, &clilen)) > 0)
break;

- if ((midfd == -1) && (errno == EINTR))
- continue;
+ if ((midsfd == -1) && (errno == EINTR))
+ goto midfd;

sprintf(tmpstr, "------- middlebox connection setup returned because (%d)", errno);
if (get_debuglvl() > 1)
perror(tmpstr);
- if (++j == 4)
- /* break; */
- return -2;
+ if (midsfd < 0)
+ return -errno;
+ if (j == 4)
+ return -102;
}
memcpy(&meta.c_addr, &cli_addr, clilen);
/* meta.c_addr = cli_addr; */
meta.family = ((struct sockaddr *) &cli_addr)->sa_family;

buff[0] = '\0';
- if ((conn = web100_connection_from_socket(agent, midfd)) == NULL) {
+ if ((conn = web100_connection_from_socket(agent, midsfd)) == NULL) {
log_println(0, "!!!!!!!!!!! test_mid() failed to get web100 connection data, rc=%d", errno);
/* exit(-1); */
return -3;
}
- web100_middlebox(midfd, agent, conn, buff);
+ web100_middlebox(midsfd, agent, conn, buff);
send_msg(ctlsockfd, TEST_MSG, buff, strlen(buff));
msgLen = sizeof(buff);
if (recv_msg(ctlsockfd, &msgType, buff, &msgLen)) {
@@ -409,8 +425,8 @@
*s2c2spd = atof(buff);
log_println(4, "CWND limited throughput = %0.0f kbps (%s)", *s2c2spd, buff);

- shutdown(midfd, SHUT_WR);
- close(midfd);
+ shutdown(midsfd, SHUT_WR);
+ close(midsfd);
close(options->midsockfd);
send_msg(ctlsockfd, TEST_FINALIZE, "", 0);
log_println(1, " <--------- %d ----------->", options->child0);
@@ -535,9 +551,22 @@
return ret;

clilen = sizeof(cli_addr);
- /* j = 0; */
log_println(6, "child %d - sent c2s prepare to client", testOptions->child0);
+ FD_ZERO(&rfd);
+ FD_SET(testOptions->c2ssockfd, &rfd);
+ sel_tv.tv_sec = 5;
+ sel_tv.tv_usec = 0;
for (j=0; j<5; j++) {
+ ret = select((testOptions->c2ssockfd)+1, &rfd, NULL, NULL, &sel_tv);
+ if ((ret == -1) && (errno == EINTR))
+ continue;
+ if (ret == 0)
+ return -100; /* timeout */
+ if (ret < 0)
+ return -errno;
+ if (j == 4)
+ return -101;
+recfd:
recvsfd = accept(testOptions->c2ssockfd, (struct sockaddr *) &cli_addr, &clilen);
if (recvsfd > 0) {
log_println(6, "accept() for %d completed", testOptions->child0);
@@ -546,13 +575,15 @@
if ((recvsfd == -1) && (errno == EINTR)) {
log_println(6, "Child %d interrupted while waiting for accept() to complete",
testOptions->child0);
- continue;
+ goto recfd;
}
log_println(6, "------- C2S connection setup for %d returned because (%d)",
testOptions->child0, errno);
- if (++j == 4) {
+ if (recvsfd < 0)
+ return -errno;
+ if (j == 4) {
log_println(6, "c2s child %d, uable to open connection, return from test", testOptions->child0);
- return -2;
+ return -102;
}
}
log_println(6, "child %d - c2s ready for test with fd=%d", testOptions->child0, recvsfd);
@@ -967,24 +998,42 @@
* This is the second throughput test, with data streaming from
* the server back to the client. Again stream data for 10 seconds.
*/
- log_println(1, "waiting for data on testOptions->s2csockfd");
-
- j = 0;
+ log_println(1, "%d waiting for data on testOptions->s2csockfd", testOptions->child0);
+
clilen = sizeof(cli_addr);
- for (;;) {
+ FD_ZERO(&rfd);
+ FD_SET(testOptions->c2ssockfd, &rfd);
+ sel_tv.tv_sec = 5;
+ sel_tv.tv_usec = 0;
+ for (j=0; j<5; j++) {
+ ret = select((testOptions->s2csockfd)+1, &rfd, NULL, NULL, &sel_tv);
+ if ((ret == -1) && (errno == EINTR))
+ continue;
+ if (ret == 0)
+ return -100; /* timeout */
+ if (ret < 0)
+ return -errno;
+ if (j == 4)
+ return -101;
+ximfd:
xmitsfd = accept(testOptions->s2csockfd, (struct sockaddr *) &cli_addr, &clilen);
if (xmitsfd > 0) {
- log_println(6, "S2C %d, has sfd=%d, read to stream data", testOptions->child0, xmitsfd);
- break;
- }
- if ((xmitsfd == -1) && (errno == EINTR))
- continue;
-
- sprintf(tmpstr, "------- S2C connection setup returned because (%d)", errno);
- if (get_debuglvl() > 1)
- perror(tmpstr);
- if (++j == 4)
- return -2;
+ log_println(6, "accept() for %d completed", testOptions->child0);
+ break;
+ }
+ if ((xmitsfd == -1) && (errno == EINTR)) {
+ log_println(6, "Child %d interrupted while waiting for accept() to complete",
+ testOptions->child0);
+ goto ximfd;
+ }
+ log_println(6, "------- S2C connection setup for %d returned because (%d)",
+ testOptions->child0, errno);
+ if (xmitsfd < 0)
+ return -errno;
+ if (++j == 4) {
+ log_println(6, "s2c child %d, uable to open connection, return from test", testOptions->child0);
+ return -102;
+ }
}
src_addr = I2AddrByLocalSockFD(get_errhandle(), xmitsfd, 0);
conn = web100_connection_from_socket(agent, xmitsfd);
=======================================
--- /trunk/src/web100srv.c Fri Apr 9 09:49:47 2010
+++ /trunk/src/web100srv.c Wed Apr 21 17:45:18 2010
@@ -842,7 +842,7 @@
return NULL;
}

-void
+int
run_test(web100_agent* agent, int ctlsockfd, TestOptions* testopt, char *test_suite)
{

@@ -931,6 +931,7 @@
log_println(6, "Middlebox test failed with rc=%d", ret);
log_println(0, "Middlebox test FAILED!, rc=%d", ret);
testopt->midopt = TOPT_DISABLED;
+ return ret;
}

/* alarm(20); */
@@ -938,8 +939,6 @@
if ((ret = test_sfw_srv(ctlsockfd, agent, &*testopt, conn_options)) != 0) {
if (ret < 0)
log_println(6, "SFW test failed with rc=%d", ret);
- log_println(0, "Simple firewall test FAILED!, rc=%d", ret);
- testopt->sfwopt = TOPT_DISABLED;
}

/* alarm(25); */
@@ -950,6 +949,7 @@
log_println(6, "C2S test failed with rc=%d", ret);
log_println(0, "C2S throughput test FAILED!, rc=%d", ret);
testopt->c2sopt = TOPT_DISABLED;
+ return ret;
}

/* alarm(25); */
@@ -960,6 +960,7 @@
log_println(6, "S2C test failed with rc=%d", ret);
log_println(0, "S2C throughput test FAILED!, rc=%d", ret);
testopt->s2copt = TOPT_DISABLED;
+ return ret;
}

log_println(4, "Finished testing C2S = %0.2f Mbps, S2C = %0.2f Mbps", c2sspd/1000, s2cspd/1000);
@@ -1280,6 +1281,7 @@
}
shutdown(ctlsockfd, SHUT_WR);
/* shutdown(ctlsockfd, SHUT_RDWR); */
+ return (0);
}

int
@@ -1695,6 +1697,7 @@

for(;;){

+mainloop:
if (head_ptr == NULL)
log_println(3, "nothing in queue");
else
@@ -1856,52 +1859,6 @@
goto sel_11;
tt = time(0);

-/*
- if (head_ptr != NULL) {
- log_println(3, "now = %ld Process started at %ld, run time = %ld",
- tt, head_ptr->stime, (tt - head_ptr->stime));
- if ((tt - head_ptr->stime) > 60) {
- /-* process is stuck at the front of the queue. *-/
- fp = fopen(get_logfile(),"a");
- if (fp != NULL) {
- fprintf(fp, "%d children waiting in queue: Killing off stuck process %d at %15.15s\n",
- waiting, head_ptr->pid, ctime(&tt)+4);
- fclose(fp);
- }
- log_println(6, "%d children waiting in queue: Killing off stuck process %d at %15.15s\n",
- waiting, head_ptr->pid, ctime(&tt)+4);
- -* kill(tmp_ptr->pid, SIGTERM); *-
- -* kill(head_ptr->pid, SIGCHLD); *-
- -* clean up more and inform the client that the test is ending
- * rac 2/27/10
- *-
- log_println(6, "pid=%d, client='%s', stime=%ld, qtime=%ld now=%ld", head_ptr->pid, head_ptr->addr,
- head_ptr->stime, head_ptr->qtime, time(0));
- log_println(6, "pipe-fd=%d, running=%d, ctlsockfd=%d, client-type=%d, tests='%s'",
- head_ptr->pipe, head_ptr->running,
head_ptr->ctlsockfd,
- head_ptr->oldclient, head_ptr->tests);
- send_msg(head_ptr->ctlsockfd, SRV_QUEUE, "9555", 4);
- shutdown(head_ptr->ctlsockfd, SHUT_WR);
- close(head_ptr->ctlsockfd);
- tpid = head_ptr->pid;
- child_sig(-1);
- kill(tpid, SIGTERM);
- child_sig(tpid);
-
- if (((multiple == 0) && (waiting == 1)) ||
- ((multiple == 1) && (mclients == 0)))
- testing = 0;
- -* should not decrement waiting here, it was decrementd in the child_sig() routine
- * RAC 2/27/09
- *-
- -* if (waiting > 0)
- * waiting--;
- *-
- if (waiting == 0)
- mclients = 0;
- }
- }
- */
}
else {
/* Nothing is in the queue, so wait forever until a new connection request arrives */
@@ -1976,7 +1933,26 @@
continue;
if (rc == 13)
break;
+ if (rc == -1) {
+ log_println(1, "Initial contact with client failed errno=%d",
errno);
+ close(chld_pipe[0]);
+ close(chld_pipe[1]);
+ shutdown(ctlsockfd, SHUT_WR);
+ close(ctlsockfd);
+ goto mainloop;
+ }
+
+ log_println(6, "xxx, calling initialize_tests()");
+ t_opts = initialize_tests(ctlsockfd, &testopt, test_suite);
+ if (t_opts < 1) {
+ log_println(3, "Invalid test suite string '%s' received, terminate child", test_suite);
+ close(chld_pipe[0]);
+ close(chld_pipe[1]);
+ shutdown(ctlsockfd, SHUT_WR);
+ close(ctlsockfd);
+
/* todo: handle other error contitions */
+ }
}
new_child = (struct ndtchild *) malloc(sizeof(struct ndtchild));
memset(new_child, 0, sizeof(struct ndtchild));
@@ -2196,7 +2172,7 @@
}
}

-multi_client:
+/* multi_client: */
if ((multiple == 1) && (mclients < max_clients)) {
if (mwaiting == 0)
continue;
@@ -2244,6 +2220,13 @@
log_println(6, "Failed to write 'GO' message to client %d, reason=%d, errno=%d",
mchild->pid, rc, errno);
/* TODO: handle other error conditions */
+ if (rc == -1) {
+ log_println(1, "Dispatch multi-client failed because '%s'", strerror(errno));
+ shutdown(mchild->ctlsockfd, SHUT_WR);
+ close(mchild->ctlsockfd);
+ kill(chld_pid, SIGTERM);
+ goto mainloop;
+ }
}
close(mchild->pipe);
close(mchild->ctlsockfd);
@@ -2261,6 +2244,13 @@
if (rc == strlen(tmpstr))
break;
/* TODO: handle other error conditions */
+ if (rc == -1) {
+ log_println(1, "Dispatch multi-client failed because '%s'", strerror(errno));
+ shutdown(head_ptr->ctlsockfd, SHUT_WR);
+ close(head_ptr->ctlsockfd);
+ kill(chld_pid, SIGTERM);
+ goto mainloop;
+ }
}
close(head_ptr->pipe);
close(head_ptr->ctlsockfd);
@@ -2384,10 +2374,15 @@

if (strncmp(test_suite, "Invalid", 7) != 0) {
log_println(3, "Valid test sequence requested, run test for client=%d", getpid());
- run_test(agent, ctlsockfd, &testopt, test_suite);
+ rc = run_test(agent, ctlsockfd, &testopt, test_suite);
}

- log_println(3, "Successfully returned from run_test() routine");
+ if (rc == 0)
+ log_println(3, "Successfully returned from run_test() routine");
+ else {
+ log_println(3, "Child %d returned non-zero (%d) from run_test() results some test failed!", getpid(), rc);
+ child_sig(0);
+ }
close(ctlsockfd);
web100_detach(agent);
log_free();


  • [ndt] r348 committed - Improve error handling when communicating with child. Use select() wi..., ndt, 04/21/2010

Archive powered by MHonArc 2.6.16.

Top of Page