this weekend. It would be interesting to learn that someone else
. I use it in addition to sysbench. After some effort tuning InnoDB and a few changes to the source I was able to almost double the Linkbench QPS but I really need MySQL 5.7 as the per-index latch for InnoDB indexes is the primary bottleneck.
In addition to networking at conferences, I recently spent a day looking at networking in MySQL 5.1 and 5.6. A good overview is the output from strace -c -p $PID where $PID is a thread busy with sysbench read-only queries for a cached database. Below I describe the results from MySQL 5.1.63 and 5.6.12 using official MySQL and the Facebook patch. Each result is from a sample of about 10 seconds (give or take a few seconds).
This strace output is from official MySQL 5.1.63. There are two interesting things in these results. The first is frequent calls to sched_setparam and all of them return an error. That is
which was fixed in MySQL 5.6. Removing the calls in 5.1 improved performance by about 0.3% on my test server. That isn't a big deal but I am happy the code is gone. The second interesting result is the high number of calls to fcntl. I filed
asking for them to be removed. There were a big problem for performance on older Linux kernels that used a big kernel mutex for some of the fcntl processing.
for details on the impact. This is not a performance problem on the kernels I have been using recently.
This strace output is from the Facebook patch for MySQL 5.1.63. It still has the frequent errors from calls to sched_setparam. But instead of too many calls to fcntl it has too many calls to setsockopt. That was a good tradeoff on some Linux kernels as described
This result is the same for both official MySQL and the Facebook patch. Hooray, the calls to sched_setparam are gone! There are many calls to recvfrom that get errors. I assume these are the non-blocking calls that return no data. There are also many calls to poll. I prefer to see fewer calls to poll and hacked on MySQL to do blocking calls to recv. That made the poll calls go away but didn't have a significant impact on performance. Perhaps it will help in the future when other bottlenecks are removed.
The pattern for server-side network reads is to first do a non-blocking read and if that doesn't return the expected amount of data then a read with timeout is done. I don't have all of the history behind the design decision but my guess is that there had to be support for interrupting reads on shutdown. The implementation of read with timeout has changed over time and it can be hard to figure out some of the code unless you look at the preprocessor output.
while (count)
{
size_t recvcnt= vio_read(net->vio, buf, count);
/* VIO_SOCKET_ERROR (-1) indicates an error. */
if (recvcnt == VIO_SOCKET_ERROR)
{
/* A recoverable I/O error occurred? */
if (net_should_retry(net, &retry_count))
continue;
else
break;
}
/* Zero indicates end of file. */
else if (!recvcnt)
{
eof= true;
break;
}
count-= recvcnt;
buf+= recvcnt;
}
vio_read is:
size_t vio_read(Vio *vio, uchar *buf, size_t size)
{
ssize_t ret;
int flags= 0;
/* If timeout is enabled, do not block if data is unavailable. */
if (vio->read_timeout >= 0)
flags= VIO_DONTWAIT;
/* this is VIO_DONTWAIT == MSG_DONTWAIT
and with tracing all calls to mysql_socket_recv have read_timeout > 0 and use MSG_DONTWAIT */
while ((ret= mysql_socket_recv(vio->mysql_socket, (SOCKBUF_T *)buf, size, flags)) == -1)
{
int error= socket_errno;
/* The operation would block? */
if (error != SOCKET_EAGAIN && error != SOCKET_EWOULDBLOCK)
break;
/* Wait for input data to become available. */
if ((ret= vio_socket_io_wait(vio, VIO_IO_EVENT_READ)))
break;
}
DBUG_RETURN(ret);
}
int vio_socket_io_wait(Vio *vio, enum enum_vio_io_event event)
{
int timeout, ret;
DBUG_ASSERT(event == VIO_IO_EVENT_READ || event == VIO_IO_EVENT_WRITE);
/* Choose an appropriate timeout. */
if (event == VIO_IO_EVENT_READ)
timeout= vio->read_timeout;
else
timeout= vio->write_timeout;
/* Wait for input data to become available. */
switch (vio_io_wait(vio, event, timeout))
{
case -1:
/* Upon failure, vio_read/write() shall return -1. */
ret= -1;
break;
case 0:
/* The wait timed out. */
ret= -1;
break;
default:
/* A positive value indicates an I/O event. */
ret= 0;
break;
}
return ret;
}
int vio_socket_io_wait(Vio *vio, enum enum_vio_io_event event)
{
int timeout, ret;
DBUG_ASSERT(event == VIO_IO_EVENT_READ || event == VIO_IO_EVENT_WRITE);
/* Choose an appropriate timeout. */
if (event == VIO_IO_EVENT_READ)
timeout= vio->read_timeout;
else
timeout= vio->write_timeout;
/* Wait for input data to become available. */
switch (vio_io_wait(vio, event, timeout))
{
case -1:
/* Upon failure, vio_read/write() shall return -1. */
ret= -1;
break;
case 0:
/* The wait timed out. */
ret= -1;
break;
default:
/* A positive value indicates an I/O event. */
ret= 0;
break;
}
return ret;
}
int vio_socket_io_wait(Vio *vio, enum enum_vio_io_event event)
{
int timeout, ret;
DBUG_ASSERT(event == VIO_IO_EVENT_READ || event == VIO_IO_EVENT_WRITE);
/* Choose an appropriate timeout. */
if (event == VIO_IO_EVENT_READ)
timeout= vio->read_timeout;
else
timeout= vio->write_timeout;
/* Wait for input data to become available. */
switch (vio_io_wait(vio, event, timeout))
{
case -1:
/* Upon failure, vio_read/write() shall return -1. */
ret= -1;
break;
case 0:
/* The wait timed out. */
ret= -1;
break;
default:
/* A positive value indicates an I/O event. */
ret= 0;
break;
}
return ret;
}
int vio_io_wait(Vio *vio, enum enum_vio_io_event event, int timeout)
{
int ret;
short DBUG_ONLY revents= 0;
struct pollfd pfd;
my_socket sd= mysql_socket_getfd(vio->mysql_socket);
memset(&pfd, 0, sizeof(pfd));
pfd.fd= sd;
/* Set the poll bitmask describing the type of events.
The error flags are only valid in the revents bitmask. */
switch (event)
{
case VIO_IO_EVENT_READ:
pfd.events= MY_POLL_SET_IN;
revents= MY_POLL_SET_IN | MY_POLL_SET_ERR | POLLRDHUP;
break;
case VIO_IO_EVENT_WRITE:
case VIO_IO_EVENT_CONNECT:
pfd.events= MY_POLL_SET_OUT;
revents= MY_POLL_SET_OUT | MY_POLL_SET_ERR;
break;
}
/* Wait for the I/O event and return early in case of error or timeout */
switch ((ret= poll(&pfd, 1, timeout)))
{
case -1:
break; /* return -1 on error */
case 0:
/* Set errno to indicate a timeout error. */
errno= SOCKET_ETIMEDOUT;
break;
default:
/* Ensure that the requested I/O event has completed. */
DBUG_ASSERT(pfd.revents & revents);
break;
}
DBUG_RETURN(ret);
}
static inline ssize_t
inline_mysql_socket_recv(MYSQL_SOCKET mysql_socket, SOCKBUF_T *buf, size_t n, int flags)
{
ssize_t result;
/* Non instrumented code */
result= recv(mysql_socket.fd, buf, IF_WIN((int),) n, flags);
return result;
}
View comments