|
| 1 | +From a2e87cc6ae847ec3de21cae3dc4c3e24b306bd71 Mon Sep 17 00:00:00 2001 |
| 2 | +From: Rainer Gerhards <rgerhards@adiscon.com> |
| 3 | +Date: Tue, 26 Sep 2023 14:38:44 +0200 |
| 4 | +Subject: [PATCH] fix startup issue on modern systemd systems |
| 5 | + |
| 6 | +close all unneeded file descriptors. Not doing this has some |
| 7 | +security implications. Traditionally, we do this by iterating |
| 8 | +over all possible file descriptor values. This is fairly compatible, |
| 9 | +because we need no OS-specific method. However, modern systemd configs |
| 10 | +tend to not limit the number of fds, so there are potentially 2^30(*) |
| 11 | +fds to close. While this is OKish, it takes some time and makes |
| 12 | +systemd think that rsyslog did not properly start up. |
| 13 | + |
| 14 | +We have now solved this by using the /proc filesystem to obtain our |
| 15 | +currently open fds. This works for Linux, as well as Cygwin, NetBSD, |
| 16 | +FreeBDS and MacOS. Where not available,and close_range() is available |
| 17 | +on the (build) platform, we try to use it. If that fails as well, we |
| 18 | +fall back to the traditional method. In our opionion, this fallback |
| 19 | +is unproblematic, as on these platforms there is no systemd and in |
| 20 | +almost all cases a decent number of fds to close. |
| 21 | + |
| 22 | +Very special thanks go out to Brennan Kinney, who clearly described |
| 23 | +the issue to us on github and also provided ample ways to solve it. |
| 24 | +What we did is just implement what we think is the best fit from |
| 25 | +rsyslog's PoV. |
| 26 | + |
| 27 | +(*) Some details below on the number of potentially to close fds. |
| 28 | + This is directly from a github posting from Brennan Kinney. |
| 29 | +Just to clarify, by default since systemd v240 (2018Q4), that |
| 30 | +should be `1024:524288` limit. As in the soft limit is the expected |
| 31 | +`1024`. |
| 32 | + |
| 33 | +The problem is other software shipping misconfiguration in systemd |
| 34 | +services that overrides this to something silly like |
| 35 | +`LimitNOFILE=infinity`. |
| 36 | +- Which will map to the sysctl `fs.nr_open` (_a value systemd |
| 37 | + v240 also raises from `2^20` to 2^30`, some distro like Debian are |
| 38 | + known to opt-out via patch for the `fs.nr_open` change_). |
| 39 | +- With the biggest issue there being that the soft limit was also |
| 40 | + set to `infinity` instead of their software requesting to raise |
| 41 | + the soft limit to a higher value that the hard limit permits. |
| 42 | + `infinity` isn't at all sane though. |
| 43 | +- The known source of this misconfiguration is container software such |
| 44 | + as Docker and `containerd` (_which would often sync with the |
| 45 | + systemd `.service` config from the Docker daemon `dockerd.service`_). |
| 46 | + |
| 47 | +closes https://github.com/rsyslog/rsyslog/issues/5158 |
| 48 | + |
| 49 | +--- |
| 50 | + configure.ac | 4 +-- |
| 51 | + tools/rsyslogd.c | 68 +++++++++++++++++++++++++++++++++++++++++++----- |
| 52 | + 2 files changed, 63 insertions(+), 9 deletions(-) |
| 53 | + |
| 54 | +diff --git a/configure.ac b/configure.ac |
| 55 | +index 247a155a1..95ce0287e 100644 |
| 56 | +--- a/configure.ac |
| 57 | ++++ b/configure.ac |
| 58 | +@@ -200,7 +200,7 @@ AC_CHECK_HEADERS([malloc.h],[],[],[ |
| 59 | + #endif |
| 60 | + ] |
| 61 | + ]) |
| 62 | +-AC_CHECK_HEADERS([fcntl.h locale.h netdb.h netinet/in.h paths.h stddef.h stdlib.h string.h sys/file.h sys/ioctl.h sys/param.h sys/socket.h sys/time.h sys/stat.h unistd.h utmp.h utmpx.h sys/epoll.h sys/prctl.h sys/select.h getopt.h]) |
| 63 | ++AC_CHECK_HEADERS([fcntl.h locale.h netdb.h netinet/in.h paths.h stddef.h stdlib.h string.h sys/file.h sys/ioctl.h sys/param.h sys/socket.h sys/time.h sys/stat.h unistd.h utmp.h utmpx.h sys/epoll.h sys/prctl.h sys/select.h getopt.h linux/close_range.h]) |
| 64 | + |
| 65 | + # Checks for typedefs, structures, and compiler characteristics. |
| 66 | + AC_C_CONST |
| 67 | +@@ -233,7 +233,7 @@ AC_TYPE_SIGNAL |
| 68 | + AC_FUNC_STAT |
| 69 | + AC_FUNC_STRERROR_R |
| 70 | + AC_FUNC_VPRINTF |
| 71 | +-AC_CHECK_FUNCS([flock recvmmsg basename alarm clock_gettime gethostbyname gethostname gettimeofday localtime_r memset mkdir regcomp select setsid socket strcasecmp strchr strdup strerror strndup strnlen strrchr strstr strtol strtoul uname ttyname_r getline malloc_trim prctl epoll_create epoll_create1 fdatasync syscall lseek64 asprintf]) |
| 72 | ++AC_CHECK_FUNCS([flock recvmmsg basename alarm clock_gettime gethostbyname gethostname gettimeofday localtime_r memset mkdir regcomp select setsid socket strcasecmp strchr strdup strerror strndup strnlen strrchr strstr strtol strtoul uname ttyname_r getline malloc_trim prctl epoll_create epoll_create1 fdatasync syscall lseek64 asprintf close_range]) |
| 73 | + AC_CHECK_FUNC([setns], [AC_DEFINE([HAVE_SETNS], [1], [Define if setns exists.])]) |
| 74 | + AC_CHECK_TYPES([off64_t]) |
| 75 | + |
| 76 | +diff --git a/tools/rsyslogd.c b/tools/rsyslogd.c |
| 77 | +index 6b8aa93a9..0c6e4cdec 100644 |
| 78 | +--- a/tools/rsyslogd.c |
| 79 | ++++ b/tools/rsyslogd.c |
| 80 | +@@ -3,7 +3,7 @@ |
| 81 | + * because it was either written from scratch by me (rgerhards) or |
| 82 | + * contributors who agreed to ASL 2.0. |
| 83 | + * |
| 84 | +- * Copyright 2004-2019 Rainer Gerhards and Adiscon |
| 85 | ++ * Copyright 2004-2023 Rainer Gerhards and Adiscon |
| 86 | + * |
| 87 | + * This file is part of rsyslog. |
| 88 | + * |
| 89 | +@@ -27,6 +27,7 @@ |
| 90 | + #include <stdlib.h> |
| 91 | + #include <sys/types.h> |
| 92 | + #include <sys/wait.h> |
| 93 | ++#include <dirent.h> |
| 94 | + #include <unistd.h> |
| 95 | + #include <errno.h> |
| 96 | + #ifdef ENABLE_LIBLOGGING_STDLOG |
| 97 | +@@ -37,6 +38,9 @@ |
| 98 | + #ifdef HAVE_LIBSYSTEMD |
| 99 | + # include <systemd/sd-daemon.h> |
| 100 | + #endif |
| 101 | ++#if defined(HAVE_LINUX_CLOSE_RANGE_H) |
| 102 | ++# include <linux/close_range.h> |
| 103 | ++#endif |
| 104 | + |
| 105 | + #include "rsyslog.h" |
| 106 | + #include "wti.h" |
| 107 | +@@ -339,6 +343,36 @@ finalize_it: |
| 108 | + RETiRet; |
| 109 | + } |
| 110 | + |
| 111 | ++ |
| 112 | ++ |
| 113 | ++/* note: this function is specific to OS'es which provide |
| 114 | ++ * the ability to read open file descriptors via /proc. |
| 115 | ++ * returns 0 - success, something else otherwise |
| 116 | ++ */ |
| 117 | ++static int |
| 118 | ++close_unneeded_open_files(const char *const procdir, |
| 119 | ++ const int beginClose, const int parentPipeFD) |
| 120 | ++{ |
| 121 | ++ DIR *dir; |
| 122 | ++ struct dirent *entry; |
| 123 | ++ |
| 124 | ++ dir = opendir(procdir); |
| 125 | ++ if (dir == NULL) { |
| 126 | ++ dbgprintf("closes unneeded files: opendir failed for %s\n", procdir); |
| 127 | ++ return 1; |
| 128 | ++ } |
| 129 | ++ |
| 130 | ++ while ((entry = readdir(dir)) != NULL) { |
| 131 | ++ const int fd = atoi(entry->d_name); |
| 132 | ++ if(fd >= beginClose && (((fd != dbgGetDbglogFd()) && (fd != parentPipeFD)))) { |
| 133 | ++ close(fd); |
| 134 | ++ } |
| 135 | ++ } |
| 136 | ++ |
| 137 | ++ closedir(dir); |
| 138 | ++ return 0; |
| 139 | ++} |
| 140 | ++ |
| 141 | + /* prepares the background processes (if auto-backbrounding) for |
| 142 | + * operation. |
| 143 | + */ |
| 144 | +@@ -384,12 +418,32 @@ prepareBackground(const int parentPipeFD) |
| 145 | + } |
| 146 | + #endif |
| 147 | + |
| 148 | +- /* close unnecessary open files */ |
| 149 | +- const int endClose = getdtablesize(); |
| 150 | +- close(0); |
| 151 | +- for(int i = beginClose ; i <= endClose ; ++i) { |
| 152 | +- if((i != dbgGetDbglogFd()) && (i != parentPipeFD)) { |
| 153 | +- aix_close_it(i); /* AIXPORT */ |
| 154 | ++ /* close unnecessary open files - first try to use /proc file system, |
| 155 | ++ * if that is not possible iterate through all potentially open file |
| 156 | ++ * descriptors. This can be lenghty, but in practice /proc should work |
| 157 | ++ * for almost all current systems, and the fallback is primarily for |
| 158 | ++ * Solaris and AIX, where we do expect a decent max numbers of fds. |
| 159 | ++ */ |
| 160 | ++ close(0); /* always close stdin, we do not need it */ |
| 161 | ++ |
| 162 | ++ /* try Linux, Cygwin, NetBSD */ |
| 163 | ++ if(close_unneeded_open_files("/proc/self/fd", beginClose, parentPipeFD) != 0) { |
| 164 | ++ /* try MacOS, FreeBSD */ |
| 165 | ++ if(close_unneeded_open_files("/proc/fd", beginClose, parentPipeFD) != 0) { |
| 166 | ++ /* did not work out, so let's close everything... */ |
| 167 | ++ const int endClose = getdtablesize(); |
| 168 | ++# if defined(HAVE_CLOSE_RANGE) |
| 169 | ++ if(close_range(beginClose, endClose, 0) != 0) { |
| 170 | ++ dbgprintf("errno %d after close_range(), fallback to loop\n", errno); |
| 171 | ++# endif |
| 172 | ++ for(int i = beginClose ; i <= endClose ; ++i) { |
| 173 | ++ if((i != dbgGetDbglogFd()) && (i != parentPipeFD)) { |
| 174 | ++ aix_close_it(i); /* AIXPORT */ |
| 175 | ++ } |
| 176 | ++ } |
| 177 | ++# if defined(HAVE_CLOSE_RANGE) |
| 178 | ++ } |
| 179 | ++# endif |
| 180 | + } |
| 181 | + } |
| 182 | + seedRandomNumberForChild(); |
| 183 | +-- |
| 184 | +2.33.8 |
| 185 | + |
0 commit comments