[svn-inject] Installing original source of unscd
[unscd.git] / nscd-0.36.c
1 /* This file is part of unscd, a complete nscd replacement.
2  * Copyright (C) 2007 Denys Vlasenko. Licensed under the GPL version 2. */
3
4 /* unscd is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; version 2 of the License.
7  *
8  * unscd is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11  * GNU General Public License for more details.
12  *
13  * You can download the GNU General Public License from the GNU website
14  * at http://www.gnu.org/ or write to the Free Software Foundation, Inc.,
15  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */
16
17 /*
18 Build instructions:
19
20 gcc -Os -o nscd nscd.c
21
22 gcc -fomit-frame-pointer -Wl,--sort-section -Wl,alignment -Wl,--sort-common
23       -Os -o nscd nscd.c
24
25 Description:
26
27 nscd problems are not exactly unheard of. Over the years, there were
28 quite a bit of bugs in it. This leads people to invent babysitters
29 which restart crashed/hung nscd. This is ugly.
30
31 After looking at nscd source in glibc I arrived to the conclusion
32 that its desidn is contributing to this significantly. Even if nscd's
33 code is 100.00% perfect and bug-free, it can still suffer from bugs
34 in libraries it calls.
35
36 As designed, it's a multithreaded program which calls NSS libraries.
37 These libraries are not part of libc, they may be provided
38 by third-party projects (samba, ldap, you name it).
39
40 Thus nscd cannot be sure that libraries it calls do not have memory
41 or file descriptor leaks and other bugs.
42
43 Since nscd is multithreaded program with single shared cache,
44 any resource leak in any NSS library has cumulative effect.
45 Even if an NSS library leaks a file descriptor 0.01% of the time,
46 this will make nscd crash or hang after some time.
47
48 Of course bugs in NSS .so modules should be fixed, but meanwhile
49 I do want nscd which does not crash or lock up.
50
51 So I went ahead and wrote a replacement.
52
53 It is a single-threaded server process which offloads all NSS
54 lookups to worker children (not threads, but fully independent
55 processes). Cache hits are handled by parent. Only cache misses
56 start worker children. This design is immune against
57 resource leaks and hangs in NSS libraries.
58
59 It is also many times smaller.
60
61 Currently (v0.36) it emulates glibc nscd pretty closely
62 (handles same command line flags and config file), and is moderately tested.
63
64 Please note that as of 2008-08 it is not in wide use (yet?).
65 If you have trouble compiling it, see an incompatibility with
66 "standard" one or experience hangs/crashes, please report it to
67 vda.linux@googlemail.com
68
69 ***********************************************************************/
70
71 /* Make struct ucred appear in sys/socket.h */
72 #define _GNU_SOURCE 1
73 /* For all good things */
74 #include <stdio.h>
75 #include <stddef.h>
76 #include <stdlib.h>
77 #include <stdarg.h>
78 #include <unistd.h>
79 #include <string.h>
80 #include <ctype.h>
81 #include <errno.h>
82 #include <fcntl.h>
83 #include <signal.h>
84 #include <time.h>
85 #include <netdb.h>
86 #include <pwd.h>
87 #include <grp.h>
88 #include <getopt.h>
89 #include <syscall.h>
90 #include <sys/socket.h>
91 #include <sys/time.h>
92 #include <sys/types.h>
93 #include <sys/stat.h>
94 #include <sys/poll.h>
95 #include <sys/un.h>
96 /* For INT_MAX */
97 #include <limits.h>
98 /* For inet_ntoa (for debug build only) */
99 #include <arpa/inet.h>
100
101 /*
102  * 0.21 add SEGV reporting to worker
103  * 0.22 don't do freeaddrinfo() in GETAI worker, it's crashy
104  * 0.23 add parameter parsing
105  * 0.24 add conf file parsing, not using results yet
106  * 0.25 used some of conf file settings (not tested)
107  * 0.26 almost all conf file settings are wired up
108  * 0.27 a bit more of almost all conf file settings are wired up
109  * 0.28 optimized cache aging
110  * 0.29 implemented invalidate and shutdown options
111  * 0.30 fixed buglet (sizeof(ptr) != sizeof(array))
112  * 0.31 reduced client_info by one member
113  * 0.32 fix nttl/size defaults; simpler check for worker child in main()
114  * 0.33 tweak includes so that it builds on my new machine (64-bit userspace);
115  *      do not die on unknown service name, just warn
116  *      ("services" is a new service we don't support)
117  * 0.34 create /var/run/nscd/nscd.pid pidfile like glibc nscd 2.8 does;
118  *      delay setuid'ing itself to server-user after log and pidfile are open
119  * 0.35 readlink /proc/self/exe and use result if execing /proc/self/exe fails
120  * 0.36 excercise extreme paranoia handling server-user option;
121  *      a little bit more verbose logging:
122  *      L_DEBUG2 log level added, use debug-level 7 to get it
123  */
124 #define PROGRAM_VERSION "0.36"
125
126 #define DEBUG_BUILD 1
127
128
129 /*
130 ** Generic helpers
131 */
132
133 #define NORETURN __attribute__ ((__noreturn__))
134
135
136 #ifdef MY_CPU_HATES_CHARS
137 typedef int smallint;
138 #else
139 typedef signed char smallint;
140 #endif
141
142
143 enum {
144         L_INFO   = (1 << 0),
145         L_DEBUG  = ((1 << 1) * DEBUG_BUILD),
146         L_DEBUG2 = ((1 << 2) * DEBUG_BUILD),
147         L_DUMP   = ((1 << 3) * DEBUG_BUILD),
148         L_ALL    = 0xf,
149         D_DAEMON = (1 << 6),
150         D_STAMP  = (1 << 5),
151 };
152
153 static smallint debug = D_DAEMON;
154
155 static void verror(const char *s, va_list p, const char *strerr)
156 {
157         char msgbuf[1024];
158         int sz, rem, strerr_len;
159         struct timeval tv;
160
161         sz = 0;
162         if (debug & D_STAMP) {
163                 gettimeofday(&tv, NULL);
164                 sz = sprintf(msgbuf, "%02u:%02u:%02u.%05u ",
165                         (unsigned)((tv.tv_sec / (60*60)) % 24),
166                         (unsigned)((tv.tv_sec / 60) % 60),
167                         (unsigned)(tv.tv_sec % 60),
168                         (unsigned)(tv.tv_usec / 10));
169         }
170         rem = sizeof(msgbuf) - sz;
171         sz += vsnprintf(msgbuf + sz, rem, s, p);
172         rem = sizeof(msgbuf) - sz; /* can be negative after this! */
173
174         if (strerr) {
175                 strerr_len = strlen(strerr);
176                 if (rem >= strerr_len + 4) { /* ": STRERR\n\0" */
177                         msgbuf[sz++] = ':';
178                         msgbuf[sz++] = ' ';
179                         strcpy(msgbuf + sz, strerr);
180                         sz += strerr_len;
181                 }
182         }
183         if (rem >= 2) {
184                 msgbuf[sz++] = '\n';
185                 msgbuf[sz] = '\0';
186         }
187         fflush(NULL);
188         fputs(msgbuf, stderr);
189 }
190
191 static void error(const char *msg, ...)
192 {
193         va_list p;
194         va_start(p, msg);
195         verror(msg, p, NULL);
196         va_end(p);
197 }
198
199 static void error_and_die(const char *msg, ...) NORETURN;
200 static void error_and_die(const char *msg, ...)
201 {
202         va_list p;
203         va_start(p, msg);
204         verror(msg, p, NULL);
205         va_end(p);
206         _exit(1);
207 }
208
209 static void perror_and_die(const char *msg, ...) NORETURN;
210 static void perror_and_die(const char *msg, ...)
211 {
212         va_list p;
213         va_start(p, msg);
214         /* Guard against "<error message>: Success" */
215         verror(msg, p, errno ? strerror(errno) : NULL);
216         va_end(p);
217         _exit(1);
218 }
219
220 static void nscd_log(int mask, const char *msg, ...)
221 {
222         if (debug & mask) {
223                 va_list p;
224                 va_start(p, msg);
225                 verror(msg, p, NULL);
226                 va_end(p);
227         }
228 }
229
230 #define log(lvl, ...) do { if (lvl) nscd_log(lvl, __VA_ARGS__); } while (0)
231
232 #if DEBUG_BUILD
233 static void dump(const void *ptr, int len)
234 {
235         char text[18];
236         const unsigned char *buf;
237         char *p;
238
239         if (!(debug & L_DUMP))
240                 return;
241
242         buf = ptr;
243         while (len > 0) {
244                 int chunk = ((len >= 16) ? 16 : len);
245                 fprintf(stderr,
246                         "%02x %02x %02x %02x %02x %02x %02x %02x "
247                         "%02x %02x %02x %02x %02x %02x %02x %02x " + (16-chunk) * 5,
248                         buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7],
249                         buf[8], buf[9],buf[10],buf[11],buf[12],buf[13],buf[14],buf[15]
250                 );
251                 fprintf(stderr, "%*s", (16-chunk) * 3, "");
252                 len -= chunk;
253                 p = text;
254                 do {
255                         unsigned char c = *buf++;
256                         *p++ = (c >= 32 && c < 127 ? c : '.');
257                 } while (--chunk);
258                 *p++ = '\n';
259                 *p = '\0';
260                 fputs(text, stderr);
261         }
262 }
263 #else
264 void dump(const void *ptr, int len);
265 #endif
266
267 #define hex_dump(p,n) do { if (L_DUMP) dump(p,n); } while (0)
268
269 static int xopen3(const char *pathname, int flags, int mode)
270 {
271         int fd = open(pathname, flags, mode);
272         if (fd < 0)
273                 perror_and_die("open");
274         return fd;
275 }
276
277 static void xpipe(int *fds)
278 {
279         if (pipe(fds) < 0)
280                 perror_and_die("pipe");
281 }
282
283 static void xexecve(const char *filename, char **argv, char **envp) NORETURN;
284 static void xexecve(const char *filename, char **argv, char **envp)
285 {
286         execve(filename, argv, envp);
287         perror_and_die("cannot re-exec %s", filename);
288 }
289
290 static void ndelay_on(int fd)
291 {
292         int fl = fcntl(fd, F_GETFL);
293         if (fl < 0)
294                 perror_and_die("F_GETFL");
295         if (fcntl(fd, F_SETFL, fl | O_NONBLOCK) < 0)
296                 perror_and_die("setting O_NONBLOCK");
297 }
298
299 static void close_on_exec(int fd)
300 {
301         if (fcntl(fd, F_SETFD, FD_CLOEXEC) < 0)
302                 perror_and_die("setting FD_CLOEXEC");
303 }
304
305 static unsigned monotonic_ms(void)
306 {
307         struct timespec ts;
308         if (syscall(__NR_clock_gettime, CLOCK_MONOTONIC, &ts))
309                 perror_and_die("clock_gettime(MONOTONIC)");
310         return ts.tv_sec * 1000 + ts.tv_nsec / 1000000;
311 }
312
313 static unsigned strsize(const char *str)
314 {
315         return strlen(str) + 1;
316 }
317
318 static unsigned strsize_aligned4(const char *str)
319 {
320         return (strlen(str) + 1 + 3) & (~3);
321 }
322
323 static ssize_t safe_read(int fd, void *buf, size_t count)
324 {
325         ssize_t n;
326         do {
327                 n = read(fd, buf, count);
328         } while (n < 0 && errno == EINTR);
329         return n;
330 }
331
332 static ssize_t full_read(int fd, void *buf, size_t len)
333 {
334         ssize_t cc;
335         ssize_t total;
336         total = 0;
337         while (len) {
338                 cc = safe_read(fd, buf, len);
339                 if (cc < 0)
340                         return cc;      /* read() returns -1 on failure. */
341                 if (cc == 0)
342                         break;
343                 buf = ((char *)buf) + cc;
344                 total += cc;
345                 len -= cc;
346         }
347         return total;
348 }
349
350 /* unused
351 static void xsafe_read(int fd, void *buf, size_t len)
352 {
353         if (len != safe_read(fd, buf, len))
354                 perror_and_die("short read");
355 }
356 static void xfull_read(int fd, void *buf, size_t len)
357 {
358         if (len != full_read(fd, buf, len))
359                 perror_and_die("short read");
360 }
361 */
362
363 static ssize_t safe_write(int fd, const void *buf, size_t count)
364 {
365         ssize_t n;
366         do {
367                 n = write(fd, buf, count);
368         } while (n < 0 && errno == EINTR);
369         return n;
370 }
371
372 static ssize_t full_write(int fd, const void *buf, size_t len)
373 {
374         ssize_t cc;
375         ssize_t total;
376
377         total = 0;
378         while (len) {
379                 cc = safe_write(fd, buf, len);
380                 if (cc < 0)
381                         return cc;      /* write() returns -1 on failure. */
382                 total += cc;
383                 buf = ((const char *)buf) + cc;
384                 len -= cc;
385         }
386         return total;
387 }
388
389 static void xsafe_write(int fd, const void *buf, size_t count)
390 {
391         if (count != safe_write(fd, buf, count))
392                 perror_and_die("short write of %ld bytes", (long)count);
393 }
394 static void xfull_write(int fd, const void *buf, size_t count)
395 {
396         if (count != full_write(fd, buf, count))
397                 perror_and_die("short write of %ld bytes", (long)count);
398 }
399
400 static void xmovefd(int from_fd, int to_fd)
401 {
402         if (from_fd != to_fd) {
403                 if (dup2(from_fd, to_fd) < 0)
404                         perror_and_die("dup2");
405                 close(from_fd);
406         }
407 }
408
409 static unsigned getnum(const char *str)
410 {
411         if (str[0] >= '0' && str[0] <= '9') {
412                 char *p;
413                 unsigned long l = strtoul(str, &p, 10);
414                 /* must not overflow int even after x1000 */
415                 if (!*p && l <= INT_MAX / 1000)
416                         return l;
417         }
418         error_and_die("malformed or too big number '%s'", str);
419 };
420
421 static char *skip_whitespace(const char *s)
422 {
423         /* NB: isspace('\0') returns 0 */
424         while (isspace(*s)) ++s;
425         return (char *) s;
426 }
427
428 static char *skip_non_whitespace(const char *s)
429 {
430         while (*s && !isspace(*s)) ++s;
431         return (char *) s;
432 }
433
434 static void *xmalloc(unsigned sz)
435 {
436         void *p = malloc(sz);
437         if (!p)
438                 error_and_die("out of memory");
439         return p;
440 }
441
442 static void *xzalloc(unsigned sz)
443 {
444         void *p = xmalloc(sz);
445         memset(p, 0, sz);
446         return p;
447 }
448
449 static void *xrealloc(void *p, unsigned size)
450 {
451         p = realloc(p, size);
452         if (!p)
453                 error_and_die("out of memory");
454         return p;
455 }
456
457 static const char *xstrdup(const char *str)
458 {
459         const char *p = strdup(str);
460         if (!p)
461                 error_and_die("out of memory");
462         return p;
463 }
464
465
466 /*
467 ** Config data
468 */
469
470 enum {
471         SRV_PASSWD,
472         SRV_GROUP,
473         SRV_HOSTS,
474 };
475
476 static struct {
477         const char *logfile;
478         const char *user;
479         smallint srv_enable[3];
480         smallint check_files[3];
481         unsigned pttl[3];
482         unsigned nttl[3];
483         unsigned size[3];
484 } config = {
485         /* We try to closely mimic glibc nscd */
486         .logfile     = NULL, /* default is to not have a log file */
487         .user        = NULL,
488         .srv_enable  = { 0, 0, 0 },
489         .check_files = { 1, 1, 1 },
490         .pttl        = { 3600, 3600, 3600 },
491         .nttl        = { 20, 60, 20 },
492         /* huh, what is the default cache size in glibc nscd? */
493         .size        = { 256 * 8 / 3, 256 * 8 / 3, 256 * 8 / 3 },
494 };
495
496 static const char default_conffile[] = "/etc/nscd.conf";
497 static const char *self_exe_points_to = "/proc/self/exe";
498
499
500 /*
501 ** Clients, workers machinery
502 */
503
504 /* Header common to all requests */
505 #define USER_REQ_STRUCT \
506         int32_t version; /* Version number of the daemon interface */ \
507         int32_t type;    /* Service requested */ \
508         int32_t key_len; /* Key length */
509
510 typedef struct user_req_header {
511         USER_REQ_STRUCT
512 } user_req_header;
513
514 enum {
515         NSCD_VERSION = 2,
516         MAX_USER_REQ_SIZE = 1024,
517         USER_HDR_SIZE = sizeof(user_req_header),
518         /* DNS queries time out after 20 seconds,
519          * we will allow for a bit more */
520         WORKER_TIMEOUT_SEC = 30,
521         CLIENT_TIMEOUT_MS = 100,
522         SMALL_POLL_TIMEOUT_MS = 200,
523 };
524
525 typedef struct user_req {
526         union {
527                 struct { /* as came from client */
528                         USER_REQ_STRUCT
529                 };
530                 struct { /* when stored in cache, overlaps .version */
531                         unsigned refcount:8; /* actually, can be 1 or 0 only */
532                         /* (timestamp24 * 256) == timestamp in ms */
533                         unsigned timestamp24:24;
534                 };
535         };
536         char reqbuf[MAX_USER_REQ_SIZE - USER_HDR_SIZE];
537 } user_req;
538
539 /* Compile-time check for correct size */
540 struct BUG_wrong_user_req_size {
541         char BUG_wrong_user_req_size[sizeof(user_req) == MAX_USER_REQ_SIZE ? 1 : -1];
542 };
543
544 enum {
545         GETPWBYNAME,
546         GETPWBYUID,
547         GETGRBYNAME,
548         GETGRBYGID,
549         GETHOSTBYNAME,
550         GETHOSTBYNAMEv6,
551         GETHOSTBYADDR,
552         GETHOSTBYADDRv6,
553         SHUTDOWN,               /* Shut the server down */
554         GETSTAT,                /* Get the server statistic */
555         INVALIDATE,             /* Invalidate one special cache */
556         GETFDPW,
557         GETFDGR,
558         GETFDHST,
559         GETAI,
560         INITGROUPS,
561         GETSERVBYNAME,
562         GETSERVBYPORT,
563         GETFDSERV,
564         LASTREQ
565 };
566 #if DEBUG_BUILD
567 static const char *const typestr[] = {
568         "GETPWBYNAME",     /* done */
569         "GETPWBYUID",      /* done */
570         "GETGRBYNAME",     /* done */
571         "GETGRBYGID",      /* done */
572         "GETHOSTBYNAME",   /* done */
573         "GETHOSTBYNAMEv6", /* done */
574         "GETHOSTBYADDR",   /* done */
575         "GETHOSTBYADDRv6", /* done */
576         "SHUTDOWN",        /* done */
577         "GETSTAT",         /* info? */
578         "INVALIDATE",      /* done */
579         /* won't do: nscd passes a name of shmem segment
580          * which client can map and "see" the db */
581         "GETFDPW",
582         "GETFDGR",         /* won't do */
583         "GETFDHST",        /* won't do */
584         "GETAI",           /* done */
585         "INITGROUPS",      /* done */
586         "GETSERVBYNAME",   /* prio 3 (no caching?) */
587         "GETSERVBYPORT",   /* prio 3 (no caching?) */
588         "GETFDSERV"        /* won't do */
589 };
590 #else
591 extern const char *const typestr[];
592 #endif
593 static const smallint type_to_srv[] = {
594         [GETPWBYNAME     ] = SRV_PASSWD,
595         [GETPWBYUID      ] = SRV_PASSWD,
596         [GETGRBYNAME     ] = SRV_GROUP,
597         [GETGRBYGID      ] = SRV_GROUP,
598         [GETHOSTBYNAME   ] = SRV_HOSTS,
599         [GETHOSTBYNAMEv6 ] = SRV_HOSTS,
600         [GETHOSTBYADDR   ] = SRV_HOSTS,
601         [GETHOSTBYADDRv6 ] = SRV_HOSTS,
602         [GETAI           ] = SRV_HOSTS,
603         [INITGROUPS      ] = SRV_GROUP,
604 };
605
606 static int unsupported_ureq_type(unsigned type)
607 {
608         if (type == GETAI) return 0;
609         if (type == INITGROUPS) return 0;
610         if (type > GETHOSTBYADDRv6) return 1;
611         return 0;
612 }
613
614 /* Possible reductions:
615  * fd, bufidx - uint8_t
616  * started_ms -> uint16_t started_s
617  * ureq - eliminate (derivable from bufidx?)
618  * cell - eliminate (derivable from resptr?)
619  */
620 typedef struct client_info {
621         /* if client_fd != 0, we are waiting for the reply from worker
622          * on pfd[i].fd, and client_fd is saved client's fd
623          * (we need to put it back into pfd[i].fd later) */
624         int client_fd;
625         unsigned bytecnt;       /* bytes read from client */
626         unsigned bufidx;        /* buffer# in global client_buf[] */
627         unsigned started_ms;
628         unsigned respos;        /* response */
629         //unsigned resp_sz;
630         user_req *resptr;       /* response */
631         user_req *ureq;         /* request (points to client_buf[x]) */
632         user_req **cell;        /* cache cell ptr */
633 } client_info;
634
635 static int min_closed = INT_MAX;
636 static int cnt_closed = 0;
637 static int num_clients = 2; /* two listening sockets are "clients" too */
638
639 /* We read up to max_reqnum requests in parallel */
640 static unsigned max_reqnum = 14;
641 static int next_buf;
642 /* Each of these points to [max_reqnum] sized array */
643 static char          (*client_buf)[MAX_USER_REQ_SIZE];
644 static char          *busy_cbuf;
645 static struct pollfd *pfd;
646 static client_info   *cinfo;
647
648 static inline unsigned ureq_size(const user_req *ureq)
649 {
650         return sizeof(user_req_header) + ureq->key_len;
651 }
652
653 static unsigned cache_age(unsigned now_ms, const user_req *ureq)
654 {
655         return (uint32_t)now_ms - (ureq->timestamp24 << 8);
656 }
657
658 static void set_cache_timestamp(user_req *ureq, unsigned now_ms)
659 {
660         ureq->timestamp24 = now_ms >> 8;
661 }
662
663 static int alloc_buf_no(void)
664 {
665         int n = next_buf;
666         do {
667                 int cur = next_buf;
668                 next_buf = (next_buf + 1) % max_reqnum;
669                 if (!busy_cbuf[cur]) {
670                         busy_cbuf[cur] = 1;
671                         return cur;
672                 }
673         } while (next_buf != n);
674         error_and_die("no free bufs?!");
675 }
676
677 static inline void *bufno2buf(int i)
678 {
679         return client_buf[i];
680 }
681
682 static void close_client(int i)
683 {
684         log(L_DEBUG, "closing client %d (fd %d)", i, pfd[i].fd);
685         close(pfd[i].fd);
686         pfd[i].fd = 0; /* flag as unused */
687         busy_cbuf[cinfo[i].bufidx] = 0;
688         cnt_closed++;
689         if (i < min_closed)
690                 min_closed = i;
691 }
692
693
694 /*
695 ** ncsd API <-> C API conversion
696 */
697
698 typedef struct response_header {
699         uint32_t version_or_size;
700         int32_t found;
701         char body[0];
702 } response_header;
703
704 typedef struct initgr_response_header {
705         uint32_t version_or_size;
706         int32_t found;
707         int32_t ngrps;
708         /* code assumes gid_t == int32, let's check that */
709         int32_t gid[sizeof(gid_t) == sizeof(int32_t) ? 0 : -1];
710         /* char user_str[as_needed]; */
711 } initgr_response_header;
712
713 static initgr_response_header *obtain_initgroups(const char *username)
714 {
715         struct initgr_response_header *resp;
716         struct passwd *pw;
717         enum { MAGIC_OFFSET = sizeof(*resp) / sizeof(int32_t) };
718         unsigned sz;
719         int ngroups;
720
721         pw = getpwnam(username);
722         if (!pw) {
723                 resp = xzalloc(8);
724                 resp->version_or_size = sizeof(*resp);
725                 /*resp->found = 0;*/
726                 /*resp->ngrps = 0;*/
727                 goto ret;
728         }
729
730         /* getgrouplist may be very expensive, it's much better to allocate
731          * a bit more than to run getgrouplist twice */
732         ngroups = 128;
733         resp = NULL;
734         do {
735                 sz = sizeof(*resp) + sizeof(resp->gid[0]) * ngroups;
736                 resp = xrealloc(resp, sz);
737         } while (getgrouplist(username, pw->pw_gid, (gid_t*) &resp->gid, &ngroups) == -1);
738         log(L_DEBUG, "ngroups=%d", ngroups);
739
740         sz = sizeof(*resp) + sizeof(resp->gid[0]) * ngroups;
741         /* resp = xrealloc(resp, sz); - why bother */
742         resp->version_or_size = sz;
743         resp->found = 1;
744         resp->ngrps = ngroups;
745  ret:
746         return resp;
747 }
748
749 typedef struct pw_response_header {
750         uint32_t version_or_size;
751         int32_t found;
752         int32_t pw_name_len;
753         int32_t pw_passwd_len;
754         int32_t pw_uid;
755         int32_t pw_gid;
756         int32_t pw_gecos_len;
757         int32_t pw_dir_len;
758         int32_t pw_shell_len;
759         /* char pw_name[pw_name_len]; */
760         /* char pw_passwd[pw_passwd_len]; */
761         /* char pw_gecos[pw_gecos_len]; */
762         /* char pw_dir[pw_dir_len]; */
763         /* char pw_shell[pw_shell_len]; */
764 } pw_response_header;
765
766 static pw_response_header *marshal_passwd(struct passwd *pw)
767 {
768         char *p;
769         pw_response_header *resp;
770         unsigned pw_name_len;
771         unsigned pw_passwd_len;
772         unsigned pw_gecos_len;
773         unsigned pw_dir_len;
774         unsigned pw_shell_len;
775         unsigned sz = sizeof(*resp);
776         if (pw) {
777                 sz += (pw_name_len = strsize(pw->pw_name));
778                 sz += (pw_passwd_len = strsize(pw->pw_passwd));
779                 sz += (pw_gecos_len = strsize(pw->pw_gecos));
780                 sz += (pw_dir_len = strsize(pw->pw_dir));
781                 sz += (pw_shell_len = strsize(pw->pw_shell));
782         }
783         resp = xzalloc(sz);
784         resp->version_or_size = sz;
785         if (!pw) {
786                 /*resp->found = 0;*/
787                 goto ret;
788         }
789         resp->found = 1;
790         resp->pw_name_len = pw_name_len;
791         resp->pw_passwd_len = pw_passwd_len;
792         resp->pw_uid = pw->pw_uid;
793         resp->pw_gid = pw->pw_gid;
794         resp->pw_gecos_len = pw_gecos_len;
795         resp->pw_dir_len = pw_dir_len;
796         resp->pw_shell_len = pw_shell_len;
797         p = (char*)(resp + 1);
798         strcpy(p, pw->pw_name); p += pw_name_len;
799         strcpy(p, pw->pw_passwd); p += pw_passwd_len;
800         strcpy(p, pw->pw_gecos); p += pw_gecos_len;
801         strcpy(p, pw->pw_dir); p += pw_dir_len;
802         strcpy(p, pw->pw_shell); p += pw_shell_len;
803         log(L_DEBUG, "sz:%u realsz:%u", sz, p - (char*)resp);
804  ret:
805         return resp;
806 }
807
808 typedef struct gr_response_header {
809         uint32_t version_or_size;
810         int32_t found;
811         int32_t gr_name_len;    /* strlen(gr->gr_name) + 1; */
812         int32_t gr_passwd_len;  /* strlen(gr->gr_passwd) + 1; */
813         int32_t gr_gid;         /* gr->gr_gid */
814         int32_t gr_mem_cnt;     /* while (gr->gr_mem[gr_mem_cnt]) ++gr_mem_cnt; */
815         /* int32_t gr_mem_len[gr_mem_cnt]; */
816         /* char gr_name[gr_name_len]; */
817         /* char gr_passwd[gr_passwd_len]; */
818         /* char gr_mem[gr_mem_cnt][gr_mem_len[i]]; */
819         /* char gr_gid_str[as_needed]; - huh? */
820         /* char orig_key[as_needed]; - needed?? I don't do this ATM... */
821 /*
822  glibc adds gr_gid_str, but client doesn't get/use it:
823  writev(3, [{"\2\0\0\0\2\0\0\0\5\0\0\0", 12}, {"root\0", 5}], 2) = 17
824  poll([{fd=3, events=POLLIN|POLLERR|POLLHUP, revents=POLLIN}], 1, 5000) = 1
825  read(3, "\2\0\0\0\1\0\0\0\10\0\0\0\4\0\0\0\0\0\0\0\0\0\0\0", 24) = 24
826  readv(3, [{"", 0}, {"root\0\0\0\0\0\0\0\0", 12}], 2) = 12
827  read(3, NULL, 0)        = 0
828 */
829 } gr_response_header;
830
831 static gr_response_header *marshal_group(struct group *gr)
832 {
833         char *p;
834         gr_response_header *resp;
835         unsigned gr_mem_cnt;
836         unsigned sz = sizeof(*resp);
837         if (gr) {
838                 sz += strsize(gr->gr_name);
839                 sz += strsize(gr->gr_passwd);
840                 gr_mem_cnt = 0;
841                 while (gr->gr_mem[gr_mem_cnt]) {
842                         sz += strsize(gr->gr_mem[gr_mem_cnt]);
843                         gr_mem_cnt++;
844                 }
845                 /* for int32_t gr_mem_len[gr_mem_cnt]; */
846                 sz += gr_mem_cnt * sizeof(int32_t);
847         }
848         resp = xzalloc(sz);
849         resp->version_or_size = sz;
850         if (!gr) {
851                 /*resp->found = 0;*/
852                 goto ret;
853         }
854         resp->found = 1;
855         resp->gr_name_len = strsize(gr->gr_name);
856         resp->gr_passwd_len = strsize(gr->gr_passwd);
857         resp->gr_gid = gr->gr_gid;
858         resp->gr_mem_cnt = gr_mem_cnt;
859         p = (char*)(resp + 1);
860 /* int32_t gr_mem_len[gr_mem_cnt]; */
861         gr_mem_cnt = 0;
862         while (gr->gr_mem[gr_mem_cnt]) {
863                 *(uint32_t*)p = strsize(gr->gr_mem[gr_mem_cnt]);
864                 p += 4;
865                 gr_mem_cnt++;
866         }
867 /* char gr_name[gr_name_len]; */
868         strcpy(p, gr->gr_name);
869         p += strsize(gr->gr_name);
870 /* char gr_passwd[gr_passwd_len]; */
871         strcpy(p, gr->gr_passwd);
872         p += strsize(gr->gr_passwd);
873 /* char gr_mem[gr_mem_cnt][gr_mem_len[i]]; */
874         gr_mem_cnt = 0;
875         while (gr->gr_mem[gr_mem_cnt]) {
876                 strcpy(p, gr->gr_mem[gr_mem_cnt]);
877                 p += strsize(gr->gr_mem[gr_mem_cnt]);
878                 gr_mem_cnt++;
879         }
880         log(L_DEBUG, "sz:%u realsz:%u", sz, p - (char*)resp);
881  ret:
882         return resp;
883 }
884
885 typedef struct hst_response_header {
886         uint32_t version_or_size;
887         int32_t found;
888         int32_t h_name_len;
889         int32_t h_aliases_cnt;
890         int32_t h_addrtype;     /* AF_INET or AF_INET6 */
891         int32_t h_length;       /* 4 or 16 */
892         int32_t h_addr_list_cnt;
893         int32_t error;
894         /* char h_name[h_name_len]; - we pad it to 4 bytes */
895         /* uint32_t h_aliases_len[h_aliases_cnt]; */
896         /* char h_addr_list[h_addr_list_cnt][h_length]; - every one is the same size [h_length] (4 or 16) */
897         /* char h_aliases[h_aliases_cnt][h_aliases_len[i]]; */
898 } hst_response_header;
899
900 static hst_response_header *marshal_hostent(struct hostent *h)
901 {
902         char *p;
903         hst_response_header *resp;
904         unsigned h_name_len;
905         unsigned h_aliases_cnt;
906         unsigned h_addr_list_cnt;
907         unsigned sz = sizeof(*resp);
908         if (h) {
909 /* char h_name[h_name_len] */
910                 sz += h_name_len = strsize_aligned4(h->h_name);
911                 h_addr_list_cnt = 0;
912                 while (h->h_addr_list[h_addr_list_cnt]) {
913                         h_addr_list_cnt++;
914                 }
915 /* char h_addr_list[h_addr_list_cnt][h_length] */
916                 sz += h_addr_list_cnt * h->h_length;
917                 h_aliases_cnt = 0;
918                 while (h->h_aliases[h_aliases_cnt]) {
919 /* char h_aliases[h_aliases_cnt][h_aliases_len[i]] */
920                         sz += strsize(h->h_aliases[h_aliases_cnt]);
921                         h_aliases_cnt++;
922                 }
923 /* uint32_t h_aliases_len[h_aliases_cnt] */
924                 sz += h_aliases_cnt * 4;
925         }
926         resp = xzalloc(sz);
927         resp->version_or_size = sz;
928         if (!h) {
929                 /*resp->found = 0;*/
930                 resp->error = HOST_NOT_FOUND;
931                 goto ret;
932         }
933         resp->found = 1;
934         resp->h_name_len = h_name_len;
935         resp->h_aliases_cnt = h_aliases_cnt;
936         resp->h_addrtype = h->h_addrtype;
937         resp->h_length = h->h_length;
938         resp->h_addr_list_cnt = h_addr_list_cnt;
939         /*resp->error = 0;*/
940         p = (char*)(resp + 1);
941 /* char h_name[h_name_len]; */
942         strcpy(p, h->h_name);
943         p += h_name_len;
944 /* uint32_t h_aliases_len[h_aliases_cnt]; */
945         h_aliases_cnt = 0;
946         while (h->h_aliases[h_aliases_cnt]) {
947                 *(uint32_t*)p = strsize(h->h_aliases[h_aliases_cnt]);
948                 p += 4;
949                 h_aliases_cnt++;
950         }
951 /* char h_addr_list[h_addr_list_cnt][h_length]; */
952         h_addr_list_cnt = 0;
953         while (h->h_addr_list[h_addr_list_cnt]) {
954                 memcpy(p, h->h_addr_list[h_addr_list_cnt], h->h_length);
955                 p += h->h_length;
956                 h_addr_list_cnt++;
957         }
958 /* char h_aliases[h_aliases_cnt][h_aliases_len[i]]; */
959         h_aliases_cnt = 0;
960         while (h->h_aliases[h_aliases_cnt]) {
961                 strcpy(p, h->h_aliases[h_aliases_cnt]);
962                 p += strsize(h->h_aliases[h_aliases_cnt]);
963                 h_aliases_cnt++;
964         }
965         log(L_DEBUG, "sz:%u realsz:%u", sz, p - (char*)resp);
966  ret:
967         return resp;
968 }
969
970 /* Reply to addrinfo query */
971 typedef struct ai_response_header {
972         uint32_t version_or_size;
973         int32_t found;
974         int32_t naddrs;
975         int32_t addrslen;
976         int32_t canonlen;
977         int32_t error;
978         /* char ai_addr[naddrs][4 or 16]; - addrslen bytes in total */
979         /* char ai_family[naddrs]; - AF_INET[6] each (determines ai_addr[i] length) */
980         /* char ai_canonname[canonlen]; */
981 } ai_response_header;
982
983 static ai_response_header *obtain_addrinfo(const char *hostname)
984 {
985         struct addrinfo hints;
986         struct addrinfo *ai;
987         struct addrinfo *ap;
988         ai_response_header *resp;
989         char *p, *family;
990         int err;
991         unsigned sz;
992         unsigned naddrs = 0;
993         unsigned addrslen = 0;
994         unsigned canonlen = 0;
995
996         memset(&hints, 0, sizeof(hints));
997         hints.ai_flags = AI_CANONNAME;
998         /* hinst.ai_socktype = SOCK_STREAM; - can kill dups (one for each possible SOCK_xxx) */
999         ai = NULL; /* on failure getaddrinfo may leave it as-is */
1000         err = getaddrinfo(hostname, NULL, &hints, &ai);
1001
1002         sz = sizeof(*resp);
1003         if (!err) {
1004                 if (ai->ai_canonname)
1005                         sz += canonlen = strsize(ai->ai_canonname);
1006                 ap = ai;
1007                 do {
1008                         naddrs++;
1009                         addrslen += (ap->ai_family == AF_INET ? 4 : 16);
1010                         ap = ap->ai_next;
1011                 } while (ap);
1012                 sz += naddrs + addrslen;
1013         }
1014         resp = xzalloc(sz);
1015         resp->version_or_size = sz;
1016         resp->error = err;
1017         if (err) {
1018                 /*resp->found = 0;*/
1019                 goto ret;
1020         }
1021         resp->found = 1;
1022         resp->naddrs = naddrs;
1023         resp->addrslen = addrslen;
1024         resp->canonlen = canonlen;
1025         p = (char*)(resp + 1);
1026         family = p + addrslen;
1027         ap = ai;
1028         do {
1029 /* char ai_family[naddrs]; */
1030                 *family++ = ap->ai_family;
1031 /* char ai_addr[naddrs][4 or 16]; */
1032                 if (ap->ai_family == AF_INET) {
1033                         memcpy(p, &(((struct sockaddr_in*)(ap->ai_addr))->sin_addr), 4);
1034                         p += 4;
1035                 } else {
1036                         memcpy(p, &(((struct sockaddr_in6*)(ap->ai_addr))->sin6_addr), 16);
1037                         p += 16;
1038                 }
1039                 ap = ap->ai_next;
1040         } while (ap);
1041 /* char ai_canonname[canonlen]; */
1042         if (ai->ai_canonname)
1043                 strcpy(family, ai->ai_canonname);
1044         log(L_DEBUG, "sz:%u realsz:%u", sz, family + strsize(ai->ai_canonname) - (char*)resp);
1045  ret:
1046         /* glibc 2.3.6 segfaults here sometimes
1047          * (maybe my mistake, fixed by "ai = NULL;" above).
1048          * Since we are in worker and are going to exit anyway, why bother? */
1049         /*freeaddrinfo(ai);*/
1050         return resp;
1051 }
1052
1053
1054 /*
1055 ** Cache management
1056 */
1057
1058 /* one 8-element "cacheline" */
1059 typedef user_req *cacheline_t[8];
1060 static unsigned cache_size;
1061 /* Points to cacheline_t  cache[cache_size] array, or in other words,
1062  * points to user_req*    cache[cache_size][8] array */
1063 static cacheline_t *cache;
1064 static unsigned cached_cnt;
1065 static unsigned cache_access_cnt = 1; /* prevent division by zero */
1066 static unsigned cache_hit_cnt = 1;
1067 static unsigned last_age_time;
1068 static unsigned aging_interval_ms;
1069 static unsigned min_aging_interval_ms;
1070
1071 static response_header *ureq_response(user_req *ureq)
1072 {
1073         /* Skip query part, find answer part
1074          * (answer is 32-bit aligned) */
1075         return (void*) ((char*)ureq + ((ureq_size(ureq) + 3) & ~3));
1076 }
1077
1078 /* This hash is supposed to be good for short textual data */
1079 static uint32_t bernstein_hash(void *p, unsigned sz, uint32_t hash)
1080 {
1081         uint8_t *key = p;
1082         do {
1083                 hash = (32 * hash + hash) ^ *key++;
1084         } while (--sz);
1085         return hash;
1086 }
1087
1088 static user_req *find_cell_and_response(user_req ***cellp, user_req *ureq)
1089 {
1090         user_req **cell;
1091         unsigned hash;
1092         unsigned i;
1093         unsigned ureq_sz = ureq_size(ureq);
1094
1095         /* prevent overflow and division by zero */
1096         if ((int)(cache_access_cnt+1) < 0) {
1097                 cache_access_cnt = (cache_access_cnt >> 1) + 1;
1098                 cache_hit_cnt = (cache_hit_cnt >> 1) + 1;
1099         }
1100         cache_access_cnt++;
1101
1102         hash = bernstein_hash(&ureq->key_len, ureq_sz - offsetof(user_req, key_len), ureq->type);
1103         log(L_DEBUG, "hash:%08x", hash);
1104         hash = hash % cache_size;
1105         (*cellp) = cell = cache[hash];
1106
1107         for (i = 0; i < 8; i++) {
1108                 if (!cell[i])
1109                         continue;
1110 // TODO: do secondary hash match
1111                 /* ureq->version is always 2 and is reused in cache
1112                  * for other purposes, we need to skip it here */
1113                 if (memcmp(&ureq->type, &cell[i]->type, ureq_sz - offsetof(user_req, type)) == 0) {
1114                         log(L_DEBUG, "found in cache[%u][%u]", hash, i);
1115                         cache_hit_cnt++;
1116                         return cell[i];
1117                 }
1118         }
1119         log(L_DEBUG, "not found in cache[%u][x]", hash);
1120         return NULL;
1121 }
1122
1123 static void free_refcounted_ureq(user_req **ureqp)
1124 {
1125         user_req *ureq = *ureqp;
1126         /* is it in use? */
1127         if (ureq->refcount) {
1128                 ureq->refcount = 0; /* since it can be only 1 or 0... */
1129         } else {
1130                 free(ureq);
1131         }
1132         *ureqp = NULL;
1133 }
1134
1135 static void save_in_cell(user_req **cell, user_req *new_cached, unsigned now_ms)
1136 {
1137         unsigned oldest_idx = 0;
1138         unsigned oldest_age = 0;
1139         unsigned age;
1140         unsigned i;
1141
1142         for (i = 0; i < 8; i++) {
1143                 if (!cell[i]) {
1144                         log(L_DEBUG, "using free cache[x][%u]", i);
1145                         cached_cnt++;
1146                         cell[i] = new_cached;
1147                         aging_interval_ms = min_aging_interval_ms;
1148                         return;
1149                 }
1150                 age = cache_age(now_ms, cell[i]);
1151                 if (age > oldest_age) {
1152                         oldest_age = age;
1153                         oldest_idx = i;
1154                 }
1155         }
1156         log(L_DEBUG, "freeing and reusing cache[x][%u] (age %u)", oldest_idx, oldest_age);
1157         if (cell[oldest_idx]) {
1158                 free_refcounted_ureq(&cell[oldest_idx]);
1159         } else {
1160                 cached_cnt++;
1161         }
1162         cell[oldest_idx] = new_cached;
1163         aging_interval_ms = min_aging_interval_ms;
1164 }
1165
1166 static void age_cache(unsigned now_ms, int srv)
1167 {
1168         user_req **cp = *cache;
1169         int i;
1170         unsigned sv = cached_cnt;
1171
1172         log(L_DEBUG, "aging cache, srv:%d, now:%u", srv, now_ms);
1173         if (srv == -1 || !now_ms)
1174                 aging_interval_ms = INT_MAX;
1175         i = cache_size * 8;
1176         do {
1177                 user_req *cached = *cp;
1178                 if (cached) {
1179                         int csrv = type_to_srv[cached->type];
1180                         if (srv == -1 || srv == csrv) {
1181                                 if (!now_ms) {
1182                                         cached_cnt--;
1183                                         free_refcounted_ureq(cp);
1184                                 } else {
1185                                         unsigned age = cache_age(now_ms, cached);
1186                                         response_header *resp = ureq_response(cached);
1187                                         unsigned ttl = (resp->found ? config.pttl : config.nttl)[csrv];
1188                                         if (age >= ttl) {
1189                                                 log(L_DEBUG, "freeing: age %u positive %d ttl %u", age, resp->found, ttl);
1190                                                 cached_cnt--;
1191                                                 free_refcounted_ureq(cp);
1192                                         } else if (srv == -1) {
1193                                                 ttl -= age;
1194                                                 if (aging_interval_ms > ttl)
1195                                                         aging_interval_ms = ttl;
1196                                         }
1197                                 }
1198                         }
1199                 }
1200                 cp++;
1201         } while (--i);
1202         log(L_INFO, "aged cache, freed:%u, remain:%u", sv - cached_cnt, cached_cnt);
1203         if (srv == -1 || !now_ms)
1204                 log(L_DEBUG, "aging interval now %u ms", aging_interval_ms);
1205 }
1206
1207
1208 /*
1209 ** Worker child
1210 */
1211
1212 /* Spawns a worker and feeds it with user query on stdin */
1213 /* Returns stdout fd of the worker, in blocking mode */
1214 static int create_and_feed_worker(user_req *ureq)
1215 {
1216         static const char *const argv[] = { "worker_nscd", NULL };
1217
1218         pid_t pid;
1219         struct {
1220                 int rd;
1221                 int wr;
1222         } to_child, to_parent;
1223
1224         /* NB: these pipe fds are in blocking mode and non-CLOEXECed */
1225         xpipe(&to_child.rd);
1226         xpipe(&to_parent.rd);
1227
1228         pid = vfork();
1229         if (pid < 0) /* error */
1230                 perror_and_die("vfork");
1231         if (!pid) { /* child */
1232                 close(to_child.wr);
1233                 close(to_parent.rd);
1234                 xmovefd(to_child.rd, 0);
1235                 xmovefd(to_parent.wr, 1);
1236                 /* Re-exec ourself, cleaning up all allocated memory.
1237                  * fds in parent are marked CLOEXEC and will be closed too
1238                  * (modulo bugs) */
1239                 execve("/proc/self/exe", (char**)argv, (char**)(argv+1));
1240                 xexecve(self_exe_points_to, (char**)argv, (char**)(argv+1));
1241         }
1242
1243         /* parent */
1244         close(to_child.rd);
1245         close(to_parent.wr);
1246         /* We do not expect child to block for any noticeably long time,
1247          * and also we expect write to be one-piece one:
1248          * ureq size is <= 1k and pipes are guaranteed to accept
1249          * at least PIPE_BUF at once */
1250         xsafe_write(to_child.wr, ureq, ureq_size(ureq));
1251
1252         close(to_child.wr);
1253         return to_parent.rd;
1254 }
1255
1256 static user_req *worker_ureq;
1257
1258 #if DEBUG_BUILD
1259 static const char *req_str(unsigned type, const char *buf)
1260 {
1261         if (type == GETHOSTBYADDR) {
1262                 struct in_addr in;
1263                 in.s_addr = *((uint32_t*)buf);
1264                 return inet_ntoa(in);
1265         }
1266         if (type == GETHOSTBYADDRv6) {
1267                 return "IPv6";
1268         }
1269         return buf;
1270 }
1271 #else
1272 const char *req_str(unsigned type, const char *buf);
1273 #endif
1274
1275 static void worker_signal_handler(int sig)
1276 {
1277 #if DEBUG_BUILD
1278         log(L_INFO, "worker:%d got sig:%d while handling req "
1279                 "type:%d(%s) key_len:%d '%s'",
1280                 getpid(), sig,
1281                 worker_ureq->type, typestr[worker_ureq->type],
1282                 worker_ureq->key_len,
1283                 req_str(worker_ureq->type, worker_ureq->reqbuf)
1284         );
1285 #else
1286         log(L_INFO, "worker:%d got sig:%d while handling req "
1287                 "type:%d key_len:%d",
1288                 getpid(), sig,
1289                 worker_ureq->type, worker_ureq->key_len);
1290 #endif
1291         _exit(0);
1292 }
1293
1294 static void worker(void) NORETURN;
1295 static void worker(void)
1296 {
1297         user_req ureq;
1298         void *resp;
1299
1300         worker_ureq = &ureq; /* for signal handler */
1301
1302         /* Make sure we won't hang, but rather die */
1303         if (WORKER_TIMEOUT_SEC)
1304                 alarm(WORKER_TIMEOUT_SEC);
1305
1306         /* NB: fds 0, 1 are in blocking mode */
1307
1308         /* We block here (for a short time) */
1309         /* Due to ureq size < PIPE_BUF read is atomic */
1310         /* No error or size checking: we trust the parent */
1311         safe_read(0, &ureq, sizeof(ureq));
1312
1313         signal(SIGSEGV,   worker_signal_handler);
1314         signal(SIGBUS,    worker_signal_handler);
1315         signal(SIGILL,    worker_signal_handler);
1316         signal(SIGFPE,    worker_signal_handler);
1317         signal(SIGABRT,   worker_signal_handler);
1318         signal(SIGSTKFLT, worker_signal_handler);
1319
1320         if (ureq.type == GETHOSTBYNAME
1321          || ureq.type == GETHOSTBYNAMEv6
1322         ) {
1323                 resp = marshal_hostent(
1324                         ureq.type == GETHOSTBYNAME
1325                         ? gethostbyname(ureq.reqbuf)
1326                         : gethostbyname2(ureq.reqbuf, AF_INET6)
1327                 );
1328         } else if (ureq.type == GETHOSTBYADDR
1329          || ureq.type == GETHOSTBYADDRv6
1330         ) {
1331                 resp = marshal_hostent(gethostbyaddr(ureq.reqbuf, ureq.key_len,
1332                         (ureq.type == GETHOSTBYADDR ? AF_INET : AF_INET6)
1333                 ));
1334         } else if (ureq.type == GETPWBYNAME) {
1335                 resp = marshal_passwd(getpwnam(ureq.reqbuf));
1336         } else if (ureq.type == GETPWBYUID) {
1337                 resp = marshal_passwd(getpwuid(atoi(ureq.reqbuf)));
1338         } else if (ureq.type == GETGRBYNAME) {
1339                 struct group *gr = getgrnam(ureq.reqbuf);
1340                 resp = marshal_group(gr);
1341         } else if (ureq.type == GETGRBYGID) {
1342                 struct group *gr = getgrgid(atoi(ureq.reqbuf));
1343                 resp = marshal_group(gr);
1344         } else if (ureq.type == GETAI) {
1345                 resp = obtain_addrinfo(ureq.reqbuf);
1346         } else /*if (ureq.type == INITGROUPS)*/ {
1347                 resp = obtain_initgroups(ureq.reqbuf);
1348         }
1349
1350         if (!((response_header*)resp)->found) {
1351                 /* Parent knows about this special case */
1352                 xfull_write(1, resp, 8);
1353         } else {
1354                 /* Responses can be big (getgrnam("guest") on a big user db),
1355                  * we cannot rely on them being atomic. full_write loops
1356                  * if needed */
1357                 xfull_write(1, resp, ((response_header*)resp)->version_or_size);
1358         }
1359         _exit(0);
1360 }
1361
1362
1363 /*
1364 ** Main loop
1365 */
1366
1367 static const char check_filenames[][sizeof("/etc/passwd")] = {
1368         [SRV_PASSWD] = "/etc/passwd", /*  "/etc/shadow"? */
1369         [SRV_GROUP]  = "/etc/group",
1370         [SRV_HOSTS]  = "/etc/hosts", /* "/etc/resolv.conf" "/etc/nsswitch.conf"? */
1371 };
1372
1373 static struct stat check_statbuf[sizeof(check_filenames) / sizeof(check_filenames[0])];
1374
1375 static void check_files(int srv)
1376 {
1377         const char *file = check_filenames[srv];
1378         struct stat *sb = &check_statbuf[srv];
1379         struct stat tempbuf;
1380         
1381         memset(&tempbuf, 0, sizeof(tempbuf));
1382         stat(file, &tempbuf); /* ignore errors */
1383         tempbuf.st_atime = 0; /* this is not a change */
1384         if (memcmp(sb, &tempbuf, sizeof(tempbuf)) != 0) {
1385                 log(L_INFO, "detected change in %s", file);
1386                 memcpy(sb, &tempbuf, sizeof(tempbuf));
1387                 age_cache(0, srv); /* frees entire cache */
1388         }
1389 }
1390
1391 /* Returns 1 if we immediately have the answer */
1392 static int handle_client(int i)
1393 {
1394         int srv;
1395         user_req *ureq = cinfo[i].ureq;
1396         user_req **cell;
1397         user_req *ureq_and_resp;
1398
1399         log(L_DEBUG, "version:%d type:%d(%s) key_len:%d '%s'",
1400                         ureq->version, ureq->type, typestr[ureq->type],
1401                         ureq->key_len, req_str(ureq->type, ureq->reqbuf));
1402         hex_dump(cinfo[i].ureq, cinfo[i].bytecnt);
1403
1404         if (unsupported_ureq_type(ureq->type)) {
1405                 /* We don't know this request. Just close the connection */
1406                 /* (glibc client interprets this like "not supported by this nscd") */
1407                 log(L_INFO, "unsupported query, dropping");
1408                 close_client(i);
1409                 return 0;
1410         }
1411         srv = type_to_srv[ureq->type];
1412         if (!config.srv_enable[srv]) {
1413                 log(L_INFO, "service %d is disabled, dropping", srv);
1414                 close_client(i);
1415                 return 0;
1416         }
1417
1418         if (cinfo[i].bytecnt < USER_HDR_SIZE + ureq->key_len) {
1419                 log(L_INFO, "read %d, need %d more to read",
1420                         cinfo[i].bytecnt, USER_HDR_SIZE + ureq->key_len);
1421                 return 0; /* more to read */
1422         }
1423         if (cinfo[i].bytecnt > USER_HDR_SIZE + ureq->key_len) {
1424                 log(L_INFO, "read overflow");
1425                 close_client(i);
1426                 return 0;
1427         }
1428         if (ureq->version != NSCD_VERSION) {
1429                 log(L_INFO, "wrong version");
1430                 close_client(i);
1431                 return 0;
1432         }
1433         if (ureq->type != GETHOSTBYADDR
1434          && ureq->type != GETHOSTBYADDRv6
1435         ) {
1436                 if (ureq->key_len && ureq->reqbuf[ureq->key_len - 1] != '\0') {
1437                         log(L_INFO, "badly terminated buffer");
1438                         close_client(i);
1439                         return 0;
1440                 }
1441         }
1442
1443         if (config.check_files[srv]) {
1444                 check_files(srv);
1445         }
1446
1447         /* If in cache, save ptr to response into cinfo and return */
1448         ureq_and_resp = find_cell_and_response(&cell, ureq);
1449         if (ureq_and_resp) {
1450                 response_header *resp = ureq_response(ureq_and_resp);
1451                 unsigned sz = resp->version_or_size;
1452                 log(L_DEBUG, "sz:%u", sz);
1453                 hex_dump(resp, sz);
1454                 ureq_and_resp->refcount = 1; /* cache shouldn't free it under us! */
1455                 pfd[i].events = POLLOUT; /* we want to write out */
1456                 cinfo[i].resptr = ureq_and_resp;
1457                 cinfo[i].respos = 0;
1458                 //cinfo[i].resp_sz = sz;
1459                 return 1;
1460         }
1461
1462         /* Start worker thread */
1463         cinfo[i].cell = cell;
1464         /* Now we will wait on worker's fd, not client's! */
1465         cinfo[i].client_fd = pfd[i].fd;
1466         pfd[i].fd = create_and_feed_worker(ureq);
1467
1468 /* We can do it here, but we don't really need to.
1469  * We need to have client_buf[] big enough anyway for worst case scenario,
1470  * so we can simply keep cbuf allocated until we close a client.
1471         cinfo[i].ureq = NULL;
1472         busy_cbuf[cinfo[i].bufidx] = 0;
1473  */
1474         return 0;
1475 }
1476
1477 /* When we return, reply is fully read and stored in cache,
1478  * worker's fd is closed, pfd[i] and cinfo[i] are updated. */
1479 static void handle_worker_response(int i, unsigned now_ms)
1480 {
1481         response_header sz_and_found;
1482         user_req *cached;
1483         user_req *ureq = cinfo[i].ureq;
1484         response_header *resp;
1485         unsigned resp_sz;
1486         unsigned ureq_sz_aligned = (char*)ureq_response(ureq) - (char*)ureq;
1487
1488         /* Replies can be big (getgrnam("guest") on a big user db),
1489          * we cannot rely on them being atomic. However, we know that worker
1490          * _always_ gives reply in one full_write(), so loop and read it all
1491          * (looping is implemented inside full_read()) */
1492         resp_sz = full_read(pfd[i].fd, &sz_and_found, 8);
1493         if (resp_sz != 8) {
1494                 /* worker was killed? */
1495                 log(L_DEBUG, "worker gave short reply:%u != 8", resp_sz);
1496                 goto err;
1497         }
1498
1499         resp_sz = sz_and_found.version_or_size;
1500         if (resp_sz < 8 || resp_sz > 0xfffffff) { /* 256 mb */
1501                 error("BUG: bad size from worker:%u", resp_sz);
1502                 goto err;
1503         }
1504
1505         /* Create new block of cached info */
1506         cached = xzalloc(ureq_sz_aligned + resp_sz);
1507         resp = (void*) ((char*)cached + ureq_sz_aligned);
1508         memcpy(cached, ureq, ureq_size(ureq));
1509         resp->version_or_size = resp_sz;
1510         resp->found = sz_and_found.found;
1511         if (sz_and_found.found) {
1512                 /* We need to read data only if it's found
1513                  * (otherwise worker sends only 8 bytes) */
1514                 if (full_read(pfd[i].fd, resp->body, resp_sz - 8) != resp_sz - 8) {
1515                         /* worker was killed? */
1516                         log(L_DEBUG, "worker gave short reply");
1517                         free(cached);
1518  err:
1519                         cached = NULL;
1520                         goto wo;
1521                 }
1522         }
1523         hex_dump(resp, resp_sz);
1524         /* save in cache */
1525         cached->refcount = 1; /* cache shouldn't free it under us! */
1526         set_cache_timestamp(cached, now_ms);
1527         save_in_cell(cinfo[i].cell, cached, now_ms);
1528
1529  wo:
1530         close(pfd[i].fd);
1531
1532         /* schedule for writeout */
1533         pfd[i].fd = cinfo[i].client_fd;
1534         cinfo[i].client_fd = 0; /* no, we don't wait for worker reply anymore */
1535         pfd[i].events = POLLOUT;
1536         /* pfd[i].revents = 0; - not needed? */
1537
1538         /* writeout position etc */
1539         cinfo[i].resptr = cached;
1540         cinfo[i].respos = 0;
1541         //cinfo[i].resp_sz = resp_sz;
1542         /* if worker took some time to get info (e.g. DNS query),
1543          * prevent client timeout from triggering at once */
1544         cinfo[i].started_ms = now_ms;
1545 }
1546
1547 static void main_loop(void)
1548 {
1549         /* 1/2 of smallest negative TTL */
1550         min_aging_interval_ms = config.nttl[0];
1551         if (min_aging_interval_ms > config.nttl[1]) min_aging_interval_ms = config.nttl[1];
1552         if (min_aging_interval_ms > config.nttl[2]) min_aging_interval_ms = config.nttl[2];
1553         min_aging_interval_ms = (min_aging_interval_ms / 2) | 1;
1554         aging_interval_ms = min_aging_interval_ms;
1555
1556         while (1) {
1557                 int i, j;
1558                 int r;
1559                 unsigned now_ms;
1560
1561                 r = SMALL_POLL_TIMEOUT_MS;
1562                 if (num_clients <= 2 && !cached_cnt)
1563                         r = -1; /* infinite */
1564                 else if (num_clients < max_reqnum)
1565                         r = aging_interval_ms;
1566
1567 #if 0 /* Debug: leak detector */
1568                 {
1569                         static unsigned long long cnt;
1570                         void *p = malloc(240); /* should not be too small */
1571                         void *s = sbrk(0);
1572                         free(p);
1573                         log(L_INFO, "entering poll %llu (%d ms). num_clients:%u cached:%u %u/%u next malloc:%p, sbrk:%p",
1574                                 cnt, r, num_clients, cached_cnt, cache_hit_cnt, cache_access_cnt, p, s);
1575                         cnt++;
1576                 }
1577 #else
1578                 log(L_DEBUG, "entering poll (%d ms). num_clients:%u cached:%u hit_ratio:%u/%u",
1579                                 r, num_clients, cached_cnt, cache_hit_cnt, cache_access_cnt);
1580 #endif
1581
1582                 r = poll(pfd, num_clients, r);
1583                 log(L_DEBUG2, "poll returns %d", r);
1584                 if (r < 0) {
1585                         if (errno != EINTR)
1586                                 perror_and_die("poll");
1587                         continue;
1588                 }
1589
1590                 /* Everything between polls never sleeps.
1591                  * There is no blocking I/O (except when we talk to worker thread
1592                  * which is guaranteed to not block us for long) */
1593
1594                 now_ms = monotonic_ms();
1595                 if (r == 0)
1596                         goto skip_fd_checks;
1597
1598                 for (i = 0; i < 2; i++) {
1599                         int cfd;
1600                         if (!pfd[i].revents)
1601                                 continue;
1602                         /* pfd[i].revents = 0; - not needed */
1603                         cfd = accept(pfd[i].fd, NULL, NULL);
1604                         if (cfd < 0) {
1605                                 /* odd... poll() says we can accept but accept failed? */
1606                                 log(L_DEBUG2, "accept failed with %s", strerror(errno));
1607                                 continue;
1608                         }
1609                         ndelay_on(cfd);
1610                         close_on_exec(cfd);
1611                         /* x[num_clients] is next free element, taking it */
1612                         pfd[num_clients].fd = cfd;
1613                         pfd[num_clients].events = POLLIN;
1614                         /* this will make us do read() in next for() loop: */
1615                         pfd[num_clients].revents = POLLIN;
1616                         memset(&cinfo[num_clients], 0, sizeof(cinfo[num_clients]));
1617                         /* cinfo[num_clients].bytecnt = 0; - done */
1618                         cinfo[num_clients].started_ms = now_ms;
1619                         cinfo[num_clients].bufidx = alloc_buf_no();
1620                         cinfo[num_clients].ureq = bufno2buf(cinfo[num_clients].bufidx);
1621                         num_clients++;
1622                         if (num_clients >= max_reqnum) {
1623                                 /* stop accepting new connects for now */
1624                                 pfd[0].events = pfd[0].revents = 0;
1625                                 pfd[1].events = pfd[1].revents = 0;
1626                         }
1627                 }
1628                 for (; i < num_clients; i++) {
1629                         if (!pfd[i].revents)
1630                                 continue;
1631                         log(L_DEBUG2, "pfd[%d].revents:0x%x", i, pfd[i].revents);
1632                         /* pfd[i].revents = 0; - not needed */
1633
1634                         /* "Write out result" case */
1635                         if (pfd[i].revents == POLLOUT) {
1636                                 response_header *resp;
1637                                 uint32_t resp_sz;
1638                                 if (!cinfo[i].resptr) {
1639                                         /* corner case: worker gave bad response earlier */
1640                                         close_client(i);
1641                                         continue;
1642                                 }
1643  write_out:
1644                                 resp = ureq_response(cinfo[i].resptr);
1645                                 resp_sz = resp->version_or_size;
1646                                 resp->version_or_size = NSCD_VERSION;
1647                                 r = safe_write(pfd[i].fd, resp + cinfo[i].respos, resp_sz - cinfo[i].respos);
1648                                 resp->version_or_size = resp_sz;
1649
1650                                 if (r < 0 && errno == EAGAIN)
1651                                         continue;
1652                                 if (r <= 0) { /* client isn't there anymore */
1653  write_out_is_done:
1654                                         free_refcounted_ureq(&cinfo[i].resptr);
1655                                         close_client(i);
1656                                         continue;
1657                                 }
1658                                 cinfo[i].respos += r;
1659                                 if (cinfo[i].respos >= resp_sz) {
1660                                         /* We wrote everything */
1661                                         /* No point in trying to get next request, it won't come.
1662                                          * glibc 2.4 client closes its end after each request,
1663                                          * without testing for EOF from server. strace:
1664                                          * ...
1665                                          * read(3, "www.google.com\0\0", 16) = 16
1666                                          * close(3) = 0
1667                                          */
1668                                         goto write_out_is_done;
1669                                 }
1670                         }
1671
1672                         /* "Read reply from worker" case. Worker may be
1673                          * already dead, revents may contain other bits too */
1674                         if ((pfd[i].revents & POLLIN) && cinfo[i].client_fd) {
1675                                 log(L_DEBUG, "reading response for client %u", i);
1676                                 handle_worker_response(i, now_ms);
1677                                 /* We can immediately try to write a response
1678                                  * to client */
1679                                 goto write_out;
1680                         }
1681
1682                         /* All strange and unexpected cases */
1683                         if (pfd[i].revents != POLLIN) {
1684                                 /* Not just "can read" - prolly POLLHUP too */
1685                                 log(L_INFO, "client %u revents is strange:%x", i, pfd[i].revents);
1686                                 close_client(i);
1687                                 continue;
1688                         }
1689
1690                         /* "Read request from client" case */
1691                         r = safe_read(pfd[i].fd, (char*)(cinfo[i].ureq) + cinfo[i].bytecnt, MAX_USER_REQ_SIZE - cinfo[i].bytecnt);
1692                         if (r < 0) {
1693                                 log(L_DEBUG2, "error reading from client: %s", strerror(errno));
1694                                 if (errno == EAGAIN)
1695                                         continue;
1696                                 close_client(i);
1697                                 continue;
1698                         }
1699                         if (r == 0) {
1700                                 log(L_INFO, "premature EOF from client, dropping");
1701                                 close_client(i);
1702                                 continue;
1703                         }
1704                         cinfo[i].bytecnt += r;
1705                         if (cinfo[i].bytecnt >= sizeof(user_req_header)) {
1706                                 if (cinfo[i].ureq->type == SHUTDOWN
1707                                  || cinfo[i].ureq->type == INVALIDATE
1708                                 ) {
1709                                         const char *service;
1710                                         unsigned len;
1711 #ifdef SO_PEERCRED
1712                                         struct ucred caller;
1713                                         socklen_t optlen = sizeof(caller);
1714                                         if (getsockopt(pfd[i].fd, SOL_SOCKET, SO_PEERCRED, &caller, &optlen) < 0) {
1715                                                 log(L_INFO, "ignoring special request - cannot get caller's id: %s", strerror(errno));
1716                                                 close_client(i);
1717                                                 continue;
1718                                         }
1719                                         if (caller.uid != 0) {
1720                                                 log(L_INFO, "special request from non-root - ignoring");
1721                                                 close_client(i);
1722                                                 continue;
1723                                         }
1724 #endif
1725                                         if (cinfo[i].ureq->type == SHUTDOWN) {
1726                                                 log(L_INFO, "got shutdown request, exiting");
1727                                                 return; /* exits nscd */;
1728                                         }
1729                                         len = cinfo[i].ureq->key_len;
1730                                         service = (char*)&cinfo[i].ureq + len;
1731                                         if (sizeof(user_req_header) + len != cinfo[i].bytecnt
1732                                          || !len
1733                                          || service[len-1] != '\0'
1734                                         ) {
1735                                                 log(L_INFO, "malformed invalidate request - ignoring");
1736                                                 close_client(i);
1737                                                 continue;
1738                                         }
1739                                         log(L_INFO, "got invalidate request, flushing cache");
1740                                         age_cache(0, -1); /* frees entire cache. TODO: replace -1 with service */
1741                                         close_client(i);
1742                                         continue;
1743                                 }
1744                                 if (handle_client(i)) {
1745                                         /* Response is found in cache! */
1746                                         goto write_out;
1747                                 }
1748                         }
1749                 } /* for each client[2..num_clients-1] */
1750
1751  skip_fd_checks:
1752                 /* Age cache */
1753                 if ((now_ms - last_age_time) >= aging_interval_ms) {
1754                         last_age_time = now_ms;
1755                         age_cache(now_ms, -1);
1756                 }
1757
1758                 /* Close timed out client connections */
1759                 for (i = 2; i < num_clients; i++) {
1760                         if (pfd[i].fd && !cinfo[i].client_fd
1761                          && (now_ms - cinfo[i].started_ms) > CLIENT_TIMEOUT_MS
1762                         ) {
1763                                 log(L_INFO, "timed out waiting for client %u, dropping", i);
1764                                 close_client(i);
1765                         }
1766                 }
1767
1768                 if (!cnt_closed)
1769                         continue;
1770
1771                 /* We closed at least one client, coalesce pfd[], cinfo[] */
1772                 if (min_closed + cnt_closed >= num_clients) {
1773                         /* clients [min_closed..num_clients-1] are all closed */
1774                         /* log(L_DEBUG, "taking shortcut"); - almost always happens */
1775                         goto coalesce_done;
1776                 }
1777                 j = min_closed;
1778                 i = min_closed + 1;
1779                 while (i < num_clients) {
1780                         while (1) {
1781                                 if (pfd[i].fd)
1782                                         break;
1783                                 if (++i >= num_clients)
1784                                         goto coalesce_done;
1785                         }
1786                         pfd[j] = pfd[i];
1787                         cinfo[j++] = cinfo[i++];
1788                 }
1789
1790  coalesce_done:
1791                 num_clients -= cnt_closed;
1792                 log(L_DEBUG, "removing %d closed clients. num_clients:%d", cnt_closed, num_clients);
1793                 min_closed = INT_MAX;
1794                 cnt_closed = 0;
1795                 /* start accepting new connects */
1796                 pfd[0].events = POLLIN;
1797                 pfd[1].events = POLLIN;
1798         } /* while (1) */
1799 }
1800
1801
1802 /*
1803 ** Initialization
1804 */
1805
1806 #define NSCD_PIDFILE    "/var/run/nscd/nscd.pid"
1807 #define NSCD_DIR        "/var/run/nscd"
1808 #define NSCD_SOCKET     "/var/run/nscd/socket"
1809 #define NSCD_SOCKET_OLD "/var/run/.nscd_socket"
1810
1811 static smallint wrote_pidfile;
1812
1813 static void cleanup_on_signal(int sig)
1814 {
1815         if (wrote_pidfile)
1816                 unlink(NSCD_PIDFILE);
1817         unlink(NSCD_SOCKET_OLD);
1818         unlink(NSCD_SOCKET);
1819         exit(0);
1820 }
1821
1822 static void write_pid(void)
1823 {
1824         FILE *pid = fopen(NSCD_PIDFILE, "w");
1825         if (!pid)
1826                 return;
1827         fprintf(pid, "%d\n", getpid());
1828         fclose(pid);
1829         wrote_pidfile = 1;
1830 }
1831
1832 /* Open a listening nscd server socket */
1833 static int open_socket(const char *name)
1834 {
1835         struct sockaddr_un sun;
1836         int sock = socket(AF_UNIX, SOCK_STREAM, 0);
1837         if (sock < 0)
1838                 perror_and_die("cannot create unix domain socket");
1839         ndelay_on(sock);
1840         close_on_exec(sock);
1841         sun.sun_family = AF_UNIX;
1842         strcpy(sun.sun_path, name);
1843         unlink(name);
1844         if (bind(sock, (struct sockaddr *) &sun, sizeof(sun)) < 0)
1845                 perror_and_die("bind(%s)", name);
1846         if (chmod(name, 0666) < 0)
1847                 perror_and_die("chmod(%s)", name);
1848         if (listen(sock, (max_reqnum/8) | 1) < 0)
1849                 perror_and_die("listen");
1850         return sock;
1851 }
1852
1853 static const struct option longopt[] = {
1854         /* name, has_arg, int *flag, int val */
1855         { "debug"      , no_argument      , NULL, 'd' },
1856         { "config-file", required_argument, NULL, 'f' },
1857         { "invalidate" , required_argument, NULL, 'i' },
1858         { "shutdown"   , no_argument      , NULL, 'K' },
1859         { "nthreads"   , required_argument, NULL, 't' },
1860         { "version"    , no_argument      , NULL, 'V' },
1861         { "help"       , no_argument      , NULL, '?' },
1862         { "usage"      , no_argument      , NULL, '?' },
1863         /* just exit(0). TODO: "test" connect? */
1864         { "statistic"  , no_argument      , NULL, 'g' },
1865         { "secure"     , no_argument      , NULL, 'S' }, /* ? */
1866         { }
1867 };
1868
1869 static const char *const help[] = {
1870         "Do not daemonize; log to stderr",
1871         "File to read configuration from",
1872         "Invalidate cache",
1873         "Shut the server down",
1874         "Serve N requests in parallel",
1875         "Version",
1876 };
1877
1878 static void print_help_and_die(void)
1879 {
1880         const struct option *opt = longopt;
1881         const char *const *h = help;
1882
1883         puts("Usage: nscd [OPTION...]\n"
1884              "Name Service Cache Daemon\n");
1885         do {
1886                 printf("\t" "-%c,--%-11s %s\n", opt->val, opt->name, *h);
1887                 h++;
1888                 opt++;
1889         } while (opt->val != '?');
1890         exit(1);
1891 }
1892
1893 static char *skip_service(int *srv, const char *s)
1894 {
1895         if (strcmp("passwd", s) == 0) {
1896                 *srv = SRV_PASSWD;
1897                 s++;
1898         } else if (strcmp("group", s) == 0) {
1899                 *srv = SRV_GROUP;
1900         } else if (strcmp("hosts", s) == 0) {
1901                 *srv = SRV_HOSTS;
1902         } else {
1903                 return NULL;
1904         }
1905         return skip_whitespace(s + 6);
1906 }
1907
1908 static void handle_null(const char *str, int srv) {}
1909
1910 static void handle_logfile(const char *str, int srv)
1911 {
1912         config.logfile = xstrdup(str);
1913 }
1914
1915 static void handle_debuglvl(const char *str, int srv)
1916 {
1917         debug |= getnum(str);
1918 }
1919
1920 static void handle_threads(const char *str, int srv)
1921 {
1922         unsigned n = getnum(str);
1923         if (max_reqnum < n)
1924                 max_reqnum = n;
1925 }
1926
1927 static void handle_user(const char *str, int srv)
1928 {
1929         config.user = xstrdup(str);
1930 }
1931
1932 static void handle_enable(const char *str, int srv)
1933 {
1934         config.srv_enable[srv] = ((str[0] | 0x20) == 'y');
1935 }
1936
1937 static void handle_pttl(const char *str, int srv)
1938 {
1939         config.pttl[srv] = getnum(str);
1940 }
1941
1942 static void handle_nttl(const char *str, int srv)
1943 {
1944         config.nttl[srv] = getnum(str);
1945 }
1946
1947 static void handle_size(const char *str, int srv)
1948 {
1949         config.size[srv] = getnum(str);
1950 }
1951
1952 static void handle_chfiles(const char *str, int srv)
1953 {
1954         config.check_files[srv] = ((str[0] | 0x20) == 'y');
1955 }
1956
1957 static void parse_conffile(const char *conffile, int warn)
1958 {
1959         static const struct confword {
1960                 const char *str;
1961                 void (*handler)(const char *, int);
1962         } conf_words[] = {
1963                 { "_" "logfile"               , handle_logfile  },
1964                 { "_" "debug-level"           , handle_debuglvl },
1965                 { "_" "threads"               , handle_threads  },
1966                 { "_" "max-threads"           , handle_threads  },
1967                 { "_" "server-user"           , handle_user     },
1968                 /* ignore: any user can stat */
1969                 { "_" "stat-user"             , handle_null     },
1970                 { "_" "paranoia"              , handle_null     }, /* ? */
1971                 /* ignore: design goal is to never crash/hang */
1972                 { "_" "reload-count"          , handle_null     },
1973                 { "_" "restart-interval"      , handle_null     },
1974                 { "S" "enable-cache"          , handle_enable   },
1975                 { "S" "positive-time-to-live" , handle_pttl     },
1976                 { "S" "negative-time-to-live" , handle_nttl     },
1977                 { "S" "suggested-size"        , handle_size     },
1978                 { "S" "check-files"           , handle_chfiles  },
1979                 { "S" "persistent"            , handle_null     }, /* ? */
1980                 { "S" "shared"                , handle_null     }, /* ? */
1981                 { "S" "auto-propagate"        , handle_null     }, /* ? */
1982                 { }
1983         };
1984
1985         char buf[128];
1986         FILE *file = fopen(conffile, "r");
1987         int lineno = 0;
1988
1989         if (!file) {
1990                 if (conffile != default_conffile)
1991                         perror_and_die("cannot open %s", conffile);
1992                 return;
1993         }
1994
1995         while (fgets(buf, sizeof(buf), file) != NULL) {
1996                 const struct confword *word;
1997                 char *p;
1998                 int len = strlen(buf);
1999
2000                 lineno++;
2001                 if (len) {
2002                         if (buf[len-1] != '\n') {
2003                                 if (len >= sizeof(buf) - 1)
2004                                         error_and_die("%s:%d: line is too long", conffile, lineno);
2005                                 len++; /* last line, not terminated by '\n' */
2006                         }
2007                         buf[len-1] = '\0';
2008                 }
2009                 p = strchr(buf, '#');
2010                 if (p)
2011                         *p = '\0';
2012
2013                 p = skip_whitespace(buf);
2014                 if (!*p)
2015                         continue;
2016                 *skip_non_whitespace(p) = '\0';
2017                 word = conf_words;
2018                 while (1) {
2019                         if (strcmp(word->str + 1, p) == 0) {
2020                                 int srv;
2021                                 p = skip_whitespace(p + strlen(p) + 1);
2022                                 *skip_non_whitespace(p) = '\0';
2023                                 if (word->str[0] == 'S') {
2024                                         char *p2 = skip_service(&srv, p);
2025                                         if (!p2) {
2026                                                 if (warn)
2027                                                         error("%s:%d: ignoring unknown service name '%s'", conffile, lineno, p);
2028                                                 break;
2029                                         }
2030                                         p = p2;
2031                                         *skip_non_whitespace(p) = '\0';
2032                                 }
2033                                 word->handler(p, srv);
2034                                 break;
2035                         }
2036                         word++;
2037                         if (!word->str) {
2038                                 if (warn)
2039                                         error("%s:%d: ignoring unknown directive '%s'", conffile, lineno, p);
2040                                 break;
2041                         }
2042                 }
2043         }
2044         fclose(file);
2045 }
2046
2047
2048 /* "XX,XX[,XX]..." -> gid_t[] */
2049 static gid_t* env_U_to_uid_and_gids(const char *str, int *sizep)
2050 {
2051         const char *sp;
2052         gid_t *ug, *gp;
2053         int ng;
2054
2055         sp = str;
2056         ng = 1;
2057         while (*sp)
2058                 if (*sp++ == ',')
2059                         ng++;
2060         ug = xmalloc(ng * sizeof(ug[0]));
2061
2062         ng = 0;
2063         gp = ug;
2064         sp = str;
2065         errno = 0;
2066         while (1) {
2067                 ng++;
2068                 *gp++ = strtoul(sp, (char**)&sp, 16);
2069                 if (errno || (*sp != ',' && *sp != '\0'))
2070                         error_and_die("internal error");
2071                 if (*sp == '\0')
2072                         break;
2073                 sp++;
2074         }
2075
2076         *sizep = ng;
2077         return ug;
2078 }
2079
2080
2081 static char* user_to_env_U(const char *user)
2082 {
2083         int ng;
2084         char *ug_str, *sp;
2085         gid_t *ug, *gp;
2086         struct passwd *pw;
2087
2088         pw = getpwnam(user);
2089         if (!pw)
2090                 perror_and_die("user '%s' is not known", user);
2091
2092         ng = 64;
2093         /* 0th cell will be used for uid */
2094         ug = xmalloc((1 + ng) * sizeof(ug[0]));
2095         if (getgrouplist(user, pw->pw_gid, &ug[1], &ng) < 0) {
2096                 ug = xrealloc(ug, (1 + ng) * sizeof(ug[0]));
2097                 if (getgrouplist(user, pw->pw_gid, &ug[1], &ng) < 0)
2098                         perror_and_die("can't get groups of user '%s'", user);
2099         }
2100         ng++;
2101         ug[0] = pw->pw_uid;
2102
2103         /* How much do we need for "-Uxx,xx[,xx]..." string? */
2104         ug_str = xmalloc((sizeof(unsigned long)+1)*2 * ng + 3);
2105         gp = ug;
2106         sp = ug_str;
2107         *sp++ = 'U';
2108         *sp++ = '=';
2109         do {
2110                 sp += sprintf(sp, "%lx,", (unsigned long)(*gp++));
2111         } while (--ng);
2112         sp[-1] = '\0';
2113
2114         free(ug);
2115         return ug_str;
2116 }
2117
2118
2119 /* not static - don't inline me, compiler! */
2120 void readlink_self_exe(void)
2121 {
2122         char buf[PATH_MAX + 1];
2123         ssize_t sz = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
2124         if (sz < 0)
2125                 perror_and_die("readlink %s failed", "/proc/self/exe");
2126         buf[sz] = 0;
2127         self_exe_points_to = xstrdup(buf);
2128 }
2129
2130
2131 static void special_op(const char *arg) NORETURN;
2132 static void special_op(const char *arg)
2133 {
2134         static const user_req_header ureq = { NSCD_VERSION, SHUTDOWN, 0 };
2135
2136         struct sockaddr_un addr;
2137         int sock;
2138
2139         sock = socket(PF_UNIX, SOCK_STREAM, 0);
2140         if (sock < 0)
2141                 error_and_die("cannot create AF_UNIX socket");
2142
2143         addr.sun_family = AF_UNIX;
2144         strcpy(addr.sun_path, NSCD_SOCKET);
2145         if (connect(sock, (struct sockaddr *) &addr, sizeof(addr)) < 0)
2146                 error_and_die("cannot connect to %s", NSCD_SOCKET);
2147
2148         if (!arg) { /* shutdown */
2149                 xfull_write(sock, &ureq, sizeof(ureq));
2150                 dup2(2, 1);
2151                 error_and_die("sent shutdown request, exiting");
2152         } else { /* invalidate */
2153                 size_t arg_len = strlen(arg) + 1;
2154                 struct {
2155                         user_req_header req;
2156                         char arg[arg_len];
2157                 } reqdata;
2158                 reqdata.req.version = NSCD_VERSION;
2159                 reqdata.req.type = INVALIDATE;
2160                 reqdata.req.key_len = arg_len;
2161                 memcpy(reqdata.arg, arg, arg_len);
2162                 xfull_write(sock, &reqdata, arg_len + sizeof(ureq));
2163                 dup2(2, 1);
2164                 error_and_die("sent invalidate(%s) request, exiting", arg);
2165         }
2166 }
2167
2168
2169 /* This internal glibc function is called to disable trying to contact nscd.
2170  * We _are_ nscd, so we need to do the lookups, and not recurse. */
2171 void __nss_disable_nscd(void);
2172
2173 int main(int argc, char **argv)
2174 {
2175         int n;
2176         const char *env_U;
2177         const char *conffile;
2178
2179         /* make sure we don't get recursive calls */
2180         __nss_disable_nscd();
2181
2182         if (argv[0][0] == 'w') /* "worker_nscd" */
2183                 worker();
2184
2185         setlinebuf(stdout);
2186         setlinebuf(stderr);
2187
2188         /* For idiotic kernels which disallow "exec /proc/self/exe" */
2189         readlink_self_exe();
2190
2191         conffile = default_conffile;
2192         while ((n = getopt_long(argc, argv, "df:i:KVgt:", longopt, NULL)) != -1) {
2193                 switch (n) {
2194                 case 'd':
2195                         debug &= ~D_DAEMON;
2196                         break;
2197                 case 'f':
2198                         conffile = optarg;
2199                         break;
2200                 case 'i':
2201                         /* invalidate */
2202                         special_op(optarg); /* exits */
2203                 case 'K':
2204                         /* shutdown server */
2205                         special_op(NULL); /* exits */
2206                 case 'V':
2207                         puts("unscd - nscd which does not hang, v."PROGRAM_VERSION);
2208                         exit(0);
2209                 case 'g':
2210                         exit(0);
2211                 case 't':
2212                         /* N threads */
2213                         max_reqnum = getnum(optarg);
2214                         break;
2215                 case 'S':
2216                         /* secure (?) */
2217                         break;
2218                 default:
2219                         print_help_and_die();
2220                 }
2221         }
2222
2223         env_U = getenv("U");
2224         /* Avoid duplicate warnings if $U exists */
2225         parse_conffile(conffile, /* warn? */ (env_U == NULL));
2226
2227         /* I have a user report of (broken?) ldap nss library
2228          * opening and never closing a socket to a ldap server,
2229          * even across fork() and exec(). This messes up
2230          * worker child's operations for the reporter.
2231          *
2232          * This strenghtens my belief that nscd _must not_ trust
2233          * nss libs to be written correctly.
2234          *
2235          * Here, we need to jump through the hoops to guard against
2236          * such problems. If config file has server-user setting, we need
2237          * to setgroups + setuid. For that, we need to get uid and gid vector.
2238          * And that means possibly using buggy nss libs.
2239          * We will do it here, but then we will re-exec, passing uid+gids
2240          * in an environment variable.
2241          */
2242         if (!env_U && config.user) {
2243                 /* user_to_env_U() does getpwnam and getgrouplist */
2244                 if (putenv(user_to_env_U(config.user)))
2245                         error_and_die("out of memory");
2246                 /* fds leaked by nss will be closed by execed copy */
2247                 execv("/proc/self/exe", argv);
2248                 xexecve(self_exe_points_to, argv, environ);
2249         }
2250
2251         /* Allocate dynamically sized stuff */
2252         max_reqnum += 2; /* account for 2 first "fake" clients */
2253         if (max_reqnum < 8) max_reqnum = 8; /* sanitize */
2254         if (max_reqnum > 0xffff) max_reqnum = 0xffff;
2255         log(L_DEBUG, "will handle %u requests in parallel", max_reqnum - 2);
2256         client_buf = xzalloc(max_reqnum * sizeof(client_buf[0]));
2257         busy_cbuf  = xzalloc(max_reqnum * sizeof(busy_cbuf[0]));
2258         pfd        = xzalloc(max_reqnum * sizeof(pfd[0]));
2259         cinfo      = xzalloc(max_reqnum * sizeof(cinfo[0]));
2260
2261         cache_size = (config.size[0] + config.size[1] + config.size[2]) / 8;
2262         if (cache_size < 64) cache_size = 64; /* 8*64 = 512 entries min */
2263         if (cache_size > 0xffff) cache_size = 0xffff; /* 8*64k entries max */
2264         cache_size |= 1; /* force it to be odd */
2265         log(L_DEBUG, "cache size %u x 8 entries", cache_size);
2266         cache = xzalloc(cache_size * sizeof(cache[0]));
2267
2268         /* Make sure stdio is not closed */
2269         n = xopen3("/dev/null", O_RDWR, 0);
2270         while (n < 2)
2271                 n = dup(n);
2272         /* Close unexpected open file descriptors */
2273         n |= 0xff; /* start from at least fd# 255 */
2274         do {
2275                 close(n--);
2276         } while (n > 2);
2277
2278         /* Register cleanup hooks */
2279         signal(SIGINT, cleanup_on_signal);
2280         signal(SIGTERM, cleanup_on_signal);
2281         /* Don't die if a client closes a socket on us */
2282         signal(SIGPIPE, SIG_IGN);
2283         /* Avoid creating zombies */
2284         signal(SIGCHLD, SIG_IGN);
2285 #if !DEBUG_BUILD
2286         /* Ensure workers don't have SIGALRM ignored */
2287         signal(SIGALRM, SIG_DFL);
2288 #endif
2289
2290         mkdir(NSCD_DIR, 0777);
2291         pfd[0].fd = open_socket(NSCD_SOCKET);
2292         pfd[1].fd = open_socket(NSCD_SOCKET_OLD);
2293         pfd[0].events = POLLIN;
2294         pfd[1].events = POLLIN;
2295         
2296         if (debug & D_DAEMON) {
2297                 daemon(/*nochdir*/ 1, /*noclose*/ 0);
2298                 if (config.logfile) {
2299                         /* nochdir=1: relative paths still work as expected */
2300                         xmovefd(xopen3(config.logfile, O_WRONLY|O_CREAT|O_TRUNC, 0666), 2);
2301                         debug |= D_STAMP;
2302                 } else {
2303                         debug = 0; /* why bother? it's /dev/null'ed anyway */
2304                 }
2305                 chdir("/"); /* compat */
2306                 write_pid();
2307                 setsid();
2308                 /* ignore job control signals */
2309                 signal(SIGTTOU, SIG_IGN);
2310                 signal(SIGTTIN, SIG_IGN);
2311                 signal(SIGTSTP, SIG_IGN);
2312         }
2313
2314         if (env_U) {
2315                 int size;
2316                 gid_t *ug = env_U_to_uid_and_gids(env_U, &size);
2317                 if (setgroups(size, &ug[1]))
2318                         perror_and_die("cannot set groups for user '%s'", config.user);
2319                 if (setuid(ug[0]))
2320                         perror_and_die("cannot set uid to %u", (unsigned)(ug[0]));
2321                 free(ug);
2322         }
2323
2324         log(L_ALL, "nscd v" PROGRAM_VERSION ", debug level %x", debug & L_ALL);
2325         log(L_DEBUG, "passwd cache: %d pttl %u nttl %u",
2326                                 config.srv_enable[SRV_PASSWD],
2327                                 config.pttl[SRV_PASSWD],
2328                                 config.nttl[SRV_PASSWD]);
2329         log(L_DEBUG, " group cache: %d pttl %u nttl %u",
2330                                 config.srv_enable[SRV_GROUP ],
2331                                 config.pttl[SRV_GROUP],
2332                                 config.nttl[SRV_GROUP]);
2333         log(L_DEBUG, " hosts cache: %d pttl %u nttl %u",
2334                                 config.srv_enable[SRV_HOSTS ],
2335                                 config.pttl[SRV_HOSTS],
2336                                 config.nttl[SRV_HOSTS]);
2337
2338         for (n = 0; n < 3; n++) {
2339                 config.pttl[n] *= 1000;
2340                 config.nttl[n] *= 1000;
2341         }
2342
2343         main_loop();
2344
2345         return 0;
2346 }