Source file
src/syscall/exec_linux.go
Documentation: syscall
1
2
3
4
5
6
7 package syscall
8
9 import (
10 "internal/itoa"
11 "runtime"
12 "unsafe"
13 )
14
15
16
17 const (
18 CLONE_VM = 0x00000100
19 CLONE_FS = 0x00000200
20 CLONE_FILES = 0x00000400
21 CLONE_SIGHAND = 0x00000800
22 CLONE_PIDFD = 0x00001000
23 CLONE_PTRACE = 0x00002000
24 CLONE_VFORK = 0x00004000
25 CLONE_PARENT = 0x00008000
26 CLONE_THREAD = 0x00010000
27 CLONE_NEWNS = 0x00020000
28 CLONE_SYSVSEM = 0x00040000
29 CLONE_SETTLS = 0x00080000
30 CLONE_PARENT_SETTID = 0x00100000
31 CLONE_CHILD_CLEARTID = 0x00200000
32 CLONE_DETACHED = 0x00400000
33 CLONE_UNTRACED = 0x00800000
34 CLONE_CHILD_SETTID = 0x01000000
35 CLONE_NEWCGROUP = 0x02000000
36 CLONE_NEWUTS = 0x04000000
37 CLONE_NEWIPC = 0x08000000
38 CLONE_NEWUSER = 0x10000000
39 CLONE_NEWPID = 0x20000000
40 CLONE_NEWNET = 0x40000000
41 CLONE_IO = 0x80000000
42
43
44
45 CLONE_CLEAR_SIGHAND = 0x100000000
46 CLONE_INTO_CGROUP = 0x200000000
47
48
49
50
51 CLONE_NEWTIME = 0x00000080
52 )
53
54
55
56 type SysProcIDMap struct {
57 ContainerID int
58 HostID int
59 Size int
60 }
61
62 type SysProcAttr struct {
63 Chroot string
64 Credential *Credential
65
66
67
68 Ptrace bool
69 Setsid bool
70
71
72 Setpgid bool
73
74
75
76
77 Setctty bool
78 Noctty bool
79 Ctty int
80
81
82
83
84
85 Foreground bool
86 Pgid int
87
88
89
90
91 Pdeathsig Signal
92 Cloneflags uintptr
93 Unshareflags uintptr
94 UidMappings []SysProcIDMap
95 GidMappings []SysProcIDMap
96
97
98
99
100 GidMappingsEnableSetgroups bool
101 AmbientCaps []uintptr
102 UseCgroupFD bool
103 CgroupFD int
104
105
106
107 PidFD *int
108 }
109
110 var (
111 none = [...]byte{'n', 'o', 'n', 'e', 0}
112 slash = [...]byte{'/', 0}
113
114 forceClone3 = false
115 )
116
117
118 func runtime_BeforeFork()
119 func runtime_AfterFork()
120 func runtime_AfterForkInChild()
121
122
123
124
125
126
127
128
129
130
131
132
133 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
134
135
136 upid, pidfd, err, mapPipe, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
137 if locked {
138 runtime_AfterFork()
139 }
140 if err != 0 {
141 return 0, err
142 }
143
144
145 pid = int(upid)
146 if sys.PidFD != nil {
147 *sys.PidFD = int(pidfd)
148 }
149
150 if sys.UidMappings != nil || sys.GidMappings != nil {
151 Close(mapPipe[0])
152 var err2 Errno
153
154
155 if sys.Unshareflags&CLONE_NEWUSER == 0 {
156 if err := writeUidGidMappings(pid, sys); err != nil {
157 err2 = err.(Errno)
158 }
159 }
160 RawSyscall(SYS_WRITE, uintptr(mapPipe[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
161 Close(mapPipe[1])
162 }
163
164 return pid, 0
165 }
166
167 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
168
169 type capHeader struct {
170 version uint32
171 pid int32
172 }
173
174 type capData struct {
175 effective uint32
176 permitted uint32
177 inheritable uint32
178 }
179 type caps struct {
180 hdr capHeader
181 data [2]capData
182 }
183
184
185 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
186
187
188 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
189
190
191 type cloneArgs struct {
192 flags uint64
193 pidFD uint64
194 childTID uint64
195 parentTID uint64
196 exitSignal uint64
197 stack uint64
198 stackSize uint64
199 tls uint64
200 setTID uint64
201 setTIDSize uint64
202 cgroup uint64
203 }
204
205
206
207
208
209
210
211
212
213
214
215
216 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid uintptr, pidfd int32, err1 Errno, mapPipe [2]int, locked bool) {
217
218 const (
219 PR_CAP_AMBIENT = 0x2f
220 PR_CAP_AMBIENT_RAISE = 0x2
221 )
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237 var (
238 err2 Errno
239 nextfd int
240 i int
241 caps caps
242 fd1, flags uintptr
243 puid, psetgroups, pgid []byte
244 uidmap, setgroups, gidmap []byte
245 clone3 *cloneArgs
246 pgrp int32
247 dirfd int
248 cred *Credential
249 ngroups, groups uintptr
250 c uintptr
251 )
252 pidfd = -1
253
254 rlim := origRlimitNofile.Load()
255
256 if sys.UidMappings != nil {
257 puid = []byte("/proc/self/uid_map\000")
258 uidmap = formatIDMappings(sys.UidMappings)
259 }
260
261 if sys.GidMappings != nil {
262 psetgroups = []byte("/proc/self/setgroups\000")
263 pgid = []byte("/proc/self/gid_map\000")
264
265 if sys.GidMappingsEnableSetgroups {
266 setgroups = []byte("allow\000")
267 } else {
268 setgroups = []byte("deny\000")
269 }
270 gidmap = formatIDMappings(sys.GidMappings)
271 }
272
273
274 ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
275
276
277
278
279 fd := make([]int, len(attr.Files))
280 nextfd = len(attr.Files)
281 for i, ufd := range attr.Files {
282 if nextfd < int(ufd) {
283 nextfd = int(ufd)
284 }
285 fd[i] = int(ufd)
286 }
287 nextfd++
288
289
290
291 if sys.UidMappings != nil || sys.GidMappings != nil {
292 if err := forkExecPipe(mapPipe[:]); err != nil {
293 err1 = err.(Errno)
294 return
295 }
296 }
297
298 flags = sys.Cloneflags
299 if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 {
300 flags |= CLONE_VFORK | CLONE_VM
301 }
302 if sys.PidFD != nil {
303 flags |= CLONE_PIDFD
304 }
305
306 if sys.UseCgroupFD || flags&CLONE_NEWTIME != 0 || forceClone3 {
307 clone3 = &cloneArgs{
308 flags: uint64(flags),
309 exitSignal: uint64(SIGCHLD),
310 }
311 if sys.UseCgroupFD {
312 clone3.flags |= CLONE_INTO_CGROUP
313 clone3.cgroup = uint64(sys.CgroupFD)
314 }
315 if sys.PidFD != nil {
316 clone3.pidFD = uint64(uintptr(unsafe.Pointer(&pidfd)))
317 }
318 }
319
320
321
322 runtime_BeforeFork()
323 locked = true
324 if clone3 != nil {
325 pid, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3), 0)
326 } else {
327 flags |= uintptr(SIGCHLD)
328 if runtime.GOARCH == "s390x" {
329
330 pid, err1 = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(&pidfd)))
331 } else {
332 pid, err1 = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(&pidfd)))
333 }
334 }
335 if err1 != 0 || pid != 0 {
336
337
338
339
340
341
342 return
343 }
344
345
346
347
348 if len(sys.AmbientCaps) > 0 {
349 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
350 if err1 != 0 {
351 goto childerror
352 }
353 }
354
355
356 if sys.UidMappings != nil || sys.GidMappings != nil {
357 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(mapPipe[1]), 0, 0); err1 != 0 {
358 goto childerror
359 }
360 pid, _, err1 = RawSyscall(SYS_READ, uintptr(mapPipe[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
361 if err1 != 0 {
362 goto childerror
363 }
364 if pid != unsafe.Sizeof(err2) {
365 err1 = EINVAL
366 goto childerror
367 }
368 if err2 != 0 {
369 err1 = err2
370 goto childerror
371 }
372 }
373
374
375 if sys.Setsid {
376 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
377 if err1 != 0 {
378 goto childerror
379 }
380 }
381
382
383 if sys.Setpgid || sys.Foreground {
384
385 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
386 if err1 != 0 {
387 goto childerror
388 }
389 }
390
391 if sys.Foreground {
392 pgrp = int32(sys.Pgid)
393 if pgrp == 0 {
394 pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
395
396 pgrp = int32(pid)
397 }
398
399
400 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
401 if err1 != 0 {
402 goto childerror
403 }
404 }
405
406
407
408 runtime_AfterForkInChild()
409
410
411 if sys.Unshareflags != 0 {
412 _, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
413 if err1 != 0 {
414 goto childerror
415 }
416
417 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
418 dirfd = int(_AT_FDCWD)
419 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
420 goto childerror
421 }
422 pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
423 if err1 != 0 {
424 goto childerror
425 }
426 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
427 goto childerror
428 }
429
430 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
431 goto childerror
432 }
433 pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
434 if err1 != 0 {
435 goto childerror
436 }
437 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
438 goto childerror
439 }
440 }
441
442 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
443 dirfd = int(_AT_FDCWD)
444 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
445 goto childerror
446 }
447 pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
448 if err1 != 0 {
449 goto childerror
450 }
451 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
452 goto childerror
453 }
454 }
455
456
457
458
459
460
461
462
463 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
464 _, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
465 if err1 != 0 {
466 goto childerror
467 }
468 }
469 }
470
471
472 if chroot != nil {
473 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
474 if err1 != 0 {
475 goto childerror
476 }
477 }
478
479
480 if cred = sys.Credential; cred != nil {
481 ngroups = uintptr(len(cred.Groups))
482 groups = uintptr(0)
483 if ngroups > 0 {
484 groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
485 }
486 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
487 _, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
488 if err1 != 0 {
489 goto childerror
490 }
491 }
492 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
493 if err1 != 0 {
494 goto childerror
495 }
496 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
497 if err1 != 0 {
498 goto childerror
499 }
500 }
501
502 if len(sys.AmbientCaps) != 0 {
503
504
505 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
506
507 if _, _, err1 = RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
508 goto childerror
509 }
510
511 for _, c = range sys.AmbientCaps {
512
513
514 caps.data[capToIndex(c)].permitted |= capToMask(c)
515 caps.data[capToIndex(c)].inheritable |= capToMask(c)
516 }
517
518 if _, _, err1 = RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
519 goto childerror
520 }
521
522 for _, c = range sys.AmbientCaps {
523 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
524 if err1 != 0 {
525 goto childerror
526 }
527 }
528 }
529
530
531 if dir != nil {
532 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
533 if err1 != 0 {
534 goto childerror
535 }
536 }
537
538
539 if sys.Pdeathsig != 0 {
540 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
541 if err1 != 0 {
542 goto childerror
543 }
544
545
546
547
548 pid, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
549 if pid != ppid {
550 pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
551 _, _, err1 = RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
552 if err1 != 0 {
553 goto childerror
554 }
555 }
556 }
557
558
559
560 if pipe < nextfd {
561 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
562 if err1 != 0 {
563 goto childerror
564 }
565 pipe = nextfd
566 nextfd++
567 }
568 for i = 0; i < len(fd); i++ {
569 if fd[i] >= 0 && fd[i] < i {
570 if nextfd == pipe {
571 nextfd++
572 }
573 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
574 if err1 != 0 {
575 goto childerror
576 }
577 fd[i] = nextfd
578 nextfd++
579 }
580 }
581
582
583 for i = 0; i < len(fd); i++ {
584 if fd[i] == -1 {
585 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
586 continue
587 }
588 if fd[i] == i {
589
590
591 _, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
592 if err1 != 0 {
593 goto childerror
594 }
595 continue
596 }
597
598
599 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(i), 0)
600 if err1 != 0 {
601 goto childerror
602 }
603 }
604
605
606
607
608
609 for i = len(fd); i < 3; i++ {
610 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
611 }
612
613
614 if sys.Noctty {
615 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
616 if err1 != 0 {
617 goto childerror
618 }
619 }
620
621
622 if sys.Setctty {
623 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
624 if err1 != 0 {
625 goto childerror
626 }
627 }
628
629
630 if rlim != nil {
631 rawSetrlimit(RLIMIT_NOFILE, rlim)
632 }
633
634
635
636
637 if sys.Ptrace {
638 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
639 if err1 != 0 {
640 goto childerror
641 }
642 }
643
644
645 _, _, err1 = RawSyscall(SYS_EXECVE,
646 uintptr(unsafe.Pointer(argv0)),
647 uintptr(unsafe.Pointer(&argv[0])),
648 uintptr(unsafe.Pointer(&envv[0])))
649
650 childerror:
651
652 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
653 for {
654 RawSyscall(SYS_EXIT, 253, 0, 0)
655 }
656 }
657
658 func formatIDMappings(idMap []SysProcIDMap) []byte {
659 var data []byte
660 for _, im := range idMap {
661 data = append(data, itoa.Itoa(im.ContainerID)+" "+itoa.Itoa(im.HostID)+" "+itoa.Itoa(im.Size)+"\n"...)
662 }
663 return data
664 }
665
666
667 func writeIDMappings(path string, idMap []SysProcIDMap) error {
668 fd, err := Open(path, O_RDWR, 0)
669 if err != nil {
670 return err
671 }
672
673 if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
674 Close(fd)
675 return err
676 }
677
678 if err := Close(fd); err != nil {
679 return err
680 }
681
682 return nil
683 }
684
685
686
687
688
689 func writeSetgroups(pid int, enable bool) error {
690 sgf := "/proc/" + itoa.Itoa(pid) + "/setgroups"
691 fd, err := Open(sgf, O_RDWR, 0)
692 if err != nil {
693 return err
694 }
695
696 var data []byte
697 if enable {
698 data = []byte("allow")
699 } else {
700 data = []byte("deny")
701 }
702
703 if _, err := Write(fd, data); err != nil {
704 Close(fd)
705 return err
706 }
707
708 return Close(fd)
709 }
710
711
712
713 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
714 if sys.UidMappings != nil {
715 uidf := "/proc/" + itoa.Itoa(pid) + "/uid_map"
716 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
717 return err
718 }
719 }
720
721 if sys.GidMappings != nil {
722
723 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
724 return err
725 }
726 gidf := "/proc/" + itoa.Itoa(pid) + "/gid_map"
727 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
728 return err
729 }
730 }
731
732 return nil
733 }
734
View as plain text