xnu-12377.1.9

Imported from xnu-12377.1.9.tar.gz
This commit is contained in:
Apple OSS Distributions 2025-10-04 02:38:34 +00:00
parent e3723e1f17
commit f6217f891a
1471 changed files with 130806 additions and 54469 deletions

View file

@ -3,3 +3,4 @@ bsd/man/man2/access.2 freebsd lib/libc/sys/access.2 5b882020081a138285227631c46a
bsd/man/man7/sticky.7 freebsd share/man/man7/sticky.7 5b882020081a138285227631c46a406c08e17bc8
bsd/man/man2/utimensat.2 freebsd lib/libc/sys/utimensat.2 89c1fcc0d088065021703b658ef547f46b5481f0
tools/tests/darwintests/netbsd_utimensat.c freebsd contrib/netbsd-tests/lib/libc/c063/t_utimensat.c 89c1fcc0d088065021703b658ef547f46b5481f0
bsd/man/man9/byteorder.9 freebsd share/man/man9/byteorder.9 5b882020081a138285227631c46a406c08e17bc8

View file

@ -34,6 +34,8 @@ KERNEL_FILES = \
ptrauth.h
LIBCXX_DATAFILES = \
_inttypes.h \
inttypes.h \
stddef.h \
stdint.h

View file

@ -0,0 +1,225 @@
/*
* Copyright (c) 2023 Apple Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this
* file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_LICENSE_HEADER_END@
*/
/*
* <inttypes.h> -- Standard C header, defined in ISO/IEC 9899:1999
* (aka "C99"), section 7.8. This defines format string conversion
* specifiers suitable for use within arguments to fprintf and fscanf
* and their ilk.
*/
#if !defined(__INTTYPES_H_)
#define __INTTYPES_H_
# define __PRI_8_LENGTH_MODIFIER__ "hh"
# define __PRI_64_LENGTH_MODIFIER__ "ll"
# define __SCN_64_LENGTH_MODIFIER__ "ll"
# define __PRI_MAX_LENGTH_MODIFIER__ "j"
# define __SCN_MAX_LENGTH_MODIFIER__ "j"
# define PRId8 __PRI_8_LENGTH_MODIFIER__ "d"
# define PRIi8 __PRI_8_LENGTH_MODIFIER__ "i"
# define PRIo8 __PRI_8_LENGTH_MODIFIER__ "o"
# define PRIu8 __PRI_8_LENGTH_MODIFIER__ "u"
# define PRIx8 __PRI_8_LENGTH_MODIFIER__ "x"
# define PRIX8 __PRI_8_LENGTH_MODIFIER__ "X"
# define PRId16 "hd"
# define PRIi16 "hi"
# define PRIo16 "ho"
# define PRIu16 "hu"
# define PRIx16 "hx"
# define PRIX16 "hX"
# define PRId32 "d"
# define PRIi32 "i"
# define PRIo32 "o"
# define PRIu32 "u"
# define PRIx32 "x"
# define PRIX32 "X"
# define PRId64 __PRI_64_LENGTH_MODIFIER__ "d"
# define PRIi64 __PRI_64_LENGTH_MODIFIER__ "i"
# define PRIo64 __PRI_64_LENGTH_MODIFIER__ "o"
# define PRIu64 __PRI_64_LENGTH_MODIFIER__ "u"
# define PRIx64 __PRI_64_LENGTH_MODIFIER__ "x"
# define PRIX64 __PRI_64_LENGTH_MODIFIER__ "X"
# define PRIdLEAST8 PRId8
# define PRIiLEAST8 PRIi8
# define PRIoLEAST8 PRIo8
# define PRIuLEAST8 PRIu8
# define PRIxLEAST8 PRIx8
# define PRIXLEAST8 PRIX8
# define PRIdLEAST16 PRId16
# define PRIiLEAST16 PRIi16
# define PRIoLEAST16 PRIo16
# define PRIuLEAST16 PRIu16
# define PRIxLEAST16 PRIx16
# define PRIXLEAST16 PRIX16
# define PRIdLEAST32 PRId32
# define PRIiLEAST32 PRIi32
# define PRIoLEAST32 PRIo32
# define PRIuLEAST32 PRIu32
# define PRIxLEAST32 PRIx32
# define PRIXLEAST32 PRIX32
# define PRIdLEAST64 PRId64
# define PRIiLEAST64 PRIi64
# define PRIoLEAST64 PRIo64
# define PRIuLEAST64 PRIu64
# define PRIxLEAST64 PRIx64
# define PRIXLEAST64 PRIX64
# define PRIdFAST8 PRId8
# define PRIiFAST8 PRIi8
# define PRIoFAST8 PRIo8
# define PRIuFAST8 PRIu8
# define PRIxFAST8 PRIx8
# define PRIXFAST8 PRIX8
# define PRIdFAST16 PRId16
# define PRIiFAST16 PRIi16
# define PRIoFAST16 PRIo16
# define PRIuFAST16 PRIu16
# define PRIxFAST16 PRIx16
# define PRIXFAST16 PRIX16
# define PRIdFAST32 PRId32
# define PRIiFAST32 PRIi32
# define PRIoFAST32 PRIo32
# define PRIuFAST32 PRIu32
# define PRIxFAST32 PRIx32
# define PRIXFAST32 PRIX32
# define PRIdFAST64 PRId64
# define PRIiFAST64 PRIi64
# define PRIoFAST64 PRIo64
# define PRIuFAST64 PRIu64
# define PRIxFAST64 PRIx64
# define PRIXFAST64 PRIX64
/* int32_t is 'int', but intptr_t is 'long'. */
# define PRIdPTR "ld"
# define PRIiPTR "li"
# define PRIoPTR "lo"
# define PRIuPTR "lu"
# define PRIxPTR "lx"
# define PRIXPTR "lX"
# define PRIdMAX __PRI_MAX_LENGTH_MODIFIER__ "d"
# define PRIiMAX __PRI_MAX_LENGTH_MODIFIER__ "i"
# define PRIoMAX __PRI_MAX_LENGTH_MODIFIER__ "o"
# define PRIuMAX __PRI_MAX_LENGTH_MODIFIER__ "u"
# define PRIxMAX __PRI_MAX_LENGTH_MODIFIER__ "x"
# define PRIXMAX __PRI_MAX_LENGTH_MODIFIER__ "X"
# define SCNd8 __PRI_8_LENGTH_MODIFIER__ "d"
# define SCNi8 __PRI_8_LENGTH_MODIFIER__ "i"
# define SCNo8 __PRI_8_LENGTH_MODIFIER__ "o"
# define SCNu8 __PRI_8_LENGTH_MODIFIER__ "u"
# define SCNx8 __PRI_8_LENGTH_MODIFIER__ "x"
# define SCNd16 "hd"
# define SCNi16 "hi"
# define SCNo16 "ho"
# define SCNu16 "hu"
# define SCNx16 "hx"
# define SCNd32 "d"
# define SCNi32 "i"
# define SCNo32 "o"
# define SCNu32 "u"
# define SCNx32 "x"
# define SCNd64 __SCN_64_LENGTH_MODIFIER__ "d"
# define SCNi64 __SCN_64_LENGTH_MODIFIER__ "i"
# define SCNo64 __SCN_64_LENGTH_MODIFIER__ "o"
# define SCNu64 __SCN_64_LENGTH_MODIFIER__ "u"
# define SCNx64 __SCN_64_LENGTH_MODIFIER__ "x"
# define SCNdLEAST8 SCNd8
# define SCNiLEAST8 SCNi8
# define SCNoLEAST8 SCNo8
# define SCNuLEAST8 SCNu8
# define SCNxLEAST8 SCNx8
# define SCNdLEAST16 SCNd16
# define SCNiLEAST16 SCNi16
# define SCNoLEAST16 SCNo16
# define SCNuLEAST16 SCNu16
# define SCNxLEAST16 SCNx16
# define SCNdLEAST32 SCNd32
# define SCNiLEAST32 SCNi32
# define SCNoLEAST32 SCNo32
# define SCNuLEAST32 SCNu32
# define SCNxLEAST32 SCNx32
# define SCNdLEAST64 SCNd64
# define SCNiLEAST64 SCNi64
# define SCNoLEAST64 SCNo64
# define SCNuLEAST64 SCNu64
# define SCNxLEAST64 SCNx64
# define SCNdFAST8 SCNd8
# define SCNiFAST8 SCNi8
# define SCNoFAST8 SCNo8
# define SCNuFAST8 SCNu8
# define SCNxFAST8 SCNx8
# define SCNdFAST16 SCNd16
# define SCNiFAST16 SCNi16
# define SCNoFAST16 SCNo16
# define SCNuFAST16 SCNu16
# define SCNxFAST16 SCNx16
# define SCNdFAST32 SCNd32
# define SCNiFAST32 SCNi32
# define SCNoFAST32 SCNo32
# define SCNuFAST32 SCNu32
# define SCNxFAST32 SCNx32
# define SCNdFAST64 SCNd64
# define SCNiFAST64 SCNi64
# define SCNoFAST64 SCNo64
# define SCNuFAST64 SCNu64
# define SCNxFAST64 SCNx64
# define SCNdPTR "ld"
# define SCNiPTR "li"
# define SCNoPTR "lo"
# define SCNuPTR "lu"
# define SCNxPTR "lx"
# define SCNdMAX __SCN_MAX_LENGTH_MODIFIER__ "d"
# define SCNiMAX __SCN_MAX_LENGTH_MODIFIER__ "i"
# define SCNoMAX __SCN_MAX_LENGTH_MODIFIER__ "o"
# define SCNuMAX __SCN_MAX_LENGTH_MODIFIER__ "u"
# define SCNxMAX __SCN_MAX_LENGTH_MODIFIER__ "x"
#include <stdint.h>
#endif /* !__INTTYPES_H_ */

View file

@ -1,4 +1,4 @@
/* Copyright (c) (2010-2012,2014-2022) Apple Inc. All rights reserved.
/* Copyright (c) (2010-2012,2014-2022,2024) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
@ -41,8 +41,12 @@ struct ccdigest_info {
void(* CC_SPTR(ccdigest_info, final))(const struct ccdigest_info *di, ccdigest_ctx_t ctx,
unsigned char *digest);
cc_impl_t impl;
void(* CC_SPTR(ccdigest_info, compress_parallel))(ccdigest_state_t state1, size_t nblocks1,
const void *data1, ccdigest_state_t state2, size_t nblocks2, const void *data2);
};
typedef const struct ccdigest_info *(*ccdigest_info_selector_t)(void);
/* Return sizeof a ccdigest_ctx for a given size_t _state_size_ and
size_t _block_size_. */
#define ccdigest_ctx_size(_state_size_, _block_size_) ((_state_size_) + sizeof(uint64_t) + (_block_size_) + sizeof(unsigned int))
@ -89,6 +93,24 @@ void ccdigest_final(const struct ccdigest_info *di, ccdigest_ctx_t ctx, unsigned
void ccdigest(const struct ccdigest_info *di, size_t len,
const void *data, void *digest);
/*!
@function ccdigest_parallel
@abstract Hashes two inputs of the same size, in parallel where hardware support is available.
@param di digest info struct specifying the hash to use
@param data_nbytes the size of the inputs
@param data1 pointer to the first input
@param digest1 output pointer for the hash of data1
@param data2 pointer to the second input
@param digest2 output pointer for the hash of data2
@discussion This is intended for use in the construction of Merkle trees.
*/
CC_NONNULL_ALL
void ccdigest_parallel(const struct ccdigest_info *di, size_t data_nbytes,
const void *data1, void *digest1,
const void *data2, void *digest2);
#define OID_DEF(_VALUE_) ((const unsigned char *)_VALUE_)
// https://csrc.nist.gov/projects/computer-security-objects-register/algorithm-registration#Hash

View file

@ -0,0 +1,36 @@
/*
* Copyright (c) 2000-2004, 2013, 2023 Apple Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this
* file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_LICENSE_HEADER_END@
*/
/*
* <inttypes.h> -- Standard C header, defined in ISO/IEC 9899:1999
* (aka "C99"), section 7.8. This defines format string conversion
* specifiers suitable for use within arguments to fprintf and fscanf
* and their ilk.
*/
#if !defined(_INTTYPES_H_) || __has_feature(modules)
#define _INTTYPES_H_
#include <_inttypes.h>
#endif /* !_INTTYPES_H_ */

View file

@ -210,6 +210,7 @@ TOP_TARGETS = \
install install_desktop install_embedded \
install_release_embedded install_development_embedded \
install_release_desktop install_development_desktop \
install_release_embedded_nohdrs install_release_desktop_nohdrs \
install_kernels \
cscope tags TAGS \
help
@ -336,6 +337,12 @@ xnu_tests_driverkit:
$(MAKE) -C $(SRCROOT)/tests/driverkit $(if $(filter -j,$(MAKEFLAGS)),,$(MAKEJOBS)) \
SRCROOT=$(SRCROOT)/tests/driverkit
xnu_unittests:
$(MAKE) -C $(SRCROOT)/tests/unit $(if $(filter -j,$(MAKEFLAGS)),,$(MAKEJOBS)) \
SRCROOT=$(SRCROOT)/tests/unit
$(MAKE) -C $(SRCROOT)/tests $(if $(filter -j,$(MAKEFLAGS)),,$(MAKEJOBS)) sched/install_userspace_unit_tests \
SRCROOT=$(SRCROOT)/tests
include $(MakeInc_cmd)

View file

@ -119,7 +119,7 @@ This can be customized by setting the `RC_DARWIN_KERNEL_VERSION` variable in
the environment or on the `make` command line.
See doc/xnu_version.md for more details.
See doc/building/xnu_version.md for more details.
### Debug Information Formats
@ -421,6 +421,8 @@ DriverKit SDK headers used by userspace drivers.
ExclaveKit SDK headers.
9. `EXCLAVECORE`: If defined, enclosed code is visible exclusively in the
ExclaveCore SDK headers.
10. `MODULES_SUPPORTED` If defined, enclosed code is visible exclusively
in locations that support modules/Swift (i.e. not System or Kernel frameworks).
## VM header file name convention
The VM headers follow the following naming conventions:

View file

@ -104,6 +104,17 @@ do_count(const char *dev, const char *hname, int search)
do_header(dev, hname, count);
}
static void
free_file_list(struct file_list *fl)
{
struct file_list *fl_prev;
while (fl != 0) {
fl_prev = fl;
fl = fl->f_next;
free((char *)fl_prev);
}
}
static void
do_header(const char *dev, const char *hname, int count)
{
@ -111,7 +122,7 @@ do_header(const char *dev, const char *hname, int count)
const char *inw;
char *inwcopy;
struct file_list *fl = NULL; /* may exit for(;;) uninitted */
struct file_list *fl_head, *fl_prev;
struct file_list *fl_head;
FILE *inf, *outf;
int inc, oldcount;
@ -169,11 +180,7 @@ do_header(const char *dev, const char *hname, int count)
}
(void) fclose(inf);
if (count == oldcount) {
while (fl != 0) {
fl_prev = fl;
fl = fl->f_next;
free((char *)fl_prev);
}
free_file_list(fl_head);
return;
}
if (oldcount == -1) {
@ -192,8 +199,8 @@ do_header(const char *dev, const char *hname, int count)
for (fl = fl_head; fl != 0; fl = fl->f_next) {
fprintf(outf, "#define %s %d\n",
fl->f_fn, count ? fl->f_type : 0);
free((char *)fl);
}
free_file_list(fl_head);
(void) fclose(outf);
}

View file

@ -42,6 +42,7 @@ EXPINC_SUBDIRS = \
bsm \
crypto/entropy \
dev \
kern \
libkern \
machine \
miscfs \

View file

@ -619,6 +619,7 @@
#define AUE_PREADV 43216 /* Darwin. */
#define AUE_PWRITEV 43217 /* Darwin. */
#define AUE_FREADLINK 43218
#define AUE_FUNMOUNT 43219 /* Darwin. */
#define AUE_SESSION_START 44901 /* Darwin. */
#define AUE_SESSION_UPDATE 44902 /* Darwin. */

View file

@ -180,6 +180,7 @@ tty_compat.o_CWARNFLAGS_ADD += -Wno-cast-align
tty_dev.o_CWARNFLAGS_ADD += -Wno-cast-align
ubc_subr.o_CWARNFLAGS_ADD += -Wno-cast-align
uipc_mbuf.o_CWARNFLAGS_ADD += -Wno-cast-align
uipc_mbuf_mcache.o_CWARNFLAGS_ADD += -Wno-cast-align
uipc_usrreq.o_CWARNFLAGS_ADD += -Wno-cast-align
vfs_attrlist.o_CWARNFLAGS_ADD += -Wno-cast-align
vfs_fsevents.o_CWARNFLAGS_ADD += -Wno-cast-align
@ -247,6 +248,7 @@ systrace.o_CWARNFLAGS_ADD += -Wno-implicit-int-conversion
sysv_msg.o_CWARNFLAGS_ADD += -Wno-implicit-int-conversion
sysv_sem.o_CWARNFLAGS_ADD += -Wno-implicit-int-conversion
uipc_mbuf.o_CWARNFLAGS_ADD += -Wno-implicit-int-conversion
uipc_mbuf_mcache.o_CWARNFLAGS_ADD += -Wno-implicit-int-conversion
vfs_quota.o_CWARNFLAGS_ADD += -Wno-implicit-int-conversion
vsock_domain.o_CWARNFLAGS_ADD += -Wno-implicit-int-conversion
# -Wno-shorten-64-to-32
@ -306,6 +308,7 @@ sysv_msg.o_CWARNFLAGS_ADD += -Wno-shorten-64-to-32
sysv_sem.o_CWARNFLAGS_ADD += -Wno-shorten-64-to-32
sysv_shm.o_CWARNFLAGS_ADD += -Wno-shorten-64-to-32
uipc_mbuf.o_CWARNFLAGS_ADD += -Wno-shorten-64-to-32
uipc_mbuf_mcache.o_CWARNFLAGS_ADD += -Wno-shorten-64-to-32
unix_signal.o_CWARNFLAGS_ADD += -Wno-shorten-64-to-32
ux_exception.o_CWARNFLAGS_ADD += -Wno-shorten-64-to-32
vfs_cluster.o_CWARNFLAGS_ADD += -Wno-shorten-64-to-32
@ -326,6 +329,7 @@ radix.o_CWARNFLAGS_ADD += -Wno-sign-compare
route6.o_CWARNFLAGS_ADD += -Wno-sign-compare
scope6.o_CWARNFLAGS_ADD += -Wno-sign-compare
uipc_mbuf.o_CWARNFLAGS_ADD += -Wno-sign-compare
uipc_mbuf_mcache.o_CWARNFLAGS_ADD += -Wno-sign-compare
# -Wno-sign-conversion
audit.o_CWARNFLAGS_ADD += -Wno-sign-conversion
audit_arg.o_CWARNFLAGS_ADD += -Wno-sign-conversion
@ -494,6 +498,7 @@ tcp_cc.o_CWARNFLAGS_ADD += -Wno-sign-conversion
tcp_cubic.o_CWARNFLAGS_ADD += -Wno-sign-conversion
ubc_subr.o_CWARNFLAGS_ADD += -Wno-sign-conversion
uipc_mbuf.o_CWARNFLAGS_ADD += -Wno-sign-conversion
uipc_mbuf_mcache.o_CWARNFLAGS_ADD += -Wno-sign-conversion
unix_signal.o_CWARNFLAGS_ADD += -Wno-sign-conversion
unix_startup.o_CWARNFLAGS_ADD += -Wno-sign-conversion
ux_exception.o_CWARNFLAGS_ADD += -Wno-sign-conversion
@ -601,6 +606,10 @@ $(COMPONENT).filelist: $(OBJS)
$(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \
done > $(COMPONENT).filelist
ifeq ($(XNU_LibAllFiles),1)
LIBOBJS := $(OBJS)
endif
$(COMPONENT).libfilelist: $(LIBOBJS)
@$(LOG_LDFILELIST) "lib$(COMPONENT)"
$(_v)for obj in ${LIBOBJS}; do \

View file

@ -58,7 +58,6 @@ OPTIONS/kctl_test optional kctl_test
OPTIONS/skywalk optional skywalk
OPTIONS/config_nexus_user_pipe optional config_nexus_user_pipe
OPTIONS/config_nexus_kernel_pipe optional config_nexus_kernel_pipe
OPTIONS/config_nexus_monitor optional config_nexus_monitor
OPTIONS/config_nexus_flowswitch optional config_nexus_flowswitch
OPTIONS/config_nexus_netif optional config_nexus_netif
@ -169,7 +168,7 @@ bsd/net/if_bridge.c optional if_bridge bound-checks
bsd/net/bridgestp.c optional bridgestp
bsd/net/if.c optional networking bound-checks
bsd/net/init.c optional sockets bound-checks
bsd/net/dlil.c optional networking bound-checks-pending
bsd/net/dlil.c optional networking bound-checks
bsd/net/dlil_ctl.c optional networking bound-checks
bsd/net/dlil_input.c optional networking bound-checks
bsd/net/dlil_output.c optional networking bound-checks
@ -188,7 +187,7 @@ bsd/net/multicast_list.c optional networking bound-checks
bsd/net/if_bond.c optional bond bound-checks
bsd/net/devtimer.c optional bond bound-checks
bsd/net/ndrv.c optional networking bound-checks
bsd/net/radix.c optional networking
bsd/net/radix.c optional networking bound-checks-pending
bsd/net/raw_cb.c optional networking bound-checks
bsd/net/raw_usrreq.c optional networking bound-checks
bsd/net/route.c optional networking bound-checks
@ -206,7 +205,7 @@ bsd/net/kpi_interfacefilter.c optional networking bound-checks
bsd/net/net_str_id.c optional networking bound-checks
bsd/net/if_utun.c optional networking bound-checks
bsd/net/if_ipsec.c optional ipsec bound-checks
bsd/net/necp.c optional necp
bsd/net/necp.c optional necp bound-checks
bsd/net/necp_client.c optional necp bound-checks
bsd/net/network_agent.c optional networking bound-checks
bsd/net/bloom_filter.c optional networking bound-checks
@ -226,6 +225,7 @@ bsd/net/pktap.c optional networking bound-checks
bsd/net/droptap.c optional networking bound-checks
bsd/net/if_llreach.c optional networking bound-checks
bsd/net/flowhash.c optional networking bound-checks
bsd/net/siphash.c optional networking bound-checks
bsd/net/flowadv.c optional networking bound-checks
bsd/net/content_filter.c optional content_filter bound-checks
bsd/net/content_filter_crypto.c optional content_filter bound-checks
@ -245,6 +245,9 @@ bsd/net/classq/classq_fq_codel.c optional networking bound-checks
bsd/net/pktsched/pktsched.c optional networking bound-checks
bsd/net/pktsched/pktsched_fq_codel.c optional networking bound-checks
bsd/net/pktsched/pktsched_netem.c optional networking bound-checks
bsd/net/pktsched/pktsched_ops.c optional networking bound-checks
bsd/net/aop/kpi_aop.c optional networking bound-checks
bsd/netinet/cpu_in_cksum_gen.c standard bound-checks
bsd/netinet/in_cksum.c optional inet bound-checks
@ -277,10 +280,12 @@ bsd/netinet/tcp_cc.c optional inet bound-checks
bsd/netinet/tcp_newreno.c optional inet bound-checks
bsd/netinet/tcp_cubic.c optional inet bound-checks
bsd/netinet/tcp_prague.c optional inet bound-checks
bsd/netinet/tcp_pacing.c optional inet bound-checks
bsd/netinet/cbrtf.c optional inet bound-checks
bsd/netinet/tcp_ledbat.c optional inet bound-checks
bsd/netinet/tcp_rledbat.c optional inet bound-checks
bsd/netinet/tcp_rack.c optional inet bound-checks
bsd/netinet/tcp_syncookie.c optional inet bound-checks
bsd/netinet/tcp_log.c optional inet bound-checks
bsd/netinet/tcp_sysctls.c optional inet bound-checks
bsd/netinet/tcp_ccdbg.c optional inet bound-checks
@ -390,6 +395,7 @@ bsd/kern/kern_authorization.c standard
bsd/kern/kern_backtrace.c standard
bsd/kern/kern_clock.c standard
bsd/kern/kern_core.c optional config_coredump
bsd/kern/kern_core.c optional config_ucoredump
bsd/kern/kern_credential.c standard
bsd/kern/kern_crossarch.c standard
bsd/kern/kern_cs.c standard
@ -436,6 +442,7 @@ bsd/kern/kern_xxx.c standard
bsd/kern/lockdown_mode.c standard
bsd/kern/mach_process.c standard
bsd/kern/mcache.c optional sockets config_mbuf_mcache
bsd/kern/mem_acct.c optional sockets bound-checks
bsd/kern/stackshot.c standard
bsd/kern/subr_log.c standard
bsd/kern/subr_log_stream.c standard
@ -466,13 +473,14 @@ bsd/kern/tty_tty.c standard
bsd/kern/ubc_subr.c standard
bsd/kern/uipc_domain.c optional sockets bound-checks
bsd/kern/uipc_mbuf.c optional sockets bound-checks
bsd/kern/uipc_mbuf2.c optional sockets bound-checks
bsd/kern/uipc_mbuf_mcache.c optional sockets config_mbuf_mcache
bsd/kern/uipc_mbuf2.c optional sockets bound-checks-soft
bsd/kern/uipc_proto.c optional sockets bound-checks
bsd/kern/uipc_socket.c optional sockets bound-checks
bsd/kern/uipc_socket2.c optional sockets bound-checks
bsd/kern/uipc_syscalls.c optional sockets bound-checks
bsd/kern/uipc_usrreq.c optional sockets bound-checks
bsd/kern/vsock_domain.c optional sockets
bsd/kern/vsock_domain.c optional sockets bound-checks-soft
bsd/kern/sysv_ipc.c standard
bsd/kern/sysv_shm.c standard
bsd/kern/sysv_sem.c standard
@ -568,6 +576,8 @@ bsd/skywalk/nexus/nexus_mbq.c optional skywalk bound-checks
bsd/skywalk/nexus/nexus_pktq.c optional skywalk bound-checks
bsd/skywalk/nexus/nexus_syscalls.c optional skywalk bound-checks
bsd/skywalk/nexus/nexus_traffic_rule.c optional skywalk bound-checks
bsd/skywalk/nexus/nexus_traffic_rule_inet.c optional skywalk bound-checks
bsd/skywalk/nexus/nexus_traffic_rule_eth.c optional skywalk bound-checks
bsd/skywalk/nexus/flowswitch/nx_flowswitch.c optional config_nexus_flowswitch bound-checks
bsd/skywalk/nexus/flowswitch/fsw.c optional config_nexus_flowswitch bound-checks
bsd/skywalk/nexus/flowswitch/fsw_vp.c optional config_nexus_flowswitch bound-checks
@ -590,7 +600,6 @@ bsd/skywalk/nexus/flowswitch/flow/flow_route.c optional config_nexus_flowswitch
bsd/skywalk/nexus/flowswitch/flow/flow_stats.c optional config_nexus_flowswitch bound-checks
bsd/skywalk/nexus/flowswitch/flow/flow_track.c optional config_nexus_flowswitch bound-checks
bsd/skywalk/nexus/flowswitch/flow/flow_agg.c optional config_nexus_flowswitch bound-checks
bsd/skywalk/nexus/monitor/nx_monitor.c optional config_nexus_monitor
bsd/skywalk/nexus/netif/nx_netif.c optional config_nexus_netif bound-checks
bsd/skywalk/nexus/netif/nx_netif_compat.c optional config_nexus_netif bound-checks
bsd/skywalk/nexus/netif/nx_netif_host.c optional config_nexus_netif bound-checks

View file

@ -110,9 +110,7 @@ int maxfiles = 3 * OPEN_MAX;
int maxfiles = OPEN_MAX + 2048;
#endif
unsigned int ncallout = 16 + 2 * NPROC;
unsigned int nmbclusters = NMBCLUSTERS;
int nport = NPROC / 2;
/*
* async IO (aio) configurable limits

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2000-2020 Apple Inc. All rights reserved.
* Copyright (c) 2000-2024 Apple Inc. All rights reserved.
*/
/*
* Copyright (C) 1990, NeXT, Inc.
@ -17,6 +17,8 @@
#include <machine/exec.h>
#include <pexpert/arm64/board_config.h>
int ml_grade_binary(cpu_type_t, cpu_subtype_t, cpu_subtype_t, bool);
#if __arm64__
static cpu_subtype_t cpu_subtype32(void);
#endif /* __arm64__ */
@ -51,18 +53,18 @@ grade_arm64e_binary(cpu_subtype_t execfeatures)
#endif /* XNU_TARGET_OS_IOS || XNU_TARGET_OS_XR */
/* The current ABI version is preferred over arm64 */
if (CPU_SUBTYPE_ARM64_PTR_AUTH_VERSION(execfeatures) ==
CPU_SUBTYPE_ARM64_PTR_AUTH_CURRENT_VERSION) {
if (CPU_SUBTYPE_ARM64_PTR_AUTH_VERSION(execfeatures) <=
CPU_SUBTYPE_ARM64_PTR_AUTH_MAX_PREFERRED_VERSION) {
return 12;
}
/* Future ABIs are allowed, but exec_mach_imgact will treat it like an arm64 slice */
/* Non-preferred future and older ABIs are allowed, but exec_mach_imgact may treat them like an arm64 slice */
return 11;
}
#endif /* __arm64__ */
/**********************************************************************
* Routine: grade_binary()
* Routine: ml_grade_binary()
*
* Function: Return a relative preference for exectypes and
* execsubtypes in fat executable files. The higher the
@ -70,7 +72,7 @@ grade_arm64e_binary(cpu_subtype_t execfeatures)
* not acceptable.
**********************************************************************/
int
grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype, cpu_subtype_t execfeatures __unused, bool allow_simulator_binary __unused)
ml_grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype, cpu_subtype_t execfeatures __unused, bool allow_simulator_binary __unused)
{
#if __arm64__
cpu_subtype_t hostsubtype =

View file

@ -23,6 +23,9 @@
#include <kern/bits.h>
#endif
#define __STR(x) #x
#define STRINGIFY(x) __STR(x)
extern uint64_t wake_abstime;
#if DEVELOPMENT || DEBUG
@ -438,6 +441,22 @@ SYSCTL_PROC(_machdep, OID_AUTO, ptrauth_enabled,
0, 0,
machdep_ptrauth_enabled, "I", "");
static const char _ctrr_type[] =
#if defined(KERNEL_CTRR_VERSION)
"ctrrv" STRINGIFY(KERNEL_CTRR_VERSION);
#elif defined(KERNEL_INTEGRITY_KTRR)
"ktrr";
#elif defined(KERNEL_INTEGRITY_PV_CTRR)
"pv";
#else
"none";
#endif
SYSCTL_STRING(_machdep, OID_AUTO, ctrr_type,
CTLFLAG_KERN | CTLFLAG_RD | CTLFLAG_LOCKED,
__DECONST(char *, _ctrr_type), 0,
"CTRR type supported by hardware/kernel");
#if CONFIG_TELEMETRY && (DEBUG || DEVELOPMENT)
extern unsigned long trap_telemetry_reported_events;
SYSCTL_ULONG(_debug, OID_AUTO, trap_telemetry_reported_events,
@ -466,3 +485,11 @@ dram_ecc_error_injection_capable SYSCTL_HANDLER_ARGS
SYSCTL_PROC(_vm, OID_AUTO, dram_ecc_error_injection_capable, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, 0, &dram_ecc_error_injection_capable, "I", "");
#endif /* DEBUG || DEVELOPMENT */
#if DEBUG || DEVELOPMENT
extern _Atomic unsigned int ipcpv_telemetry_count;
SYSCTL_UINT(_debug, OID_AUTO, ipcpv_telemetry_count,
CTLFLAG_RD | CTLFLAG_LOCKED, &ipcpv_telemetry_count,
0, "Number of ipc policy violation telemetry emitted");
#endif /* DEBUG || DEVELOPMENT */

View file

@ -258,7 +258,7 @@ static uint8_t dtrace_kerneluuid[16]; /* the 128-bit uuid */
*/
static ZONE_DEFINE_TYPE(dtrace_probe_t_zone, "dtrace.dtrace_probe_t",
dtrace_probe_t, ZC_PGZ_USE_GUARDS);
dtrace_probe_t, ZC_NONE);
static ZONE_DEFINE(dtrace_state_pcpu_zone, "dtrace.dtrace_dstate_percpu_t",
sizeof(dtrace_dstate_percpu_t), ZC_PERCPU);
@ -564,7 +564,7 @@ dtrace_load##bits(uintptr_t addr) \
int i; \
volatile uint16_t *flags = (volatile uint16_t *) \
&cpu_core[CPU->cpu_id].cpuc_dtrace_flags; \
uintptr_t caddr = vm_memtag_canonicalize_kernel(addr); \
uintptr_t caddr = VM_KERNEL_STRIP_PTR(addr); \
\
DTRACE_ALIGNCHECK(addr, size, flags); \
\
@ -19277,8 +19277,6 @@ static int gMajDevNo;
void dtrace_early_init (void)
{
dtrace_restriction_policy_load();
/*
* See dtrace_impl.h for a description of kernel symbol modes.
* The default is to wait for symbols from userspace (lazy symbols).

View file

@ -35,14 +35,11 @@
#include <kern/debug.h>
#include <kern/sched_prim.h>
#include <kern/task.h>
#include <machine/machine_routines.h>
#if CONFIG_CSR
#include <sys/codesign.h>
#include <sys/csr.h>
#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
extern bool csr_unsafe_kernel_text;
#endif
#endif
/*
@ -414,13 +411,6 @@ dtrace_state_free(minor_t minor)
kfree_type(dtrace_state_t, state);
}
void
dtrace_restriction_policy_load(void)
{
}
/*
* Check if DTrace has been restricted by the current security policy.
*/
@ -449,7 +439,8 @@ dtrace_are_restrictions_relaxed(void)
boolean_t
dtrace_fbt_probes_restricted(void)
{
if (!ml_unsafe_kernel_text())
return TRUE;
#if CONFIG_CSR
if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed())
return TRUE;
@ -462,6 +453,8 @@ boolean_t
dtrace_sdt_probes_restricted(void)
{
if (!ml_unsafe_kernel_text())
return TRUE;
return FALSE;
}

View file

@ -592,7 +592,7 @@ fasttrap_setdebug(proc_t *p)
* should not be possible for the process to actually
* disappear.
*/
struct proc_ident pident = proc_ident(p);
struct proc_ident pident = proc_ident_with_policy(p, IDENT_VALIDATION_PROC_EXACT);
sprunlock(p);
p = PROC_NULL;
@ -2428,8 +2428,8 @@ fasttrap_check_cred_priv(cred_t *cr, proc_t *p)
#if CONFIG_MACF
/* Check with MAC framework when enabled. */
struct proc_ident cur_ident = proc_ident(current_proc());
struct proc_ident p_ident = proc_ident(p);
struct proc_ident cur_ident = proc_ident_with_policy(current_proc(), IDENT_VALIDATION_PROC_EXACT);
struct proc_ident p_ident = proc_ident_with_policy(p, IDENT_VALIDATION_PROC_EXACT);
/* Do not hold ref to proc here to avoid deadlock. */
proc_rele(p);

View file

@ -234,5 +234,7 @@ inline int EOWNERDEAD = 105;
#pragma D binding "1.0" EOWNERDEAD
inline int EQFULL = 106;
#pragma D binding "1.0" EQFULL
inline int ELAST = 106;
inline int ENOTCAPABLE = 107;
#pragma D binding "1.0" ENOTCAPABLE
inline int ELAST = 107;
#pragma D binding "1.0" ELAST

View file

@ -996,6 +996,18 @@ sdt_argdesc_t sdt_args[] = {
{"vminfo", "vm_sanitize", 4, 4, "uint64_t", "uint64_t" },
{"vminfo", "vm_sanitize", 5, 5, "uint64_t", "uint64_t" },
{"vminfo", "vm_sanitize", 6, 6, "uint64_t", "uint64_t" },
{"vminfo", "corpse_footprint_collect", 0, 0, "uint32_t", "uint32_t" },
{"vminfo", "corpse_footprint_collect", 1, 1, "vm_map_offset_t", "vm_map_offset_t" },
{"vminfo", "corpse_footprint_collect", 2, 2, "uint32_t", "uint32_t" },
{"vminfo", "corpse_footprint_collect", 3, 3, "vm_map_offset_t", "vm_map_offset_t" },
{"vminfo", "corpse_footprint_collect_new_region", 0, 0, "vm_map_offset_t", "vm_map_offset_t" },
{"vminfo", "corpse_footprint_collect_new_region", 1, 1, "vm_map_offset_t", "vm_map_offset_t" },
{"vminfo", "corpse_footprint_collect_new_region", 2, 2, "uint64_t", "uint64_t" },
{"vminfo", "corpse_footprint_collect_zero_gap", 0, 0, "vm_map_offset_t", "vm_map_offset_t" },
{"vminfo", "corpse_footprint_collect_zero_gap", 1, 1, "vm_map_offset_t", "vm_map_offset_t" },
{"vminfo", "corpse_footprint_collect_zero_gap", 2, 2, "uint64_t", "uint64_t" },
{"vminfo", "corpse_footprint_collect_page_info", 0, 0, "vm_map_offset_t", "vm_map_offset_t" },
{"vminfo", "corpse_footprint_collect_page_info", 1, 1, "uint8_t", "uint8_t" },
{"vminfo", "reclaim_ring_allocate", 0, 0, "mach_vm_address_t", "mach_vm_address_t" },
{"vminfo", "reclaim_ring_allocate", 1, 1, "mach_vm_reclaim_count_t", "mach_vm_reclaim_count_t" },
{"vminfo", "reclaim_ring_allocate", 2, 2, "mach_vm_reclaim_count_t", "mach_vm_reclaim_count_t" },

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2000-2013 Apple Inc. All rights reserved.
* Copyright (c) 2000-2024 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
@ -44,15 +44,17 @@
extern int bootarg_no32exec; /* bsd_init.c */
#endif
int ml_grade_binary(cpu_type_t, cpu_subtype_t, cpu_subtype_t, bool);
/**********************************************************************
* Routine: grade_binary()
* Routine: ml_grade_binary()
*
* Function: Say OK to CPU types that we can actually execute on the given
* system. 64-bit binaries have the highest preference, followed
* by 32-bit binaries. 0 means unsupported.
**********************************************************************/
int
grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype, cpu_subtype_t execfeatures __unused, bool allow_simulator_binary __unused)
ml_grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype, cpu_subtype_t execfeatures __unused, bool allow_simulator_binary __unused)
{
cpu_subtype_t hostsubtype = cpu_subtype();

View file

@ -59,11 +59,6 @@ extern uint32_t kern_maxvnodes;
extern vm_map_t mb_map;
#endif /* CONFIG_MBUF_MCACHE */
#if INET
extern uint32_t tcp_sendspace;
extern uint32_t tcp_recvspace;
#endif
void bsd_bufferinit(void);
unsigned int bsd_mbuf_cluster_reserve(boolean_t *);
@ -174,27 +169,6 @@ bsd_startupearly(void)
buf_headers = (struct buf *)bufferhdr_range.min_address;
#if SOCKETS
{
static const unsigned int maxspace = 128 * 1024;
int scale;
#if INET
if ((scale = nmbclusters / NMBCLUSTERS) > 1) {
tcp_sendspace *= scale;
tcp_recvspace *= scale;
if (tcp_sendspace > maxspace) {
tcp_sendspace = maxspace;
}
if (tcp_recvspace > maxspace) {
tcp_recvspace = maxspace;
}
}
#endif /* INET */
}
#endif /* SOCKETS */
if (vnodes_sized == 0) {
if (!PE_get_default("kern.maxvnodes", &desiredvnodes, sizeof(desiredvnodes))) {
/*
@ -331,51 +305,42 @@ done:
#if defined(__LP64__)
extern int tcp_tcbhashsize;
extern int max_cached_sock_count;
#endif
void
bsd_scale_setup(int scale)
{
#if defined(__LP64__)
if ((scale > 0) && (serverperfmode == 0)) {
maxproc *= scale;
maxprocperuid = (maxproc * 2) / 3;
if (scale > 2) {
maxfiles *= scale;
maxfilesperproc = maxfiles / 2;
}
}
/* Apply server scaling rules */
if ((scale > 0) && (serverperfmode != 0)) {
maxproc = 2500 * scale;
hard_maxproc = maxproc;
/* no fp usage */
maxprocperuid = (maxproc * 3) / 4;
maxfiles = (150000 * scale);
maxfilesperproc = maxfiles / 2;
desiredvnodes = maxfiles;
vnodes_sized = 1;
tcp_tfo_backlog = 100 * scale;
if (scale > 4) {
/* clip somaxconn at 32G level */
somaxconn = 2048;
/*
* For scale > 4 (> 32G), clip
* tcp_tcbhashsize to 32K
*/
tcp_tcbhashsize = 32 * 1024;
if (scale > 7) {
/* clip at 64G level */
max_cached_sock_count = 165000;
} else {
max_cached_sock_count = 60000 + ((scale - 1) * 15000);
if (scale > 0) {
if (!serverperfmode) {
maxproc *= scale;
maxprocperuid = (maxproc * 2) / 3;
if (scale > 2) {
maxfiles *= scale;
maxfilesperproc = maxfiles / 2;
}
} else {
somaxconn = 512 * scale;
tcp_tcbhashsize = 4 * 1024 * scale;
max_cached_sock_count = 60000 + ((scale - 1) * 15000);
maxproc = 2500 * scale;
hard_maxproc = maxproc;
/* no fp usage */
maxprocperuid = (maxproc * 3) / 4;
maxfiles = (150000 * scale);
maxfilesperproc = maxfiles / 2;
desiredvnodes = maxfiles;
vnodes_sized = 1;
tcp_tfo_backlog = 100 * scale;
if (scale > 4) {
/* clip somaxconn at 32G level */
somaxconn = 2048;
/*
* For scale > 4 (> 32G), clip
* tcp_tcbhashsize to 32K
*/
tcp_tcbhashsize = 32 * 1024;
} else {
somaxconn = 512 * scale;
tcp_tcbhashsize = 4 * 1024 * scale;
}
}
}

18
bsd/kern/Makefile Normal file
View file

@ -0,0 +1,18 @@
export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd
export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
include $(MakeInc_cmd)
include $(MakeInc_def)
EXPORT_MI_DIR = kern
EXPORT_MI_LIST = qsort.h
# Don't want these XNU-internal headers installed in the SDK
INSTALL_KF_MI_LIST = $(empty)
INSTALL_KF_MI_LCL_LIST = $(empty)
include $(MakeInc_rule)
include $(MakeInc_dir)

View file

@ -163,6 +163,7 @@
#include <net/restricted_in_port.h> /* for restricted_in_port_init() */
#include <net/remote_vif.h> /* for rvi_init() */
#include <net/kctl_test.h> /* for kctl_test_init() */
#include <net/aop/kpi_aop.h> /* for kern_aop_net_init() */
#include <netinet/kpi_ipfilter_var.h> /* for ipfilter_init() */
#include <kern/assert.h> /* for assert() */
#include <sys/kern_overrides.h> /* for init_system_override() */
@ -270,7 +271,9 @@ extern void bsd_bufferinit(void);
extern void throttle_init(void);
vm_map_t bsd_pageable_map;
#if CONFIG_MBUF_MCACHE
vm_map_t mb_map;
#endif /* CONFIG_MBUF_MCACHE */
static int bsd_simul_execs;
static int bsd_pageable_map_size;
@ -491,8 +494,8 @@ bsd_init(void)
boolean_t netboot = FALSE;
#endif
#if (DEVELOPMENT || DEBUG)
platform_stall_panic_or_spin(PLATFORM_STALL_XNU_LOCATION_BSD_INIT);
#if HAS_UPSI_FAILURE_INJECTION
check_for_failure_injection(XNU_STAGE_BSD_INIT_START);
#endif
#define DEBUG_BSDINIT 0
@ -705,6 +708,7 @@ bsd_init(void)
#endif
#if SOCKETS
net_update_uptime();
#if CONFIG_MBUF_MCACHE
/* Initialize per-CPU cache allocator */
mcache_init();
@ -774,6 +778,7 @@ bsd_init(void)
necp_init();
#endif
netagent_init();
net_aop_init();
#endif /* NETWORKING */
#if CONFIG_FREEZE
@ -1067,6 +1072,10 @@ bsd_init(void)
machine_timeout_bsd_init();
#endif /* DEVELOPMENT || DEBUG */
#if HAS_UPSI_FAILURE_INJECTION
check_for_failure_injection(XNU_STAGE_BSD_INIT_END);
#endif
bsd_init_kprintf("done\n");
}

View file

@ -1,4 +0,0 @@
Ensure any new syscalls added:
- Fill in any gaps before being added to the end of the list.
- Have been reviewed by a security engineer.

View file

@ -336,6 +336,20 @@ ppl_reconstitute_code_signature(
#pragma mark Address Spaces
kern_return_t
ppl_setup_nested_address_space(
__unused pmap_t pmap,
__unused const vm_address_t region_addr,
__unused const vm_size_t region_size)
{
/*
* We don't need to do anything here from the code-signing-monitor's perspective
* because the PMAP's base address fields are setup when someone eventually calls
* pmap_nest on the PMAP object.
*/
return KERN_SUCCESS;
}
kern_return_t
ppl_associate_code_signature(
pmap_t pmap,
@ -380,7 +394,6 @@ ppl_associate_debug_region(
const vm_address_t region_addr,
const vm_size_t region_size)
{
volatile bool force_true = true;
bool debugger_mapping = false;
/*
@ -409,16 +422,6 @@ ppl_associate_debug_region(
}
#endif
/*
* For now, we're just going to revert back to our previous policy and continue
* to allow a debugger mapped to be created by a process on its own.
*
* For more information: rdar://145588999.
*/
if (force_true == true) {
debugger_mapping = true;
}
if (debugger_mapping == false) {
printf("disallowed non-debugger initiated debug mapping\n");
return KERN_DENIED;

View file

@ -621,25 +621,16 @@ get_code_signing_info(void)
txm_restricted_mode_state = txm_ro_data->restrictedModeState;
}
#if kTXMKernelAPIVersion >= 11
research_mode_enabled = txm_ro_data->buildType.research;
extended_research_mode_enabled = txm_ro_data->buildType.extendedResearch;
#endif
/* Setup the number of boot trust caches */
num_static_trust_caches = os_atomic_load(&txm_metrics->trustCaches.numStatic, relaxed);
num_engineering_trust_caches = os_atomic_load(&txm_metrics->trustCaches.numEngineering, relaxed);
}
static void
set_shared_region_base_address(void)
{
txm_call_t txm_call = {
.selector = kTXMKernelSelectorSetSharedRegionBaseAddress,
.failure_fatal = true,
.num_input_args = 2,
};
txm_kernel_call(&txm_call,
SHARED_REGION_BASE,
SHARED_REGION_SIZE);
}
void
code_signing_init(void)
{
@ -662,12 +653,6 @@ code_signing_init(void)
lck_mtx_init(&compilation_service_lock, &txm_lck_grp, 0);
lck_mtx_init(&unregister_sync_lock, &txm_lck_grp, 0);
/*
* We need to let TXM know what the shared region base address is going
* to be for this boot.
*/
set_shared_region_base_address();
/* Require signed code when monitor is enabled */
if (code_signing_enabled == true) {
cs_debug_fail_on_unsigned_code = 1;
@ -1228,6 +1213,26 @@ txm_unregister_address_space(
return KERN_SUCCESS;
}
kern_return_t
txm_setup_nested_address_space(
pmap_t pmap,
const vm_address_t region_addr,
const vm_size_t region_size)
{
txm_call_t txm_call = {
.selector = kTXMKernelSelectorSetupNestedAddressSpace,
.num_input_args = 3
};
TXMAddressSpace_t *txm_addr_space = pmap_txm_addr_space(pmap);
kern_return_t ret = KERN_DENIED;
pmap_txm_acquire_exclusive_lock(pmap);
ret = txm_kernel_call(&txm_call, txm_addr_space, region_addr, region_size);
pmap_txm_release_exclusive_lock(pmap);
return ret;
}
kern_return_t
txm_associate_code_signature(
pmap_t pmap,
@ -1260,7 +1265,7 @@ txm_associate_code_signature(
*/
vm_address_t adjusted_region_addr = region_addr;
if (txm_addr_space->addrSpaceID.type == kTXMAddressSpaceIDTypeSharedRegion) {
adjusted_region_addr += SHARED_REGION_BASE;
adjusted_region_addr += txm_addr_space->baseAddr;
}
/*

View file

@ -1347,7 +1347,7 @@ decmpfs_pagein_compressed(struct vnop_pagein_args *ap, int *is_compressed, decmp
* alignment requirements.
*/
err = VNOP_VERIFY(vp, f_offset, NULL, 0, &verify_block_size, NULL,
VNODE_VERIFY_DEFAULT, NULL);
VNODE_VERIFY_DEFAULT, NULL, NULL);
if (err) {
ErrorLogWithPath("VNOP_VERIFY returned error = %d\n", err);
goto out;
@ -1597,7 +1597,7 @@ decompress:
if (!err && verify_block_size) {
size_t cur_verify_block_size = verify_block_size;
if ((err = VNOP_VERIFY(vp, uplPos, vec.buf, rounded_uplSize, &cur_verify_block_size, NULL, 0, NULL))) {
if ((err = VNOP_VERIFY(vp, uplPos, vec.buf, rounded_uplSize, &cur_verify_block_size, NULL, 0, NULL, NULL))) {
ErrorLogWithPath("Verification failed with error %d, uplPos = %lld, uplSize = %d, did_read = %d, valid_pages = %d, invalid_pages = %d, tail_page_valid = %d\n",
err, (long long)uplPos, (int)rounded_uplSize, (int)did_read, num_valid_pages, num_invalid_pages, file_tail_page_valid);
}
@ -1749,7 +1749,7 @@ decmpfs_read_compressed(struct vnop_read_args *ap, int *is_compressed, decmpfs_c
*/
/* If the verify block size is larger than the page size, the UPL needs to aligned to it */
err = VNOP_VERIFY(vp, uplPos, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL);
err = VNOP_VERIFY(vp, uplPos, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL, NULL);
if (err) {
goto out;
} else if (verify_block_size) {
@ -1858,7 +1858,7 @@ decompress:
if (!err && verify_block_size) {
size_t cur_verify_block_size = verify_block_size;
if ((err = VNOP_VERIFY(vp, curUplPos, data, curUplSize, &cur_verify_block_size, NULL, 0, NULL))) {
if ((err = VNOP_VERIFY(vp, curUplPos, data, curUplSize, &cur_verify_block_size, NULL, 0, NULL, NULL))) {
ErrorLogWithPath("Verification failed with error %d\n", err);
abort_read = 1;
}

View file

@ -37,6 +37,7 @@
#include <sys/vnode_internal.h>
#include <sys/imageboot.h>
#include <kern/assert.h>
#include <vm/vm_far.h>
#include <sys/namei.h>
#include <sys/fcntl.h>
@ -243,7 +244,8 @@ imageboot_pivot_image(const char *image_path, imageboot_type_t type, const char
size_t bufsz = 0;
void *buf = NULL;
error_func = "imageboot_read_file";
error = imageboot_read_file_pageable(image_path, &buf, &bufsz);
// no_softlimit: di_root_ramfile_buf is OK to handle a no_softlimit buffer
error = imageboot_read_file_pageable(image_path, &buf, &bufsz, /* no_softlimit */ true);
if (error == 0) {
error_func = "di_root_ramfile_buf";
error = di_root_ramfile_buf(buf, bufsz, devname, sizeof(devname), &dev);
@ -572,7 +574,7 @@ errorout:
}
static int
imageboot_read_file_internal(const char *path, const off_t offset, const bool pageable, void **bufp, size_t *bufszp, off_t *fsizep)
imageboot_read_file_internal(const char *path, const off_t offset, const bool pageable, void **bufp, size_t *bufszp, off_t *fsizep, bool no_softlimit)
{
int err = 0;
struct nameidata ndp = {};
@ -639,26 +641,41 @@ imageboot_read_file_internal(const char *path, const off_t offset, const bool pa
PE_parse_boot_argn("rootdmg-maxsize", &maxsize, sizeof(maxsize));
if (maxsize && (maxsize < (size_t)fsize)) {
AUTHPRNT("file is too large (%lld > %lld)", (long long) fsize, (long long) maxsize);
err = ENOMEM;
err = EFBIG;
goto out;
}
if (pageable) {
vm_offset_t addr = 0;
kma_flags_t kma_flags = 0;
kma_flags = KMA_PAGEABLE | KMA_DATA_SHARED;
if (no_softlimit) {
kma_flags |= KMA_NOSOFTLIMIT;
}
if (kmem_alloc(kernel_map, &addr, (vm_size_t)fsize,
KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_FILE) == KERN_SUCCESS) {
kma_flags, VM_KERN_MEMORY_FILE) == KERN_SUCCESS) {
buf = (char *)addr;
} else {
buf = NULL;
}
} else {
zalloc_flags_t zflags = 0;
//limit kalloc data calls to only 2GB.
if (fsize > IMAGEBOOT_MAX_KALLOCSIZE) {
AUTHPRNT("file is too large for non-pageable (%lld)", (long long) fsize);
err = ENOMEM;
goto out;
}
buf = (char *)kalloc_data((vm_size_t)fsize, Z_WAITOK);
zflags = Z_WAITOK;
if (no_softlimit) {
zflags |= Z_NOSOFTLIMIT;
}
buf = (char *)kalloc_data((vm_size_t)fsize, zflags);
}
if (buf == NULL) {
err = ENOMEM;
@ -699,7 +716,7 @@ imageboot_read_file_internal(const char *path, const off_t offset, const bool pa
}
}
readbuf = &readbuf[chunksize];
readbuf = VM_FAR_ADD_PTR_UNBOUNDED(readbuf, chunksize);
readsize -= chunksize;
readoff += chunksize;
}
@ -734,21 +751,21 @@ out:
}
int
imageboot_read_file_pageable(const char *path, void **bufp, size_t *bufszp)
imageboot_read_file_pageable(const char *path, void **bufp, size_t *bufszp, bool no_softlimit)
{
return imageboot_read_file_internal(path, 0, true, bufp, bufszp, NULL);
return imageboot_read_file_internal(path, 0, true, bufp, bufszp, NULL, no_softlimit);
}
int
imageboot_read_file_from_offset(const char *path, const off_t offset, void **bufp, size_t *bufszp)
{
return imageboot_read_file_internal(path, offset, false, bufp, bufszp, NULL);
return imageboot_read_file_internal(path, offset, false, bufp, bufszp, NULL, /* no_softlimit */ false);
}
int
imageboot_read_file(const char *path, void **bufp, size_t *bufszp, off_t *fsizep)
{
return imageboot_read_file_internal(path, 0, false, bufp, bufszp, fsizep);
return imageboot_read_file_internal(path, 0, false, bufp, bufszp, fsizep, /* no_softlimit */ false);
}
#if CONFIG_IMAGEBOOT_IMG4 || CONFIG_IMAGEBOOT_CHUNKLIST
@ -896,8 +913,14 @@ imageboot_mount_ramdisk(const char *path)
vnode_t tvp;
mount_t new_rootfs;
/* Read our target image from disk */
err = imageboot_read_file_pageable(path, &buf, &bufsz);
/*
* Read our target image from disk
*
* We override the allocator soft-limit in order to allow booting large RAM
* disks. As a consequence, we are responsible for manipulating the
* buffer only through vm_far safe APIs.
*/
err = imageboot_read_file_pageable(path, &buf, &bufsz, /* no_softlimit */ true);
if (err) {
printf("%s: failed: imageboot_read_file_pageable() = %d\n", __func__, err);
goto out;
@ -1091,8 +1114,13 @@ imageboot_setup_new(imageboot_type_t type)
}
if (error) {
panic("Failed to mount root image (err=%d, auth=%d, ramdisk=%d)",
error, auth_root, ramdisk_root);
if (error == EFBIG) {
panic("root imagefile is too large (err=%d, auth=%d, ramdisk=%d)",
error, auth_root, ramdisk_root);
} else {
panic("Failed to mount root image (err=%d, auth=%d, ramdisk=%d)",
error, auth_root, ramdisk_root);
}
}
#if CONFIG_IMAGEBOOT_CHUNKLIST

File diff suppressed because it is too large Load diff

View file

@ -58,6 +58,7 @@ TUNABLE(unsigned int, write_trace_on_panic, "trace_panic", 0);
// Obsolete leak logging system.
TUNABLE(int, log_leaks, "-l", 0);
__startup_func
void
kdebug_startup(void)
{
@ -137,7 +138,7 @@ create_buffers(
kd_data_page->kdb_region_count++;
}
if (kd_data_page->kdcopybuf == 0) {
if (kd_ctrl_page->kdebug_kdcopybuf_size > 0 && kd_data_page->kdcopybuf == NULL) {
if (kmem_alloc(kernel_map, (vm_offset_t *)&kd_data_page->kdcopybuf,
(vm_size_t) kd_ctrl_page->kdebug_kdcopybuf_size,
KMA_DATA | KMA_ZERO, tag) != KERN_SUCCESS) {
@ -252,9 +253,20 @@ delete_buffers(struct kd_control *kd_ctrl_page,
kd_ctrl_page->kdc_flags &= ~KDBG_BUFINIT;
}
static bool
allocate_storage_unit(struct kd_control *kd_ctrl_page,
struct kd_buffer *kd_data_page, int cpu)
static void
_register_out_of_space(struct kd_control *kd_ctrl_page)
{
kd_ctrl_page->kdc_emit = KDEMIT_DISABLE;
kdebug_enable = 0;
kd_ctrl_page->enabled = 0;
commpage_update_kdebug_state();
}
bool
kdebug_storage_alloc(
struct kd_control *kd_ctrl_page,
struct kd_buffer *kd_data_page,
int cpu)
{
union kds_ptr kdsp;
struct kd_storage *kdsp_actual, *kdsp_next_actual;
@ -292,11 +304,8 @@ allocate_storage_unit(struct kd_control *kd_ctrl_page,
* storage unit we can find.
*/
if (kd_ctrl_page->kdc_live_flags & KDBG_NOWRAP) {
kd_ctrl_page->kdc_emit = KDEMIT_DISABLE;
_register_out_of_space(kd_ctrl_page);
kd_ctrl_page->kdc_live_flags |= KDBG_WRAPPED;
kdebug_enable = 0;
kd_ctrl_page->enabled = 0;
commpage_update_kdebug_state();
kdbp->kd_lostevents = true;
retval = false;
goto out;
@ -339,10 +348,7 @@ allocate_storage_unit(struct kd_control *kd_ctrl_page,
}
}
if (kdbp_vict == NULL && kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
kd_ctrl_page->kdc_emit = KDEMIT_DISABLE;
kdebug_enable = 0;
kd_ctrl_page->enabled = 0;
commpage_update_kdebug_state();
_register_out_of_space(kd_ctrl_page);
retval = false;
goto out;
}
@ -388,607 +394,3 @@ out:
return retval;
}
static void
release_storage_unit(struct kd_control *kd_ctrl_page, struct kd_buffer *kd_data_page, int cpu, uint32_t kdsp_raw)
{
struct kd_storage *kdsp_actual;
struct kd_bufinfo *kdbp;
union kds_ptr kdsp;
kdbp = &kd_data_page->kdb_info[cpu];
kdsp.raw = kdsp_raw;
int intrs_en = kdebug_storage_lock(kd_ctrl_page);
if (kdsp.raw == kdbp->kd_list_head.raw) {
/*
* it's possible for the storage unit pointed to
* by kdsp to have already been stolen... so
* check to see if it's still the head of the list
* now that we're behind the lock that protects
* adding and removing from the queue...
* since we only ever release and steal units from
* that position, if it's no longer the head
* we having nothing to do in this context
*/
kdsp_actual = POINTER_FROM_KDS_PTR(kd_data_page->kd_bufs, kdsp);
kdbp->kd_list_head = kdsp_actual->kds_next;
kdsp_actual->kds_next = kd_ctrl_page->kds_free_list;
kd_ctrl_page->kds_free_list = kdsp;
kd_ctrl_page->kdc_storage_used--;
}
kdebug_storage_unlock(kd_ctrl_page, intrs_en);
}
bool
kdebug_disable_wrap(struct kd_control *ctl,
kdebug_emit_filter_t *old_emit, kdebug_live_flags_t *old_live)
{
int intrs_en = kdebug_storage_lock(ctl);
*old_emit = ctl->kdc_emit;
*old_live = ctl->kdc_live_flags;
bool wrapped = ctl->kdc_live_flags & KDBG_WRAPPED;
ctl->kdc_live_flags &= ~KDBG_WRAPPED;
ctl->kdc_live_flags |= KDBG_NOWRAP;
kdebug_storage_unlock(ctl, intrs_en);
return wrapped;
}
static void
_enable_wrap(struct kd_control *kd_ctrl_page, kdebug_emit_filter_t emit)
{
int intrs_en = kdebug_storage_lock(kd_ctrl_page);
kd_ctrl_page->kdc_live_flags &= ~KDBG_NOWRAP;
if (emit) {
kd_ctrl_page->kdc_emit = emit;
}
kdebug_storage_unlock(kd_ctrl_page, intrs_en);
}
__attribute__((always_inline))
void
kernel_debug_write(struct kd_control *kd_ctrl_page,
struct kd_buffer *kd_data_page,
struct kd_record kd_rec)
{
uint64_t now = 0;
uint32_t bindx;
kd_buf *kd;
int cpu;
struct kd_bufinfo *kdbp;
struct kd_storage *kdsp_actual;
union kds_ptr kds_raw;
disable_preemption();
if (kd_ctrl_page->enabled == 0) {
goto out;
}
if (kd_rec.cpu == -1) {
cpu = cpu_number();
} else {
cpu = kd_rec.cpu;
}
kdbp = &kd_data_page->kdb_info[cpu];
bool timestamp_is_continuous = kdbp->continuous_timestamps;
if (kd_rec.timestamp != -1) {
if (kdebug_using_continuous_time()) {
if (!timestamp_is_continuous) {
kd_rec.timestamp = absolutetime_to_continuoustime(kd_rec.timestamp);
}
} else {
if (timestamp_is_continuous) {
kd_rec.timestamp = continuoustime_to_absolutetime(kd_rec.timestamp);
}
}
kd_rec.timestamp &= KDBG_TIMESTAMP_MASK;
if (kd_rec.timestamp < kd_ctrl_page->kdc_oldest_time) {
if (kdbp->latest_past_event_timestamp < kd_rec.timestamp) {
kdbp->latest_past_event_timestamp = kd_rec.timestamp;
}
goto out;
}
}
retry_q:
kds_raw = kdbp->kd_list_tail;
if (kds_raw.raw != KDS_PTR_NULL) {
kdsp_actual = POINTER_FROM_KDS_PTR(kd_data_page->kd_bufs, kds_raw);
bindx = kdsp_actual->kds_bufindx;
} else {
kdsp_actual = NULL;
bindx = kd_ctrl_page->kdebug_events_per_storage_unit;
}
if (kdsp_actual == NULL || bindx >= kd_ctrl_page->kdebug_events_per_storage_unit) {
if (allocate_storage_unit(kd_ctrl_page, kd_data_page, cpu) == false) {
/*
* this can only happen if wrapping
* has been disabled
*/
goto out;
}
goto retry_q;
}
if (kd_rec.timestamp != -1) {
/*
* IOP entries can be allocated before xnu allocates and inits the buffer
* And, Intel uses a special 0 value as a early tracing timestamp sentinel
* to set the start of trace-time-start-of-interest.
*/
if (kd_rec.timestamp < kdsp_actual->kds_timestamp) {
kdsp_actual->kds_timestamp = kd_rec.timestamp;
}
now = kd_rec.timestamp;
} else {
if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
now = kdebug_timestamp() & KDBG_TIMESTAMP_MASK;
} else {
now = mach_continuous_time() & KDBG_TIMESTAMP_MASK;
}
}
if (!OSCompareAndSwap(bindx, bindx + 1, &kdsp_actual->kds_bufindx)) {
goto retry_q;
}
kd = &kdsp_actual->kds_records[bindx];
if (kd_ctrl_page->kdc_flags & KDBG_DEBUGID_64) {
/*DebugID has been passed in arg 4*/
kd->debugid = 0;
} else {
kd->debugid = kd_rec.debugid;
}
kd->arg1 = kd_rec.arg1;
kd->arg2 = kd_rec.arg2;
kd->arg3 = kd_rec.arg3;
kd->arg4 = kd_rec.arg4;
kd->arg5 = kd_rec.arg5;
kdbg_set_timestamp_and_cpu(kd, now, cpu);
OSAddAtomic(1, &kdsp_actual->kds_bufcnt);
out:
enable_preemption();
}
// Read events from kdebug storage units into a user space buffer or file.
//
// This code runs while events are emitted -- storage unit allocation and
// deallocation wll synchronize with the emitters. Only one reader per control
// structure is allowed.
int
kernel_debug_read(struct kd_control *kd_ctrl_page,
struct kd_buffer *kd_data_page, user_addr_t buffer, size_t *number,
vnode_t vp, vfs_context_t ctx, uint32_t file_version)
{
size_t count;
unsigned int cpu, min_cpu;
uint64_t barrier_min = 0, barrier_max = 0, t, earliest_time;
int error = 0;
kd_buf *tempbuf;
uint32_t rcursor;
kd_buf lostevent;
union kds_ptr kdsp;
bool traced_retrograde = false;
struct kd_storage *kdsp_actual;
struct kd_bufinfo *kdbp;
struct kd_bufinfo *min_kdbp;
size_t tempbuf_count;
uint32_t tempbuf_number;
kdebug_emit_filter_t old_emit;
uint32_t old_live_flags;
bool out_of_events = false;
bool wrapped = false;
bool set_preempt = true;
bool should_disable = false;
struct kd_bufinfo *kdbip = kd_data_page->kdb_info;
struct kd_region *kd_bufs = kd_data_page->kd_bufs;
assert(number != NULL);
count = *number / sizeof(kd_buf);
*number = 0;
if (count == 0 || !(kd_ctrl_page->kdc_flags & KDBG_BUFINIT) || kd_data_page->kdcopybuf == 0) {
return EINVAL;
}
if (kd_ctrl_page->mode == KDEBUG_MODE_TRIAGE) {
/*
* A corpse can be created due to 'TASK_HAS_TOO_MANY_THREADS'
* and that can be handled by a callout thread that already
* has the eager-preemption set.
* So check to see if we are dealing with one such thread.
*/
set_preempt = !(thread_is_eager_preempt(current_thread()));
}
if (set_preempt) {
thread_set_eager_preempt(current_thread());
}
memset(&lostevent, 0, sizeof(lostevent));
lostevent.debugid = TRACE_LOST_EVENTS;
/*
* Capture the current time. Only sort events that have occured
* before now. Since the IOPs are being flushed here, it is possible
* that events occur on the AP while running live tracing.
*/
if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
barrier_max = kdebug_timestamp() & KDBG_TIMESTAMP_MASK;
} else {
barrier_max = mach_continuous_time() & KDBG_TIMESTAMP_MASK;
}
/*
* Disable wrap so storage units cannot be stolen out from underneath us
* while merging events.
*
* Because we hold ktrace_lock, no other control threads can be playing
* with kdc_flags. The code that emits new events could be running,
* but it grabs kdc_storage_lock if it needs to acquire a new storage
* chunk, which is where it examines kdc_flags. If it is adding to
* the same chunk we're reading from, check for that below.
*/
wrapped = kdebug_disable_wrap(kd_ctrl_page, &old_emit, &old_live_flags);
if (count > kd_data_page->kdb_event_count) {
count = kd_data_page->kdb_event_count;
}
if ((tempbuf_count = count) > kd_ctrl_page->kdebug_kdcopybuf_count) {
tempbuf_count = kd_ctrl_page->kdebug_kdcopybuf_count;
}
/*
* If the buffers have wrapped, do not emit additional lost events for the
* oldest storage units.
*/
if (wrapped) {
kd_ctrl_page->kdc_live_flags &= ~KDBG_WRAPPED;
for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page->kdebug_cpus; cpu++, kdbp++) {
if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) {
continue;
}
kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
kdsp_actual->kds_lostevents = false;
}
}
if (kd_ctrl_page->mode == KDEBUG_MODE_TRIAGE) {
/*
* In TRIAGE mode we want to extract all the current
* records regardless of where we stopped reading last
* time so that we have the best shot at getting older
* records for threads before the buffers are wrapped.
* So set:-
* a) kd_prev_timebase to 0 so we (re-)consider older records
* b) readlast to 0 to initiate the search from the
* 1st record.
*/
for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page->kdebug_cpus; cpu++, kdbp++) {
kdbp->kd_prev_timebase = 0;
if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) {
continue;
}
kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
kdsp_actual->kds_readlast = 0;
}
}
/*
* Capture the earliest time where there are events for all CPUs and don't
* emit events with timestamps prior.
*/
barrier_min = kd_ctrl_page->kdc_oldest_time;
while (count) {
tempbuf = kd_data_page->kdcopybuf;
tempbuf_number = 0;
if (wrapped) {
/*
* Emit a lost events tracepoint to indicate that previous events
* were lost -- the thread map cannot be trusted. A new one must
* be taken so tools can analyze the trace in a backwards-facing
* fashion.
*/
kdbg_set_timestamp_and_cpu(&lostevent, barrier_min, 0);
*tempbuf = lostevent;
wrapped = false;
goto nextevent;
}
/* While space left in merged events scratch buffer. */
while (tempbuf_count) {
bool lostevents = false;
int lostcpu = 0;
earliest_time = UINT64_MAX;
min_kdbp = NULL;
min_cpu = 0;
/* Check each CPU's buffers for the earliest event. */
for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page->kdebug_cpus; cpu++, kdbp++) {
/* Skip CPUs without data in their oldest storage unit. */
if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) {
next_cpu:
continue;
}
/* From CPU data to buffer header to buffer. */
kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
next_event:
/* The next event to be read from this buffer. */
rcursor = kdsp_actual->kds_readlast;
/* Skip this buffer if there are no events left. */
if (rcursor == kdsp_actual->kds_bufindx) {
continue;
}
if (kd_ctrl_page->mode == KDEBUG_MODE_TRIAGE) {
/*
* TRIAGE mode record keeping doesn't (currently)
* use lostevent markers. It also doesn't want to
* call release_storage_unit() in this read call.
* It expects the buffers to wrap and records reclaimed
* in that way solely.
*/
t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]);
goto skip_record_checks;
}
/*
* Check that this storage unit wasn't stolen and events were
* lost. This must have happened while wrapping was disabled
* in this function.
*/
if (kdsp_actual->kds_lostevents) {
lostevents = true;
kdsp_actual->kds_lostevents = false;
/*
* The earliest event we can trust is the first one in this
* stolen storage unit.
*/
uint64_t lost_time =
kdbg_get_timestamp(&kdsp_actual->kds_records[0]);
if (kd_ctrl_page->kdc_oldest_time < lost_time) {
/*
* If this is the first time we've seen lost events for
* this gap, record its timestamp as the oldest
* timestamp we're willing to merge for the lost events
* tracepoint.
*/
kd_ctrl_page->kdc_oldest_time = barrier_min = lost_time;
lostcpu = cpu;
}
}
t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]);
if (t > barrier_max) {
goto next_cpu;
}
if (t < kdsp_actual->kds_timestamp) {
/*
* This indicates the event emitter hasn't completed
* filling in the event (becuase we're looking at the
* buffer that the record head is using). The max barrier
* timestamp should have saved us from seeing these kinds
* of things, but other CPUs might be slow on the up-take.
*
* Bail out so we don't get out-of-order events by
* continuing to read events from other CPUs' events.
*/
out_of_events = true;
break;
}
/*
* Ignore events that have aged out due to wrapping or storage
* unit exhaustion while merging events.
*/
if (t < barrier_min) {
kdsp_actual->kds_readlast++;
if (kdsp_actual->kds_readlast >= kd_ctrl_page->kdebug_events_per_storage_unit) {
release_storage_unit(kd_ctrl_page, kd_data_page, cpu, kdsp.raw);
if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) {
goto next_cpu;
}
kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
}
goto next_event;
}
/*
* Don't worry about merging any events -- just walk through
* the CPUs and find the latest timestamp of lost events.
*/
if (lostevents) {
continue;
}
skip_record_checks:
if (t < earliest_time) {
earliest_time = t;
min_kdbp = kdbp;
min_cpu = cpu;
}
}
if (lostevents) {
/*
* If any lost events were hit in the buffers, emit an event
* with the latest timestamp.
*/
kdbg_set_timestamp_and_cpu(&lostevent, barrier_min, lostcpu);
*tempbuf = lostevent;
tempbuf->arg1 = 1;
goto nextevent;
}
if (min_kdbp == NULL) {
/* All buffers ran empty. */
out_of_events = true;
}
if (out_of_events) {
break;
}
kdsp = min_kdbp->kd_list_head;
kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
if (min_kdbp->latest_past_event_timestamp != 0) {
if (kdbg_debug) {
printf("kdebug: PAST EVENT: debugid %#8x: "
"time %lld from CPU %u "
"(barrier at time %lld)\n",
kdsp_actual->kds_records[rcursor].debugid,
t, cpu, barrier_min);
}
kdbg_set_timestamp_and_cpu(tempbuf, earliest_time, min_cpu);
tempbuf->arg1 = (kd_buf_argtype)min_kdbp->latest_past_event_timestamp;
tempbuf->arg2 = 0;
tempbuf->arg3 = 0;
tempbuf->arg4 = 0;
tempbuf->debugid = TRACE_PAST_EVENTS;
min_kdbp->latest_past_event_timestamp = 0;
goto nextevent;
}
/* Copy earliest event into merged events scratch buffer. */
*tempbuf = kdsp_actual->kds_records[kdsp_actual->kds_readlast++];
kd_buf *earliest_event = tempbuf;
if (kd_control_trace.kdc_flags & KDBG_MATCH_DISABLE) {
kd_event_matcher *match = &kd_control_trace.disable_event_match;
kd_event_matcher *mask = &kd_control_trace.disable_event_mask;
if ((earliest_event->debugid & mask->kem_debugid) == match->kem_debugid &&
(earliest_event->arg1 & mask->kem_args[0]) == match->kem_args[0] &&
(earliest_event->arg2 & mask->kem_args[1]) == match->kem_args[1] &&
(earliest_event->arg3 & mask->kem_args[2]) == match->kem_args[2] &&
(earliest_event->arg4 & mask->kem_args[3]) == match->kem_args[3]) {
should_disable = true;
}
}
if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
if (kdsp_actual->kds_readlast == kd_ctrl_page->kdebug_events_per_storage_unit) {
release_storage_unit(kd_ctrl_page, kd_data_page, min_cpu, kdsp.raw);
}
}
/*
* Watch for out of order timestamps (from IOPs).
*/
if (earliest_time < min_kdbp->kd_prev_timebase) {
/*
* If we haven't already, emit a retrograde events event.
* Otherwise, ignore this event.
*/
if (traced_retrograde) {
continue;
}
if (kdbg_debug) {
printf("kdebug: RETRO EVENT: debugid %#8x: "
"time %lld from CPU %u "
"(barrier at time %lld)\n",
kdsp_actual->kds_records[rcursor].debugid,
t, cpu, barrier_min);
}
kdbg_set_timestamp_and_cpu(tempbuf, min_kdbp->kd_prev_timebase,
kdbg_get_cpu(tempbuf));
tempbuf->arg1 = tempbuf->debugid;
tempbuf->arg2 = (kd_buf_argtype)earliest_time;
tempbuf->arg3 = 0;
tempbuf->arg4 = 0;
tempbuf->debugid = TRACE_RETROGRADE_EVENTS;
traced_retrograde = true;
} else {
min_kdbp->kd_prev_timebase = earliest_time;
}
nextevent:
tempbuf_count--;
tempbuf_number++;
tempbuf++;
if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE &&
(RAW_file_written += sizeof(kd_buf)) >= RAW_FLUSH_SIZE) {
break;
}
}
if (tempbuf_number) {
/*
* Remember the latest timestamp of events that we've merged so we
* don't think we've lost events later.
*/
uint64_t latest_time = kdbg_get_timestamp(tempbuf - 1);
if (kd_ctrl_page->kdc_oldest_time < latest_time) {
kd_ctrl_page->kdc_oldest_time = latest_time;
}
if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
extern int kernel_debug_trace_write_to_file(user_addr_t *buffer,
size_t *number, size_t *count, size_t tempbuf_number,
vnode_t vp, vfs_context_t ctx, uint32_t file_version);
error = kernel_debug_trace_write_to_file(&buffer, number,
&count, tempbuf_number, vp, ctx, file_version);
} else if (kd_ctrl_page->mode == KDEBUG_MODE_TRIAGE) {
memcpy((void*)buffer, kd_data_page->kdcopybuf,
tempbuf_number * sizeof(kd_buf));
buffer += tempbuf_number * sizeof(kd_buf);
} else {
panic("kdebug: invalid kdebug mode %d", kd_ctrl_page->mode);
}
if (error) {
*number = 0;
error = EINVAL;
break;
}
count -= tempbuf_number;
*number += tempbuf_number;
}
if (out_of_events) {
break;
}
if ((tempbuf_count = count) > kd_ctrl_page->kdebug_kdcopybuf_count) {
tempbuf_count = kd_ctrl_page->kdebug_kdcopybuf_count;
}
}
if ((old_live_flags & KDBG_NOWRAP) == 0) {
_enable_wrap(kd_ctrl_page, old_emit);
}
if (set_preempt) {
thread_clear_eager_preempt(current_thread());
}
if (should_disable) {
kernel_debug_disable();
}
return error;
}

View file

@ -22,6 +22,7 @@
#include <sys/kdebug_common.h>
#include <sys/kdebug_triage.h>
#include <machine/atomic.h>
#define TRIAGE_KDCOPYBUF_COUNT 128
#define TRIAGE_KDCOPYBUF_SIZE (TRIAGE_KDCOPYBUF_COUNT * sizeof(kd_buf))
@ -31,8 +32,6 @@ struct kd_control kd_control_triage = {
.mode = KDEBUG_MODE_TRIAGE,
.kdebug_events_per_storage_unit = TRIAGE_EVENTS_PER_STORAGE_UNIT,
.kdebug_min_storage_units_per_cpu = TRIAGE_MIN_STORAGE_UNITS_PER_CPU,
.kdebug_kdcopybuf_count = TRIAGE_KDCOPYBUF_COUNT,
.kdebug_kdcopybuf_size = TRIAGE_KDCOPYBUF_SIZE,
.kdc_flags = KDBG_DEBUGID_64,
.kdc_emit = KDEMIT_DISABLE,
.kdc_oldest_time = 0
@ -45,10 +44,9 @@ struct kd_buffer kd_buffer_triage = {
.kdb_region_count = 0,
.kdb_info = NULL,
.kd_bufs = NULL,
.kdcopybuf = NULL
.kdcopybuf = NULL,
};
static LCK_GRP_DECLARE(ktriage_grp, "ktriage");
static LCK_MTX_DECLARE(ktriage_mtx, &ktriage_grp);
@ -64,54 +62,28 @@ ktriage_unlock(void)
lck_mtx_unlock(&ktriage_mtx);
}
int
__startup_func
void
create_buffers_triage(void)
{
int error = 0;
int events_per_storage_unit, min_storage_units_per_cpu;
if (kd_control_triage.kdc_flags & KDBG_BUFINIT) {
panic("create_buffers_triage shouldn't be called once we have inited the triage system.");
panic("kdebug_triage: double-init");
}
events_per_storage_unit = kd_control_triage.kdebug_events_per_storage_unit;
min_storage_units_per_cpu = kd_control_triage.kdebug_min_storage_units_per_cpu;
uint32_t cpu_count = kdbg_cpu_count();
kd_control_triage.kdebug_cpus = cpu_count;
kd_control_triage.alloc_cpus = cpu_count;
uint32_t storage_count = cpu_count * kd_control_triage.kdebug_min_storage_units_per_cpu;
kd_control_triage.kdebug_cpus = kdbg_cpu_count();
kd_control_triage.alloc_cpus = kd_control_triage.kdebug_cpus;
kd_control_triage.kdc_coprocs = NULL;
kd_buffer_triage.kdb_storage_count = storage_count;
kd_buffer_triage.kdb_event_count = storage_count * kd_control_triage.kdebug_events_per_storage_unit;
if (kd_buffer_triage.kdb_event_count < (kd_control_triage.kdebug_cpus * events_per_storage_unit * min_storage_units_per_cpu)) {
kd_buffer_triage.kdb_storage_count = kd_control_triage.kdebug_cpus * min_storage_units_per_cpu;
} else {
kd_buffer_triage.kdb_storage_count = kd_buffer_triage.kdb_event_count / events_per_storage_unit;
int error = create_buffers(&kd_control_triage, &kd_buffer_triage, VM_KERN_MEMORY_TRIAGE);
if (error != 0) {
panic("kdebug_triage: failed to create buffers, error = %d", error);
}
kd_buffer_triage.kdb_event_count = kd_buffer_triage.kdb_storage_count * events_per_storage_unit;
kd_buffer_triage.kd_bufs = NULL;
error = create_buffers(&kd_control_triage, &kd_buffer_triage, VM_KERN_MEMORY_TRIAGE);
if (!error) {
kd_control_triage.kdc_oldest_time = mach_continuous_time();
kd_control_triage.enabled = 1;
kd_buffer_triage.kdb_storage_threshold = kd_buffer_triage.kdb_storage_count / 2;
}
return error;
}
__attribute__((noreturn))
void
delete_buffers_triage(void)
{
/*
* If create_buffers() for triage mode fails, it will call the generic delete_buffers() to
* free the resources. This specific call should never be invoked because we expect the
* triage system to always be ON.
*/
panic("delete_buffers_triage shouldn't be invoked");
// Immediately enable triage recording.
kd_control_triage.enabled = 1;
}
ktriage_strings_t ktriage_subsystems_strings[KDBG_TRIAGE_SUBSYS_MAX + 1];
@ -150,38 +122,191 @@ ktriage_convert_to_string(uint64_t debugid, uintptr_t arg, char *buf, uint32_t b
return;
}
static void
_write_triage_record_nopreempt(uintptr_t debugid, uintptr_t arg, uintptr_t thread_id)
{
uint64_t now = 0;
uint32_t bindx;
kd_buf *kd;
struct kd_storage *kdsp_actual;
union kds_ptr kds_raw;
if (!kd_control_triage.enabled) {
return;
}
int cpu = cpu_number();
struct kd_bufinfo *info = &kd_buffer_triage.kdb_info[cpu];
const uint32_t events_per_storage = kd_control_triage.kdebug_events_per_storage_unit;
while (true) {
kds_raw = info->kd_list_tail;
if (kds_raw.raw != KDS_PTR_NULL) {
kdsp_actual = POINTER_FROM_KDS_PTR(kd_buffer_triage.kd_bufs, kds_raw);
bindx = kdsp_actual->kds_bufindx;
} else {
kdsp_actual = NULL;
bindx = events_per_storage;
}
if (kdsp_actual == NULL || bindx >= events_per_storage) {
if (kdebug_storage_alloc(&kd_control_triage, &kd_buffer_triage, cpu) == false) {
break;
}
continue;
}
now = mach_continuous_time() & KDBG_TIMESTAMP_MASK;
if (OSCompareAndSwap(bindx, bindx + 1, &kdsp_actual->kds_bufindx)) {
kd = &kdsp_actual->kds_records[bindx];
kd->debugid = 0;
kd->arg1 = arg;
kd->arg2 = 0;
kd->arg3 = 0;
kd->arg4 = debugid;
kd->arg5 = thread_id;
kd->timestamp = now;
os_atomic_inc(&kdsp_actual->kds_bufcnt, release);
break;
}
}
}
void
ktriage_record(
uint64_t thread_id,
uint64_t debugid,
uintptr_t arg)
{
struct kd_record kd_rec;
if (thread_id == 0) {
thread_id = thread_tid(current_thread());
}
disable_preemption();
_write_triage_record_nopreempt(debugid, arg, thread_id);
enable_preemption();
}
kd_rec.cpu = -1;
kd_rec.timestamp = -1;
static struct kd_storage *
_find_triage_min_storage(uint64_t thread_id)
{
uint64_t earliest_time = UINT64_MAX;
struct kd_storage *min_store = NULL;
/*
* use 64-bit debugid per our flag KDBG_DEBUGID_64
* that is set in kd_control_triage (on LP64 only).
*/
assert(kd_control_triage.kdc_flags & KDBG_DEBUGID_64);
// Find the earliest record from all CPUs.
for (unsigned int cpu = 0; cpu < kd_control_triage.kdebug_cpus; cpu++) {
struct kd_bufinfo *info = &kd_buffer_triage.kdb_info[cpu];
union kds_ptr store_ptr = info->kd_list_head;
if (store_ptr.raw == KDS_PTR_NULL) {
continue;
}
struct kd_storage *store = POINTER_FROM_KDS_PTR(kd_buffer_triage.kd_bufs, store_ptr);
kd_buf *found_rec = NULL;
kd_rec.debugid = 0;
kd_rec.arg4 = (uintptr_t)debugid;
while (store) {
unsigned int last_read = store->kds_readlast;
unsigned int const limit = os_atomic_load(&store->kds_bufcnt, acquire);
while (last_read < limit) {
// Skip any records that didn't come from the target thread.
kd_buf *rec = &store->kds_records[last_read];
if (rec->arg5 == thread_id) {
found_rec = rec;
break;
}
last_read++;
}
if (found_rec) {
store->kds_readlast = last_read;
break;
}
kd_rec.arg1 = arg;
kd_rec.arg2 = 0;
kd_rec.arg3 = 0;
kd_rec.arg5 = (uintptr_t)thread_id;
store_ptr = store->kds_next;
if (store_ptr.raw == KDS_PTR_NULL) {
break;
}
store = POINTER_FROM_KDS_PTR(kd_buffer_triage.kd_bufs, store_ptr);
}
kernel_debug_write(&kd_control_triage,
&kd_buffer_triage,
kd_rec);
if (found_rec) {
uint64_t t = found_rec->timestamp;
if (t < earliest_time) {
earliest_time = t;
min_store = store;
}
}
}
return min_store;
}
/// Copy a time-ordered series of records pertaining to the given thread to a
/// buffer. Returns the number of records written into the buffer.
///
/// Mutual exclusion must be provided by the caller.
///
/// This is similar to `_read_trace_records`, except for a few triage-specific
/// additions and the removal of significant complexity for handling lost
/// events, coprocessors, and direct file writing.
static size_t
_read_triage_records(kd_buf *read_buffer,
size_t max_count,
uint64_t thread_id)
{
struct kd_bufinfo *bufinfos = kd_buffer_triage.kdb_info;
struct kd_region *region = kd_buffer_triage.kd_bufs;
size_t avail_count = MIN(max_count, kd_buffer_triage.kdb_event_count);
size_t read_count = 0;
if (avail_count == 0 ||
!(kd_control_triage.kdc_flags & KDBG_BUFINIT)) {
return 0;
}
// `thread_call` threads created due to corpse creation may already have the
// eager preemption bit set, so don't over-do it.
bool set_preempt = !(thread_is_eager_preempt(current_thread()));
if (set_preempt) {
thread_set_eager_preempt(current_thread());
}
// Prevent any writers from stealing storage units -- just drop their logs
// on the floor instead.
int intrs_en = kdebug_storage_lock(&kd_control_triage);
kd_control_triage.kdc_flags |= KDBG_NOWRAP;
kdebug_storage_unlock(&kd_control_triage, intrs_en);
// Clear out any previous accumulated state from earlier reads, as triage
// wants to reconsider all available data.
for (unsigned int cpu = 0; cpu < kd_control_triage.kdebug_cpus; cpu++) {
struct kd_bufinfo *info = &bufinfos[cpu];
info->kd_prev_timebase = 0;
union kds_ptr kdsp = info->kd_list_head;
while (kdsp.raw != KDS_PTR_NULL) {
struct kd_storage *store = POINTER_FROM_KDS_PTR(region, kdsp);
store->kds_readlast = 0;
kdsp = store->kds_next;
}
}
while (avail_count) {
struct kd_storage *min_store = _find_triage_min_storage(thread_id);
if (min_store == NULL) {
break;
}
*read_buffer++ = min_store->kds_records[min_store->kds_readlast++];
avail_count--;
read_count++;
}
intrs_en = kdebug_storage_lock(&kd_control_triage);
kd_control_triage.kdc_flags &= ~KDBG_NOWRAP;
kdebug_storage_unlock(&kd_control_triage, intrs_en);
if (set_preempt) {
thread_clear_eager_preempt(current_thread());
}
return read_count;
}
void
@ -190,11 +315,9 @@ ktriage_extract(
void *buf,
uint32_t bufsz)
{
size_t i, record_bytes, record_cnt, record_bufsz;
size_t record_cnt = 0, record_bufsz;
void *record_buf;
void *local_buf;
int ret;
if (thread_id == 0 || buf == NULL || bufsz < KDBG_TRIAGE_MAX_STRLEN) {
return;
@ -203,43 +326,28 @@ ktriage_extract(
local_buf = buf;
bzero(local_buf, bufsz);
record_bytes = record_bufsz = kd_buffer_triage.kdb_event_count * sizeof(kd_buf);
record_bufsz = kd_buffer_triage.kdb_event_count * sizeof(kd_buf);
record_buf = kalloc_data(record_bufsz, Z_WAITOK);
if (record_buf == NULL) {
ret = ENOMEM;
printf("kdebug_triage: failed to allocate %lu bytes for record\n",
record_bufsz);
return;
} else {
ktriage_lock();
ret = kernel_debug_read(&kd_control_triage,
&kd_buffer_triage,
(user_addr_t) record_buf, &record_bytes, NULL, NULL, 0);
record_cnt = _read_triage_records(record_buf,
kd_buffer_triage.kdb_event_count, thread_id);
ktriage_unlock();
}
if (ret) {
printf("ktriage_extract: kernel_debug_read failed with %d\n", ret);
kfree_data(record_buf, record_bufsz);
return;
}
kd_buf *kd = (kd_buf*) record_buf;
i = 0;
record_cnt = record_bytes; /* kernel_debug_read() takes number of bytes that it
* converts to kd_bufs. It processes a max of those and
* returns number of kd_buf read/processed. We use a
* different variable here to make our units clear.
*/
while (i < record_cnt) {
if (kd->arg5 == (uintptr_t)thread_id) {
ktriage_convert_to_string(kd->arg4, kd->arg1, local_buf, KDBG_TRIAGE_MAX_STRLEN);
local_buf = (void *)((uintptr_t)local_buf + KDBG_TRIAGE_MAX_STRLEN);
bufsz -= KDBG_TRIAGE_MAX_STRLEN;
if (bufsz < KDBG_TRIAGE_MAX_STRLEN) {
break;
}
kd_buf *kd = (kd_buf *)record_buf;
for (size_t i = 0; i < record_cnt; i++) {
assert3u(kd->arg5, ==, thread_id);
ktriage_convert_to_string(kd->arg4, kd->arg1, local_buf, KDBG_TRIAGE_MAX_STRLEN);
local_buf = (void *)((uintptr_t)local_buf + KDBG_TRIAGE_MAX_STRLEN);
bufsz -= KDBG_TRIAGE_MAX_STRLEN;
if (bufsz < KDBG_TRIAGE_MAX_STRLEN) {
break;
}
i++;
kd++;
}
@ -346,6 +454,9 @@ const char *vm_triage_strings[] =
[KDBG_TRIAGE_VM_ALLOCATE_KERNEL_BADMAP_ERROR] = "mach_vm_allocate_kernel failed due to bad map\n",
[KDBG_TRIAGE_VM_ALLOCATE_KERNEL_BADSIZE_ERROR] = "mach_vm_allocate_kernel failed due to bad size\n",
[KDBG_TRIAGE_VM_ALLOCATE_KERNEL_VMMAPENTER_ERROR] = "mach_vm_allocate_kernel failed within call to vm_map_enter\n",
[KDBG_TRIAGE_VM_IOPL_ON_EXEC_PAGE] = "Attempted I/O wiring of page with executable mapping\n",
[KDBG_TRIAGE_VM_EXEC_ON_IOPL_PAGE] = "Attempted executable mapping of page already wired for I/O\n",
[KDBG_TRIAGE_VM_UPL_WRITE_ON_EXEC_REGION] = "Attempted writable UPL against executable VM region\n",
};
/* VM end */

File diff suppressed because it is too large Load diff

View file

@ -132,7 +132,7 @@ static int kauth_authorize_generic_callback(kauth_cred_t _credential, void
uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
kauth_scope_t kauth_scope_fileop;
extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int);
extern bool cansignal(struct proc *, kauth_cred_t, struct proc *, int);
extern char * get_pathbuff(void);
extern void release_pathbuff(char *path);

View file

@ -232,6 +232,7 @@ code_signing_configuration(
int cs_enforcement_disabled = 0;
int cs_integrity_skip = 0;
int amfi_relax_profile_trust = 0;
int amfi_dev_mode_policy = 0;
/* Parse the AMFI mask */
PE_parse_boot_argn("amfi", &amfi_mask, sizeof(amfi_mask));
@ -272,6 +273,12 @@ code_signing_configuration(
&amfi_relax_profile_trust,
sizeof(amfi_relax_profile_trust));
/* Parse the AMFI customer developer mode policy */
PE_parse_boot_argn(
"amfi_dev_mode_policy",
&amfi_dev_mode_policy,
sizeof(amfi_dev_mode_policy));
/* CS_CONFIG_UNRESTRICTED_DEBUGGING */
if (amfi_mask & CS_AMFI_MASK_UNRESTRICT_TASK_FOR_PID) {
config |= CS_CONFIG_UNRESTRICTED_DEBUGGING;
@ -316,6 +323,11 @@ code_signing_configuration(
config |= CS_CONFIG_RELAX_PROFILE_TRUST;
}
/* CS_CONFIG_DEV_MODE_POLICY */
if (amfi_dev_mode_policy) {
config |= CS_CONFIG_DEV_MODE_POLICY;
}
#if CONFIG_SPTM
if (csm_enabled() == true) {
@ -503,6 +515,29 @@ developer_mode_state(void)
return os_atomic_load(developer_mode_enabled, relaxed);
}
#pragma mark Research Mode
SECURITY_READ_ONLY_LATE(bool) research_mode_enabled = false;
SECURITY_READ_ONLY_LATE(bool) extended_research_mode_enabled = false;
bool
research_mode_state(void)
{
if (allow_research_modes() == true) {
return research_mode_enabled;
}
return false;
}
bool
extended_research_mode_state(void)
{
if (allow_research_modes() == true) {
return extended_research_mode_enabled;
}
return false;
}
#pragma mark Restricted Execution Mode
kern_return_t
@ -982,7 +1017,7 @@ csm_resolve_os_entitlements_from_proc(
}
kern_return_t
address_space_debugged(
address_space_debugged_state(
const proc_t process)
{
/* Must pass in a valid proc_t */
@ -1049,6 +1084,12 @@ address_space_debugged(
return KERN_DENIED;
}
bool
is_address_space_debugged(const proc_t process)
{
return address_space_debugged_state(process) == KERN_SUCCESS;
}
#if CODE_SIGNING_MONITOR
bool
@ -1207,6 +1248,18 @@ csm_reconstitute_code_signature(
unneeded_size);
}
kern_return_t
csm_setup_nested_address_space(
pmap_t pmap,
const vm_address_t region_addr,
const vm_size_t region_size)
{
return CSM_PREFIX(setup_nested_address_space)(
pmap,
region_addr,
region_size);
}
kern_return_t
csm_associate_code_signature(
pmap_t monitor_pmap,

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2000-2021 Apple Computer, Inc. All rights reserved.
* Copyright (c) 2000-2025 Apple Computer, Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
@ -32,7 +32,7 @@
* This file contains machine independent code for performing core dumps.
*
*/
#if CONFIG_COREDUMP
#if CONFIG_COREDUMP || CONFIG_UCOREDUMP
#include <mach/vm_param.h>
#include <mach/thread_status.h>
@ -73,13 +73,85 @@
#include <security/mac_framework.h>
#endif /* CONFIG_MACF */
#include <kdp/core_notes.h>
extern int freespace_mb(vnode_t vp);
/* XXX not in a Mach header anywhere */
kern_return_t thread_getstatus(thread_t act, int flavor,
thread_state_t tstate, mach_msg_type_number_t *count);
void task_act_iterate_wth_args(task_t, void (*)(thread_t, void *), void *);
#ifdef SECURE_KERNEL
__XNU_PRIVATE_EXTERN int do_coredump = 0; /* default: don't dump cores */
#else
__XNU_PRIVATE_EXTERN int do_coredump = 1; /* default: dump cores */
#endif /* SECURE_KERNEL */
__XNU_PRIVATE_EXTERN int sugid_coredump = 0; /* default: but not SGUID binaries */
#if CONFIG_UCOREDUMP
__XNU_PRIVATE_EXTERN int do_ucoredump = 0; /* default: kernel does dumps */
#endif
/*
* is_coredump_eligible
*
* Determine if a core should even be dumped at all (by any mechanism)
*
* Does NOT include disk permission or space constraints
*
* core_proc Process to dump core [*] must be current proc!
*
* Return: 0 Success
* !0 Failure errno
*/
int
is_coredump_eligible(proc_t core_proc)
{
if (current_proc() != core_proc && (
core_proc->p_exit_reason &&
core_proc->p_exit_reason->osr_namespace == OS_REASON_JETSAM)) {
return EPERM;
}
if (current_proc() != core_proc) {
panic("coredump for proc that is not current: %p)", core_proc);
}
vfs_context_t ctx = vfs_context_current();
kauth_cred_t cred = vfs_context_ucred(ctx);
if (do_coredump == 0 || /* Not dumping at all */
((sugid_coredump == 0) && /* Not dumping SUID/SGID binaries */
((kauth_cred_getsvuid(cred) != kauth_cred_getruid(cred)) ||
(kauth_cred_getsvgid(cred) != kauth_cred_getrgid(cred))))) {
return EPERM;
}
#if CONFIG_MACF
const int error = mac_proc_check_dump_core(core_proc);
if (error != 0) {
return error;
}
#endif
return 0;
}
#else /* CONFIG_COREDUMP || CONFIG_UCOREDUMP */
/* When core dumps aren't needed, no need to compile this file at all */
#error assertion failed: this section is not compiled
#endif /* CONFIG_COREDUMP || CONFIG_UCOREDUMP */
#if CONFIG_COREDUMP
#define COREDUMP_CUSTOM_LOCATION_ENTITLEMENT "com.apple.private.custom-coredump-location"
typedef struct {
int flavor; /* the number for this flavor */
mach_msg_type_number_t count; /* count of ints in this flavor */
mach_msg_type_number_t count; /* count of ints in this flavor */
} mythread_state_flavor_t;
#if defined (__i386__) || defined (__x86_64__)
@ -109,21 +181,6 @@ typedef struct {
size_t flavor_count;
} tir_t;
extern int freespace_mb(vnode_t vp);
/* XXX not in a Mach header anywhere */
kern_return_t thread_getstatus(thread_t act, int flavor,
thread_state_t tstate, mach_msg_type_number_t *count);
void task_act_iterate_wth_args(task_t, void (*)(thread_t, void *), void *);
#ifdef SECURE_KERNEL
__XNU_PRIVATE_EXTERN int do_coredump = 0; /* default: don't dump cores */
#else
__XNU_PRIVATE_EXTERN int do_coredump = 1; /* default: dump cores */
#endif /* SECURE_KERNEL */
__XNU_PRIVATE_EXTERN int sugid_coredump = 0; /* default: but not SGUID binaries */
/* cpu_type returns only the most generic indication of the current CPU. */
/* in a core we want to know the kind of process. */
@ -299,9 +356,9 @@ dump_notes(proc_t __unused core_proc, vm_offset_t header, size_t hoffset, struct
* indicated
*
* Parameters: core_proc Process to dump core [*]
* reserve_mb If non-zero, leave filesystem with
* at least this much free space.
* coredump_flags Extra options (ignore rlimit, run fsync)
* reserve_mb If non-zero, leave filesystem with
* at least this much free space.
* coredump_flags Extra options (ignore rlimit, run fsync)
*
* Returns: 0 Success
* !0 Failure errno
@ -344,8 +401,8 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags)
mach_msg_type_number_t vbrcount = 0;
tir_t tir1;
struct vnode * vp;
struct mach_header *mh = NULL; /* protected by is_64 */
struct mach_header_64 *mh64 = NULL; /* protected by is_64 */
struct mach_header *mh = NULL; /* protected by is_64 */
struct mach_header_64 *mh64 = NULL; /* protected by is_64 */
int is_64 = 0;
size_t mach_header_sz = sizeof(struct mach_header);
size_t segment_command_sz = sizeof(struct segment_command);
@ -358,27 +415,10 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags)
bool include_iokit_memory = task_is_driver(task);
bool coredump_attempted = false;
if (current_proc() != core_proc) {
COREDUMPLOG("Skipping coredump (called against proc that is not current_proc: %p)", core_proc);
error = EFAULT;
if ((error = is_coredump_eligible(core_proc)) != 0) {
goto out2;
}
if (do_coredump == 0 || /* Not dumping at all */
((sugid_coredump == 0) && /* Not dumping SUID/SGID binaries */
((kauth_cred_getsvuid(cred) != kauth_cred_getruid(cred)) ||
(kauth_cred_getsvgid(cred) != kauth_cred_getrgid(cred))))) {
error = EFAULT;
goto out2;
}
#if CONFIG_MACF
error = mac_proc_check_dump_core(core_proc);
if (error != 0) {
goto out2;
}
#endif
if (IS_64BIT_PROCESS(core_proc)) {
is_64 = 1;
mach_header_sz = sizeof(struct mach_header_64);
@ -425,6 +465,7 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags)
(void) task_suspend_internal(task);
alloced_name = zalloc_flags(ZV_NAMEI, Z_NOWAIT | Z_ZERO);
/* create name according to sysctl'able format string */
@ -456,7 +497,7 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags)
goto out;
}
VATTR_INIT(vap); /* better to do it here than waste more stack in vnode_setsize */
VATTR_INIT(vap); /* better to do it here than waste more stack in vnode_setsize */
VATTR_SET(vap, va_data_size, 0);
if (core_proc == initproc) {
VATTR_SET(vap, va_dataprotect_class, PROTECTION_CLASS_D);
@ -479,7 +520,7 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags)
*/
thread_count = get_task_numacts(task);
segment_count = get_vmmap_entries(map); /* XXX */
segment_count = get_vmmap_entries(map); /* XXX */
tir1.flavor_count = sizeof(thread_flavor_array) / sizeof(mythread_state_flavor_t);
bcopy(thread_flavor_array, flavors, sizeof(thread_flavor_array));
tstate_size = 0;
@ -561,8 +602,8 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags)
mh->sizeofcmds = (uint32_t)command_size;
}
hoffset = mach_header_sz; /* offset into header */
foffset = round_page(header_size); /* offset into file */
hoffset = mach_header_sz; /* offset into header */
foffset = round_page(header_size); /* offset into file */
vmoffset = MACH_VM_MIN_ADDRESS; /* offset into VM */
COREDUMPLOG("mach header size: %zu", header_size);
@ -694,8 +735,8 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags)
sc->segname[0] = 0;
sc->vmaddr = CAST_DOWN_EXPLICIT(uint32_t, vmoffset);
sc->vmsize = CAST_DOWN_EXPLICIT(uint32_t, vmsize);
sc->fileoff = CAST_DOWN_EXPLICIT(uint32_t, foffset); /* will never truncate */
sc->filesize = CAST_DOWN_EXPLICIT(uint32_t, fsize); /* will never truncate */
sc->fileoff = CAST_DOWN_EXPLICIT(uint32_t, foffset); /* will never truncate */
sc->filesize = CAST_DOWN_EXPLICIT(uint32_t, fsize); /* will never truncate */
sc->maxprot = maxprot;
sc->initprot = prot;
sc->nsects = 0;
@ -791,10 +832,4 @@ out2:
return error;
}
#else /* CONFIG_COREDUMP */
/* When core dumps aren't needed, no need to compile this file at all */
#error assertion failed: this section is not compiled
#endif /* CONFIG_COREDUMP */

View file

@ -1918,7 +1918,7 @@ kauth_cred_change_egid(kauth_cred_t cred, gid_t new_egid)
}
uid_t
__mockable uid_t
kauth_cred_getuid(kauth_cred_t cred)
{
return posix_cred_get(cred)->cr_uid;
@ -3414,7 +3414,9 @@ kauth_cred_init(void)
smr_shash_init(&kauth_cred_hash, SMRSH_BALANCED, maxproc / 4);
vfs_context0.vc_ucred = posix_cred_create(&kernel_cred_template);
}
#ifndef __BUILDING_XNU_LIB_UNITTEST__ /* smr not supported in user-mode */
STARTUP(ZALLOC, STARTUP_RANK_LAST, kauth_cred_init);
#endif /* __BUILDING_XNU_LIB_UNITTEST__ */
uid_t
kauth_getuid(void)

View file

@ -43,7 +43,7 @@
#include <libkern/section_keywords.h>
#include <pexpert/device_tree.h>
#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
#include <arm64/amcc_rorgn.h>
#endif
@ -120,6 +120,15 @@ _csr_is_restore_environment(void)
return PE_parse_boot_argn("-restore", &notused, sizeof(notused));
}
static bool
_csr_is_darwinos_ramdisk(void)
{
DTEntry chosen;
return SecureDTLookupEntry(0, "/chosen", &chosen) == kSuccess &&
_csr_dt_string_is_equal(&chosen, "osenvironment", "darwinos-ramdisk");
}
static bool
_csr_is_iuou_or_iuos_device(void)
{
@ -201,7 +210,8 @@ csr_bootstrap(void)
// This is required so the MSU stack can mount/unmount the update volume
// during paired recovery.
if (_csr_is_recovery_environment() ||
_csr_is_restore_environment()) {
_csr_is_restore_environment() ||
_csr_is_darwinos_ramdisk()) {
csr_config |= CSR_ALLOW_UNRESTRICTED_FS;
}
@ -216,13 +226,6 @@ csr_bootstrap(void)
} else {
csr_config &= ~CSR_ALLOW_UNAUTHENTICATED_ROOT;
}
#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
// Check whether we have to disable CTRR.
// lp-sip2 in the local boot policy is the bit driving this,
// which csrutil also sets implicitly when e.g. requesting kernel debugging.
csr_unsafe_kernel_text = _csr_get_dt_bool(&entry, "lp-sip2", &bool_value) && bool_value;
#endif
}
STARTUP(TUNABLES, STARTUP_RANK_FIRST, csr_bootstrap);

View file

@ -1731,6 +1731,8 @@ fp_close_and_unlock(proc_t p, kauth_cred_t cred, int fd, struct fileproc *fp, in
proc_fdunlock(p);
if (FILEGLOB_DTYPE(fg) == DTYPE_VNODE) {
vnode_t vp = (vnode_t)fg_get_data(fg);
/*
* call out to allow 3rd party notification of close.
* Ignore result of kauth_authorize_fileop call.
@ -1742,15 +1744,15 @@ fp_close_and_unlock(proc_t p, kauth_cred_t cred, int fd, struct fileproc *fp, in
#endif
if (kauth_authorize_fileop_has_listeners() &&
vnode_getwithref((vnode_t)fg_get_data(fg)) == 0) {
vnode_getwithref(vp) == 0) {
u_int fileop_flags = 0;
if (fg->fg_flag & FWASWRITTEN) {
fileop_flags |= KAUTH_FILEOP_CLOSE_MODIFIED;
}
kauth_authorize_fileop(fg->fg_cred, KAUTH_FILEOP_CLOSE,
(uintptr_t)fg_get_data(fg), (uintptr_t)fileop_flags);
(uintptr_t)vp, (uintptr_t)fileop_flags);
vnode_put((vnode_t)fg_get_data(fg));
vnode_put(vp);
}
}
@ -1861,6 +1863,16 @@ dupfdopen(proc_t p, int indx, int dfd, int flags, int error)
return EPERM;
}
if (wfp->f_type == DTYPE_VNODE) {
vnode_t vp = (vnode_t)fp_get_data(wfp);
/* Don't allow opening symlink if O_SYMLINK was not specified. */
if (vp && (vp->v_type == VLNK) && ((flags & O_SYMLINK) == 0)) {
proc_fdunlock(p);
return ELOOP;
}
}
/*
* Check that the mode the file is being opened for is a
* subset of the mode of the existing descriptor.
@ -2764,6 +2776,7 @@ sys_fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
int i, tmp, error, error2, flg = 0;
struct flock fl = {};
struct flocktimeout fltimeout;
struct user32_flocktimeout user32_fltimeout;
struct timespec *timeout = NULL;
off_t offset;
int newmin;
@ -3025,9 +3038,20 @@ sys_fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
/* Copy in the lock structure */
if (F_SETLKWTIMEOUT == cmd || F_OFD_SETLKWTIMEOUT == cmd) {
error = copyin(argp, (caddr_t) &fltimeout, sizeof(fltimeout));
if (error) {
goto outdrop;
/* timespec uses long, so munge when we're dealing with 32-bit userspace */
if (is64bit) {
error = copyin(argp, (caddr_t) &fltimeout, sizeof(fltimeout));
if (error) {
goto outdrop;
}
} else {
error = copyin(argp, (caddr_t) &user32_fltimeout, sizeof(user32_fltimeout));
if (error) {
goto outdrop;
}
fltimeout.fl = user32_fltimeout.fl;
fltimeout.timeout.tv_sec = user32_fltimeout.timeout.tv_sec;
fltimeout.timeout.tv_nsec = user32_fltimeout.timeout.tv_nsec;
}
fl = fltimeout.fl;
timeout = &fltimeout.timeout;
@ -4401,6 +4425,16 @@ dropboth:
struct vnode_attr va;
#if CONFIG_MACF
// tmp has already explicitly downcast to uint32_t above.
uint32_t dataprotect_class = (uint32_t)tmp;
if ((error = mac_vnode_check_dataprotect_set(ctx, vp, &dataprotect_class))) {
vnode_put(vp);
goto outdrop;
}
tmp = (int)dataprotect_class;
#endif
VATTR_INIT(&va);
VATTR_SET(&va, va_dataprotect_class, tmp);
@ -5479,11 +5513,11 @@ fstat(proc_t p, int fd, user_addr_t ub, user_addr_t xsecurity,
case DTYPE_VNODE:
if ((error = vnode_getwithref((vnode_t)data)) == 0) {
/*
* If the caller has the file open, and is not
* requesting extended security information, we are
* If the caller has the file open for reading, and is
* not requesting extended security information, we are
* going to let them get the basic stat information.
*/
if (xsecurity == USER_ADDR_NULL) {
if ((fp->f_flag & FREAD) && (xsecurity == USER_ADDR_NULL)) {
error = vn_stat_noauth((vnode_t)data, sbptr, NULL, isstat64, 0, ctx,
fp->fp_glob->fg_cred);
} else {

View file

@ -148,7 +148,7 @@ SECURITY_READ_ONLY_EARLY(vm_packing_params_t) kn_kq_packing_params =
VM_PACKING_PARAMS(KNOTE_KQ_PACKED);
extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc/ipc_entry.h */
extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int); /* bsd/kern/kern_sig.c */
extern bool cansignal(struct proc *, kauth_cred_t, struct proc *, int); /* bsd/kern/kern_sig.c */
#define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
@ -310,15 +310,16 @@ extern const struct filterops fsevent_filtops;
extern const struct filterops vnode_filtops;
extern const struct filterops tty_filtops;
const static struct filterops file_filtops;
const static struct filterops kqread_filtops;
const static struct filterops proc_filtops;
const static struct filterops timer_filtops;
const static struct filterops user_filtops;
const static struct filterops workloop_filtops;
__security_const_early static struct filterops file_filtops;
__security_const_early static struct filterops kqread_filtops;
__security_const_early static struct filterops proc_filtops;
__security_const_early static struct filterops timer_filtops;
__security_const_early static struct filterops user_filtops;
__security_const_early static struct filterops workloop_filtops;
#if CONFIG_EXCLAVES
extern const struct filterops exclaves_notification_filtops;
#endif /* CONFIG_EXCLAVES */
extern const struct filterops aio_filtops;
/*
*
@ -340,7 +341,7 @@ static const struct filterops * const sysfilt_ops[EVFILTID_MAX] = {
/* Public Filters */
[~EVFILT_READ] = &file_filtops,
[~EVFILT_WRITE] = &file_filtops,
[~EVFILT_AIO] = &bad_filtops,
[~EVFILT_AIO] = &aio_filtops,
[~EVFILT_VNODE] = &file_filtops,
[~EVFILT_PROC] = &proc_filtops,
[~EVFILT_SIGNAL] = &sig_filtops,

View file

@ -102,6 +102,7 @@
#include <sys/kdebug.h>
#include <sys/signal.h>
#include <sys/aio_kern.h>
#include <sys/lockdown_mode.h>
#include <sys/sysproto.h>
#include <sys/sysctl.h>
#include <sys/persona.h>
@ -175,12 +176,14 @@
#include <machine/pal_routines.h>
#include <pexpert/pexpert.h>
#include <pexpert/device_tree.h>
#if CONFIG_MEMORYSTATUS
#include <sys/kern_memorystatus.h>
#endif
#include <IOKit/IOBSD.h>
#include <IOKit/IOKitKeys.h> /* kIODriverKitEntitlementKey */
#include "kern_exec_internal.h"
@ -204,7 +207,17 @@ static TUNABLE(bool, unentitled_ios_sim_launch, "unentitled_ios_sim_launch", fal
#endif /* DEBUG || DEVELOPMENT */
#endif /* XNU_TARGET_OS_OSX */
#if DEVELOPMENT || DEBUG
os_log_t exec_log_handle = NULL;
#define EXEC_LOG(fmt, ...) \
do { \
if (exec_log_handle) { \
os_log_with_type(exec_log_handle, OS_LOG_TYPE_INFO, "exec - %s:%d " fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__); \
} \
} while (0)
#else /* DEVELOPMENT || DEBUG */
#define EXEC_LOG(fmt, ...) do { } while (0)
#endif /* DEVELOPMENT || DEBUG */
#if CONFIG_DTRACE
/* Do not include dtrace.h, it redefines kmem_[alloc/free] */
@ -290,7 +303,6 @@ int task_add_conclave(task_t task, void *vnode, int64_t off, const char *task_co
kern_return_t task_inherit_conclave(task_t old_task, task_t new_task, void *vnode, int64_t off);
#endif /* CONFIG_EXCLAVES */
/*
* Mach things for which prototypes are unavailable from Mach headers
*/
@ -351,40 +363,6 @@ extern int nextpidversion;
*/
#define SPAWN_SET_PANIC_CRASH_BEHAVIOR "com.apple.private.spawn-panic-crash-behavior"
/*
* This entitlement marks security critical binaries for which the spawned
* process should be hardened. Implies enable-by-default for enablement
* of security features. These defaults can be overridden with the control
* entitlements for the sub-features below.
*/
#define SPAWN_ENABLE_HARDENED_PROCESS "com.apple.developer.hardened-process"
#if DEVELOPMENT || DEBUG
/*
* The following boot-arg defines the behavior for the case
* where a binary entitled as hardened-process but doesn't
* have a specific sub-feature entitlement, which is still
* under adoption.
*/
typedef enum {
HARDENED_PROCESS_CONFIG_SILENT = 0,
HARDENED_PROCESS_CONFIG_LOG = 1,
HARDENED_PROCESS_CONFIG_FATAL = 2,
HARDENED_PROCESS_CONFIG_MAX = 3
} hardened_process_config_policy;
TUNABLE(hardened_process_config_policy,
hardened_process_config,
"hardened_process_config",
HARDENED_PROCESS_CONFIG_SILENT);
#endif /* DEVELOPMENT || DEBUG */
/*
* Control entitlement to enable/disable hardened-heap in the process.
*/
#define SPAWN_ENABLE_HARDENED_HEAP "com.apple.developer.hardened-process.hardened-heap"
/* Platform Code Exec Logging */
static int platform_exec_logging = 0;
@ -395,6 +373,7 @@ SYSCTL_INT(_security_mac, OID_AUTO, platform_exec_logging, CTLFLAG_RW, &platform
static os_log_t peLog = OS_LOG_DEFAULT;
struct exception_port_action_t {
ipc_port_t port;
_ps_port_action_t *port_action;
@ -417,7 +396,7 @@ static int execargs_alloc(struct image_params *imgp);
static int execargs_free(struct image_params *imgp);
static int exec_check_permissions(struct image_params *imgp);
static int exec_extract_strings(struct image_params *imgp);
static int exec_add_apple_strings(struct image_params *imgp, const load_result_t *load_result);
static int exec_add_apple_strings(struct image_params *imgp, const load_result_t *load_result, task_t task);
static int exec_handle_sugid(struct image_params *imgp);
static int sugid_scripts = 0;
SYSCTL_INT(_kern, OID_AUTO, sugid_scripts, CTLFLAG_RW | CTLFLAG_LOCKED, &sugid_scripts, 0, "");
@ -434,8 +413,6 @@ static errno_t exec_handle_spawnattr_policy(proc_t p, thread_t thread, int psa_a
task_role_t psa_darwin_role, struct exec_port_actions *port_actions);
static void exec_port_actions_destroy(struct exec_port_actions *port_actions);
/*
* exec_add_user_string
*
@ -973,23 +950,24 @@ set_crash_behavior_from_bootarg(proc_t p)
void
set_proc_name(struct image_params *imgp, proc_t p)
{
int p_name_len = sizeof(p->p_name) - 1;
uint64_t buflen = imgp->ip_ndp->ni_cnd.cn_namelen;
const int p_name_len = sizeof(p->p_name) - 1;
const int p_comm_len = sizeof(p->p_comm) - 1;
if (imgp->ip_ndp->ni_cnd.cn_namelen > p_name_len) {
imgp->ip_ndp->ni_cnd.cn_namelen = p_name_len;
if (buflen > p_name_len) {
buflen = p_name_len;
}
bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_name,
(unsigned)imgp->ip_ndp->ni_cnd.cn_namelen);
p->p_name[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0';
bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_name, buflen);
p->p_name[buflen] = '\0';
if (imgp->ip_ndp->ni_cnd.cn_namelen > MAXCOMLEN) {
imgp->ip_ndp->ni_cnd.cn_namelen = MAXCOMLEN;
if (buflen > p_comm_len) {
static_assert(MAXCOMLEN + 1 == sizeof(p->p_comm));
buflen = p_comm_len;
}
bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_comm,
(unsigned)imgp->ip_ndp->ni_cnd.cn_namelen);
p->p_comm[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0';
bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_comm, buflen);
p->p_comm[buflen] = '\0';
#if (DEVELOPMENT || DEBUG)
/*
@ -1042,8 +1020,10 @@ get_teamid_for_shared_region(struct image_params *imgp)
static inline bool
arm64_cpusubtype_uses_ptrauth(cpu_subtype_t cpusubtype)
{
int ptrauth_abi_version = (int)CPU_SUBTYPE_ARM64_PTR_AUTH_VERSION(cpusubtype);
return (cpusubtype & ~CPU_SUBTYPE_MASK) == CPU_SUBTYPE_ARM64E &&
CPU_SUBTYPE_ARM64_PTR_AUTH_VERSION(cpusubtype) == CPU_SUBTYPE_ARM64_PTR_AUTH_CURRENT_VERSION;
(ptrauth_abi_version >= CPU_SUBTYPE_ARM64_PTR_AUTHV0_VERSION &&
ptrauth_abi_version <= CPU_SUBTYPE_ARM64_PTR_AUTH_MAX_PREFERRED_VERSION);
}
#endif /* __has_feature(ptrauth_calls) */
@ -1078,54 +1058,19 @@ binary_match(cpu_type_t mask, cpu_type_t req_cpu,
}
#define MIN_IOS_TPRO_SDK_VERSION 0x00100000
#define MIN_OSX_TPRO_SDK_VERSION 0x000D0000
#define MIN_TVOS_TPRO_SDK_VERSION 0x000D0000
#define MIN_WATCHOS_TPRO_SDK_VERSION 0x00090000
#define MIN_DRIVERKIT_TPRO_SDK_VERSION 0x00600000
static void
exec_setup_tpro(struct image_params *imgp, load_result_t *load_result)
/*
* Check entitlements to see if this is a platform restrictions binary.
* Save this in load_result until later for two purposes:
* 1. We can mark the task at a certain security level once it's been created
* 2. We can propagate which entitlements are present to the apple array
*/
static inline void
encode_HR_entitlement(const char *entitlement, hardened_browser_flags_t mask,
const struct image_params *imgp, load_result_t *load_result)
{
extern boolean_t xprr_tpro_enabled;
extern boolean_t enable_user_modifiable_perms;
uint32_t min_sdk_version = 0;
/* x86-64 translated code cannot take advantage of TPRO */
if (imgp->ip_flags & IMGPF_ROSETTA) {
return;
if (IOVnodeHasEntitlement(imgp->ip_vp, (int64_t)imgp->ip_arch_offset, entitlement)) {
load_result->hardened_browser |= mask;
}
/* Do not enable on 32-bit VA targets */
if (!(imgp->ip_flags & IMGPF_IS_64BIT_ADDR)) {
return;
}
switch (load_result->ip_platform) {
case PLATFORM_IOS:
case PLATFORM_IOSSIMULATOR:
case PLATFORM_MACCATALYST:
min_sdk_version = MIN_IOS_TPRO_SDK_VERSION;
break;
case PLATFORM_MACOS:
min_sdk_version = MIN_OSX_TPRO_SDK_VERSION;
break;
case PLATFORM_TVOS:
case PLATFORM_TVOSSIMULATOR:
min_sdk_version = MIN_TVOS_TPRO_SDK_VERSION;
break;
case PLATFORM_WATCHOS:
case PLATFORM_WATCHOSSIMULATOR:
min_sdk_version = MIN_WATCHOS_TPRO_SDK_VERSION;
break;
case PLATFORM_DRIVERKIT:
min_sdk_version = MIN_DRIVERKIT_TPRO_SDK_VERSION;
break;
default:
/* TPRO is on by default for newer platforms */
break;
}
}
/*
@ -1148,157 +1093,164 @@ vnode_is_rsr(vnode_t vp)
return FALSE;
}
static struct {
char *legacy;
char *security;
} exec_security_mitigation_entitlement[] = {
/* The following entries must match the enum declaration in kern_exec_internal.h */
[HARDENED_PROCESS] = {
"com.apple.developer.hardened-process",
"com.apple.security.hardened-process"
},
[HARDENED_HEAP] = {
"com.apple.developer.hardened-process.hardened-heap",
"com.apple.security.hardened-process.hardened-heap"
},
[TPRO] = {
NULL,
"com.apple.security.hardened-process.dyld-ro",
},
};
// Check entitlements to see if this is a hardened runtime binary.
// Save this in load_result until later for two purposes:
// 1. Once the task is created, we can mark it as hardened runtime if needed
// 2. we can propagate which entitlements are present to the apple array
/*
* Platform Restrictions
*
* This mitigation opts you into the grab bag of various kernel mitigations
* including IPC security restrictions
* The presence of the entitlement opts the binary into the feature.
* The entitlement is an <integer> entitlement containing a version number
* for the platform restrictions you are opting into.
*/
#define SPAWN_ENABLE_PLATFORM_RESTRICTIONS "com.apple.security.hardened-process.platform-restrictions"
/*
* Version number for enhanced security
* Currently stored with 3 bits in `hardened_process_version`
*/
#define HARDENED_PROCESS_VERSION "com.apple.security.hardened-process.enhanced-security-version"
/* See kern_exec_internal.h for the extensive documentation. */
exec_security_err_t
exec_check_security_entitlement(struct image_params *imgp,
exec_security_mitigation_entitlement_t entitlement)
{
bool has_legacy_entitlement = false, has_security_entitlement = false;
assert(exec_security_mitigation_entitlement[entitlement].security != NULL);
if (exec_security_mitigation_entitlement[entitlement].legacy != NULL) {
has_legacy_entitlement =
IOVnodeHasEntitlement(imgp->ip_vp, (int64_t)imgp->ip_arch_offset,
exec_security_mitigation_entitlement[entitlement].legacy);
}
has_security_entitlement =
IOVnodeHasEntitlement(imgp->ip_vp, (int64_t)imgp->ip_arch_offset,
exec_security_mitigation_entitlement[entitlement].security);
/* If both entitlements are present, this is an invalid configuration. */
if (has_legacy_entitlement && has_security_entitlement) {
EXEC_LOG("Binary has both legacy (%s) and security (%s) entitlements\n",
exec_security_mitigation_entitlement[entitlement].legacy,
exec_security_mitigation_entitlement[entitlement].security);
return EXEC_SECURITY_INVALID_CONFIG;
}
if (has_legacy_entitlement || has_security_entitlement) {
return EXEC_SECURITY_ENTITLED;
}
return EXEC_SECURITY_NOT_ENTITLED;
}
/*
* Entitled binaries get hardened_heap
*/
static inline errno_t
imgact_setup_hardened_heap(struct image_params *imgp, task_t task)
{
exec_security_err_t ret = exec_check_security_entitlement(imgp, HARDENED_HEAP);
if (ret == EXEC_SECURITY_ENTITLED) {
task_set_hardened_heap(task);
} else {
task_clear_hardened_heap(task);
}
switch (ret) {
case EXEC_SECURITY_INVALID_CONFIG:
return EINVAL;
case EXEC_SECURITY_ENTITLED:
case EXEC_SECURITY_NOT_ENTITLED:
return 0;
}
}
/*
* Configure the platform restrictions security features on the task
* This must be done before `ipc_task_enable` so that the bits
* can be propogated to the ipc space.
*
* Requires `exectextresetvp` to be called on `task` previously so
* that we can use the `IOTaskGetEntitlement` API
*/
static inline void
encode_HR_entitlement(const char *entitlement, HR_flags_t mask,
const struct image_params *imgp, load_result_t *load_result)
exec_setup_platform_restrictions(task_t task)
{
if (IOVnodeHasEntitlement(imgp->ip_vp, (int64_t)imgp->ip_arch_offset, entitlement)) {
load_result->hardened_runtime_binary |= mask;
}
}
#if DEVELOPMENT || DEBUG
/*
* This function handles the case where the hardened-process entitlement
* is set without a specific sub-feature entitlement, which is still under
* adoption.
*
* For in-adoption features, the fallout of having certain
* security sensitive components enabled but not qualified
* is potentially too large. Therefore, we allow to have a
* "forcing period" in which every binary entitled as
* hardened-process is required to have an explicit setting
* (true or false) for the security feature or otherwise
* gets killed or at least traced at launch.
*
* return value: true if all policies restrictions met,
* false otherwise.
*/
static inline bool
handle_missing_subfeature_entitlement(
const struct image_params *imgp,
const char *subfeature_entitlement)
{
switch (hardened_process_config) {
case HARDENED_PROCESS_CONFIG_SILENT:
break;
case HARDENED_PROCESS_CONFIG_LOG:
/*
* Use the name directly from imgp since we haven't
* set_proc_name() yet.
*/
printf("[WARNING] %s has hardened-process but not %s\n",
imgp->ip_ndp->ni_cnd.cn_nameptr,
subfeature_entitlement);
break;
case HARDENED_PROCESS_CONFIG_FATAL:
/*
* When the policy defined as FATAL, we SIGKILL
* the process.
*/
printf("[ERROR] %s has hardened-process but not %s\n",
imgp->ip_ndp->ni_cnd.cn_nameptr,
subfeature_entitlement);
return false;
default:
panic("invalid hardened-process policy");
uint64_t value = 0;
/* Set platform restrictions version */
if (task_get_platform_binary(task)) {
task_set_platform_restrictions_version(task, 2);
} else if (IOTaskGetIntegerEntitlement(task, SPAWN_ENABLE_PLATFORM_RESTRICTIONS, &value) &&
value > 1) {
task_set_platform_restrictions_version(task, value);
}
return true;
}
#endif /* DEVELOPMENT || DEBUG */
/*
* Handle the hardened-process.hardened-heap entitlement.
*
* Note: hardened-heap is not inherited via spawn/exec;
* It is inherited (only) on fork, which is done
* via Apple strings.
*/
static inline bool
apply_hardened_heap_policy(
struct image_params *imgp,
bool is_hardened_process)
{
bool result = true;
bool set_hardened_heap = false;
bool hardened_heap_ent = false;
if (IOVnodeGetBooleanEntitlement(imgp->ip_vp,
(int64_t)imgp->ip_arch_offset,
SPAWN_ENABLE_HARDENED_HEAP,
&hardened_heap_ent)) {
/*
* The hardened-heap entitlement exists, use that
* to decide about enablement.
*/
set_hardened_heap = hardened_heap_ent;
} else if (is_hardened_process) {
#if DEVELOPMENT || DEBUG
/*
* We should imply default from hardened-process. However,
* bringup will take time and could be sensitive. We want
* to allow teams to adopt incrementally.
*
* We will link hardened-heap to hardened-process when
* adoption will be more stable.
*/
if (!handle_missing_subfeature_entitlement(imgp,
SPAWN_ENABLE_HARDENED_HEAP)) {
result = false;
}
#endif /* DEVELOPMENT || DEBUG */
/* Set hardened process version*/
if (IOTaskGetIntegerEntitlement(task, HARDENED_PROCESS_VERSION, &value)) {
task_set_hardened_process_version(task, value);
}
if (set_hardened_heap) {
imgp->ip_flags |= IMGPF_HARDENED_HEAP;
}
return result;
}
/*
* This function handles all the hardened-process related
* mitigations, parse their entitlements, and apply policies.
* This routine configures the various runtime mitigations we can apply to a process
* during image activation. This occurs before `imgact_setup_runtime_mitigations`
*
* For feature-ready mitigations, having hardened-process=true
* implies enablement. Sub-features specific entitlements can
* override this, which means that even if we have hardened-process
* exists and set to true, but a sub-feature entitlement exists
* and set to false, we do not enable the sub-feature.
*
* return value: true if all policies restrictions met,
* false otherwise.
* Returns true on success, false on failure. Failure will be fatal in exec_mach_imgact().
*/
static bool
apply_hardened_process_policy(
struct image_params *imgp,
__unused proc_t proc,
__unused bool is_platform_binary)
static inline errno_t
imgact_setup_runtime_mitigations(struct image_params *imgp, __unused load_result_t *load_result,
__unused task_t old_task, task_t new_task, __unused vm_map_t map, __unused proc_t proc)
{
bool result = true;
/*
* It's safe to check entitlements anytime after `load_machfile` if you check
* based on the vnode in imgp. We must perform this entitlement check
* before we start using load_result->hardened_browser further down
*/
load_result->hardened_browser = 0;
encode_HR_entitlement(kCSWebBrowserHostEntitlement, BrowserHostEntitlementMask, imgp, load_result);
encode_HR_entitlement(kCSWebBrowserGPUEntitlement, BrowserGPUEntitlementMask, imgp, load_result);
encode_HR_entitlement(kCSWebBrowserNetworkEntitlement, BrowserNetworkEntitlementMask, imgp, load_result);
encode_HR_entitlement(kCSWebBrowserWebContentEntitlement, BrowserWebContentEntitlementMask, imgp, load_result);
if (load_result->hardened_browser) {
task_set_platform_restrictions_version(new_task, 1);
}
errno_t retval = 0;
/*
* Check if the binary has hardened-process entitlement.
* Hardened-heap enables a set of extra security features in our system memory allocator.
*/
bool is_hardened_process = false;
if (IOVnodeHasEntitlement(imgp->ip_vp,
(int64_t)imgp->ip_arch_offset, SPAWN_ENABLE_HARDENED_PROCESS)) {
is_hardened_process = true;
}
if (!apply_hardened_heap_policy(imgp, is_hardened_process)) {
result = false;
if ((retval = imgact_setup_hardened_heap(imgp, new_task)) != 0) {
EXEC_LOG("Invalid configuration detected for hardened-heap");
return retval;
}
return result;
return retval;
}
uint32_t
@ -1372,7 +1324,7 @@ exec_mach_imgact(struct image_params *imgp)
proc_t p = vfs_context_proc(imgp->ip_vfs_context);
int error = 0;
task_t task;
task_t new_task = NULL; /* protected by vfexec */
task_t new_task = NULL; /* protected by vfexec */
thread_t thread;
struct uthread *uthread;
vm_map_switch_context_t switch_ctx;
@ -1547,16 +1499,6 @@ grade:
assert(imgp->ip_free_map == NULL);
// It's safe to check entitlements anytime after `load_machfile` if you check
// based on the vnode in imgp. We must perform this entitlement check
// before we start using load_result->hardened_runtime_binary further down
load_result.hardened_runtime_binary = 0;
encode_HR_entitlement(kCSWebBrowserHostEntitlement, BrowserHostEntitlementMask, imgp, &load_result);
encode_HR_entitlement(kCSWebBrowserGPUEntitlement, BrowserGPUEntitlementMask, imgp, &load_result);
encode_HR_entitlement(kCSWebBrowserNetworkEntitlement, BrowserNetworkEntitlementMask, imgp, &load_result);
encode_HR_entitlement(kCSWebBrowserWebContentEntitlement, BrowserWebContentEntitlementMask, imgp, &load_result);
/*
* ERROR RECOVERY
*
@ -1585,7 +1527,6 @@ grade:
p->p_cputype = imgp->ip_origcputype;
p->p_cpusubtype = imgp->ip_origcpusubtype;
proc_setplatformdata(p, load_result.ip_platform, load_result.lr_min_sdk, load_result.lr_sdk);
exec_setup_tpro(imgp, &load_result);
vm_map_set_size_limit(map, proc_limitgetcur(p, RLIMIT_AS));
vm_map_set_data_limit(map, proc_limitgetcur(p, RLIMIT_DATA));
@ -1601,11 +1542,9 @@ grade:
proc_unlock(p);
/*
* Handle hardened-process mitigations, parse entitlements
* and apply enablements.
* Setup runtime mitigations.
*/
if (!apply_hardened_process_policy(imgp, p, load_result.platform_binary)) {
#if DEVELOPMENT || DEBUG
if ((error = imgact_setup_runtime_mitigations(imgp, &load_result, current_task(), new_task, map, p)) != 0) {
set_proc_name(imgp, p);
exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_BAD_MACHO);
if (bootarg_execfailurereports) {
@ -1616,13 +1555,8 @@ grade:
imgp->ip_free_map = map;
map = VM_MAP_NULL;
goto badtoolate;
#endif /* DEVELOPMENT || DEBUG */
}
/*
* Set TPRO flags if enabled
*/
/*
* Set code-signing flags if this binary is signed, or if parent has
* requested them on exec.
@ -1720,7 +1654,7 @@ grade:
* for system processes that need to match and be able to inspect
* a pre-existing task.
*/
int cpu_subtype = 0; /* all cpu_subtypes use the same shared region */
int cpu_subtype = 0; /* all cpu_subtypes use the same shared region */
#if __has_feature(ptrauth_calls)
char *shared_region_id = NULL;
size_t len;
@ -1749,7 +1683,7 @@ grade:
* Determine which shared cache to select based on being told,
* matching a team-id or matching an entitlement.
*/
if (load_result.hardened_runtime_binary & BrowserWebContentEntitlementMask) {
if (load_result.hardened_browser & BrowserWebContentEntitlementMask) {
len = sizeof(HARDENED_RUNTIME_CONTENT_ID);
shared_region_id = kalloc_data(len, Z_WAITOK | Z_NOFAIL);
strlcpy(shared_region_id, HARDENED_RUNTIME_CONTENT_ID, len);
@ -1929,16 +1863,6 @@ grade:
goto badtoolate;
}
if (load_result.hardened_runtime_binary) {
if (cs_debug) {
printf("setting hardened runtime with entitlement mask= "
"0x%x on task: pid = %d\n",
load_result.hardened_runtime_binary,
proc_getpid(p));
}
task_set_hardened_runtime(task, true);
}
/*
* The load result will have already been munged by AMFI to include the
* platform binary flag if boot-args dictated it (AMFI will mark anything
@ -1984,22 +1908,7 @@ grade:
#endif /* DEVELOPMENT || DEBUG */
#endif /* XNU_TARGET_OS_OSX */
/*
* Set starting EXC_GUARD and control port behavior for task now that
* platform and hardened runtime is set. Use the name directly from imgp since we haven't
* set_proc_name() yet. Also make control port for the task and main thread
* immovable/pinned based on task's option.
*
* Must happen before main thread port copyout in exc_add_apple_strings.
*/
task_set_exc_guard_ctrl_port_default(task, thread,
imgp->ip_ndp->ni_cnd.cn_nameptr,
(unsigned)imgp->ip_ndp->ni_cnd.cn_namelen,
proc_is_simulated(p),
load_result.ip_platform,
load_result.lr_sdk);
error = exec_add_apple_strings(imgp, &load_result); /* copies out main thread port */
error = exec_add_apple_strings(imgp, &load_result, task); /* copies out main thread port */
if (error) {
KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
@ -2284,7 +2193,6 @@ cleanup_rosetta_fp:
dtrace_proc_exec(p);
#endif
if (kdebug_enable) {
long args[4] = {};
@ -2638,6 +2546,7 @@ bad_notrans:
return error;
}
/*
* exec_validate_spawnattr_policy
*
@ -2700,6 +2609,9 @@ exec_handle_spawnattr_policy(proc_t p, thread_t thread, int psa_apptype, uint64_
case POSIX_SPAWN_PROC_TYPE_APP_DEFAULT:
apptype = TASK_APPTYPE_APP_DEFAULT;
break;
case POSIX_SPAWN_PROC_TYPE_APP_NONUI:
apptype = TASK_APPTYPE_APP_NONUI;
break;
case POSIX_SPAWN_PROC_TYPE_DRIVER:
apptype = TASK_APPTYPE_DRIVER;
break;
@ -2886,7 +2798,7 @@ exec_handle_port_actions(struct image_params *imgp,
if (MACH_PORT_VALID(act->new_port)) {
kr = ipc_typed_port_copyin_send(get_task_ipcspace(current_task()),
act->new_port, IKOT_UNKNOWN, &port);
act->new_port, IOT_ANY, &port);
if (kr != KERN_SUCCESS) {
ret = EINVAL;
@ -3684,12 +3596,14 @@ proc_apply_jit_and_vm_policies(struct image_params *imgp, proc_t p, task_t task)
}
#if CONFIG_MAP_RANGES
if (task_is_hardened_binary(task) && !proc_is_simulated(p)) {
if ((task_has_hardened_heap(task) ||
(task_get_platform_restrictions_version(task) == 1) ||
task_get_platform_binary(task)) && !proc_is_simulated(p)) {
/*
* This must be done last as it needs to observe
* any kind of VA space growth that was requested.
* This is used by the secure allocator, so
* must be applied to all hardened binaries
* must be applied to all platform restrictions binaries
*/
#if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
needs_extra_jumbo_va = IOTaskHasEntitlement(task,
@ -3713,14 +3627,14 @@ proc_apply_jit_and_vm_policies(struct image_params *imgp, proc_t p, task_t task)
const bool task_loads_3P_plugins = imgp->ip_flags & IMGPF_3P_PLUGINS;
#endif /* XNU_TARGET_OS_OSX */
if (task_is_hardened_binary(task)
if (task_has_tpro(task)
#if XNU_TARGET_OS_OSX
&& !task_loads_3P_plugins
#endif /* XNU_TARGET_OS_OSX */
) {
/*
* Pre-emptively disable TPRO remapping for
* hardened binaries (which do not load 3P plugins)
* platform restrictions binaries (which do not load 3P plugins)
*/
vm_map_set_tpro_enforcement(get_task_map(task));
}
@ -3764,7 +3678,6 @@ spawn_posix_cred_adopt(proc_t p,
return 0;
}
/*
* posix_spawn
*
@ -4109,11 +4022,13 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
if ((psa->psa_options & PSA_OPTION_PLUGIN_HOST_DISABLE_A_KEYS) == PSA_OPTION_PLUGIN_HOST_DISABLE_A_KEYS) {
imgp->ip_flags |= IMGPF_PLUGIN_HOST_DISABLE_A_KEYS;
}
#if (DEVELOPMENT || DEBUG)
if ((psa->psa_options & PSA_OPTION_ALT_ROSETTA) == PSA_OPTION_ALT_ROSETTA) {
imgp->ip_flags |= (IMGPF_ROSETTA | IMGPF_ALT_ROSETTA);
}
#endif
#endif /* (DEVELOPMENT || DEBUG) */
if ((error = exec_validate_spawnattr_policy(psa->psa_apptype)) != 0) {
goto bad;
@ -4501,8 +4416,6 @@ do_fork1:
}
}
#endif
/*
* Activate the image.
* Warning: If activation failed after point of no return, it returns error
@ -4693,13 +4606,27 @@ bad:
}
exec_resettextvp(p, imgp);
vm_map_setup(get_task_map(new_task), new_task);
exec_setup_platform_restrictions(new_task);
/*
* Set starting EXC_GUARD behavior for task now that platform
* and platform restrictions bits are set.
*/
task_set_exc_guard_default(new_task,
proc_best_name(p),
strlen(proc_best_name(p)),
proc_is_simulated(p),
proc_platform(p),
proc_sdk(p));
/*
* Enable new task IPC access if exec_activate_image() returned an
* active task. (Checks active bit in ipc_task_enable() under lock).
* Must enable after resettextvp so that task port policies are not evaluated
* until the csblob in the textvp is accurately reflected.
*/
vm_map_setup(get_task_map(new_task), new_task);
ipc_task_enable(new_task);
/* Set task exception ports now that we can check entitlements */
@ -4779,6 +4706,9 @@ bad:
if (imgp->ip_px_sa != NULL && px_sa.psa_thread_limit > 0) {
task_set_thread_limit(new_task, (uint16_t)px_sa.psa_thread_limit);
}
if (imgp->ip_px_sa != NULL && px_sa.psa_conclave_mem_limit > 0) {
task_set_conclave_mem_limit(new_task, px_sa.psa_conclave_mem_limit);
}
#if CONFIG_PROC_RESOURCE_LIMITS
if (imgp->ip_px_sa != NULL && (px_sa.psa_port_soft_limit > 0 || px_sa.psa_port_hard_limit > 0)) {
@ -4795,6 +4725,10 @@ bad:
(int)px_sa.psa_kqworkloop_hard_limit);
}
#endif /* CONFIG_PROC_RESOURCE_LIMITS */
if (imgp->ip_px_sa != NULL && (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_REALTIME_AUDIO)) {
task_set_jetsam_realtime_audio(new_task, TRUE);
}
}
@ -4952,8 +4886,9 @@ bad:
}
if (error == 0 && !spawn_no_exec) {
extern uint64_t kdp_task_exec_meta_flags(task_t task);
KDBG(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXEC),
proc_getpid(p));
proc_getpid(p), kdp_task_exec_meta_flags(proc_task(p)));
}
}
@ -5550,17 +5485,33 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval __unused)
assert(imgp->ip_new_thread != NULL);
exec_resettextvp(p, imgp);
vm_map_setup(get_task_map(new_task), new_task);
exec_setup_platform_restrictions(new_task);
/*
* Set starting EXC_GUARD behavior for task now that platform
* and platform restrictions bits are set.
*/
task_set_exc_guard_default(new_task,
proc_best_name(p),
strlen(proc_best_name(p)),
proc_is_simulated(p),
proc_platform(p),
proc_sdk(p));
/*
* Enable new task IPC access if exec_activate_image() returned an
* active task. (Checks active bit in ipc_task_enable() under lock).
* Must enable after resettextvp so that task port policies are not evaluated
* until the csblob in the textvp is accurately reflected.
*/
vm_map_setup(get_task_map(new_task), new_task);
ipc_task_enable(new_task);
error = process_signature(p, imgp);
}
#if defined(HAS_APPLE_PAC)
if (imgp->ip_new_thread && !error) {
ml_task_set_jop_pid_from_shared_region(new_task, imgp->ip_flags & IMGPF_NOJOP);
@ -6327,7 +6278,7 @@ bad:
#define LIBMALLOC_EXPERIMENT_FACTORS_KEY "MallocExperiment="
/*
* Passes information about hardened runtime entitlements to libsystem/libmalloc
* Passes information about hardened heap/"hardened runtime" entitlements to libsystem/libmalloc
*/
#define HARDENED_RUNTIME_KEY "HardenedRuntime="
@ -6423,7 +6374,7 @@ _Atomic uint64_t libmalloc_experiment_factors = 0;
static int
exec_add_apple_strings(struct image_params *imgp,
const load_result_t *load_result)
const load_result_t *load_result, task_t task)
{
int error;
int img_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) ? 8 : 4;
@ -6547,7 +6498,7 @@ exec_add_apple_strings(struct image_params *imgp,
}
uint8_t cdhash[SHA1_RESULTLEN];
int cdhash_errror = ubc_cs_getcdhash(imgp->ip_vp, imgp->ip_arch_offset, cdhash);
int cdhash_errror = ubc_cs_getcdhash(imgp->ip_vp, imgp->ip_arch_offset, cdhash, NULL);
if (cdhash_errror == 0) {
char hash_string[strlen(CDHASH_KEY) + 2 * SHA1_RESULTLEN + 1];
strncpy(hash_string, CDHASH_KEY, sizeof(hash_string));
@ -6645,9 +6596,9 @@ exec_add_apple_strings(struct image_params *imgp,
*/
if ((new_thread = imgp->ip_new_thread) != THREAD_NULL) {
thread_reference(new_thread);
sright = convert_thread_to_port_pinned(new_thread);
sright = convert_thread_to_port_immovable(new_thread);
task_t new_task = get_threadtask(new_thread);
mach_port_name_t name = ipc_port_copyout_send(sright, get_task_ipcspace(new_task));
mach_port_name_t name = ipc_port_copyout_send_pinned(sright, get_task_ipcspace(new_task));
char port_name_hex_str[strlen(MAIN_TH_PORT_KEY) + HEX_STR_LEN32 + 1];
snprintf(port_name_hex_str, sizeof(port_name_hex_str), MAIN_TH_PORT_KEY "0x%x", name);
@ -6694,35 +6645,32 @@ exec_add_apple_strings(struct image_params *imgp,
imgp->ip_applec++;
}
if (imgp->ip_flags & IMGPF_HARDENED_HEAP) {
const char *hardened_heap_shims = "hardened_heap=1";
error = exec_add_user_string(imgp, CAST_USER_ADDR_T(hardened_heap_shims), UIO_SYSSPACE, FALSE);
/*
* Push down the task security configuration. To reduce confusion when userland parses the information
* still push an empty security configuration if nothing is active.
*/
{
#define SECURITY_CONFIG_KEY "security_config="
char security_config_str[strlen(SECURITY_CONFIG_KEY) + HEX_STR_LEN + 1];
snprintf(security_config_str, sizeof(security_config_str),
SECURITY_CONFIG_KEY "0x%x", task_get_security_config(task));
error = exec_add_user_string(imgp, CAST_USER_ADDR_T(security_config_str), UIO_SYSSPACE, FALSE);
if (error) {
printf("Failed to add hardened heap string with error %d\n", error);
printf("Failed to add the security config string with error %d\n", error);
goto bad;
}
imgp->ip_applec++;
}
/* tell dyld that it can leverage hardware for its read-only/read-write trusted path */
if (imgp->ip_flags & IMGPF_HW_TPRO) {
const char *dyld_hw_tpro = "dyld_hw_tpro=1";
error = exec_add_user_string(imgp, CAST_USER_ADDR_T(dyld_hw_tpro), UIO_SYSSPACE, FALSE);
if (error) {
printf("Failed to add dyld hw tpro setting with error %d\n", error);
goto bad;
}
imgp->ip_applec++;
}
if (load_result->hardened_runtime_binary) {
if (load_result->hardened_browser) {
const size_t HR_STRING_SIZE = sizeof(HARDENED_RUNTIME_KEY) + HR_FLAGS_NUM_NIBBLES + 2 + 1;
char hardened_runtime[HR_STRING_SIZE];
snprintf(hardened_runtime, HR_STRING_SIZE, HARDENED_RUNTIME_KEY"0x%x", load_result->hardened_runtime_binary);
snprintf(hardened_runtime, HR_STRING_SIZE, HARDENED_RUNTIME_KEY"0x%x", load_result->hardened_browser);
error = exec_add_user_string(imgp, CAST_USER_ADDR_T(hardened_runtime), UIO_SYSSPACE, FALSE);
if (error) {
printf("Failed to add hardened runtime flag with error %d\n", error);
@ -7434,6 +7382,10 @@ load_init_program(proc_t p)
mach_vm_offset_t scratch_addr = 0;
mach_vm_size_t map_page_size = vm_map_page_size(map);
#if DEVELOPMENT || DEBUG
/* Use the opportunity to initialize exec's debug log stream */
exec_log_handle = os_log_create("com.apple.xnu.bsd", "exec");
#endif /* DEVELOPMENT || DEBUG */
(void) mach_vm_allocate_kernel(map, &scratch_addr, map_page_size,
VM_MAP_KERNEL_FLAGS_ANYWHERE());
@ -7884,6 +7836,36 @@ proc_process_signature(proc_t p, os_reason_t *signature_failure_reason)
return error;
}
#define DT_UNRESTRICTED_SUBSYSTEM_ROOT "unrestricted-subsystem-root"
static bool
allow_unrestricted_subsystem_root(void)
{
#if !(DEVELOPMENT || DEBUG)
static bool allow_unrestricted_subsystem_root = false;
static bool has_been_set = false;
if (!has_been_set) {
DTEntry chosen;
const uint32_t *value;
unsigned size;
has_been_set = true;
if (SecureDTLookupEntry(0, "/chosen", &chosen) == kSuccess &&
SecureDTGetProperty(chosen, DT_UNRESTRICTED_SUBSYSTEM_ROOT, (const void**)&value, &size) == kSuccess &&
value != NULL &&
size == sizeof(uint32_t)) {
allow_unrestricted_subsystem_root = (bool)*value;
}
}
return allow_unrestricted_subsystem_root;
#else
return true;
#endif
}
static int
process_signature(proc_t p, struct image_params *imgp)
{
@ -7945,6 +7927,20 @@ process_signature(proc_t p, struct image_params *imgp)
goto done;
}
/*
* Reject when there's subsystem root path set, but the image is restricted, and doesn't require
* library validation. This is to avoid subsystem root being used to inject unsigned code
*/
if (!allow_unrestricted_subsystem_root()) {
if ((imgp->ip_csflags & CS_RESTRICT || proc_issetugid(p)) &&
!(imgp->ip_csflags & CS_REQUIRE_LV) &&
(imgp->ip_subsystem_root_path != NULL)) {
signature_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_SECURITY_POLICY);
error = EACCES;
goto done;
}
}
#if XNU_TARGET_OS_OSX
/* Check for platform passed in spawn attr if iOS binary is being spawned */
if (proc_platform(p) == PLATFORM_IOS) {
@ -8357,16 +8353,11 @@ sysctl_libmalloc_experiments SYSCTL_HANDLER_ARGS
return 0;
}
EXPERIMENT_FACTOR_PROC(_kern, libmalloc_experiments, CTLTYPE_QUAD | CTLFLAG_RW, 0, 0, &sysctl_libmalloc_experiments, "A", "");
EXPERIMENT_FACTOR_LEGACY_PROC(_kern, libmalloc_experiments, CTLTYPE_QUAD | CTLFLAG_RW, 0, 0, &sysctl_libmalloc_experiments, "A", "");
SYSCTL_NODE(_kern, OID_AUTO, sec_transition,
CTLFLAG_RD | CTLFLAG_LOCKED, 0, "sec_transition");
SYSCTL_INT(_kern_sec_transition, OID_AUTO, available,
CTLFLAG_RD | CTLFLAG_LOCKED, (int *)NULL, 0, "");
#if DEBUG || DEVELOPMENT
static int
sysctl_setup_ensure_pidversion_changes_on_exec(__unused int64_t in, int64_t *out)

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020 Apple Computer, Inc. All rights reserved.
* Copyright (c) 2020-2025 Apple Computer, Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
@ -31,6 +31,7 @@
#include <sys/imgact.h>
#include <sys/kernel_types.h>
#include <kern/mach_loader.h>
/*
* Set p->p_comm and p->p_name to the name passed to exec
@ -38,4 +39,60 @@
extern void
set_proc_name(struct image_params *imgp, proc_t p);
/*
* Runtime security mitigations in production are primarily controlled by
* entitlements. Third party processes/daemons on MacOS aren't allowed to use
* the com.apple.developer entitlement without a profile, whereby a special carve out
* exists for com.apple.security.
*
* Progressively we expect internal first party software to shift towards the com.apple.security
* format, but until then we support both cases, with a strict rule that only one can
* be present.
*/
__enum_decl(exec_security_mitigation_entitlement_t, uint8_t, {
/*
* Hardened-process.
*
* Security mitigations follow the notion of "hardened-process": binaries that we
* have identified as being security critical. They are identified by the
* com.apple.{developer|security}.hardened-process entitlement, which is required to further
* configure the other security mitigations.
*/
HARDENED_PROCESS = 0,
/*
* Hardened-Heap.
*
* This mitigation extends libmalloc xzone with a number of security features,
* most notably increasing the number of buckets and adding guard pages.
* The presence of the entitlement opts the binary into the feature.
*/
HARDENED_HEAP,
/*
* TPRO - Trusted-Path Read-Only
*
* The TPRO mitigation allows to create memory regions that are read-only
* but that can be rapidly, locally, modified by trusted-paths to be temporarily
* read-write. TPRO is "enabled by default" (with the caveats in the exec_setup_tpro())
* starting with the SDK versions below.
*/
TPRO,
});
/*
* exec_check_security_entitlement verifies whether a given entitlement is
* associated to the to-be-run process. It verifies both legacy and current
* format and returns:
* EXEC_SECURITY_NOT_ENTITLED - if no entitlement is present
* EXEC_SECURITY_ENTITLED - if the entitlement is present
* EXEC_SECURITY_INVALID_CONFIG - if _both_ entitlements are present (fatal condition)
*/
__enum_decl(exec_security_err_t, uint8_t, {
EXEC_SECURITY_NOT_ENTITLED,
EXEC_SECURITY_ENTITLED,
EXEC_SECURITY_INVALID_CONFIG
});
extern exec_security_err_t exec_check_security_entitlement(struct image_params *imgp,
exec_security_mitigation_entitlement_t entitlement);
#endif /* _KERN_EXEC_INTERNAL_H_ */

View file

@ -824,6 +824,12 @@ populate_corpse_crashinfo(proc_t p, task_t corpse_task, struct rusage_superset *
kcdata_memcpy(crash_info_ptr, uaddr, &trust, sizeof(trust));
}
if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_TASK_SECURITY_CONFIG, sizeof(uint32_t), &uaddr)) {
struct crashinfo_task_security_config task_security;
task_security.task_security_config = task_get_security_config(corpse_task);
kcdata_memcpy(crash_info_ptr, uaddr, &task_security, sizeof(task_security));
}
uint64_t jit_start_addr = 0;
uint64_t jit_end_addr = 0;
kern_return_t ret = get_jit_address_range_kdp(get_task_pmap(corpse_task), (uintptr_t*)&jit_start_addr, (uintptr_t*)&jit_end_addr);
@ -841,9 +847,24 @@ populate_corpse_crashinfo(proc_t p, task_t corpse_task, struct rusage_superset *
kcdata_memcpy(crash_info_ptr, uaddr, &cs_auxiliary_info, sizeof(cs_auxiliary_info));
}
if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_RLIM_CORE, sizeof(rlim_t), &uaddr)) {
const rlim_t lim = proc_limitgetcur(p, RLIMIT_CORE);
kcdata_memcpy(crash_info_ptr, uaddr, &lim, sizeof(lim));
}
#if CONFIG_UCOREDUMP
if (do_ucoredump && !task_is_driver(proc_task(p)) &&
KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_CORE_ALLOWED, sizeof(uint8_t), &uaddr)) {
const uint8_t allow = is_coredump_eligible(p) == 0;
kcdata_memcpy(crash_info_ptr, uaddr, &allow, sizeof(allow));
}
#endif /* CONFIG_UCOREDUMP */
if (p->p_exit_reason != OS_REASON_NULL && reason == OS_REASON_NULL) {
reason = p->p_exit_reason;
}
if (reason != OS_REASON_NULL) {
if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, EXIT_REASON_SNAPSHOT, sizeof(struct exit_reason_snapshot), &uaddr)) {
struct exit_reason_snapshot ers = {
@ -863,8 +884,6 @@ populate_corpse_crashinfo(proc_t p, task_t corpse_task, struct rusage_superset *
kcdata_memcpy(crash_info_ptr, uaddr, reason->osr_kcd_buf, reason_buf_size);
}
}
#if DEVELOPMENT || DEBUG
#endif /* DEVELOPMENT || DEBUG */
}
if (num_udata > 0) {
@ -3537,7 +3556,7 @@ exit_with_exception_internal(
}
if (!(flags & PX_DEBUG_NO_HONOR)
&& address_space_debugged(p) == KERN_SUCCESS) {
&& is_address_space_debugged(p)) {
return 0;
}
@ -3623,12 +3642,13 @@ exit_with_exclave_exception(
void
exit_with_mach_exception_using_ast(
exception_info_t exception,
uint32_t flags)
uint32_t flags,
bool fatal)
{
const uint32_t __assert_only supported_flags = PX_KTRIAGE;
assert((flags & ~supported_flags) == 0);
bool ktriage = flags & PX_KTRIAGE;
thread_ast_mach_exception(current_thread(), exception.os_reason, exception.exception_type,
exception.mx_code, exception.mx_subcode, false, ktriage);
exception.mx_code, exception.mx_subcode, fatal, ktriage);
}

View file

@ -407,8 +407,8 @@ bad:
* fork_create_child
*
* Description: Common operations associated with the creation of a child
* process. Return with new task and first thread's control port movable
* and not pinned.
* process. Return with new task and first thread's control
* port movable
*
* Parameters: parent_task parent task
* parent_coalitions parent's set of coalitions
@ -506,8 +506,7 @@ fork_create_child(task_t parent_task,
}
/*
* Create main thread for the child process. Its control port is not immovable/pinned
* until main_thread_set_immovable_pinned().
* Create main thread for the child process.
*
* The new thread is waiting on the event triggered by 'task_clear_return_wait'
*/
@ -588,14 +587,7 @@ fork(proc_t parent_proc, __unused struct fork_args *uap, int32_t *retval)
child_task = (task_t)get_threadtask(child_thread);
assert(child_task != TASK_NULL);
/* task_control_port_options has been inherited from parent, apply it */
task_set_immovable_pinned(child_task);
main_thread_set_immovable_pinned(child_thread);
/*
* Since the task ports for this new task are now set to be immovable,
* we can enable them.
*/
task_copyout_control_port(child_task);
vm_map_setup(get_task_map(child_task), child_task);
ipc_task_enable(child_task);

View file

@ -1268,7 +1268,8 @@ vng_guard_violation(const struct vng_info *vgi,
}
}
if (vng_policy_flags & (kVNG_POLICY_EXC | kVNG_POLICY_EXC_CORPSE)) {
if (vng_policy_flags &
(kVNG_POLICY_EXC | kVNG_POLICY_EXC_CORPSE | kVNG_POLICY_EXC_CORE)) {
/* EXC_GUARD exception */
const struct vng_owner *vgo = TAILQ_FIRST(&vgi->vgi_owners);
pid_t pid = vgo ? proc_pid(vgo->vgo_p) : 0;
@ -1283,7 +1284,8 @@ vng_guard_violation(const struct vng_info *vgi,
lck_rw_unlock_shared(&llock);
if (vng_policy_flags & kVNG_POLICY_EXC_CORPSE) {
if (vng_policy_flags &
(kVNG_POLICY_EXC_CORPSE | kVNG_POLICY_EXC_CORE)) {
char *path;
int len = MAXPATHLEN;
@ -1294,7 +1296,10 @@ vng_guard_violation(const struct vng_info *vgi,
if (*path && len) {
r = vng_reason_from_pathname(path, len);
}
task_violated_guard(code, subcode, r, TRUE); /* not fatal */
const bool backtrace_only =
!(vng_policy_flags & kVNG_POLICY_EXC_CORE);
/* not fatal */
task_violated_guard(code, subcode, r, backtrace_only);
if (NULL != r) {
os_reason_free(r);
}

View file

@ -263,7 +263,7 @@ OSMalloc_external(uint32_t size, OSMallocTag tag)
OSMalloc_Tagref(tag);
if ((tag->OSMT_attr & OSMT_PAGEABLE) && (size & ~PAGE_MASK)) {
if ((kr = kmem_alloc(kernel_map, (vm_offset_t *)&addr, size,
KMA_PAGEABLE | KMA_DATA, vm_tag_bt())) != KERN_SUCCESS) {
KMA_PAGEABLE | KMA_DATA_SHARED, vm_tag_bt())) != KERN_SUCCESS) {
addr = NULL;
}
} else {
@ -433,3 +433,50 @@ SYSCTL_PROC(_kern, OID_AUTO, zones_collectable_bytes,
CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED,
0, 0, &sysctl_zones_collectable_bytes, "Q",
"Collectable memory in zones");
#if DEVELOPMENT || DEBUG
static int
sysctl_zone_reset_peak SYSCTL_HANDLER_ARGS
{
#pragma unused(oidp, arg1, arg2)
kern_return_t kr;
int ret;
const size_t name_len = MAX_ZONE_NAME + 1;
char zonename[name_len];
ret = sysctl_io_string(req, zonename, name_len, 0, NULL);
if (ret) {
return ret;
}
kr = zone_reset_peak(zonename);
return mach_to_bsd_errno(kr);
}
SYSCTL_PROC(_kern, OID_AUTO, zone_reset_peak,
CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED,
0, 0, &sysctl_zone_reset_peak, "-",
"Reset the peak size of a kernel zone by name.");
static int
sysctl_zone_reset_all_peaks SYSCTL_HANDLER_ARGS
{
#pragma unused(oidp, arg1, arg2)
kern_return_t kr;
if (!req->newptr) {
/* Only reset on a write */
return EINVAL;
}
kr = zone_reset_all_peaks();
return mach_to_bsd_errno(kr);
}
SYSCTL_PROC(_kern, OID_AUTO, zone_reset_all_peaks,
CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED,
0, 0, &sysctl_zone_reset_all_peaks, "I",
"Reset the peak size of all kernel zones.");
#endif /* DEVELOPMENT || DEBUG */

File diff suppressed because it is too large Load diff

View file

@ -195,12 +195,12 @@ struct memorystatus_freezer_candidate_list memorystatus_global_demote_list = {NU
#define FREEZER_USE_ORDERED_LIST_DEFAULT 0
#endif
int memorystatus_freezer_use_ordered_list = FREEZER_USE_ORDERED_LIST_DEFAULT;
EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freezer_use_ordered_list, &memorystatus_freezer_use_ordered_list, 0, 1, "");
EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freezer_use_ordered_list, &memorystatus_freezer_use_ordered_list, 0, 1, "");
/*
* When enabled, demotion candidates are chosen from memorystatus_global_demotion_list
*/
int memorystatus_freezer_use_demotion_list = 0;
EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freezer_use_demotion_list, &memorystatus_freezer_use_demotion_list, 0, 1, "");
EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freezer_use_demotion_list, &memorystatus_freezer_use_demotion_list, 0, 1, "");
extern boolean_t vm_swap_max_budget(uint64_t *);
@ -411,13 +411,13 @@ SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freezer_thaw_percentage_fg_non_xpc_ser
#define FREEZER_ERROR_STRING_LENGTH 128
EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_pages_min, &memorystatus_freeze_pages_min, 0, UINT32_MAX, "");
EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_pages_max, &memorystatus_freeze_pages_max, 0, UINT32_MAX, "");
EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_processes_max, &memorystatus_frozen_processes_max, 0, UINT32_MAX, "");
EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_jetsam_band, &memorystatus_freeze_jetsam_band, JETSAM_PRIORITY_BACKGROUND, JETSAM_PRIORITY_FOREGROUND, "");
EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_private_shared_pages_ratio, &memorystatus_freeze_private_shared_pages_ratio, 0, UINT32_MAX, "");
EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_min_processes, &memorystatus_freeze_suspended_threshold, 0, UINT32_MAX, "");
EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_max_candidate_band, &memorystatus_freeze_max_candidate_band, JETSAM_PRIORITY_IDLE, JETSAM_PRIORITY_FOREGROUND, "");
EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freeze_pages_min, &memorystatus_freeze_pages_min, 0, UINT32_MAX, "");
EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freeze_pages_max, &memorystatus_freeze_pages_max, 0, UINT32_MAX, "");
EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freeze_processes_max, &memorystatus_frozen_processes_max, 0, UINT32_MAX, "");
EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freeze_jetsam_band, &memorystatus_freeze_jetsam_band, JETSAM_PRIORITY_BACKGROUND, JETSAM_PRIORITY_FOREGROUND, "");
EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freeze_private_shared_pages_ratio, &memorystatus_freeze_private_shared_pages_ratio, 0, UINT32_MAX, "");
EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freeze_min_processes, &memorystatus_freeze_suspended_threshold, 0, UINT32_MAX, "");
EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freeze_max_candidate_band, &memorystatus_freeze_max_candidate_band, JETSAM_PRIORITY_IDLE, JETSAM_PRIORITY_FOREGROUND, "");
static int
sysctl_memorystatus_freeze_budget_multiplier SYSCTL_HANDLER_ARGS
{
@ -458,21 +458,21 @@ sysctl_memorystatus_freeze_budget_multiplier SYSCTL_HANDLER_ARGS
}
return 0;
}
EXPERIMENT_FACTOR_PROC(_kern, memorystatus_freeze_budget_multiplier, CTLTYPE_QUAD | CTLFLAG_RW, 0, 0, &sysctl_memorystatus_freeze_budget_multiplier, "Q", "");
EXPERIMENT_FACTOR_LEGACY_PROC(_kern, memorystatus_freeze_budget_multiplier, CTLTYPE_QUAD | CTLFLAG_RW, 0, 0, &sysctl_memorystatus_freeze_budget_multiplier, "Q", "");
/*
* max. # of frozen process demotions we will allow in our daily cycle.
*/
EXPERIMENT_FACTOR_UINT(_kern, memorystatus_max_freeze_demotions_daily, &memorystatus_max_frozen_demotions_daily, 0, UINT32_MAX, "");
EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_max_freeze_demotions_daily, &memorystatus_max_frozen_demotions_daily, 0, UINT32_MAX, "");
/*
* min # of thaws needed by a process to protect it from getting demoted into the IDLE band.
*/
EXPERIMENT_FACTOR_UINT(_kern, memorystatus_thaw_count_demotion_threshold, &memorystatus_thaw_count_demotion_threshold, 0, UINT32_MAX, "");
EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_thaw_count_demotion_threshold, &memorystatus_thaw_count_demotion_threshold, 0, UINT32_MAX, "");
/*
* min # of global thaws needed for us to consider refreezing these processes.
*/
EXPERIMENT_FACTOR_UINT(_kern, memorystatus_min_thaw_refreeze_threshold, &memorystatus_min_thaw_refreeze_threshold, 0, UINT32_MAX, "");
EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_min_thaw_refreeze_threshold, &memorystatus_min_thaw_refreeze_threshold, 0, UINT32_MAX, "");
#if DEVELOPMENT || DEBUG
@ -1441,7 +1441,7 @@ sysctl_freeze_enabled SYSCTL_HANDLER_ARGS
return 0;
}
EXPERIMENT_FACTOR_PROC(_vm, freeze_enabled, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY, NULL, 0, sysctl_freeze_enabled, "I", "");
EXPERIMENT_FACTOR_LEGACY_PROC(_vm, freeze_enabled, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY, NULL, 0, sysctl_freeze_enabled, "I", "");
static void
schedule_interval_reset(thread_call_t reset_thread_call, throttle_interval_t *interval)
@ -3086,36 +3086,41 @@ memorystatus_freeze_init_proc(proc_t p)
}
}
static int
sysctl_memorystatus_do_fastwake_warmup_all SYSCTL_HANDLER_ARGS
sysctl_memorystatus_do_fastwake_warmup_all SYSCTL_HANDLER_ARGS
{
#pragma unused(oidp, arg1, arg2)
if (!req->newptr) {
return EINVAL;
}
/* Need to be root or have entitlement */
if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement( MEMORYSTATUS_ENTITLEMENT)) {
if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(MEMORYSTATUS_ENTITLEMENT)) {
return EPERM;
}
if (memorystatus_freeze_enabled == false) {
return ENOTSUP;
}
if (!VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
return ENOTSUP;
}
if (!memorystatus_freeze_enabled && !memorystatus_swap_all_apps) {
/* Nothing to do. Swap is not enabled on this system. */
assert3u(vm_compressor_get_swapped_segment_count(), ==, 0);
memorystatus_log("memorystatus: swap is disabled, bypassing fast-wake warmup");
return 0;
}
memorystatus_log("memorystatus: swapping-in all swapped-out compressor "
"segments\n");
do_fastwake_warmup_all();
return 0;
}
SYSCTL_PROC(_kern, OID_AUTO, memorystatus_do_fastwake_warmup_all, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
0, 0, &sysctl_memorystatus_do_fastwake_warmup_all, "I", "");
SYSCTL_PROC(_kern, OID_AUTO, memorystatus_do_fastwake_warmup_all,
CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
0, 0, &sysctl_memorystatus_do_fastwake_warmup_all, "I",
"Swap-in any compressed data that resides in swapfiles");
/*
* Takes in a candidate list from the user_addr, validates it, and copies it into the list pointer.

View file

@ -121,9 +121,17 @@ OS_CLOSED_ENUM(memorystatus_action, uint32_t,
MEMORYSTATUS_KILL_SWAPPABLE, // Kill a swap-eligible process (even if it's running) based on jetsam priority
MEMORYSTATUS_KILL_IDLE, // Kill an idle process
MEMORYSTATUS_KILL_LONG_IDLE, // Kill a long-idle process (reaper)
MEMORYSTATUS_NO_PAGING_SPACE, // Perform a no-paging-space-action
MEMORYSTATUS_PURGE_CACHES, // Purge system memory caches (e.g. corpses, deferred reclaim memory)
MEMORYSTATUS_KILL_NONE, // Do nothing
);
__options_closed_decl(memstat_kill_options_t, uint8_t, {
MEMSTAT_ONLY_SWAPPABBLE = 0x01,
MEMSTAT_ONLY_LONG_IDLE = 0x02,
MEMSTAT_SORT_BUCKET = 0x04,
});
/*
* Structure to hold state for a jetsam thread.
* Typically there should be a single jetsam thread
@ -136,6 +144,7 @@ typedef struct jetsam_state_s {
thread_t thread; /* jetsam thread pointer */
int jld_idle_kills; /* idle jetsam kill counter for this session */
uint32_t errors; /* Error accumulator */
bool errors_cleared; /* Have we tried clearing all errors this iteration? */
bool sort_flag; /* Sort the fg band (idle on macOS) before killing? */
bool corpse_list_purged; /* Has the corpse list been purged? */
bool post_snapshot; /* Do we need to post a jetsam snapshot after this session? */
@ -149,7 +158,7 @@ typedef struct jetsam_state_s {
* and will continue to act until the system is considered
* healthy.
*/
typedef struct memorystatus_system_health {
typedef struct memorystatus_system_health_s {
#if CONFIG_JETSAM
bool msh_available_pages_below_soft;
bool msh_available_pages_below_idle;
@ -163,16 +172,28 @@ typedef struct memorystatus_system_health {
bool msh_swapin_queue_over_limit;
bool msh_pageout_starved;
#endif /* CONFIG_JETSAM */
bool msh_vm_pressure_warning;
bool msh_vm_pressure_critical;
bool msh_compressor_low_on_space;
bool msh_compressor_exhausted;
bool msh_swap_exhausted;
bool msh_swap_low_on_space;
bool msh_zone_map_is_exhausted;
} memorystatus_system_health_t;
} *memorystatus_system_health_t;
void memorystatus_log_system_health(const memorystatus_system_health_t *health);
bool memorystatus_is_system_healthy(const memorystatus_system_health_t *status);
/* Picks a kill cause given an unhealthy system status */
uint32_t memorystatus_pick_kill_cause(const memorystatus_system_health_t *status);
/*
* @func memstat_check_system_health
*
* @brief Evaluate system memory conditions and return if the system is healthy.
*
* @discussion
* Evaluates various system memory conditions, including compressor size and
* available page quantities. If conditions indicate a kill should be
* performed, the system is considered "unhealthy".
*
* @returns @c true if the system is healthy, @c false otherwise.
*/
extern bool memstat_check_system_health(memorystatus_system_health_t status);
#pragma mark Locks
@ -193,6 +214,30 @@ extern int jld_idle_kill_candidates;
extern _Atomic uint64_t last_no_space_action_ts;
extern uint64_t no_paging_space_action_throttle_delay_ns;
#pragma mark Pressure Response Globals
extern uint64_t memstat_last_cache_purge_ts;
extern uint64_t memstat_cache_purge_backoff_ns;
__options_decl(memstat_pressure_options_t, uint32_t, {
/* Kill long idle processes at kVMPressureWarning */
MEMSTAT_WARNING_KILL_LONG_IDLE = 0x01,
/* Kill idle processes from the notify thread at kVMPressureWarning */
MEMSTAT_WARNING_KILL_IDLE_THROTTLED = 0x02,
/* Purge memory caches (e.g. corpses, deferred reclaim rings) at kVMPressureCritical */
MEMSTAT_CRITICAL_PURGE_CACHES = 0x04,
/* Kill all idle processes at kVMPressureCritical */
MEMSTAT_CRITICAL_KILL_IDLE = 0x08,
/* Kill when at kVMPressureWarning for a prolonged period */
MEMSTAT_WARNING_KILL_SUSTAINED = 0x10,
});
/* Maximum value for sysctl handler */
#define MEMSTAT_PRESSURE_CONFIG_MAX (0x18U)
extern memstat_pressure_options_t memstat_pressure_config;
#pragma mark Config Globals
extern boolean_t memstat_reaper_enabled;
#pragma mark VM globals read by the memorystatus subsystem
extern unsigned int vm_page_free_count;
@ -302,6 +347,24 @@ _memstat_proc_is_dirty(proc_t p)
return p->p_memstat_dirty & P_DIRTY_IS_DIRTY;
}
/*
* Return true if this process is self-terminating via ActivityTracking.
*/
static inline bool
_memstat_proc_is_terminating(proc_t p)
{
return p->p_memstat_dirty & P_DIRTY_TERMINATED;
}
/*
* Return true if this process has been killed and is in the process of exiting.
*/
static inline bool
_memstat_proc_was_killed(proc_t p)
{
return p->p_memstat_state & P_MEMSTAT_TERMINATED;
}
static inline bool
_memstat_proc_is_internal(proc_t p)
{
@ -315,6 +378,13 @@ _memstat_proc_can_idle_exit(proc_t p)
(p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT);
}
static inline bool
_memstat_proc_shutdown_on_clean(proc_t p)
{
return _memstat_proc_is_tracked(p) &&
(p->p_memstat_dirty & P_DIRTY_SHUTDOWN_ON_CLEAN);
}
static inline bool
_memstat_proc_has_priority_assertion(proc_t p)
{
@ -485,6 +555,12 @@ uint32_t memstat_get_proccnt_upto_priority(uint32_t max_bucket_index);
*/
uint32_t memstat_get_idle_proccnt(void);
/*
* @func memstat_get_reapable_proccnt
* @brief Return the number of idle, reapable processes which may be terminated.
*/
uint32_t memstat_get_long_idle_proccnt(void);
#pragma mark Freezer
#if CONFIG_FREEZE
/*

View file

@ -142,7 +142,7 @@ kern_return_t memorystatus_update_vm_pressure(boolean_t target_foreground_proces
#define VM_PRESSURE_MINIMUM_RSIZE 6 /* MB */
#endif /* XNU_TARGET_OS_OSX */
static uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE;
static TUNABLE_DEV_WRITEABLE(uint32_t, vm_pressure_task_footprint_min, "vm_pressure_notify_min_footprint_mb", VM_PRESSURE_MINIMUM_RSIZE);
#if DEVELOPMENT || DEBUG
SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, "");
@ -421,34 +421,25 @@ memorystatus_knote_unregister(struct knote *kn __unused)
#if VM_PRESSURE_EVENTS
#if CONFIG_JETSAM
static thread_call_t sustained_pressure_handler_thread_call;
int memorystatus_should_kill_on_sustained_pressure = 1;
/* Count the number of sustained pressure kills we've done since boot. */
uint64_t memorystatus_kill_on_sustained_pressure_count = 0;
uint64_t memorystatus_kill_on_sustained_pressure_window_s = 60 * 10; /* 10 Minutes */
uint64_t memorystatus_kill_on_sustained_pressure_delay_ms = 500; /* .5 seconds */
#if DEVELOPMENT || DEBUG
SYSCTL_INT(_kern, OID_AUTO, memorystatus_should_kill_on_sustained_pressure, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_should_kill_on_sustained_pressure, 0, "");
#endif /* DEVELOPMENT || DEBUG */
SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_count, "");
SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_window_s, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_window_s, "");
SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_delay_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_delay_ms, "");
SYSCTL_QUAD(_kern_memorystatus, OID_AUTO, kill_on_sustained_pressure_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_count, "");
SYSCTL_QUAD(_kern_memorystatus, OID_AUTO, kill_on_sustained_pressure_window_s, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_window_s, "");
SYSCTL_QUAD(_kern_memorystatus, OID_AUTO, kill_on_sustained_pressure_delay_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_delay_ms, "");
static void sustained_pressure_handler(void*, void*);
#endif /* CONFIG_JETSAM */
static thread_call_t memorystatus_notify_update_telemetry_thread_call;
static void update_footprints_for_telemetry(void*, void*);
void
memorystatus_notify_init()
{
#if CONFIG_JETSAM
sustained_pressure_handler_thread_call = thread_call_allocate_with_options(sustained_pressure_handler, NULL, THREAD_CALL_PRIORITY_KERNEL_HIGH, THREAD_CALL_OPTIONS_ONCE);
#endif /* CONFIG_JETSAM */
memorystatus_notify_update_telemetry_thread_call = thread_call_allocate_with_options(update_footprints_for_telemetry, NULL, THREAD_CALL_PRIORITY_USER, THREAD_CALL_OPTIONS_ONCE);
}
@ -708,17 +699,23 @@ memorystatus_is_foreground_locked(proc_t p)
* to access the p_memstat_dirty field.
*/
void
memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit)
memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit, boolean_t *is_active, boolean_t *is_managed, boolean_t *has_assertion)
{
if (!v) {
*is_dirty = FALSE;
*is_dirty_tracked = FALSE;
*allow_idle_exit = FALSE;
*is_active = FALSE;
*is_managed = FALSE;
*has_assertion = FALSE;
} else {
proc_t p = (proc_t)v;
*is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0;
*is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0;
*allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0;
*is_active = (p->p_memstat_memlimit == p->p_memstat_memlimit_active);
*is_managed = (p->p_memstat_state & P_MEMSTAT_MANAGED) != 0;
*has_assertion = (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) != 0;
}
}
@ -783,7 +780,16 @@ CA_EVENT(memorystatus_pressure_interval,
CA_INT, num_transitions,
CA_INT, num_kills,
CA_INT, duration);
static CA_EVENT_TYPE(memorystatus_pressure_interval) memorystatus_pressure_interval_telemetry;
/* Separate struct for tracking so that we have aligned members for atomics */
struct memstat_cur_interval {
int64_t num_procs;
int64_t num_notifs;
int64_t num_transitions;
uint64_t start_mt;
_Atomic uint32_t num_kills;
vm_pressure_level_t max_level;
} memstat_cur_interval;
CA_EVENT(memorystatus_proc_notification,
CA_INT, footprint_before_notification,
@ -915,19 +921,15 @@ update_knote_footprint_history(struct knote *kn, task_t task, uint64_t curr_ts)
}
extern char *proc_name_address(void *p);
/*
* Attempt to send the given level telemetry event.
* Finalizes the duration.
* Clears the src_event struct.
* Send pressure interval telemetry.
*/
static void
memorystatus_pressure_interval_send(CA_EVENT_TYPE(memorystatus_pressure_interval) *src_event)
memorystatus_pressure_interval_send(void)
{
uint64_t duration_nanoseconds = 0;
uint64_t curr_ts = mach_absolute_time();
src_event->duration = curr_ts - src_event->duration;
absolutetime_to_nanoseconds(src_event->duration, &duration_nanoseconds);
src_event->duration = (int64_t) (duration_nanoseconds / NSEC_PER_SEC);
uint64_t duration_nanoseconds;
CA_EVENT_TYPE(memorystatus_pressure_interval) * evt_data;
/*
* Drop the event rather than block for memory. We should be in a normal pressure level now,
@ -935,17 +937,23 @@ memorystatus_pressure_interval_send(CA_EVENT_TYPE(memorystatus_pressure_interval
*/
ca_event_t event_wrapper = CA_EVENT_ALLOCATE_FLAGS(memorystatus_pressure_interval, Z_NOWAIT);
if (event_wrapper) {
memcpy(event_wrapper->data, src_event, sizeof(CA_EVENT_TYPE(memorystatus_pressure_interval)));
CA_EVENT_SEND(event_wrapper);
}
src_event->num_processes_registered = 0;
src_event->num_notifications_sent = 0;
src_event->max_level = 0;
src_event->num_transitions = 0;
src_event->num_kills = 0;
src_event->duration = 0;
}
absolutetime_to_nanoseconds(
mach_absolute_time() - memstat_cur_interval.start_mt,
&duration_nanoseconds);
evt_data = event_wrapper->data;
evt_data->num_processes_registered = memstat_cur_interval.num_procs;
evt_data->num_notifications_sent = memstat_cur_interval.num_notifs;
evt_data->max_level = memstat_cur_interval.max_level;
evt_data->num_transitions = memstat_cur_interval.num_transitions;
evt_data->num_kills = os_atomic_load(&memstat_cur_interval.num_kills, relaxed);
evt_data->duration = duration_nanoseconds / NSEC_PER_SEC;
CA_EVENT_SEND(event_wrapper);
} else {
memorystatus_log_error("memorystatus: Dropping interval telemetry event\n");
}
}
/*
* Attempt to send the per-proc telemetry events.
@ -955,7 +963,6 @@ static void
memorystatus_pressure_proc_telemetry_send(void)
{
struct knote *kn = NULL;
memorystatus_klist_lock();
SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
proc_t p = PROC_NULL;
struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
@ -1000,21 +1007,8 @@ memorystatus_pressure_proc_telemetry_send(void)
timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX] = 0;
timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX] = 0;
}
memorystatus_klist_unlock();
}
/*
* Send all telemetry associated with the increased pressure interval.
*/
static void
memorystatus_pressure_telemetry_send(void)
{
LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_NOTOWNED);
memorystatus_pressure_interval_send(&memorystatus_pressure_interval_telemetry);
memorystatus_pressure_proc_telemetry_send();
}
/*
* kn_max - knote
*
@ -1286,12 +1280,49 @@ uint64_t next_critical_notification_sent_at_ts = 0;
boolean_t memorystatus_manual_testing_on = FALSE;
vm_pressure_level_t memorystatus_manual_testing_level = kVMPressureNormal;
unsigned int memorystatus_sustained_pressure_maximum_band = JETSAM_PRIORITY_IDLE;
TUNABLE_DEV_WRITEABLE(unsigned int, memstat_sustained_pressure_max_pri, "memstat_sustained_pressure_max_pri", JETSAM_PRIORITY_IDLE);
#if DEVELOPMENT || DEBUG
SYSCTL_INT(_kern, OID_AUTO, memorystatus_sustained_pressure_maximum_band, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_sustained_pressure_maximum_band, 0, "");
SYSCTL_UINT(_kern_memorystatus, OID_AUTO, sustained_pressure_max_pri, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memstat_sustained_pressure_max_pri, 0, "");
#endif /* DEVELOPMENT || DEBUG */
#if CONFIG_JETSAM
#define MEMSTAT_PRESSURE_CONFIG_DEFAULT (MEMSTAT_WARNING_KILL_SUSTAINED)
#else
#define MEMSTAT_PRESSURE_CONFIG_DEFAULT (MEMSTAT_WARNING_KILL_IDLE_THROTTLED | MEMSTAT_CRITICAL_PURGE_CACHES)
#endif
TUNABLE_WRITEABLE(memstat_pressure_options_t, memstat_pressure_config,
"memorystatus_pressure_config", MEMSTAT_PRESSURE_CONFIG_DEFAULT);
EXPERIMENT_FACTOR_UINT(memorystatus_pressure_config, &memstat_pressure_config,
0, MEMSTAT_PRESSURE_CONFIG_MAX,
"Which actions to take in response to rising VM pressure");
#if DEVELOPMENT || DEBUG
SYSCTL_UINT(_kern_memorystatus, OID_AUTO, pressure_config,
CTLFLAG_RW | CTLFLAG_LOCKED, &memstat_pressure_config, 0,
"How to respond to VM pressure");
static int
sysctl_memstat_should_kill_sustained SYSCTL_HANDLER_ARGS
{
int old = !!(memstat_pressure_config & MEMSTAT_WARNING_KILL_SUSTAINED);
int new, changed;
int ret = sysctl_io_number(req, old, sizeof(old), &new, &changed);
if (changed) {
if (new) {
memstat_pressure_config |= MEMSTAT_WARNING_KILL_SUSTAINED;
} else {
memstat_pressure_config &= ~MEMSTAT_WARNING_KILL_SUSTAINED;
}
}
return ret;
}
SYSCTL_PROC(_kern, OID_AUTO, memorystatus_should_kill_on_sustained_pressure,
CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0, sysctl_memstat_should_kill_sustained, "IU",
"Whether to kill idle processes under sustained pressure");
#endif
/*
* TODO(jason): The memorystatus thread should be responsible for this
@ -1312,7 +1343,7 @@ sustained_pressure_handler(void* arg0 __unused, void* arg1 __unused)
* If the pressure hasn't been relieved by then, the problem is memory
* consumption in a higher band and this churn is probably doing more harm than good.
*/
max_kills = memstat_get_proccnt_upto_priority(memorystatus_sustained_pressure_maximum_band) * 2;
max_kills = memstat_get_proccnt_upto_priority(memstat_sustained_pressure_max_pri) * 2;
memorystatus_log("memorystatus: Pressure level has been elevated for too long. killing up to %d idle processes\n", max_kills);
while (memorystatus_vm_pressure_level != kVMPressureNormal && kill_count < max_kills) {
bool killed = memorystatus_kill_on_sustained_pressure();
@ -1323,8 +1354,7 @@ sustained_pressure_handler(void* arg0 __unused, void* arg1 __unused)
delay((int)(memorystatus_kill_on_sustained_pressure_delay_ms * NSEC_PER_MSEC / NSEC_PER_USEC));
kill_count++;
memorystatus_kill_on_sustained_pressure_count++;
/* TODO(jason): Should use os_atomic but requires rdar://76310894. */
memorystatus_pressure_interval_telemetry.num_kills++;
os_atomic_inc(&memstat_cur_interval.num_kills, relaxed);
} else {
/* Nothing left to kill */
break;
@ -1335,8 +1365,6 @@ sustained_pressure_handler(void* arg0 __unused, void* arg1 __unused)
}
}
#endif /* CONFIG_JETSAM */
/*
* Returns the number of processes registered for notifications at this level.
*/
@ -1355,6 +1383,48 @@ memorystatus_klist_length(int level)
return count;
}
/*
* Starts a pressure interval, setting up tracking for it
*/
static void
memstat_pressure_interval_start(uint64_t curr_ts)
{
LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_OWNED);
memstat_cur_interval.num_procs = 0;
memstat_cur_interval.num_notifs = 0;
memstat_cur_interval.num_transitions = 0;
memstat_cur_interval.start_mt = curr_ts;
os_atomic_store(&memstat_cur_interval.num_kills, 0, relaxed);
memstat_cur_interval.max_level = kVMPressureNormal;
}
/*
* Ends a pressure interval, sending all telemetry associated with it
*/
static void
memstat_pressure_interval_end(void)
{
LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_OWNED);
memorystatus_pressure_interval_send();
memorystatus_pressure_proc_telemetry_send();
}
/*
* Updates the pressure interval when the pressure level changes
*/
static void
memstat_pressure_interval_update(vm_pressure_level_t new_level)
{
LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_OWNED);
memstat_cur_interval.num_transitions++;
if (new_level <= memstat_cur_interval.max_level) {
return;
}
memstat_cur_interval.num_procs = memorystatus_klist_length(new_level);
memstat_cur_interval.max_level = new_level;
}
/*
* Updates the footprint telemetry for procs that have received notifications.
*/
@ -1421,14 +1491,12 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process)
* by immediately killing idle exitable processes. We use a delay
* to avoid overkill. And we impose a max counter as a fail safe
* in case daemons re-launch too fast.
*
* TODO: These jetsams should be performed on the memorystatus thread. We can
* provide the similar false-idle mitigation by skipping processes with med/high
* relaunch probability and/or using the sustained-pressure mechanism.
* (rdar://134075608)
*/
while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
if (!memstat_kill_idle_process(kMemorystatusKilledIdleExit, NULL)) {
while (memstat_pressure_config & MEMSTAT_WARNING_KILL_IDLE_THROTTLED &&
memorystatus_vm_pressure_level != kVMPressureNormal &&
idle_kill_counter < MAX_IDLE_KILLS) {
uint64_t footprint;
if (!memstat_kill_idle_process(kMemorystatusKilledIdleExit, &footprint)) {
/* No idle exitable processes left to kill */
break;
}
@ -1440,7 +1508,7 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process)
* the pressure notification scheme.
*/
} else {
delay(1000000); /* 1 second */
delay(1 * USEC_PER_SEC);
}
}
#endif /* !CONFIG_JETSAM */
@ -1476,26 +1544,24 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process)
}
}
#if CONFIG_JETSAM
if (memorystatus_vm_pressure_level == kVMPressureNormal && prev_level_snapshot != kVMPressureNormal) {
if (memorystatus_should_kill_on_sustained_pressure) {
if (memstat_pressure_config & MEMSTAT_WARNING_KILL_SUSTAINED) {
if (memorystatus_vm_pressure_level == kVMPressureNormal && prev_level_snapshot != kVMPressureNormal) {
memorystatus_log("memorystatus: Pressure has returned to level %d. Cancelling scheduled jetsam\n", memorystatus_vm_pressure_level);
thread_call_cancel(sustained_pressure_handler_thread_call);
} else if (memorystatus_vm_pressure_level != kVMPressureNormal && prev_level_snapshot == kVMPressureNormal) {
/*
* Pressure has increased from normal.
* Hopefully the notifications will relieve it,
* but as a fail-safe we'll trigger jetsam
* after a configurable amount of time.
*/
memorystatus_log("memorystatus: Pressure level has increased from %d to %d. Scheduling jetsam.\n", prev_level_snapshot, memorystatus_vm_pressure_level);
uint64_t kill_time;
nanoseconds_to_absolutetime(memorystatus_kill_on_sustained_pressure_window_s * NSEC_PER_SEC, &kill_time);
kill_time += mach_absolute_time();
thread_call_enter_delayed(sustained_pressure_handler_thread_call, kill_time);
}
} else if (memorystatus_should_kill_on_sustained_pressure && memorystatus_vm_pressure_level != kVMPressureNormal && prev_level_snapshot == kVMPressureNormal) {
/*
* Pressure has increased from normal.
* Hopefully the notifications will relieve it,
* but as a fail-safe we'll trigger jetsam
* after a configurable amount of time.
*/
memorystatus_log("memorystatus: Pressure level has increased from %d to %d. Scheduling jetsam.\n", prev_level_snapshot, memorystatus_vm_pressure_level);
uint64_t kill_time;
nanoseconds_to_absolutetime(memorystatus_kill_on_sustained_pressure_window_s * NSEC_PER_SEC, &kill_time);
kill_time += mach_absolute_time();
thread_call_enter_delayed(sustained_pressure_handler_thread_call, kill_time);
}
#endif /* CONFIG_JETSAM */
while (1) {
/*
@ -1523,21 +1589,41 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process)
continue;
}
}
if (level_snapshot == kVMPressureNormal) {
memorystatus_pressure_telemetry_send();
}
prev_level_snapshot = level_snapshot;
smoothing_window_started = FALSE;
if (memstat_pressure_config & MEMSTAT_WARNING_KILL_LONG_IDLE &&
level_snapshot >= kVMPressureWarning &&
memstat_get_long_idle_proccnt() > 0) {
/* There are long-idle daemons to kill */
memorystatus_thread_wake();
} else if (level_snapshot == kVMPressureCritical) {
if (memstat_pressure_config & MEMSTAT_CRITICAL_PURGE_CACHES) {
uint64_t now = mach_absolute_time();
uint64_t delta_ns;
absolutetime_to_nanoseconds(now - memstat_last_cache_purge_ts, &delta_ns);
if (delta_ns >= memstat_cache_purge_backoff_ns) {
/* Wake up the jetsam thread to purge caches */
memorystatus_thread_wake();
}
} else if (memstat_pressure_config & MEMSTAT_CRITICAL_KILL_IDLE &&
memstat_get_idle_proccnt() > 0) {
memorystatus_thread_wake();
}
}
memorystatus_klist_lock();
if (level_snapshot > memorystatus_pressure_interval_telemetry.max_level) {
memorystatus_pressure_interval_telemetry.num_processes_registered = memorystatus_klist_length(level_snapshot);
memorystatus_pressure_interval_telemetry.max_level = level_snapshot;
memorystatus_pressure_interval_telemetry.num_transitions++;
if (memorystatus_pressure_interval_telemetry.duration == 0) {
/* Set the start timestamp. Duration will be finalized when we send the event. */
memorystatus_pressure_interval_telemetry.duration = curr_ts;
/* Interval tracking & telemetry */
if (prev_level_snapshot != level_snapshot) {
if (level_snapshot == kVMPressureNormal) {
memstat_pressure_interval_end();
} else if (prev_level_snapshot == kVMPressureNormal) {
memstat_pressure_interval_start(curr_ts);
}
memstat_pressure_interval_update(level_snapshot);
}
kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process, &next_telemetry_update);
@ -1624,10 +1710,16 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process)
}
}
}
if (level_snapshot != kVMPressureNormal) {
mark_knote_send_time(kn_max, task, convert_internal_pressure_level_to_dispatch_level(level_snapshot),
(uint16_t) MIN(UINT16_MAX, memorystatus_pressure_interval_telemetry.num_notifications_sent));
memorystatus_pressure_interval_telemetry.num_notifications_sent++;
uint16_t num_notifications;
if (os_convert_overflow(memstat_cur_interval.num_notifs, &num_notifications)) {
num_notifications = UINT16_MAX;
}
mark_knote_send_time(kn_max, task,
convert_internal_pressure_level_to_dispatch_level(level_snapshot),
num_notifications);
memstat_cur_interval.num_notifs++;
}
KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);

View file

@ -80,12 +80,15 @@ extern uint64_t memstat_reaper_min_age_secs;
extern uint64_t memstat_oldest_reapable_proc_will_be_reapable_at_ts_matu;
extern bool memstat_reaper_is_currently_sweeping;
extern vm_pressure_level_t memorystatus_vm_pressure_level;
static void
memorystatus_health_check(memorystatus_system_health_t *status)
memstat_evaluate_health_conditions(memorystatus_system_health_t status)
{
memset(status, 0, sizeof(memorystatus_system_health_t));
status->msh_compressor_exhausted = vm_compressor_low_on_space() ||
status->msh_compressor_low_on_space = vm_compressor_low_on_space() ||
os_atomic_load(&memorystatus_compressor_space_shortage, relaxed);
status->msh_compressor_exhausted = vm_compressor_out_of_space();
status->msh_swap_low_on_space = vm_swap_low_on_space();
status->msh_swap_exhausted = vm_swap_out_of_space();
#if CONFIG_JETSAM
@ -108,27 +111,158 @@ memorystatus_health_check(memorystatus_system_health_t *status)
status->msh_pageout_starved = os_atomic_load(&memorystatus_pageout_starved, relaxed);
status->msh_swappable_compressor_segments_over_limit = memorystatus_swap_over_trigger(100);
status->msh_swapin_queue_over_limit = memorystatus_swapin_over_trigger();
#else /* !CONFIG_JETSAM */
vm_pressure_level_t pressure_level = memorystatus_vm_pressure_level;
status->msh_vm_pressure_critical = (pressure_level == kVMPressureCritical);
status->msh_vm_pressure_warning = (pressure_level >= kVMPressureWarning);
#endif /* CONFIG_JETSAM */
status->msh_zone_map_is_exhausted = os_atomic_load(&memorystatus_zone_map_is_exhausted, relaxed);
}
bool
memorystatus_is_system_healthy(const memorystatus_system_health_t *status)
static bool
memstat_is_system_healthy(const memorystatus_system_health_t status)
{
#if CONFIG_JETSAM
return !(status->msh_available_pages_below_critical ||
status->msh_compressor_is_thrashing ||
status->msh_compressor_exhausted ||
status->msh_compressor_low_on_space ||
status->msh_filecache_is_thrashing ||
status->msh_zone_map_is_exhausted ||
status->msh_pageout_starved);
#else /* CONFIG_JETSAM */
return !(status->msh_zone_map_is_exhausted ||
status->msh_compressor_exhausted ||
status->msh_swap_exhausted);
status->msh_compressor_low_on_space ||
status->msh_swap_exhausted ||
status->msh_swap_low_on_space ||
status->msh_vm_pressure_critical ||
status->msh_vm_pressure_warning);
#endif /* CONFIG_JETSAM */
}
static void
memstat_log_system_health(const memorystatus_system_health_t status)
{
static struct memorystatus_system_health_s prev_status = {0};
bool healthy = memstat_is_system_healthy(status);
/*
* Avoid spamming logs by only logging when the system status has changed.
*/
if (prev_status.msh_zone_map_is_exhausted == status->msh_zone_map_is_exhausted &&
prev_status.msh_compressor_exhausted == status->msh_compressor_exhausted &&
prev_status.msh_swap_low_on_space == status->msh_swap_low_on_space &&
prev_status.msh_swap_exhausted == status->msh_swap_exhausted
#if CONFIG_JETSAM
&&
prev_status.msh_available_pages_below_idle == status->msh_available_pages_below_idle &&
prev_status.msh_available_pages_below_soft == status->msh_available_pages_below_soft &&
prev_status.msh_available_pages_below_critical == status->msh_available_pages_below_critical &&
prev_status.msh_available_pages_below_reaper == status->msh_available_pages_below_reaper &&
prev_status.msh_compressor_needs_to_swap == status->msh_compressor_needs_to_swap &&
prev_status.msh_compressor_is_thrashing == status->msh_compressor_is_thrashing &&
prev_status.msh_filecache_is_thrashing == status->msh_filecache_is_thrashing &&
prev_status.msh_phantom_cache_pressure == status->msh_phantom_cache_pressure &&
prev_status.msh_swapin_queue_over_limit == status->msh_swapin_queue_over_limit &&
prev_status.msh_pageout_starved == status->msh_pageout_starved
#endif /* CONFIG_JETSAM */
) {
/* No change */
return;
}
#if CONFIG_JETSAM
if (healthy) {
if (status->msh_available_pages_below_soft) {
memorystatus_log(
"memorystatus: System will begin enforcing "
"soft memory limits. "
"memorystatus_available_pages: %llu compressor_size: %u\n",
(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
} else if (status->msh_available_pages_below_idle) {
memorystatus_log(
"memorystatus: System will begin enacting "
"idle-exits. "
"memorystatus_available_pages: %llu compressor_size: %u\n",
(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
} else if (status->msh_available_pages_below_reaper) {
memorystatus_log(
"memorystatus: System will begin reaping "
"long-idle processes. "
"memorystatus_available_pages: %llu compressor_size: %u\n",
(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
} else {
memorystatus_log(
"memorystatus: System is healthy. "
"memorystatus_available_pages: %llu compressor_size:%u\n",
(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
}
} else {
/* Unhealthy */
memorystatus_log("memorystatus: System is unhealthy! memorystatus_available_pages: %llu compressor_size:%u\n",
(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
memorystatus_log(
"memorystatus: {"
"\"available_pages_below_critical\": %d, "
"\"available_pages_below_idle\": %d, "
"\"available_pages_below_soft\": %d, "
"\"available_pages_below_reaper\": %d, "
"\"compressor_needs_to_swap\": %d, "
"\"compressor_exhausted\": %d, "
"\"compressor_is_thrashing\": %d, "
"\"filecache_is_thrashing\": %d, "
"\"zone_map_is_exhausted\": %d, "
"\"phantom_cache_pressure\": %d, "
"\"swappable_compressor_segments_over_limit\": %d, "
"\"swapin_queue_over_limit\": %d, "
"\"swap_low\": %d, "
"\"swap_exhausted\": %d"
"}\n",
status->msh_available_pages_below_critical,
status->msh_available_pages_below_idle,
status->msh_available_pages_below_soft,
status->msh_available_pages_below_reaper,
status->msh_compressor_needs_to_swap,
status->msh_compressor_exhausted,
status->msh_compressor_is_thrashing,
status->msh_filecache_is_thrashing,
status->msh_zone_map_is_exhausted,
status->msh_phantom_cache_pressure,
status->msh_swappable_compressor_segments_over_limit,
status->msh_swapin_queue_over_limit,
status->msh_swap_low_on_space,
status->msh_swap_exhausted);
}
#else /* CONFIG_JETSAM */
memorystatus_log("memorystatus: System is %s. memorystatus_available_pages: %llu compressor_size:%u\n",
healthy ? "healthy" : "unhealthy",
(uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size());
if (!healthy) {
memorystatus_log(
"memorystatus: {"
"\"compressor_exhausted\": %d, "
"\"zone_map_is_exhausted\": %d, "
"\"swap_low\": %d, "
"\"swap_exhausted\": %d"
"}\n",
status->msh_compressor_exhausted,
status->msh_zone_map_is_exhausted,
status->msh_swap_low_on_space,
status->msh_swap_exhausted);
}
#endif /* CONFIG_JETSAM */
prev_status = *status;
}
bool
memstat_check_system_health(memorystatus_system_health_t status)
{
memstat_evaluate_health_conditions(status);
memstat_log_system_health(status);
return memstat_is_system_healthy(status);
}
#pragma mark Memorystatus Thread Actions
@ -136,6 +270,45 @@ memorystatus_is_system_healthy(const memorystatus_system_health_t *status)
* This section picks the appropriate memorystatus_action & deploys it.
*/
uint64_t memstat_last_cache_purge_ts;
/* Purge caches under critical pressure up to every 1 min */
TUNABLE(uint64_t, memstat_cache_purge_backoff_ns,
"memorystatus_cache_purge_backoff_ns", 1 * 60 * NSEC_PER_SEC);
static uint32_t
memorystatus_pick_kill_cause(const memorystatus_system_health_t status)
{
assert(!memstat_is_system_healthy(status));
#if CONFIG_JETSAM
if (status->msh_compressor_is_thrashing) {
return kMemorystatusKilledVMCompressorThrashing;
} else if (status->msh_compressor_exhausted) {
return kMemorystatusKilledVMCompressorSpaceShortage;
} else if (status->msh_swap_low_on_space) {
return kMemorystatusKilledLowSwap;
} else if (status->msh_filecache_is_thrashing) {
return kMemorystatusKilledFCThrashing;
} else if (status->msh_zone_map_is_exhausted) {
return kMemorystatusKilledZoneMapExhaustion;
} else if (status->msh_pageout_starved) {
return kMemorystatusKilledVMPageoutStarvation;
} else {
assert(status->msh_available_pages_below_critical);
return kMemorystatusKilledVMPageShortage;
}
#else /* CONFIG_JETSAM */
if (status->msh_zone_map_is_exhausted) {
return kMemorystatusKilledZoneMapExhaustion;
} else if (status->msh_compressor_exhausted) {
return kMemorystatusKilledVMCompressorSpaceShortage;
} else if (status->msh_swap_exhausted) {
return kMemorystatusKilledLowSwap;
} else {
return kMemorystatusKilled;
}
#endif /* CONFIG_JETSAM */
}
/*
* Inspects the state of various resources in the system to see if
* the system is healthy. If the system is not healthy, picks a
@ -153,10 +326,8 @@ memorystatus_pick_action(jetsam_state_t state,
bool swappable_apps_remaining,
int *jld_idle_kills)
{
memorystatus_system_health_t status;
memorystatus_health_check(&status);
memorystatus_log_system_health(&status);
bool is_system_healthy = memorystatus_is_system_healthy(&status);
struct memorystatus_system_health_s status;
bool is_system_healthy = memstat_check_system_health(&status);
#if CONFIG_JETSAM
if (status.msh_available_pages_below_soft || !is_system_healthy) {
@ -195,7 +366,7 @@ memorystatus_pick_action(jetsam_state_t state,
}
}
if (status.msh_compressor_exhausted) {
if (status.msh_compressor_exhausted || status.msh_compressor_low_on_space) {
*kill_cause = kMemorystatusKilledVMCompressorSpaceShortage;
return MEMORYSTATUS_KILL_TOP_PROCESS;
}
@ -255,6 +426,7 @@ memorystatus_pick_action(jetsam_state_t state,
(void) jld_idle_kills;
(void) suspended_swappable_apps_remaining;
(void) swappable_apps_remaining;
(void) highwater_remaining;
/*
* Without CONFIG_JETSAM, we only kill if the system is unhealthy.
@ -265,29 +437,75 @@ memorystatus_pick_action(jetsam_state_t state,
*kill_cause = 0;
return MEMORYSTATUS_KILL_NONE;
}
if (highwater_remaining) {
*kill_cause = kMemorystatusKilledHiwat;
return MEMORYSTATUS_KILL_HIWATER;
}
*kill_cause = memorystatus_pick_kill_cause(&status);
if (status.msh_zone_map_is_exhausted) {
return MEMORYSTATUS_KILL_TOP_PROCESS;
} else if (status.msh_compressor_exhausted || status.msh_swap_exhausted) {
}
if (status.msh_compressor_exhausted || status.msh_swap_exhausted) {
if (kill_on_no_paging_space) {
return MEMORYSTATUS_KILL_TOP_PROCESS;
} else if (memstat_get_idle_proccnt() > 0) {
}
}
if (status.msh_compressor_low_on_space || status.msh_swap_low_on_space) {
if (memstat_get_idle_proccnt() > 0) {
/* Kill all idle processes before invoking the no paging space action */
return MEMORYSTATUS_KILL_IDLE;
}
/*
* Throttle how often the no-paging-space action is performed.
*/
uint64_t now = mach_absolute_time();
uint64_t delta_since_last_no_space_ns;
uint64_t last_action_ts = os_atomic_load(&last_no_space_action_ts, relaxed);
assert3u(now, >=, last_action_ts);
absolutetime_to_nanoseconds(now - last_action_ts, &delta_since_last_no_space_ns);
if (delta_since_last_no_space_ns > no_paging_space_action_throttle_delay_ns) {
return MEMORYSTATUS_NO_PAGING_SPACE;
} else {
/*
* The no paging space action will be performed synchronously by the the
* thread performing the compression/swap.
*/
return MEMORYSTATUS_KILL_NONE;
}
} else {
panic("System is unhealthy but compressor, swap, and zone map are not exhausted");
}
if (status.msh_vm_pressure_critical) {
/*
* The system is under critical memory pressure. First terminate any low-risk
* idle processes. When they are exhausted, purge system memory caches.
*/
if (memstat_pressure_config & MEMSTAT_WARNING_KILL_LONG_IDLE &&
memstat_get_long_idle_proccnt() > 0) {
*kill_cause = kMemorystatusKilledLongIdleExit;
return MEMORYSTATUS_KILL_LONG_IDLE;
}
if (memstat_pressure_config & MEMSTAT_CRITICAL_KILL_IDLE &&
memstat_get_idle_proccnt() > 0) {
*kill_cause = kMemorystatusKilledIdleExit;
return MEMORYSTATUS_KILL_IDLE;
}
if (memstat_pressure_config & MEMSTAT_CRITICAL_PURGE_CACHES) {
uint64_t now = mach_absolute_time();
uint64_t delta_ns;
uint64_t last_purge_ts = os_atomic_load(&memstat_last_cache_purge_ts, relaxed);
assert3u(now, >=, last_purge_ts);
absolutetime_to_nanoseconds(now - last_purge_ts, &delta_ns);
if (delta_ns > memstat_cache_purge_backoff_ns) {
memstat_last_cache_purge_ts = now;
return MEMORYSTATUS_PURGE_CACHES;
}
}
return MEMORYSTATUS_KILL_NONE;
} else if (status.msh_vm_pressure_warning) {
/*
* The system is under pressure and is likely to start swapping soon. Reap
* any long-idle daemons.
*/
if (memstat_pressure_config & MEMSTAT_WARNING_KILL_LONG_IDLE &&
memstat_get_long_idle_proccnt() > 0) {
*kill_cause = kMemorystatusKilledLongIdleExit;
return MEMORYSTATUS_KILL_LONG_IDLE;
}
return MEMORYSTATUS_KILL_NONE;
}
#endif /* CONFIG_JETSAM */
panic("System is unhealthy but no action has been chosen");
}
#pragma mark Aggressive Jetsam

View file

@ -66,8 +66,10 @@
* @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94
*/
#include <sys/errno.h>
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/syslimits.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/proc_internal.h>
@ -112,6 +114,7 @@ extern vm_map_t bsd_pageable_map;
#include <sys/kdebug.h>
#include <IOKit/IOPlatformExpert.h>
#include <IOKit/IOBSD.h>
#include <pexpert/pexpert.h>
#include <machine/config.h>
@ -165,6 +168,9 @@ static int osenvironment_initialized = 0;
static uint32_t ephemeral_storage = 0;
static uint32_t use_recovery_securityd = 0;
static char *mempath = NULL;
static size_t mempath_size = 0;
static struct {
uint32_t ephemeral_storage:1;
uint32_t use_recovery_securityd:1;
@ -575,7 +581,7 @@ sysctl_hw_generic(__unused struct sysctl_oid *oidp, void *arg1,
#endif
case HW_USERMEM:
{
int usermem = (int)(mem_size - vm_page_wire_count * page_size);
int usermem = (int)(max_mem - vm_page_wire_count * page_size);
return SYSCTL_RETURN(req, usermem);
}
@ -876,6 +882,55 @@ sysctl_serialdebugmode
return sysctl_io_number(req, serialdebugmode, sizeof(serialdebugmode), NULL, NULL);
}
/*
* This sysctl is a string that contains the jetsam properties path used by launchd to apply.
* jetsam properties to service. This sysctl is set once by launchd at boot and after userspace reboots,
* before it spawns any services.
*/
#define kReadOnlyMempathEntitlement "com.apple.private.kernel.mempath-read-only"
static int
sysctl_mempath
(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
{
int error = EINVAL;
if (req->newptr != 0) {
/* initproc is the only process that can write to this sysctl */
if (proc_getpid(req->p) != 1) {
return EPERM;
}
if (req->newlen > PATH_MAX) {
return EOVERFLOW;
}
size_t mempath_new_size = req->newlen + 1;
char *mempath_new = kalloc_data(mempath_new_size, Z_WAITOK);
if (!mempath_new) {
return ENOMEM;
}
mempath_new[mempath_new_size - 1] = '\0';
error = SYSCTL_IN(req, mempath_new, mempath_new_size - 1);
if (0 != error) {
kfree_data(mempath_new, mempath_new_size);
return error;
}
/* copy in was successful; swap out old/new buffers */
if (NULL != mempath) {
kfree_data(mempath, mempath_size);
}
mempath = mempath_new;
mempath_size = mempath_new_size;
} else {
/* A read entitlement is required to read this sysctl */
if (!IOCurrentTaskHasEntitlement(kReadOnlyMempathEntitlement)) {
return EPERM;
}
error = EIO;
if (mempath && mempath_size) {
error = SYSCTL_OUT(req, mempath, mempath_size);
}
}
return error;
}
/*
* hw.* MIB variables.
*/
@ -937,6 +992,7 @@ SYSCTL_PROC(_hw, OID_AUTO, ephemeral_storage, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG
SYSCTL_PROC(_hw, OID_AUTO, use_recovery_securityd, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_use_recovery_securityd, "I", "");
SYSCTL_PROC(_hw, OID_AUTO, use_kernelmanagerd, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_use_kernelmanagerd, "I", "");
SYSCTL_PROC(_hw, OID_AUTO, serialdebugmode, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_serialdebugmode, "I", "");
SYSCTL_PROC(_hw, OID_AUTO, mempath, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_mempath, "A", "");
/*
* hw.perflevelN.* variables.

View file

@ -1168,10 +1168,12 @@ mprotect_sanitize(
* check unaligned start due to UNIX SPEC: user address is not page-aligned,
* return EINVAL
*/
vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_CHECK_ALIGNED_START |
VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH;
result = vm_sanitize_addr_size(user_addr_u, user_size_u,
VM_SANITIZE_CALLER_MPROTECT, user_map,
VM_SANITIZE_FLAGS_CHECK_ALIGNED_START |
VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH,
VM_SANITIZE_CALLER_MPROTECT, user_map, flags,
user_addr, user_end_aligned, user_size);
if (__improbable(result != KERN_SUCCESS)) {
return result;
@ -1325,10 +1327,11 @@ minherit_sanitize(
kern_return_t result;
mach_vm_offset_t addr_end;
vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH;
result = vm_sanitize_addr_size(addr_u, size_u, VM_SANITIZE_CALLER_MINHERIT,
user_map,
VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, addr,
&addr_end, size);
user_map, flags, addr, &addr_end, size);
if (__improbable(result != KERN_SUCCESS)) {
return result;
}
@ -1397,10 +1400,11 @@ madvise_sanitize(
mach_vm_offset_t *end,
mach_vm_size_t *size)
{
vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH;
return vm_sanitize_addr_size(addr_u, len_u, VM_SANITIZE_CALLER_MADVISE,
user_map,
VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH,
start, end, size);
user_map, flags, start, end, size);
}
int
@ -1510,8 +1514,10 @@ mincore_sanitize(
mach_vm_offset_t *end,
mach_vm_size_t *size)
{
vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS;
return vm_sanitize_addr_size(addr_u, len_u, VM_SANITIZE_CALLER_MINCORE,
map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, addr, end, size);
map, flags, addr, end, size);
}
int

View file

@ -77,6 +77,7 @@
#include <sys/kauth.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/variant_internal.h>
#include <vm/vm_pageout_xnu.h>
@ -1676,16 +1677,36 @@ sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
return error;
}
#define WRITE_EXPERIMENT_FACTORS_ENTITLEMENT "com.apple.private.write-kr-experiment-factors"
const char *trial_experiment_factors_entitlement = "com.apple.private.kernel.read-write-trial-experiment-factors";
/*
* Is the current task allowed to read/write trial experiment factors?
* Requires either:
* - trial_experiment_factors_entitlement
* - root user (internal-diagnostics only)
*/
STATIC bool
can_rw_trial_experiment_factors(struct sysctl_req *req)
{
if (IOTaskHasEntitlement(proc_task(req->p), trial_experiment_factors_entitlement)) {
return true;
}
if (os_variant_has_internal_diagnostics("com.apple.xnu")) {
return !proc_suser(req->p);
}
return false;
}
#define WRITE_LEGACY_EXPERIMENT_FACTORS_ENTITLEMENT "com.apple.private.write-kr-experiment-factors"
/*
* Is the current task allowed to write to experiment factors?
* tasks with the WRITE_EXPERIMENT_FACTORS_ENTITLEMENT are always allowed to write these.
* In the development / debug kernel we also allow root to write them.
*/
STATIC bool
can_write_experiment_factors(__unused struct sysctl_req *req)
can_write_legacy_experiment_factors(__unused struct sysctl_req *req)
{
if (IOCurrentTaskHasEntitlement(WRITE_EXPERIMENT_FACTORS_ENTITLEMENT)) {
if (IOCurrentTaskHasEntitlement(WRITE_LEGACY_EXPERIMENT_FACTORS_ENTITLEMENT)) {
return true;
}
#if DEBUG || DEVELOPMENT
@ -1832,13 +1853,20 @@ found:
goto err;
}
if (oid->oid_kind & CTLFLAG_EXPERIMENT && req->p) {
if (!can_rw_trial_experiment_factors(req)) {
error = (EPERM);
goto err;
}
}
if (req->newptr && req->p) {
if (oid->oid_kind & CTLFLAG_EXPERIMENT) {
if (oid->oid_kind & CTLFLAG_LEGACY_EXPERIMENT) {
/*
* Experiment factors have different permissions since they need to be
* writable by procs with WRITE_EXPERIMENT_FACTORS_ENTITLEMENT.
*/
if (!can_write_experiment_factors(req)) {
if (!can_write_legacy_experiment_factors(req)) {
error = (EPERM);
goto err;
}
@ -2223,6 +2251,9 @@ scalable_counter_sysctl_handler SYSCTL_HANDLER_ARGS
return SYSCTL_OUT(req, &value, sizeof(value));
}
SYSCTL_NODE(_kern, OID_AUTO, trial, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
"trial experiment factors");
#define X(name, T) \
int \
experiment_factor_##name##_handler SYSCTL_HANDLER_ARGS \
@ -2285,7 +2316,12 @@ sysctl_register_test_startup(struct sysctl_test_setup_spec *spec)
.oid_parent = &sysctl__debug_test_children,
.oid_number = OID_AUTO,
.oid_kind = CTLTYPE_QUAD | CTLFLAG_OID2 | CTLFLAG_WR |
CTLFLAG_PERMANENT | CTLFLAG_LOCKED | CTLFLAG_MASKED,
CTLFLAG_PERMANENT | CTLFLAG_LOCKED | CTLFLAG_MASKED
#ifdef __BUILDING_XNU_LIB_UNITTEST__
| CTLFLAG_KERN, /* allow calls from unit-test which use kernel_sysctlbyname() */
#else /* __BUILDING_XNU_LIB_UNITTEST__ */
,
#endif /* __BUILDING_XNU_LIB_UNITTEST__ */
.oid_arg1 = (void *)(uintptr_t)spec->st_func,
.oid_name = spec->st_name,
.oid_handler = sysctl_test_handler,
@ -2457,3 +2493,35 @@ SYSCTL_OID(_debug_test_sysctl_node_test_l2, OID_AUTO, hanging_oid,
CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, NULL, "", "rdar://138698424 L2 hanging OID");
#endif /* DEBUG || DEVELOPMENT */
static int
sysctl_static_if_modified_keys SYSCTL_HANDLER_ARGS
{
extern char __static_if_segment_start[] __SEGMENT_START_SYM(STATIC_IF_SEGMENT);
uint64_t addr;
int err;
for (static_if_key_t key = static_if_modified_keys;
key; key = key->sik_modified_next) {
if ((key->sik_enable_count >= 0) == key->sik_init_value) {
continue;
}
addr = (vm_offset_t)key->sik_entries_head - (vm_offset_t)__static_if_segment_start;
err = SYSCTL_OUT(req, &addr, sizeof(addr));
if (err) {
return err;
}
}
return 0;
}
SYSCTL_PROC(_kern, OID_AUTO, static_if_modified_keys,
CTLFLAG_RD | CTLFLAG_LOCKED | CTLTYPE_OPAQUE,
0, 0, sysctl_static_if_modified_keys, "-",
"List of unslid addresses of modified keys");
SYSCTL_UINT(_kern, OID_AUTO, static_if_abi, CTLFLAG_RD | CTLFLAG_LOCKED,
&static_if_abi, 0, "static_if ABI");

View file

@ -103,6 +103,7 @@
#include <kern/smr_hash.h>
#include <kern/task.h>
#include <kern/coalition.h>
#include <kern/cs_blobs.h>
#include <sys/coalition.h>
#include <kern/assert.h>
#include <kern/sched_prim.h>
@ -193,7 +194,7 @@ static TUNABLE(bool, syscallfilter_disable, "-disable_syscallfilter", false);
#if DEBUG
#define __PROC_INTERNAL_DEBUG 1
#endif
#if CONFIG_COREDUMP
#if CONFIG_COREDUMP || CONFIG_UCOREDUMP
/* Name to give to core files */
#if defined(XNU_TARGET_OS_BRIDGE)
__XNU_PRIVATE_EXTERN const char * defaultcorefiledir = "/private/var/internal";
@ -421,9 +422,11 @@ proc_isinferior(int pid1, int pid2)
* racy for a current process or if a reference to the process is held.
*/
struct proc_ident
proc_ident(proc_t p)
proc_ident_with_policy(proc_t p, proc_ident_validation_policy_t policy)
{
struct proc_ident ident = {
.may_exit = (policy & IDENT_VALIDATION_PROC_MAY_EXIT) != 0,
.may_exec = (policy & IDENT_VALIDATION_PROC_MAY_EXEC) != 0,
.p_pid = proc_pid(p),
.p_uniqueid = proc_uniqueid(p),
.p_idversion = proc_pidversion(p),
@ -432,6 +435,12 @@ proc_ident(proc_t p)
return ident;
}
/*
* Function: proc_find_audit_token
*
* Description: Lookup a process with the provided audit_token_t
* will validate that the embedded pidver matches.
*/
proc_t
proc_find_audit_token(const audit_token_t token)
{
@ -456,23 +465,200 @@ proc_find_audit_token(const audit_token_t token)
return proc;
}
proc_t
proc_find_ident(struct proc_ident const *ident)
/*
* Function: proc_find_ident_validated
*
* Description: Obtain a proc ref from the provided proc_ident.
*
* Returns:
* - 0 on Success
* - EINVAL: When the provided arguments are invalid (NULL)
* - ESTALE: The process exists but is currently a zombie and
* has not been reaped via wait(). Callers may choose to handle
* this edge case as a non-error.
* - ESRCH: When the lookup or validation fails otherwise. The process
* described by the identifier no longer exists.
*
* Note: Caller must proc_rele() the out param when this function returns 0
*/
errno_t
proc_find_ident_validated(const proc_ident_t ident, proc_t *out)
{
proc_t proc = PROC_NULL;
proc = proc_find(ident->p_pid);
if (proc == PROC_NULL) {
return PROC_NULL;
if (ident == NULL || out == NULL) {
return EINVAL;
}
if (proc_uniqueid(proc) != ident->p_uniqueid ||
proc_t proc = proc_find(ident->p_pid);
if (proc == PROC_NULL) {
// If the policy indicates the process may exit, we should also check
// the zombie list, and return ENOENT to indicate that the process is
// a zombie waiting to be reaped.
if (proc_ident_has_policy(ident, IDENT_VALIDATION_PROC_MAY_EXIT)
&& pzfind_unique(ident->p_pid, ident->p_uniqueid)) {
return ESTALE;
}
return ESRCH;
}
// If the policy indicates that the process shouldn't exec, fail the
// lookup if the pidversion doesn't match
if (!proc_ident_has_policy(ident, IDENT_VALIDATION_PROC_MAY_EXEC) &&
proc_pidversion(proc) != ident->p_idversion) {
proc_rele(proc);
return PROC_NULL;
return ESRCH;
}
return proc;
// Check the uniqueid which is always verified
if (proc_uniqueid(proc) != ident->p_uniqueid) {
proc_rele(proc);
return ESRCH;
}
*out = proc;
return 0;
}
/*
* Function: proc_find_ident
*
* Description: Obtain a proc ref from the provided proc_ident.
* Discards the errno result from proc_find_ident_validated
* for callers using the old interface.
*/
inline proc_t
proc_find_ident(const proc_ident_t ident)
{
proc_t p = PROC_NULL;
if (proc_find_ident_validated(ident, &p) != 0) {
return PROC_NULL;
}
return p;
}
/*
* Function: proc_ident_equal_token
*
* Description: Compare a proc_ident_t to an audit token. The
* process described by the audit token must still exist (which
* includes a pidver check during the lookup). But the comparison
* with the proc_ident_t will respect IDENT_VALIDATION_PROC_MAY_EXEC
* and only compare PID and unique ID when it is set.
*/
bool
proc_ident_equal_token(proc_ident_t ident, audit_token_t token)
{
if (ident == NULL) {
return false;
}
// If the PIDs don't match, early return
if (ident->p_pid != get_audit_token_pid(&token)) {
return false;
}
// Compare pidversion if IDENT_VALIDATION_PROC_MAY_EXEC is not set
if (!proc_ident_has_policy(ident, IDENT_VALIDATION_PROC_MAY_EXEC) &&
ident->p_idversion != token.val[7]) {
return false;
}
// Lookup the process described by the provided audit token
proc_t proc = proc_find_audit_token(token);
if (proc == PROC_NULL) {
return false;
}
// Always validate that the uniqueid matches
if (proc_uniqueid(proc) != ident->p_uniqueid) {
proc_rele(proc);
return false;
}
proc_rele(proc);
return true;
}
/*
* Function: proc_ident_equal_ref
*
* Description: Compare a proc_ident_t to a proc_t. Will
* respect IDENT_VALIDATION_PROC_MAY_EXEC and only compare
* PID and unique ID when set.
*/
bool
proc_ident_equal_ref(proc_ident_t ident, proc_t proc)
{
if (ident == NULL || proc == PROC_NULL) {
return false;
}
// Always compare PID and p_uniqueid
if (proc_pid(proc) != ident->p_pid ||
proc_uniqueid(proc) != ident->p_uniqueid) {
return false;
}
// Compare pidversion if IDENT_VALIDATION_PROC_MAY_EXEC is not set
if (!proc_ident_has_policy(ident, IDENT_VALIDATION_PROC_MAY_EXEC) &&
proc_pidversion(proc) != ident->p_idversion) {
return false;
}
return true;
}
/*
* Function: proc_ident_equal
*
* Description: Compare two proc_ident_t identifiers. Will
* respect IDENT_VALIDATION_PROC_MAY_EXEC and only compare
* PID and unique ID when set.
*/
bool
proc_ident_equal(proc_ident_t ident, proc_ident_t other)
{
if (ident == NULL || other == NULL) {
return false;
}
// Always compare PID and p_uniqueid
if (ident->p_pid != other->p_pid ||
ident->p_uniqueid != other->p_uniqueid) {
return false;
}
// Compare pidversion if IDENT_VALIDATION_PROC_MAY_EXEC is not set
if (!proc_ident_has_policy(ident, IDENT_VALIDATION_PROC_MAY_EXEC) &&
ident->p_idversion != other->p_idversion) {
return false;
}
return true;
}
/*
* Function: proc_ident_has_policy
*
* Description: Validate that a particular policy is set.
*
* Stored in the upper 4 bits of the 32 bit
* p_pid field.
*/
inline bool
proc_ident_has_policy(const proc_ident_t ident, enum proc_ident_validation_policy policy)
{
if (ident == NULL) {
return false;
}
switch (policy) {
case IDENT_VALIDATION_PROC_MAY_EXIT:
return ident->may_exit;
case IDENT_VALIDATION_PROC_MAY_EXEC:
return ident->may_exec;
case IDENT_VALIDATION_PROC_EXACT:
return ident->may_exec == 0 && ident->may_exit == 0;
}
}
void
@ -1467,6 +1653,21 @@ proc_archinfo_kdp(void* p, cpu_type_t* cputype, cpu_subtype_t* cpusubtype)
}
}
void
proc_memstat_data_kdp(void *p, int32_t *current_memlimit, int32_t *prio_effective, int32_t *prio_requested, int32_t *prio_assertion);
void
proc_memstat_data_kdp(void *p, int32_t *current_memlimit, int32_t *prio_effective, int32_t *prio_requested, int32_t *prio_assertion)
{
proc_t pp = (proc_t)p;
if (pp != PROC_NULL) {
*current_memlimit = pp->p_memstat_memlimit;
*prio_effective = pp->p_memstat_effectivepriority;
*prio_assertion = pp->p_memstat_assertionpriority;
*prio_requested = pp->p_memstat_requestedpriority;
}
}
const char *
proc_name_address(void *p)
{
@ -1840,7 +2041,7 @@ proc_getcdhash(proc_t p, unsigned char *cdhash)
if (p == kernproc) {
return EINVAL;
}
return vn_getcdhash(p->p_textvp, p->p_textoff, cdhash);
return vn_getcdhash(p->p_textvp, p->p_textoff, cdhash, NULL);
}
uint64_t
@ -2264,27 +2465,59 @@ proc_findthread(thread_t thread)
return p;
}
/*
* Locate a zombie by PID
* Determine if the process described by the provided
* PID is a zombie
*/
__private_extern__ proc_t
__private_extern__ bool
pzfind(pid_t pid)
{
proc_t p;
bool found = false;
/* Enter critical section */
proc_list_lock();
LIST_FOREACH(p, &zombproc, p_list) {
if (proc_getpid(p) == pid && !proc_is_shadow(p)) {
break;
}
/* Ensure the proc exists and is a zombie */
proc_t p = phash_find_locked(pid);
if ((p == PROC_NULL) || !proc_list_exited(p)) {
goto out;
}
found = true;
out:
/* Exit critical section */
proc_list_unlock();
return found;
}
return p;
/*
* Determine if the process described by the provided
* uniqueid is a zombie. The same as pzfind but with an
* additional uniqueid check.
*/
__private_extern__ bool
pzfind_unique(pid_t pid, uint64_t uniqueid)
{
bool found = false;
/* Enter critical section */
proc_list_lock();
/* Ensure the proc exists and is a zombie */
proc_t p = phash_find_locked(pid);
if ((p == PROC_NULL) || !proc_list_exited(p)) {
goto out;
}
if (proc_uniqueid(p) != uniqueid) {
goto out;
}
found = true;
out:
/* Exit critical section */
proc_list_unlock();
return found;
}
/*
@ -3163,7 +3396,7 @@ proc_is_rsr(proc_t p)
return os_atomic_load(&p->p_ladvflag, relaxed) & P_RSR;
}
#if CONFIG_COREDUMP
#if CONFIG_COREDUMP || CONFIG_UCOREDUMP
/*
* proc_core_name(format, name, uid, pid)
* Expand the name described in format, using name, uid, and pid.
@ -3253,7 +3486,7 @@ endofstring:
(long)pid, name, (uint32_t)uid);
return 1;
}
#endif /* CONFIG_COREDUMP */
#endif /* CONFIG_COREDUMP || CONFIG_UCOREDUMP */
/* Code Signing related routines */
@ -3311,9 +3544,10 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user
int error;
vnode_t tvp;
off_t toff;
unsigned char cdhash[SHA1_RESULTLEN];
csops_cdhash_t cdhash_info = {0};
audit_token_t token;
unsigned int upid = 0, uidversion = 0;
bool mark_invalid_allowed = false;
forself = error = 0;
@ -3322,12 +3556,13 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user
}
if (pid == proc_selfpid()) {
forself = 1;
mark_invalid_allowed = true;
}
switch (ops) {
case CS_OPS_STATUS:
case CS_OPS_CDHASH:
case CS_OPS_CDHASH_WITH_INFO:
case CS_OPS_PIDOFFSET:
case CS_OPS_ENTITLEMENTS_BLOB:
case CS_OPS_DER_ENTITLEMENTS_BLOB:
@ -3411,6 +3646,10 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user
break;
}
case CS_OPS_MARKINVALID:
if (mark_invalid_allowed == false) {
error = EPERM;
goto out;
}
proc_lock(pt);
if ((proc_getcsflags(pt) & CS_VALID) == CS_VALID) { /* is currently valid */
proc_csflags_clear(pt, CS_VALID); /* set invalid */
@ -3470,16 +3709,36 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user
tvp = pt->p_textvp;
toff = pt->p_textoff;
if (tvp == NULLVP || usize != SHA1_RESULTLEN) {
if (tvp == NULLVP || usize != sizeof(cdhash_info.hash)) {
proc_rele(pt);
return EINVAL;
}
error = vn_getcdhash(tvp, toff, cdhash);
error = vn_getcdhash(tvp, toff, cdhash_info.hash, &cdhash_info.type);
proc_rele(pt);
if (error == 0) {
error = copyout(cdhash, uaddr, sizeof(cdhash));
error = copyout(cdhash_info.hash, uaddr, sizeof(cdhash_info.hash));
}
return error;
case CS_OPS_CDHASH_WITH_INFO:
/* pt already holds a reference on its p_textvp */
tvp = pt->p_textvp;
toff = pt->p_textoff;
if (tvp == NULLVP || usize != sizeof(csops_cdhash_t)) {
proc_rele(pt);
return EINVAL;
}
error = vn_getcdhash(tvp, toff, cdhash_info.hash, &cdhash_info.type);
proc_rele(pt);
if (error == 0) {
error = copyout(&cdhash_info, uaddr, sizeof(cdhash_info));
}
return error;
@ -3641,7 +3900,7 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user
*/
if (forself == 1 && IOTaskHasEntitlement(proc_task(pt), CLEAR_LV_ENTITLEMENT)) {
proc_lock(pt);
if (!(proc_getcsflags(pt) & CS_INSTALLER)) {
if (!(proc_getcsflags(pt) & CS_INSTALLER) && (pt->p_subsystem_root_path == NULL)) {
proc_csflags_clear(pt, CS_REQUIRE_LV | CS_FORCED_LV);
error = 0;
} else {
@ -3742,11 +4001,8 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user
break;
}
#endif /* CONFIG_CSR */
task_t task = proc_task(pt);
proc_lock(pt);
proc_csflags_clear(pt, CS_PLATFORM_BINARY | CS_PLATFORM_PATH);
task_set_hardened_runtime(task, false);
csproc_clear_platform_binary(pt);
proc_unlock(pt);
break;
@ -4648,7 +4904,7 @@ proc_pcontrol_null(__unused proc_t p, __unused void *arg)
extern int32_t max_kill_priority;
bool
no_paging_space_action(void)
no_paging_space_action(memorystatus_kill_cause_t cause)
{
proc_t p;
struct no_paging_space nps;
@ -4691,7 +4947,7 @@ no_paging_space_action(void)
memorystatus_log("memorystatus: killing largest compressed process %s [%d] "
"%llu MB\n",
proc_best_name(p), proc_getpid(p), (nps.npcs_max_size / MB_SIZE));
kill_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_LOWSWAP);
kill_reason = os_reason_create(OS_REASON_JETSAM, cause);
psignal_with_reason(p, SIGKILL, kill_reason);
proc_rele(p);
@ -4703,13 +4959,6 @@ no_paging_space_action(void)
}
}
if (memstat_get_idle_proccnt() > 0) {
/*
* There are still idle processes to kill.
*/
return false;
}
if (nps.pcs_max_size > 0) {
memorystatus_log("memorystatus: attempting pcontrol on "
"[%d]\n", nps.pcs_pid);
@ -4723,10 +4972,9 @@ no_paging_space_action(void)
memorystatus_log("memorystatus: doing "
"pcontrol on %s [%d]\n",
proc_best_name(p), proc_getpid(p));
proc_dopcontrol(p, JETSAM_REASON_LOWSWAP);
proc_dopcontrol(p, cause);
proc_rele(p);
return true;
} else {
memorystatus_log("memorystatus: cannot "
@ -5217,6 +5465,19 @@ proc_get_ro(proc_t p)
return ro;
}
#ifdef __BUILDING_XNU_LIB_UNITTEST__
/* this is here since unittest Makefile can't build BSD sources yet */
void mock_init_proc(proc_t p, void* (*calloc_call)(size_t, size_t));
void
mock_init_proc(proc_t p, void* (*calloc_call)(size_t, size_t))
{
proc_ro_t ro = calloc_call(1, sizeof(struct proc_ro));
ro->pr_proc = p;
p->p_proc_ro = ro;
}
#endif /* __BUILDING_XNU_LIB_UNITTEST__ */
task_t
proc_ro_task(proc_ro_t pr)
{
@ -5471,7 +5732,7 @@ task_for_pid(
error = KERN_FAILURE;
goto tfpout;
}
pident = proc_ident(p);
pident = proc_ident_with_policy(p, IDENT_VALIDATION_PROC_EXACT);
is_current_proc = (p == current_proc());
#if CONFIG_AUDIT
@ -5544,12 +5805,7 @@ task_for_pid(
/* this reference will be consumed during conversion */
task_reference(task);
if (task == current_task()) {
/* return pinned self if current_task() so equality check with mach_task_self_ passes */
sright = (void *)convert_task_to_port_pinned(task);
} else {
sright = (void *)convert_task_to_port(task);
}
sright = (void *)convert_task_to_port(task);
/* extra task ref consumed */
/*
@ -5638,7 +5894,7 @@ task_name_for_pid(
|| IOCurrentTaskHasEntitlement("com.apple.system-task-ports.name.safe")
)) {
if (proc_task(p) != TASK_NULL) {
struct proc_ident pident = proc_ident(p);
struct proc_ident pident = proc_ident_with_policy(p, IDENT_VALIDATION_PROC_EXACT);
task_t task = proc_task(p);
@ -5726,7 +5982,7 @@ task_inspect_for_pid(struct proc *p __unused, struct task_inspect_for_pid_args *
error = ESRCH;
goto tifpout;
}
pident = proc_ident(proc);
pident = proc_ident_with_policy(proc, IDENT_VALIDATION_PROC_EXACT);
is_current_proc = (proc == current_proc());
if (!(task_for_pid_posix_check(proc))) {
@ -5848,7 +6104,7 @@ task_read_for_pid(struct proc *p __unused, struct task_read_for_pid_args *args,
error = ESRCH;
goto trfpout;
}
pident = proc_ident(proc);
pident = proc_ident_with_policy(proc, IDENT_VALIDATION_PROC_EXACT);
is_current_proc = (proc == current_proc());
if (!(task_for_pid_posix_check(proc))) {
@ -6061,7 +6317,7 @@ debug_control_port_for_pid(struct debug_control_port_for_pid_args *args)
error = KERN_FAILURE;
goto tfpout;
}
pident = proc_ident(p);
pident = proc_ident_with_policy(p, IDENT_VALIDATION_PROC_EXACT);
is_current_proc = (p == current_proc());
#if CONFIG_AUDIT

View file

@ -104,6 +104,7 @@
#if CONFIG_FREEZE
#include <sys/kern_memorystatus_freeze.h> /* for memorystatus_freeze_mark_ui_transition */
#endif /* CONFIG_FREEZE */
#include <sys/kern_memorystatus_xnu.h> /* for memorystatus_get_proc_is_managed */
#include <sys/socketvar.h> /* for struct socket */
#if NECP
#include <net/necp.h>
@ -131,13 +132,16 @@ static int dosetrlimit(struct proc *p, u_int which, struct rlimit *limp);
static void do_background_socket(struct proc *p, thread_t thread);
static int do_background_thread(thread_t thread, int priority);
static int do_background_proc(struct proc *curp, struct proc *targetp, int priority);
static int set_gpudeny_proc(struct proc *curp, struct proc *targetp, int priority);
static int proc_set_gpurole(struct proc *curp, struct proc *targetp, int priority);
static int proc_get_gpurole(proc_t targetp, int *priority);
static int proc_set_darwin_role(proc_t curp, proc_t targetp, int priority);
static int proc_get_darwin_role(proc_t curp, proc_t targetp, int *priority);
static int proc_set_game_mode(proc_t targetp, int priority);
static int proc_get_game_mode(proc_t targetp, int *priority);
static int proc_set_carplay_mode(proc_t targetp, int priority);
static int proc_get_carplay_mode(proc_t targetp, int *priority);
static int proc_set_runaway_mitigation(proc_t targetp, int priority);
static int proc_get_runaway_mitigation(proc_t targetp, int *priority);
static int get_background_proc(struct proc *curp, struct proc *targetp, int *priority);
int fill_task_rusage(task_t task, rusage_info_current *ri);
@ -357,6 +361,50 @@ getpriority(struct proc *curp, struct getpriority_args *uap, int32_t *retval)
}
break;
case PRIO_DARWIN_GPU:
if (uap->who == 0) {
p = curp;
} else {
p = proc_find(uap->who);
if (p == PROC_NULL) {
break;
}
refheld = 1;
}
error = proc_get_gpurole(p, &low);
if (refheld) {
proc_rele(p);
}
if (error) {
return error;
}
break;
case PRIO_DARWIN_RUNAWAY_MITIGATION:
if (uap->who == 0) {
p = curp;
} else {
p = proc_find(uap->who);
if (p == PROC_NULL) {
break;
}
refheld = 1;
}
error = proc_get_runaway_mitigation(p, &low);
if (refheld) {
proc_rele(p);
}
if (error) {
return error;
}
break;
default:
return EINVAL;
}
@ -533,7 +581,7 @@ setpriority(struct proc *curp, struct setpriority_args *uap, int32_t *retval)
break;
}
error = set_gpudeny_proc(curp, p, uap->prio);
error = proc_set_gpurole(curp, p, uap->prio);
found++;
proc_rele(p);
@ -601,6 +649,26 @@ setpriority(struct proc *curp, struct setpriority_args *uap, int32_t *retval)
break;
}
case PRIO_DARWIN_RUNAWAY_MITIGATION: {
if (uap->who == 0) {
p = curp;
} else {
p = proc_find(uap->who);
if (p == PROC_NULL) {
break;
}
refheld = 1;
}
error = proc_set_runaway_mitigation(p, uap->prio);
found++;
if (refheld != 0) {
proc_rele(p);
}
break;
}
default:
return EINVAL;
}
@ -663,8 +731,10 @@ out:
return error;
}
#define SET_GPU_ROLE_ENTITLEMENT "com.apple.private.set-gpu-role"
static int
set_gpudeny_proc(struct proc *curp, struct proc *targetp, int priority)
proc_set_gpurole(struct proc *curp, struct proc *targetp, int priority)
{
int error = 0;
kauth_cred_t ucred;
@ -673,7 +743,12 @@ set_gpudeny_proc(struct proc *curp, struct proc *targetp, int priority)
ucred = kauth_cred_get();
target_cred = kauth_cred_proc_ref(targetp);
/* TODO: Entitlement instead of uid check */
boolean_t entitled = FALSE;
entitled = IOCurrentTaskHasEntitlement(SET_GPU_ROLE_ENTITLEMENT);
if (!entitled) {
error = EPERM;
goto out;
}
if (!kauth_cred_issuser(ucred) && kauth_cred_getruid(ucred) &&
kauth_cred_getuid(ucred) != kauth_cred_getuid(target_cred) &&
@ -695,11 +770,16 @@ set_gpudeny_proc(struct proc *curp, struct proc *targetp, int priority)
#endif
switch (priority) {
case PRIO_DARWIN_GPU_DENY:
task_set_gpu_denied(proc_task(targetp), TRUE);
break;
case PRIO_DARWIN_GPU_UNKNOWN:
case PRIO_DARWIN_GPU_ALLOW:
task_set_gpu_denied(proc_task(targetp), FALSE);
case PRIO_DARWIN_GPU_DENY:
case PRIO_DARWIN_GPU_BACKGROUND:
case PRIO_DARWIN_GPU_UTILITY:
case PRIO_DARWIN_GPU_UI_NON_FOCAL:
case PRIO_DARWIN_GPU_UI:
case PRIO_DARWIN_GPU_UI_FOCAL:
task_set_gpu_role(proc_task(targetp),
(darwin_gpu_role_t)priority);
break;
default:
error = EINVAL;
@ -711,6 +791,42 @@ out:
return error;
}
static int
proc_get_gpurole(proc_t targetp, int *priority)
{
int error = 0;
kauth_cred_t ucred, target_cred;
ucred = kauth_cred_get();
target_cred = kauth_cred_proc_ref(targetp);
boolean_t entitled = FALSE;
entitled = IOCurrentTaskHasEntitlement(SET_GPU_ROLE_ENTITLEMENT);
/* Root is allowed to get without entitlement */
if (!kauth_cred_issuser(ucred) && !entitled) {
error = EPERM;
goto out;
}
/* Even with entitlement, non-root is only alllowed to see same-user */
if (!kauth_cred_issuser(ucred) &&
kauth_cred_getuid(ucred) != kauth_cred_getuid(target_cred)) {
error = EPERM;
goto out;
}
darwin_gpu_role_t gpurole = task_get_gpu_role(proc_task(targetp));
*priority = gpurole;
out:
kauth_cred_unref(&target_cred);
return error;
}
static int
proc_set_darwin_role(proc_t curp, proc_t targetp, int priority)
{
@ -960,6 +1076,116 @@ out:
return error;
}
#define RUNAWAY_MITIGATION_ENTITLEMENT "com.apple.private.runaway-mitigation"
/* Boot arg to allow RunningBoard-managed processes to be mitigated */
static TUNABLE(bool, allow_managed_mitigation, "allow_managed_mitigation", false);
static int
proc_set_runaway_mitigation(proc_t targetp, int priority)
{
int error = 0;
kauth_cred_t ucred, target_cred;
ucred = kauth_cred_get();
target_cred = kauth_cred_proc_ref(targetp);
boolean_t entitled = FALSE;
entitled = IOCurrentTaskHasEntitlement(RUNAWAY_MITIGATION_ENTITLEMENT);
if (!entitled) {
error = EPERM;
goto out;
}
/* Even with entitlement, non-root is only alllowed to set same-user */
if (!kauth_cred_issuser(ucred) &&
kauth_cred_getuid(ucred) != kauth_cred_getuid(target_cred)) {
error = EPERM;
goto out;
}
switch (priority) {
case PRIO_DARWIN_RUNAWAY_MITIGATION_OFF:
printf("%s[%d] disabling runaway mitigation on %s[%d]\n",
proc_best_name(current_proc()), proc_selfpid(),
proc_best_name(targetp), proc_getpid(targetp));
proc_set_task_policy(proc_task(targetp), TASK_POLICY_ATTRIBUTE,
TASK_POLICY_RUNAWAY_MITIGATION, TASK_POLICY_DISABLE);
break;
case PRIO_DARWIN_RUNAWAY_MITIGATION_ON:
/*
* RunningBoard-managed processes are not mitigatable - they should be
* managed through RunningBoard-level interfaces instead.
* Set the boot arg allow_managed_mitigation=1 to allow this.
*/
if (memorystatus_get_proc_is_managed(targetp) && !allow_managed_mitigation) {
printf("%s[%d] blocked from disabling runaway mitigation on RunningBoard managed process %s[%d]\n",
proc_best_name(current_proc()), proc_selfpid(),
proc_best_name(targetp), proc_getpid(targetp));
error = ENOTSUP;
goto out;
}
proc_set_task_policy(proc_task(targetp), TASK_POLICY_ATTRIBUTE,
TASK_POLICY_RUNAWAY_MITIGATION, TASK_POLICY_ENABLE);
printf("%s[%d] enabling runaway mitigation on %s[%d]\n",
proc_best_name(current_proc()), proc_selfpid(),
proc_best_name(targetp), proc_getpid(targetp));
break;
default:
error = EINVAL;
goto out;
}
out:
kauth_cred_unref(&target_cred);
return error;
}
static int
proc_get_runaway_mitigation(proc_t targetp, int *priority)
{
int error = 0;
kauth_cred_t ucred, target_cred;
ucred = kauth_cred_get();
target_cred = kauth_cred_proc_ref(targetp);
boolean_t entitled = FALSE;
entitled = IOCurrentTaskHasEntitlement(RUNAWAY_MITIGATION_ENTITLEMENT);
/* Root is allowed to get without entitlement */
if (!kauth_cred_issuser(ucred) && !entitled) {
error = EPERM;
goto out;
}
/* Even with entitlement, non-root is only alllowed to see same-user */
if (!kauth_cred_issuser(ucred) &&
kauth_cred_getuid(ucred) != kauth_cred_getuid(target_cred)) {
error = EPERM;
goto out;
}
if (proc_get_task_policy(proc_task(targetp), TASK_POLICY_ATTRIBUTE, TASK_POLICY_RUNAWAY_MITIGATION)) {
*priority = PRIO_DARWIN_RUNAWAY_MITIGATION_ON;
} else {
*priority = PRIO_DARWIN_RUNAWAY_MITIGATION_OFF;
}
out:
kauth_cred_unref(&target_cred);
return error;
}
static int
get_background_proc(struct proc *curp, struct proc *targetp, int *priority)
{
@ -1501,22 +1727,30 @@ getrlimit(struct proc *p, struct getrlimit_args *uap, __unused int32_t *retval)
return EINVAL;
}
lim = proc_limitget(p, uap->which);
return copyout((caddr_t)&lim,
uap->rlp, sizeof(struct rlimit));
return copyout((caddr_t)&lim, uap->rlp, sizeof(struct rlimit));
}
static struct timeval
_absolutetime_to_timeval(uint64_t abstime)
{
clock_sec_t sec;
clock_usec_t usec;
absolutetime_to_microtime(abstime, &sec, &usec);
return (struct timeval){
.tv_sec = sec,
.tv_usec = usec,
};
}
/*
* Transform the running time and tick information in proc p into user,
* system, and interrupt time usage.
*/
/* No lock on proc is held for this.. */
void
calcru(struct proc *p, struct timeval *up, struct timeval *sp, struct timeval *ip)
{
task_t task;
task_t task;
timerclear(up);
timerclear(sp);
if (ip != NULL) {
timerclear(ip);
}
@ -1524,51 +1758,39 @@ calcru(struct proc *p, struct timeval *up, struct timeval *sp, struct timeval *i
task = proc_task(p);
if (task) {
mach_task_basic_info_data_t tinfo;
task_thread_times_info_data_t ttimesinfo;
task_events_info_data_t teventsinfo;
mach_msg_type_number_t task_info_count, task_ttimes_count;
mach_msg_type_number_t task_info_count;
mach_msg_type_number_t task_events_count;
struct timeval ut, st;
task_events_info_data_t teventsinfo;
struct recount_times_mach times;
task_info_count = MACH_TASK_BASIC_INFO_COUNT;
task_info(task, MACH_TASK_BASIC_INFO,
(task_info_t)&tinfo, &task_info_count);
ut.tv_sec = tinfo.user_time.seconds;
ut.tv_usec = tinfo.user_time.microseconds;
st.tv_sec = tinfo.system_time.seconds;
st.tv_usec = tinfo.system_time.microseconds;
timeradd(&ut, up, up);
timeradd(&st, sp, sp);
task_ttimes_count = TASK_THREAD_TIMES_INFO_COUNT;
task_info(task, TASK_THREAD_TIMES_INFO,
(task_info_t)&ttimesinfo, &task_ttimes_count);
ut.tv_sec = ttimesinfo.user_time.seconds;
ut.tv_usec = ttimesinfo.user_time.microseconds;
st.tv_sec = ttimesinfo.system_time.seconds;
st.tv_usec = ttimesinfo.system_time.microseconds;
timeradd(&ut, up, up);
timeradd(&st, sp, sp);
task_events_count = TASK_EVENTS_INFO_COUNT;
task_info(task, TASK_EVENTS_INFO,
(task_info_t)&teventsinfo, &task_events_count);
times = recount_task_times(task);
*up = _absolutetime_to_timeval(times.rtm_user);
*sp = _absolutetime_to_timeval(times.rtm_system);
/*
* No need to lock "p": this does not need to be
* completely consistent, right ?
* No lock is held here, but it's only a consistency issue for non-
* getrusage(2) callers of this function.
*/
p->p_stats->p_ru.ru_minflt = (teventsinfo.faults -
teventsinfo.pageins);
p->p_stats->p_ru.ru_minflt = teventsinfo.faults -
teventsinfo.pageins;
p->p_stats->p_ru.ru_majflt = teventsinfo.pageins;
p->p_stats->p_ru.ru_nivcsw = (teventsinfo.csw -
p->p_stats->p_ru.ru_nvcsw);
p->p_stats->p_ru.ru_nivcsw = teventsinfo.csw -
p->p_stats->p_ru.ru_nvcsw;
if (p->p_stats->p_ru.ru_nivcsw < 0) {
p->p_stats->p_ru.ru_nivcsw = 0;
}
p->p_stats->p_ru.ru_maxrss = (long)tinfo.resident_size_max;
} else {
timerclear(up);
timerclear(sp);
}
}
@ -1587,7 +1809,6 @@ getrusage(struct proc *p, struct getrusage_args *uap, __unused int32_t *retval)
struct timeval utime;
struct timeval stime;
switch (uap->who) {
case RUSAGE_SELF:
calcru(p, &utime, &stime, NULL);
@ -1857,6 +2078,8 @@ static int iopolicysys_vfs_altlink(struct proc *p, int cmd, int scope, int polic
static int iopolicysys_vfs_nocache_write_fs_blksize(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param);
static int
iopolicysys_vfs_support_long_paths(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param);
static int
iopolicysys_vfs_entitled_reserve_access(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param);
/*
* iopolicysys
@ -1880,6 +2103,17 @@ iopolicysys(struct proc *p, struct iopolicysys_args *uap, int32_t *retval)
goto out;
}
#if CONFIG_MACF
error = mac_proc_check_iopolicysys(p, kauth_cred_get(),
uap->cmd,
iop_param.iop_iotype,
iop_param.iop_scope,
iop_param.iop_policy);
if (error) {
return error;
}
#endif
switch (iop_param.iop_iotype) {
case IOPOL_TYPE_DISK:
error = iopolicysys_disk(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param);
@ -1969,6 +2203,12 @@ iopolicysys(struct proc *p, struct iopolicysys_args *uap, int32_t *retval)
goto out;
}
break;
case IOPOL_TYPE_VFS_ENTITLED_RESERVE_ACCESS:
error = iopolicysys_vfs_entitled_reserve_access(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param);
if (error) {
goto out;
}
break;
default:
error = EINVAL;
@ -2575,6 +2815,54 @@ out:
return error;
}
static int
get_proc_vfs_ignore_permissions_policy(struct proc *p)
{
return os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS ?
IOPOL_VFS_IGNORE_PERMISSIONS_ON : IOPOL_VFS_IGNORE_PERMISSIONS_OFF;
}
static int
get_thread_vfs_ignore_permissions_policy(thread_t thread)
{
struct uthread *ut = get_bsdthread_info(thread);
return (ut->uu_flag & UT_IGNORE_NODE_PERMISSIONS) ?
IOPOL_VFS_IGNORE_PERMISSIONS_ON : IOPOL_VFS_IGNORE_PERMISSIONS_OFF;
}
static void
set_proc_vfs_ignore_permissions_policy(struct proc *p, int policy)
{
switch (policy) {
case IOPOL_VFS_IGNORE_PERMISSIONS_OFF:
os_atomic_andnot(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS, relaxed);
break;
case IOPOL_VFS_IGNORE_PERMISSIONS_ON:
os_atomic_or(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS, relaxed);
break;
default:
break;
}
}
static void
set_thread_vfs_ignore_permissions_policy(thread_t thread, int policy)
{
struct uthread *ut = get_bsdthread_info(thread);
switch (policy) {
case IOPOL_VFS_IGNORE_PERMISSIONS_OFF:
ut->uu_flag &= ~UT_IGNORE_NODE_PERMISSIONS;
break;
case IOPOL_VFS_IGNORE_PERMISSIONS_ON:
ut->uu_flag |= UT_IGNORE_NODE_PERMISSIONS;
break;
default:
break;
}
}
#define AUTHORIZED_ACCESS_ENTITLEMENT \
"com.apple.private.vfs.authorized-access"
int
@ -2582,8 +2870,12 @@ iopolicysys_vfs_ignore_node_permissions(struct proc *p, int cmd, int scope,
int policy, __unused struct _iopol_param_t *iop_param)
{
int error = EINVAL;
thread_t thread = THREAD_NULL;
switch (scope) {
case IOPOL_SCOPE_THREAD:
thread = current_thread();
break;
case IOPOL_SCOPE_PROCESS:
break;
default:
@ -2592,8 +2884,11 @@ iopolicysys_vfs_ignore_node_permissions(struct proc *p, int cmd, int scope,
switch (cmd) {
case IOPOL_CMD_GET:
policy = os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS ?
IOPOL_VFS_IGNORE_PERMISSIONS_ON : IOPOL_VFS_IGNORE_PERMISSIONS_OFF;
if (thread != THREAD_NULL) {
policy = get_thread_vfs_ignore_permissions_policy(thread);
} else {
policy = get_proc_vfs_ignore_permissions_policy(p);
}
iop_param->iop_policy = policy;
goto out_ok;
case IOPOL_CMD_SET:
@ -2608,15 +2903,10 @@ iopolicysys_vfs_ignore_node_permissions(struct proc *p, int cmd, int scope,
goto out;
}
switch (policy) {
case IOPOL_VFS_IGNORE_PERMISSIONS_OFF:
os_atomic_andnot(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS, relaxed);
break;
case IOPOL_VFS_IGNORE_PERMISSIONS_ON:
os_atomic_or(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS, relaxed);
break;
default:
break;
if (thread != THREAD_NULL) {
set_thread_vfs_ignore_permissions_policy(thread, policy);
} else {
set_proc_vfs_ignore_permissions_policy(p, policy);
}
out_ok:
@ -2863,40 +3153,20 @@ static int
iopolicysys_vfs_nocache_write_fs_blksize(struct proc *p, int cmd, int scope, int policy,
struct _iopol_param_t *iop_param)
{
thread_t thread;
switch (scope) {
case IOPOL_SCOPE_THREAD:
thread = current_thread();
break;
case IOPOL_SCOPE_PROCESS:
thread = THREAD_NULL;
break;
default:
if (scope != IOPOL_SCOPE_PROCESS) {
return EINVAL;
}
if (cmd == IOPOL_CMD_GET) {
if (thread != THREAD_NULL) {
struct uthread *ut = get_bsdthread_info(thread);
policy = ut->uu_flag & UT_FS_BLKSIZE_NOCACHE_WRITES ?
IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_ON : IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_DEFAULT;
} else {
policy = (os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_NOCACHE_WRITE_FS_BLKSIZE) ?
IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_ON : IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_DEFAULT;
}
policy = (os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_NOCACHE_WRITE_FS_BLKSIZE) ?
IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_ON : IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_DEFAULT;
iop_param->iop_policy = policy;
return 0;
}
/* Once set, we don't allow the process or thread to clear it. */
if ((cmd == IOPOL_CMD_SET) && (policy == IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_ON)) {
if (thread != THREAD_NULL) {
struct uthread *ut = get_bsdthread_info(thread);
ut->uu_flag |= UT_FS_BLKSIZE_NOCACHE_WRITES;
} else {
os_atomic_or(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_NOCACHE_WRITE_FS_BLKSIZE, relaxed);
}
/* Once set, we don't allow the process to clear it. */
if (policy == IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_ON) {
os_atomic_or(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_NOCACHE_WRITE_FS_BLKSIZE, relaxed);
return 0;
}
@ -3002,6 +3272,67 @@ out:
return error;
}
#define ENTITLED_RESERVE_ACCESS_ENTITLEMENT \
"com.apple.private.vfs.entitled-reserve-access"
static int
iopolicysys_vfs_entitled_reserve_access(struct proc *p, int cmd, int scope,
int policy, struct _iopol_param_t *iop_param)
{
struct uthread *ut;
switch (scope) {
case IOPOL_SCOPE_THREAD:
ut = get_bsdthread_info(current_thread());
break;
case IOPOL_SCOPE_PROCESS:
ut = NULL;
break;
default:
return EINVAL;
}
if (cmd == IOPOL_CMD_GET) {
if (scope == IOPOL_SCOPE_THREAD) {
policy = (os_atomic_load(&ut->uu_flag, relaxed) & UT_FS_ENTITLED_RESERVE_ACCESS) ?
IOPOL_VFS_ENTITLED_RESERVE_ACCESS_ON : IOPOL_VFS_ENTITLED_RESERVE_ACCESS_OFF;
} else {
policy = (os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_ENTITLED_RESERVE_ACCESS) ?
IOPOL_VFS_ENTITLED_RESERVE_ACCESS_ON : IOPOL_VFS_ENTITLED_RESERVE_ACCESS_OFF;
}
iop_param->iop_policy = policy;
return 0;
}
if (cmd != IOPOL_CMD_SET) {
return EINVAL;
}
if (!IOCurrentTaskHasEntitlement(ENTITLED_RESERVE_ACCESS_ENTITLEMENT)) {
return EPERM;
}
switch (policy) {
case IOPOL_VFS_ENTITLED_RESERVE_ACCESS_OFF:
if (scope == IOPOL_SCOPE_THREAD) {
os_atomic_andnot(&ut->uu_flag, UT_FS_ENTITLED_RESERVE_ACCESS, relaxed);
} else {
os_atomic_andnot(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_ENTITLED_RESERVE_ACCESS, relaxed);
}
break;
case IOPOL_VFS_ENTITLED_RESERVE_ACCESS_ON:
if (scope == IOPOL_SCOPE_THREAD) {
os_atomic_or(&ut->uu_flag, UT_FS_ENTITLED_RESERVE_ACCESS, relaxed);
} else {
os_atomic_or(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_ENTITLED_RESERVE_ACCESS, relaxed);
}
break;
default:
return EINVAL;
}
return 0;
}
void
proc_apply_task_networkbg(int pid, thread_t thread)
{

View file

@ -150,6 +150,15 @@ get_system_inshutdown()
return system_inshutdown;
}
extern int OSKextIsInUserspaceReboot(void);
int
get_system_inuserspacereboot()
{
/* set by launchd before performing a userspace reboot */
return OSKextIsInUserspaceReboot();
}
__abortlike
static void
panic_kernel(int howto, char *message)
@ -268,6 +277,11 @@ reboot_kernel(int howto, char *message)
if (!(howto & RB_PANIC) || !kdp_has_polled_corefile())
#endif /* DEVELOPMENT || DEBUG */
{
#if CONFIG_COREDUMP || CONFIG_UCOREDUMP
/* Disable user space core dump before unmounting non-system volume so
* that dext cores wouldn't be written to system volume */
do_coredump = 0;
#endif /* COREDUMP || CONFIG_UCOREDUMP */
startTime = mach_absolute_time();
vfs_unmountall(TRUE);
halt_log_enter("vfs_unmountall", 0, mach_absolute_time() - startTime);

View file

@ -145,8 +145,8 @@ extern void doexception(int exc, mach_exception_code_t code,
mach_exception_subcode_t sub);
static void stop(proc_t, proc_t);
int cansignal_nomac(proc_t, kauth_cred_t, proc_t, int);
int cansignal(proc_t, kauth_cred_t, proc_t, int);
bool cansignal_nomac(proc_t, kauth_cred_t, proc_t, int);
bool cansignal(proc_t, kauth_cred_t, proc_t, int);
int killpg1(proc_t, int, int, int, int);
kern_return_t do_bsdexception(int, int, int);
void __posix_sem_syscall_return(kern_return_t);
@ -297,39 +297,45 @@ signal_setast(thread_t sig_actthread)
act_set_astbsd(sig_actthread);
}
int
bool
cansignal_nomac(proc_t src, kauth_cred_t uc_src, proc_t dst, int signum)
{
/* you can signal yourself */
if (src == dst) {
return 1;
return true;
}
/* you can't send the init proc SIGKILL, even if root */
if (signum == SIGKILL && dst == initproc) {
return 0;
/*
* You can't signal the initproc, even if root.
* Note that this still permits the kernel itself to signal initproc directly,
* e.g SIGCHLD when reparenting or SIGTERM at shutdown, because those are
* not considered to originate from a user process, so the cansignal()
* check isn't performed.
*/
if (dst == initproc) {
return false;
}
/* otherwise, root can always signal */
if (kauth_cred_issuser(uc_src)) {
return 1;
return true;
}
/* processes in the same session can send SIGCONT to each other */
if (signum == SIGCONT && proc_sessionid(src) == proc_sessionid(dst)) {
return 1;
return true;
}
#if XNU_TARGET_OS_IOS
// Allow debugging of third party drivers on iOS
if (proc_is_third_party_debuggable_driver(dst)) {
return 1;
return true;
}
#endif /* XNU_TARGET_OS_IOS */
/* the source process must be authorized to signal the target */
{
int allowed = 0;
bool allowed = false;
kauth_cred_t uc_dst = NOCRED, uc_ref = NOCRED;
uc_dst = uc_ref = kauth_cred_proc_ref(dst);
@ -342,7 +348,7 @@ cansignal_nomac(proc_t src, kauth_cred_t uc_src, proc_t dst, int signum)
kauth_cred_getruid(uc_src) == kauth_cred_getsvuid(uc_dst) ||
kauth_cred_getuid(uc_src) == kauth_cred_getruid(uc_dst) ||
kauth_cred_getuid(uc_src) == kauth_cred_getsvuid(uc_dst)) {
allowed = 1;
allowed = true;
}
if (uc_ref != NOCRED) {
@ -359,13 +365,13 @@ cansignal_nomac(proc_t src, kauth_cred_t uc_src, proc_t dst, int signum)
* `dst`? The ucred is referenced by the caller so internal fileds can be used
* safely.
*/
int
bool
cansignal(proc_t src, kauth_cred_t uc_src, proc_t dst, int signum)
{
#if CONFIG_MACF
struct proc_ident dst_ident = proc_ident(dst);
struct proc_ident dst_ident = proc_ident_with_policy(dst, IDENT_VALIDATION_PROC_MAY_EXEC | IDENT_VALIDATION_PROC_MAY_EXIT);
if (mac_proc_check_signal(src, NULL, &dst_ident, signum)) {
return 0;
return false;
}
#endif
@ -399,8 +405,7 @@ static int
signal_is_restricted(proc_t p, int signum)
{
if (sigmask(signum) & sigrestrictmask()) {
if (sigrestrict_arg == 0 &&
task_get_apptype(proc_task(p)) == TASK_APPTYPE_APP_DEFAULT) {
if (sigrestrict_arg == 0 && task_is_app(proc_task(p))) {
return ENOTSUP;
} else {
return EINVAL;
@ -1125,8 +1130,9 @@ __pthread_kill(__unused proc_t p, struct __pthread_kill_args *uap,
* workq threads must have kills enabled through either
* BSDTHREAD_CTL_WORKQ_ALLOW_KILL or BSDTHREAD_CTL_WORKQ_ALLOW_SIGMASK
*/
if ((thread_get_tag(target_act) & THREAD_TAG_WORKQUEUE) &&
!(uth->uu_workq_pthread_kill_allowed || p->p_workq_allow_sigmask)) {
if (((thread_get_tag(target_act) & THREAD_TAG_WORKQUEUE) &&
!(uth->uu_workq_pthread_kill_allowed || p->p_workq_allow_sigmask)) ||
(thread_get_tag(target_act) & THREAD_TAG_AIO_WORKQUEUE)) {
error = ENOTSUP;
goto out;
}
@ -1386,7 +1392,7 @@ kill(proc_t cp, struct kill_args *uap, __unused int32_t *retval)
if (uap->pid > 0) {
/* kill single process */
if ((p = proc_find(uap->pid)) == NULL) {
if ((p = pzfind(uap->pid)) != NULL) {
if (pzfind(uap->pid)) {
/*
* POSIX 1003.1-2001 requires returning success when killing a
* zombie; see Rationale for kill(2).
@ -1862,7 +1868,8 @@ set_thread_extra_flags(task_t task, struct uthread *uth, os_reason_t reason)
reason->osr_flags |= OS_REASON_FLAG_SHAREDREGION_FAULT;
#if __has_feature(ptrauth_calls)
if (!vm_shared_region_reslide_restrict || task_is_hardened_binary(current_task())) {
if (!vm_shared_region_reslide_restrict ||
(task_get_platform_restrictions_version(current_task()) >= 1)) {
reslide_shared_region = TRUE;
}
#endif /* __has_feature(ptrauth_calls) */
@ -1944,7 +1951,8 @@ again:
if (((uth->uu_flag & UT_NO_SIGMASK) == 0) &&
(((uth->uu_sigmask & mask) == 0) || (uth->uu_sigwait & mask))) {
thread_t th = get_machthread(uth);
if (skip_wqthreads && (thread_get_tag(th) & THREAD_TAG_WORKQUEUE)) {
if ((skip_wqthreads && (thread_get_tag(th) & THREAD_TAG_WORKQUEUE)) ||
(thread_get_tag(th) & THREAD_TAG_AIO_WORKQUEUE)) {
/* Workqueue threads may be parked in the kernel unable to
* deliver signals for an extended period of time, so skip them
* in favor of pthreads in a first pass. (rdar://50054475). */
@ -3057,7 +3065,6 @@ postsig_locked(int signum)
int mask, returnmask;
struct uthread * ut;
os_reason_t ut_exit_reason = OS_REASON_NULL;
int coredump_flags = 0;
#if DIAGNOSTIC
if (signum == 0) {
@ -3097,29 +3104,70 @@ postsig_locked(int signum)
p->p_sigacts.ps_sig = signum;
proc_signalend(p, 1);
proc_unlock(p);
if (task_is_driver(proc_task(p))) {
coredump_flags |= COREDUMP_FULLFSYNC;
}
#if CONFIG_COREDUMP || CONFIG_UCOREDUMP
/*
* For now, driver dumps are only performed by xnu.
* Regular processes can be configured to use xnu
* (synchronously generating very large core files),
* or xnu can generate a specially tagged corpse which
* (depending on other configuration) will cause
* ReportCrash to dump a core file asynchronously.
*
* The userland dumping path must operate
* asynchronously to avoid deadlocks, yet may have
* unexpected failures => indicate dump *initiation*
* via WCOREFLAG (or CLD_DUMPED).
*/
do {
if (task_is_driver(proc_task(p))) {
#if CONFIG_COREDUMP
if (coredump(p, 0, coredump_flags) == 0) {
signum |= WCOREFLAG;
}
#endif
if (coredump(p, 0, COREDUMP_FULLFSYNC) == 0) {
signum |= WCOREFLAG;
}
#endif /* CONFIG_COREDUMP */
break;
}
#if CONFIG_UCOREDUMP
if (do_ucoredump) {
/*
* A compatibility nod to existing
* coredump behavior: only set
* WCOREFLAG here if the user has
* implicitly asked for a core
* file and it passes security
* checks. (A core file might still
* be dumped because of other policy.)
*/
if (proc_limitgetcur(p, RLIMIT_CORE) != 0 &&
is_coredump_eligible(p) == 0) {
signum |= WCOREFLAG;
}
break;
}
#endif /* CONFIG_UCOREDUMP */
#if CONFIG_COREDUMP
if (coredump(p, 0, 0) == 0) {
signum |= WCOREFLAG;
}
#endif /* CONFIG_COREDUMP */
} while (0);
#endif /* CONFIG_COREDUMP || CONFIG_UCOREDUMP */
} else {
proc_signalend(p, 1);
proc_unlock(p);
}
#if CONFIG_DTRACE
bzero((caddr_t)&(ut->t_dtrace_siginfo), sizeof(ut->t_dtrace_siginfo));
bzero(&(ut->t_dtrace_siginfo), sizeof(ut->t_dtrace_siginfo));
ut->t_dtrace_siginfo.si_signo = signum;
const int signo = signum & ~WCOREFLAG;
ut->t_dtrace_siginfo.si_signo = signo;
ut->t_dtrace_siginfo.si_pid = p->si_pid;
ut->t_dtrace_siginfo.si_uid = p->si_uid;
ut->t_dtrace_siginfo.si_status = WEXITSTATUS(p->si_status);
/* Fire DTrace proc:::fault probe when signal is generated by hardware. */
switch (signum) {
switch (signo) {
case SIGILL: case SIGBUS: case SIGSEGV: case SIGFPE: case SIGTRAP:
DTRACE_PROC2(fault, int, (int)(ut->uu_code), siginfo_t *, &(ut->t_dtrace_siginfo));
break;
@ -3128,7 +3176,7 @@ postsig_locked(int signum)
}
DTRACE_PROC3(signal__handle, int, signum, siginfo_t *, &(ut->t_dtrace_siginfo),
DTRACE_PROC3(signal__handle, int, signo, siginfo_t *, &(ut->t_dtrace_siginfo),
void (*)(void), SIG_DFL);
#endif

View file

@ -235,7 +235,7 @@ kern_open_file_for_direct_io(const char * name,
int isssd = 0;
uint32_t flags = 0;
uint32_t blksize;
off_t maxiocount, count, segcount, wbctotal;
off_t maxiocount, count, segcount, wbctotal, set_file_size;
boolean_t locked = FALSE;
int fmode;
mode_t cmode;
@ -341,9 +341,10 @@ kern_open_file_for_direct_io(const char * name,
}
}
if (set_file_size_max) {
if ((set_file_size = set_file_size_max)) {
// set file size
if (wbctotal) {
// only hibernate
if (wbctotal >= set_file_size_min) {
set_file_size_min = HIBERNATE_MIN_FILE_SIZE;
} else {
@ -352,32 +353,41 @@ kern_open_file_for_direct_io(const char * name,
set_file_size_min = HIBERNATE_MIN_FILE_SIZE;
}
}
set_file_size_max = set_file_size_min;
set_file_size = set_file_size_min;
}
if (fs_free_size) {
mpFree += va.va_data_alloc;
if ((mpFree < set_file_size_max) || ((mpFree - set_file_size_max) < fs_free_size)) {
set_file_size_max = mpFree - fs_free_size;
if ((mpFree < set_file_size) || ((mpFree - set_file_size) < fs_free_size)) {
set_file_size = mpFree - fs_free_size;
if (0 == set_file_size_min) {
// passing zero for set_file_size_min (coredumps)
// means caller only accepts set_file_size_max
error = ENOSPC;
goto out;
}
if (set_file_size_max < set_file_size_min) {
set_file_size_max = set_file_size_min;
}
printf("kern_direct_file(%s): using reduced size %qd\n",
ref->name, set_file_size_max);
// if set_file_size_min is passed (hibernation),
// it does not check free space on disk
}
}
error = vnode_setsize(ref->vp, set_file_size_max, IO_NOZEROFILL | IO_NOAUTH, ref->ctx);
while (TRUE) {
if (set_file_size < set_file_size_min) {
set_file_size = set_file_size_min;
}
if (set_file_size < set_file_size_max) {
printf("kern_direct_file(%s): using reduced size %qd\n",
ref->name, set_file_size);
}
error = vnode_setsize(ref->vp, set_file_size, IO_NOZEROFILL | IO_NOAUTH, ref->ctx);
if ((ENOSPC == error) && set_file_size_min && (set_file_size > set_file_size_min) && (set_file_size > fs_free_size)) {
set_file_size -= fs_free_size;
continue;
}
break;
}
if (error) {
goto out;
}
ref->filelength = set_file_size_max;
ref->filelength = set_file_size;
}
} else if ((ref->vp->v_type == VBLK) || (ref->vp->v_type == VCHR)) {
/* Partition. */
@ -684,10 +694,10 @@ kern_file_mount(struct kern_direct_file_io_ref_t * ref)
void
kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref,
off_t write_offset, void * addr, size_t write_length,
off_t discard_offset, off_t discard_end, bool unlink)
off_t discard_offset, off_t discard_end, off_t set_file_size, bool unlink)
{
int error;
printf("kern_close_file_for_direct_io(%p)\n", ref);
printf("kern_close_file_for_direct_io(%p) %qd\n", ref, set_file_size);
if (!ref) {
return;
@ -737,7 +747,9 @@ kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref,
if (addr && write_length) {
(void) kern_write_file(ref, write_offset, addr, write_length, IO_SKIP_ENCRYPTION);
}
if (set_file_size) {
error = vnode_setsize(ref->vp, set_file_size, IO_NOZEROFILL | IO_NOAUTH, ref->ctx);
}
error = vnode_close(ref->vp, FWRITE, ref->ctx);
ref->vp = NULLVP;

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2000-2024 Apple Inc. All rights reserved.
* Copyright (c) 2000-2025 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
@ -156,6 +156,7 @@
#include <nfs/nfs_conf.h>
#include <vm/vm_protos.h>
#include <vm/vm_page.h>
#include <vm/vm_pageout_xnu.h>
#include <vm/vm_compressor_algorithms_xnu.h>
#include <vm/vm_compressor_xnu.h>
@ -219,6 +220,9 @@ extern unsigned int vm_page_free_target;
extern unsigned int vm_page_free_reserved;
extern unsigned int vm_page_max_speculative_age_q;
static uint64_t userspacereboottime = 0;
static unsigned int userspacerebootpurpose = 0;
#if (DEVELOPMENT || DEBUG)
extern uint32_t vm_page_creation_throttled_hard;
extern uint32_t vm_page_creation_throttled_soft;
@ -318,9 +322,12 @@ STATIC int sysctl_imgsrcdev(struct sysctl_oid *oidp, void *arg1, int arg2, struc
#endif
STATIC int sysctl_usrstack(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req);
STATIC int sysctl_usrstack64(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req);
#if CONFIG_COREDUMP
#if CONFIG_COREDUMP || CONFIG_UCOREDUMP
STATIC int sysctl_coredump(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req);
STATIC int sysctl_suid_coredump(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req);
#if CONFIG_UCOREDUMP
STATIC int sysctl_ucoredump(struct sysctl_oid *, void *, int, struct sysctl_req *);
#endif
#endif
STATIC int sysctl_delayterm(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req);
STATIC int sysctl_rage_vnode(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req);
@ -1510,7 +1517,8 @@ sysctl_procargsx(int *name, u_int namelen, user_addr_t where,
if (vm_map_copy_overwrite(kernel_map,
(vm_map_address_t)copy_start,
tmp, (vm_map_size_t) arg_size, FALSE) != KERN_SUCCESS) {
tmp, (vm_map_size_t) arg_size,
FALSE) != KERN_SUCCESS) {
error = EIO;
goto finish;
}
@ -2006,7 +2014,9 @@ sysctl_system_version_compat
SYSCTL_PROC(_kern, OID_AUTO, system_version_compat,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
0, 0, sysctl_system_version_compat, "A", "");
#endif /* XNU_TARGET_OS_OSX */
#if XNU_TARGET_OS_OSX || defined(XNU_EXPERIMENTAL_SYSTEM_VERSION_COMPAT)
char osproductversioncompat[48] = { '\0' };
static int
@ -2023,12 +2033,42 @@ SYSCTL_PROC(_kern, OID_AUTO, osproductversioncompat,
CTLFLAG_RW | CTLFLAG_KERN | CTLTYPE_STRING | CTLFLAG_LOCKED,
osproductversioncompat, sizeof(osproductversioncompat),
sysctl_osproductversioncompat, "A", "The ProductVersion from SystemVersionCompat.plist");
#endif
#endif /* XNU_TARGET_OS_OSX || defined(XNU_EXPERIMENTAL_SYSTEM_VERSION_COMPAT) */
char osproductversion[48] = { '\0' };
static char iossupportversion_string[48] = { '\0' };
#if defined(XNU_EXPERIMENTAL_SYSTEM_VERSION_COMPAT)
/*
* Equivalent to dyld_program_sdk_at_least(dyld_fall_2025_os_versions).
*/
static bool
proc_2025_fall_os_sdk_or_later(struct proc *p)
{
const uint32_t proc_sdk_ver = proc_sdk(p);
switch (proc_platform(p)) {
case PLATFORM_MACOS:
return proc_sdk_ver >= 0x00100000; // DYLD_MACOSX_VERSION_16_0
case PLATFORM_IOS:
case PLATFORM_IOSSIMULATOR:
case PLATFORM_MACCATALYST:
return proc_sdk_ver >= 0x00130000; // DYLD_IOS_VERSION_19_0
case PLATFORM_BRIDGEOS:
return proc_sdk_ver >= 0x000a0000; // DYLD_BRIDGEOS_VERSION_10_0
case PLATFORM_TVOS:
case PLATFORM_TVOSSIMULATOR:
return proc_sdk_ver >= 0x00130000; // DYLD_TVOS_VERSION_19_0
case PLATFORM_WATCHOS:
case PLATFORM_WATCHOSSIMULATOR:
return proc_sdk_ver >= 0x000c0000; // DYLD_WATCHOS_VERSION_12_0
default:
return true;
}
}
#endif /* defined(XNU_EXPERIMENTAL_SYSTEM_VERSION_COMPAT) */
static int
sysctl_osproductversion(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
{
@ -2039,18 +2079,22 @@ sysctl_osproductversion(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
#if XNU_TARGET_OS_OSX
if (task_has_system_version_compat_enabled(current_task()) && (osproductversioncompat[0] != '\0')) {
return sysctl_handle_string(oidp, osproductversioncompat, arg2, req);
} else {
return sysctl_handle_string(oidp, arg1, arg2, req);
}
#elif defined(XNU_TARGET_OS_XR)
#endif /* XNU_TARGET_OS_OSX */
#if defined(XNU_TARGET_OS_XR)
if (proc_platform(req->p) == PLATFORM_IOS && (iossupportversion_string[0] != '\0')) {
return sysctl_handle_string(oidp, iossupportversion_string, arg2, req);
} else {
return sysctl_handle_string(oidp, arg1, arg2, req);
}
#else
#endif /* defined(XNU_TARGET_OS_XR) */
#if defined(XNU_EXPERIMENTAL_SYSTEM_VERSION_COMPAT)
if (!proc_2025_fall_os_sdk_or_later(req->p) && (osproductversioncompat[0] != '\0')) {
return sysctl_handle_string(oidp, osproductversioncompat, arg2, req);
}
#endif /* defined(XNU_EXPERIMENTAL_SYSTEM_VERSION_COMPAT) */
return sysctl_handle_string(oidp, arg1, arg2, req);
#endif
}
#if XNU_TARGET_OS_OSX
@ -2487,10 +2531,6 @@ extern int sched_allow_rt_smt;
SYSCTL_INT(_kern, OID_AUTO, sched_allow_rt_smt,
CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED,
&sched_allow_rt_smt, 0, "");
extern int sched_allow_rt_steal;
SYSCTL_INT(_kern, OID_AUTO, sched_allow_rt_steal,
CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED,
&sched_allow_rt_steal, 0, "");
extern int sched_backup_cpu_timeout_count;
SYSCTL_INT(_kern, OID_AUTO, sched_backup_cpu_timeout_count,
CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED,
@ -3352,8 +3392,7 @@ SYSCTL_UINT(_kern, OID_AUTO, secure_coredump, CTLFLAG_RD, &sc_dump_mode, 0, "sec
#endif /* EXCLAVES_COREDUMP */
#if CONFIG_COREDUMP
#if CONFIG_COREDUMP || CONFIG_UCOREDUMP
SYSCTL_STRING(_kern, KERN_COREFILE, corefile,
CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
@ -3413,7 +3452,34 @@ SYSCTL_PROC(_kern, KERN_SUGID_COREDUMP, sugid_coredump,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
0, 0, sysctl_suid_coredump, "I", "");
#endif /* CONFIG_COREDUMP */
#if CONFIG_UCOREDUMP
STATIC int
sysctl_ucoredump
(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
{
#ifdef SECURE_KERNEL
(void)req;
return ENOTSUP;
#else
int new_value, changed;
int error = sysctl_io_number(req, do_ucoredump, sizeof(int), &new_value, &changed);
if (changed) {
if (new_value == 0 || new_value == 1) {
do_ucoredump = new_value;
} else {
error = EINVAL;
}
}
return error;
#endif
}
SYSCTL_PROC(_kern, OID_AUTO, ucoredump,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
0, 0, sysctl_ucoredump, "I", "");
#endif /* CONFIG_UCOREDUMP */
#endif /* CONFIG_COREDUMP || CONFIG_UCOREDUMP */
#if CONFIG_KDP_INTERACTIVE_DEBUGGING
@ -4394,12 +4460,14 @@ SYSCTL_PROC(_vm, OID_AUTO, add_wire_count_over_user_limit, CTLTYPE_QUAD | CTLFLA
#if DEVELOPMENT || DEBUG
/* These sysctls are used to test the wired limit. */
extern unsigned int vm_page_wire_count;
extern uint32_t vm_lopage_free_count;
extern unsigned int vm_page_stolen_count;
SYSCTL_INT(_vm, OID_AUTO, page_wire_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_wire_count, 0, "");
SYSCTL_INT(_vm, OID_AUTO, page_wire_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_wire_count, 0,
"The number of physical pages which are pinned and cannot be evicted");
#if XNU_VM_HAS_LOPAGE
SYSCTL_INT(_vm, OID_AUTO, lopage_free_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_lopage_free_count, 0, "");
#endif
SYSCTL_INT(_vm, OID_AUTO, page_stolen_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stolen_count, 0, "");
SYSCTL_UINT(_vm, OID_AUTO, page_swapped_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_swapped_count, 0,
"The number of virtual pages whose contents are currently compressed and swapped to disk");
/*
* Setting the per task variable exclude_physfootprint_ledger to 1 will allow the calling task to exclude memory entries that are
@ -4929,6 +4997,7 @@ SCALABLE_COUNTER_DECLARE(oslog_e_metadata_count);
SCALABLE_COUNTER_DECLARE(oslog_e_metadata_dropped_count);
SCALABLE_COUNTER_DECLARE(oslog_e_signpost_count);
SCALABLE_COUNTER_DECLARE(oslog_e_signpost_dropped_count);
SCALABLE_COUNTER_DECLARE(oslog_e_replay_failure_count);
SCALABLE_COUNTER_DECLARE(oslog_e_query_count);
SCALABLE_COUNTER_DECLARE(oslog_e_query_error_count);
SCALABLE_COUNTER_DECLARE(oslog_e_trace_mode_set_count);
@ -4989,6 +5058,8 @@ SYSCTL_SCALABLE_COUNTER(_debug, oslog_e_signpost_count, oslog_e_signpost_count,
"Number of signposts retrieved from the exclaves log server");
SYSCTL_SCALABLE_COUNTER(_debug, oslog_e_signpost_dropped_count, oslog_e_signpost_dropped_count,
"Number of dropped signposts retrieved from the exclaves log server");
SYSCTL_SCALABLE_COUNTER(_debug, oslog_e_replay_failure_count, oslog_e_replay_failure_count,
"Number of dropped messages that couldn't be replayed and failed generically");
SYSCTL_SCALABLE_COUNTER(_debug, oslog_e_query_count, oslog_e_query_count,
"Number of sucessful queries to the exclaves log server");
SYSCTL_SCALABLE_COUNTER(_debug, oslog_e_query_error_count, oslog_e_query_error_count,
@ -5545,6 +5616,31 @@ sysctl_get_thread_group_id SYSCTL_HANDLER_ARGS
SYSCTL_PROC(_kern, OID_AUTO, thread_group_id, CTLFLAG_RD | CTLFLAG_LOCKED | CTLTYPE_QUAD,
0, 0, &sysctl_get_thread_group_id, "I", "thread group id of the thread");
extern kern_return_t sysctl_clutch_thread_group_cpu_time_for_thread(thread_t thread, int sched_bucket, uint64_t *cpu_stats);
static int
sysctl_get_clutch_bucket_group_cpu_stats SYSCTL_HANDLER_ARGS
{
int error;
kern_return_t kr;
int sched_bucket = -1;
error = SYSCTL_IN(req, &sched_bucket, sizeof(sched_bucket));
if (error) {
return error;
}
uint64_t cpu_stats[2];
kr = sysctl_clutch_thread_group_cpu_time_for_thread(current_thread(), sched_bucket, cpu_stats);
error = mach_to_bsd_errno(kr);
if (error) {
return error;
}
return SYSCTL_OUT(req, cpu_stats, sizeof(cpu_stats));
}
SYSCTL_PROC(_kern, OID_AUTO, clutch_bucket_group_cpu_stats, CTLFLAG_RW | CTLFLAG_LOCKED | CTLTYPE_OPAQUE,
0, 0, &sysctl_get_clutch_bucket_group_cpu_stats, "I",
"CPU used and blocked time for the current thread group at a specified scheduling bucket");
STATIC int
sysctl_thread_group_count(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
{
@ -5601,6 +5697,77 @@ SYSCTL_PROC(_kern, OID_AUTO, grade_cputype,
0, 0, &sysctl_grade_cputype, "S",
"grade value of cpu_type_t+cpu_sub_type_t");
#if DEVELOPMENT || DEBUG
STATIC int
sysctl_binary_grade_override( __unused struct sysctl_oid *oidp, __unused void *arg1,
__unused int arg2, struct sysctl_req *req)
{
int error;
user_addr_t oldp = 0, newp = 0;
size_t *oldlenp = NULL;
size_t newlen = 0;
oldp = req->oldptr;
oldlenp = &(req->oldlen);
newp = req->newptr;
newlen = req->newlen;
/* We want the current length, and maybe the string itself */
if (oldlenp) {
char existing_overrides[256] = { 0 };
size_t currlen = bingrade_get_override_string(existing_overrides, sizeof(existing_overrides));
if (oldp && currlen > 0) {
if (*oldlenp < currlen) {
return ENOMEM;
}
/* NOTE - we do not copy the NULL terminator */
error = copyout(existing_overrides, oldp, currlen);
if (error) {
return error;
}
}
/* return length of overrides minus the NULL terminator (just like strlen) */
req->oldidx = currlen;
}
/* We want to set the override string to something */
if (newp) {
char *tmp_override = (char *)kalloc_data(newlen + 1, Z_WAITOK | Z_ZERO);
if (!tmp_override) {
return ENOMEM;
}
error = copyin(newp, tmp_override, newlen);
if (error) {
kfree_data(tmp_override, newlen + 1);
return error;
}
tmp_override[newlen] = 0; /* Terminate string */
/* Set the binary grading overrides */
if (binary_grade_overrides_update(tmp_override) == 0) {
/* Nothing got set. */
kfree_data(tmp_override, newlen + 1);
return EINVAL;
}
kfree_data(tmp_override, newlen + 1);
}
return 0;
}
SYSCTL_PROC(_kern, OID_AUTO, grade_override,
CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED,
0, 0, &sysctl_binary_grade_override, "A",
"");
#endif /* DEVELOPMENT || DEBUG */
extern boolean_t allow_direct_handoff;
SYSCTL_INT(_kern, OID_AUTO, direct_handoff,
CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED,
@ -6135,14 +6302,23 @@ uuid_string_t trial_treatment_id;
uuid_string_t trial_experiment_id;
int trial_deployment_id = -1;
SYSCTL_STRING(_kern, OID_AUTO, trial_treatment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, trial_treatment_id, sizeof(trial_treatment_id), "");
SYSCTL_STRING(_kern, OID_AUTO, trial_experiment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, trial_experiment_id, sizeof(trial_experiment_id), "");
SYSCTL_INT(_kern, OID_AUTO, trial_deployment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &trial_deployment_id, 0, "");
SYSCTL_STRING(_kern, OID_AUTO, trial_treatment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_LEGACY_EXPERIMENT, trial_treatment_id, sizeof(trial_treatment_id), "");
SYSCTL_STRING(_kern, OID_AUTO, trial_experiment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_LEGACY_EXPERIMENT, trial_experiment_id, sizeof(trial_experiment_id), "");
SYSCTL_INT(_kern, OID_AUTO, trial_deployment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_LEGACY_EXPERIMENT, &trial_deployment_id, 0, "");
#if (DEVELOPMENT || DEBUG)
/* For unit testing setting factors & limits. */
unsigned int testing_experiment_factor;
EXPERIMENT_FACTOR_UINT(_kern, testing_experiment_factor, &testing_experiment_factor, 5, 10, "");
EXPERIMENT_FACTOR_LEGACY_UINT(_kern, testing_experiment_factor, &testing_experiment_factor, 5, 10, "");
static int32_t experiment_factor_test;
EXPERIMENT_FACTOR_INT(test, &experiment_factor_test, 0, 32, "test factor");
#if MACH_ASSERT && __arm64__
/* rdar://149041040 */
extern unsigned int panic_on_jit_guard;
EXPERIMENT_FACTOR_UINT(jitguard, &panic_on_jit_guard, 0, 7, "Panic on JIT guard failure");
#endif /* MACH_ASSERT && __arm64__ */
extern int exception_log_max_pid;
SYSCTL_INT(_debug, OID_AUTO, exception_log_max_pid, CTLFLAG_RW | CTLFLAG_LOCKED, &exception_log_max_pid, 0, "Log exceptions for all processes up to this pid");
@ -6186,6 +6362,212 @@ SYSCTL_PROC(_kern, OID_AUTO, page_protection_type,
CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, 0, sysctl_page_protection_type, "I", "Type of page protection that the system supports");
#if CONFIG_SPTM && HAS_SPTM_SYSCTL
extern bool disarm_protected_io;
static int sysctl_sptm_disarm_protected_io SYSCTL_HANDLER_ARGS
{
int error = 0;
uint64_t old_disarm_protected_io = (uint64_t) disarm_protected_io;
error = SYSCTL_OUT(req, &old_disarm_protected_io, sizeof(old_disarm_protected_io));
if (error) {
return error;
}
uint64_t new_disarm_protected_io = old_disarm_protected_io;
if (req->newptr) {
error = SYSCTL_IN(req, &new_disarm_protected_io, sizeof(new_disarm_protected_io));
if (!disarm_protected_io && new_disarm_protected_io) {
sptm_sysctl(SPTM_SYSCTL_DISARM_PROTECTED_IO, SPTM_SYSCTL_SET, 1);
os_atomic_thread_fence(release);
disarm_protected_io = true;
}
}
return error;
}
SYSCTL_PROC(_kern, OID_AUTO, sptm_disarm_protected_io, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_sptm_disarm_protected_io, "Q", "");
/**
* Usage of kern.sptm_sysctl_poke
*
* This sysctl provides a convenient way to trigger the "getter" handler of a
* specified SPTM sysctl. With this sysctl, you can trigger arbitrary SPTM
* code without modifying xnu source code. All you need to do is define a
* new SPTM sysctl and implement its "getter". After that, you can write
* the SPTM sysctl number to this sysctl to trigger it.
*/
static int sysctl_sptm_sysctl_poke SYSCTL_HANDLER_ARGS
{
int error = 0;
/* Always read-as-zero. */
const uint64_t out = 0;
error = SYSCTL_OUT(req, &out, sizeof(out));
if (error) {
return error;
}
uint64_t selector;
if (req->newptr) {
error = SYSCTL_IN(req, &selector, sizeof(selector));
sptm_sysctl(selector, SPTM_SYSCTL_GET, 0);
}
return error;
}
SYSCTL_PROC(_kern, OID_AUTO, sptm_sysctl_poke, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_sptm_sysctl_poke, "Q", "");
#endif /* CONFIG_SPTM && HAS_SPTM_SYSCTL */
#if CONFIG_SPTM && (DEVELOPMENT || DEBUG)
/**
* Sysctls to get SPTM allowed I/O ranges, pmap I/O ranges and I/O ranges by index.
* Used by SEAR/LASER tools.
*/
static int
sysctl_sptm_allowed_io_ranges SYSCTL_HANDLER_ARGS
{
#pragma unused(oidp, arg1, arg2)
sptm_io_range_t io_range = { 0 };
unsigned int index = 0;
int error = SYSCTL_IN(req, &index, sizeof(index));
if (error) {
return error;
}
libsptm_error_t ret = sptm_get_info(INFO_SPTM_ALLOWED_IO_RANGES, index, &io_range);
if (__improbable(ret != LIBSPTM_SUCCESS)) {
return EINVAL;
}
return SYSCTL_OUT(req, &io_range, sizeof(io_range));
}
SYSCTL_PROC(_kern, OID_AUTO, sptm_allowed_io_ranges, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
0, 0, sysctl_sptm_allowed_io_ranges, "S,sptm_io_range_t", "SPTM allowed I/O ranges by index");
static int
sysctl_sptm_allowed_io_ranges_count SYSCTL_HANDLER_ARGS
{
#pragma unused(oidp, arg1, arg2)
unsigned int count = 0;
libsptm_error_t ret = sptm_get_info(INFO_SPTM_ALLOWED_IO_RANGES_COUNT, 0, &count);
if (__improbable(ret != LIBSPTM_SUCCESS)) {
return EINVAL;
}
return SYSCTL_OUT(req, &count, sizeof(count));
}
SYSCTL_PROC(_kern, OID_AUTO, sptm_allowed_io_ranges_count, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, 0, sysctl_sptm_allowed_io_ranges_count, "I", "SPTM allowed I/O ranges count");
static int
sysctl_sptm_pmap_io_ranges SYSCTL_HANDLER_ARGS
{
#pragma unused(oidp, arg1, arg2)
sptm_io_range_t io_range = { 0 };
unsigned int index = 0;
int error = SYSCTL_IN(req, &index, sizeof(index));
if (error) {
return error;
}
libsptm_error_t ret = sptm_get_info(INFO_SPTM_PMAP_IO_RANGES, index, &io_range);
if (__improbable(ret != LIBSPTM_SUCCESS)) {
return EINVAL;
}
return SYSCTL_OUT(req, &io_range, sizeof(io_range));
}
SYSCTL_PROC(_kern, OID_AUTO, sptm_pmap_io_ranges, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
0, 0, sysctl_sptm_pmap_io_ranges, "S,sptm_io_range_t", "SPTM pmap I/O ranges by index");
static int
sysctl_sptm_pmap_io_ranges_count SYSCTL_HANDLER_ARGS
{
#pragma unused(oidp, arg1, arg2)
unsigned int count = 0;
libsptm_error_t ret = sptm_get_info(INFO_SPTM_PMAP_IO_RANGES_COUNT, 0, &count);
if (__improbable(ret != LIBSPTM_SUCCESS)) {
return EINVAL;
}
return SYSCTL_OUT(req, &count, sizeof(count));
}
SYSCTL_PROC(_kern, OID_AUTO, sptm_pmap_io_ranges_count, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, 0, sysctl_sptm_pmap_io_ranges_count, "I", "SPTM pmap I/O ranges count");
static int
sysctl_sptm_io_ranges SYSCTL_HANDLER_ARGS
{
#pragma unused(oidp, arg1, arg2)
sptm_io_range_t io_range = { 0 };
unsigned int index = 0;
int error = SYSCTL_IN(req, &index, sizeof(index));
if (error) {
return error;
}
libsptm_error_t ret = sptm_get_info(INFO_SPTM_IO_RANGES, index, &io_range);
if (__improbable(ret != LIBSPTM_SUCCESS)) {
return EINVAL;
}
return SYSCTL_OUT(req, &io_range, sizeof(io_range));
}
SYSCTL_PROC(_kern, OID_AUTO, sptm_io_ranges, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
0, 0, sysctl_sptm_io_ranges, "S,sptm_io_range_t", "SPTM I/O ranges by index");
static int
sysctl_sptm_io_ranges_count SYSCTL_HANDLER_ARGS
{
#pragma unused(oidp, arg1, arg2)
unsigned int count = 0;
libsptm_error_t ret = sptm_get_info(INFO_SPTM_IO_RANGES_COUNT, 0, &count);
if (__improbable(ret != LIBSPTM_SUCCESS)) {
return EINVAL;
}
return SYSCTL_OUT(req, &count, sizeof(count));
}
SYSCTL_PROC(_kern, OID_AUTO, sptm_io_ranges_count, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, 0, sysctl_sptm_io_ranges_count, "I", "SPTM I/O ranges count");
#endif /* CONFIG_SPTM && (DEVELOPMENT || DEBUG) */
#if __ARM64_PMAP_SUBPAGE_L1__ && CONFIG_SPTM
extern bool surt_ready;
static int
sysctl_surt_ready SYSCTL_HANDLER_ARGS
{
#pragma unused(oidp, arg1, arg2)
unsigned int surt_ready_uint = (unsigned int)surt_ready;
return SYSCTL_OUT(req, &surt_ready_uint, sizeof(surt_ready_uint));
}
SYSCTL_PROC(_kern, OID_AUTO, surt_ready, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, 0, sysctl_surt_ready, "I", "SURT system readiness");
#endif /* __ARM64_PMAP_SUBPAGE_L1__ && CONFIG_SPTM */
#if __arm64__ && (DEBUG || DEVELOPMENT)
extern unsigned int pmap_wcrt_on_non_dram_count_get(void);
static int
sysctl_pmap_wcrt_on_non_dram_count SYSCTL_HANDLER_ARGS
{
#pragma unused(oidp, arg1, arg2)
unsigned int count = pmap_wcrt_on_non_dram_count_get();
return SYSCTL_OUT(req, &count, sizeof(count));
}
SYSCTL_PROC(_kern, OID_AUTO, pmap_wcrt_on_non_dram_count, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, 0, sysctl_pmap_wcrt_on_non_dram_count, "I", "pmap WC/RT mapping request on non-DRAM count");
#endif /* __arm64__ && (DEBUG || DEVELOPMENT) */
TUNABLE_DT(int, gpu_pmem_selector, "defaults", "kern.gpu_pmem_selector", "gpu-pmem-selector", 0, TUNABLE_DT_NONE);
#if CONFIG_EXCLAVES
@ -6299,3 +6681,175 @@ SYSCTL_PROC(_kern, OID_AUTO, exclaves_inspection_status,
extern uint32_t disable_vm_sanitize_telemetry;
SYSCTL_UINT(_debug, OID_AUTO, disable_vm_sanitize_telemetry, CTLFLAG_RW | CTLFLAG_LOCKED /*| CTLFLAG_MASKED*/, &disable_vm_sanitize_telemetry, 0, "disable VM API sanitization telemetry");
#endif
#define kReadUserspaceRebootInfoEntitlement "com.apple.private.kernel.userspacereboot-info-read-only"
static int
_sysctl_userspacereboot_info(struct sysctl_req *req, void *ptr, size_t ptr_size)
{
if (req->newptr != 0) {
/* initproc is the only process that can write to these sysctls */
if (proc_getpid(req->p) != 1) {
return EPERM;
}
return SYSCTL_IN(req, ptr, ptr_size);
} else {
/* A read entitlement is required to read these sysctls */
if (!IOCurrentTaskHasEntitlement(kReadUserspaceRebootInfoEntitlement)) {
return EPERM;
}
return SYSCTL_OUT(req, ptr, ptr_size);
}
}
static int
sysctl_userspacereboottime(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
{
return _sysctl_userspacereboot_info(req, &userspacereboottime, sizeof(userspacereboottime));
}
static int
sysctl_userspacerebootpurpose(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
{
return _sysctl_userspacereboot_info(req, &userspacerebootpurpose, sizeof(userspacerebootpurpose));
}
SYSCTL_PROC(_kern, OID_AUTO, userspacereboottime, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_userspacereboottime, "Q", "");
SYSCTL_PROC(_kern, OID_AUTO, userspacerebootpurpose, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_userspacerebootpurpose, "I", "");
#if XNU_TARGET_OS_IOS
static LCK_GRP_DECLARE(erm_config_lock_grp, "ERM sysctl");
static LCK_RW_DECLARE(erm_config_lock, &erm_config_lock_grp);
#define ERM_CONFIG_SYSCTL_WRITE_ENTITLEMENT "com.apple.private.security-research-device.extended-research-mode"
#define ERM_CONFIG_SYSCTL_MAX_SIZE PAGE_SIZE
// This sysctl handler is only registered when Extended Research Mode (ERM) is active.
static int
sysctl_user_extended_research_mode_config_handler(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
{
// Pointer for the dynamically allocated buffer
static void *extended_research_mode_config_data = NULL;
// Current size of the valid data stored in the buffer
static size_t extended_research_mode_config_current_size = 0;
// Handle Read request (user wants to read the current config, before it is overwritten)
if (req->oldptr != USER_ADDR_NULL) {
int error = 0;
lck_rw_lock_shared(&erm_config_lock);
if (req->oldlen < extended_research_mode_config_current_size) {
error = ENOMEM;
} else {
if (extended_research_mode_config_current_size > 0) {
error = copyout(extended_research_mode_config_data,
req->oldptr,
extended_research_mode_config_current_size);
}
}
// In all cases, report the total size of the currently stored config back to the user,
req->oldlen = extended_research_mode_config_current_size;
req->oldidx = req->oldlen;
lck_rw_unlock_shared(&erm_config_lock);
if (error != 0) {
return error;
}
} else {
// User just want to know the current buffer size.
// All accesses to extended_research_mode_config* variables are expected
// to be done under erm_config_lock.
lck_rw_lock_shared(&erm_config_lock);
req->oldidx = extended_research_mode_config_current_size;
lck_rw_unlock_shared(&erm_config_lock);
}
// Handle Write request (new data provided by user)
if (req->newptr != USER_ADDR_NULL) {
if (!IOTaskHasEntitlement(proc_task(req->p), ERM_CONFIG_SYSCTL_WRITE_ENTITLEMENT)) {
return EPERM;
}
size_t requested_len = req->newlen;
if (requested_len > ERM_CONFIG_SYSCTL_MAX_SIZE) {
// We ensure the config provided by user-space is not too big
return EINVAL;
}
// Allocate a new buffer for the incoming data
void *new_buffer = (void *)kalloc_data(requested_len, Z_WAITOK | Z_ZERO);
if (new_buffer == NULL) {
return ENOMEM; // Allocation failed
}
// Copy data from user space into the newly allocated buffer
int error = copyin(req->newptr, new_buffer, requested_len);
if (error == 0) {
// Success: Replace the old buffer with the new one
lck_rw_lock_exclusive(&erm_config_lock);
// Backup old buffer info for freeing it in a second step
void *old_buffer_to_free = extended_research_mode_config_data;
size_t old_buffer_size = extended_research_mode_config_current_size;
// Point to the new buffer and update size
extended_research_mode_config_data = new_buffer;
extended_research_mode_config_current_size = requested_len;
lck_rw_unlock_exclusive(&erm_config_lock);
new_buffer = NULL; // transferred to the static pointer
// Previous buffer is not referenced anymore, good to be deleted.
kfree_data(old_buffer_to_free, old_buffer_size);
} else {
// Copyin failed, free the buffer we just allocated and keep the old data and size intact
kfree_data(new_buffer, requested_len);
return error;
}
}
return 0;
}
// We don't register this sysctl handler automatically , but rather only register it only if the extended
// research mode is active.
SYSCTL_PROC(_user, // Parent node structure (_kern)
OID_AUTO, // Automatically assign OID
extended_research_mode_config, // Name of the node
CTLFLAG_NOAUTO | // We will register this sysctl on our own
CTLTYPE_OPAQUE | // Type: Opaque binary data
CTLFLAG_WR | // Allow both read and write
CTLFLAG_ANYBODY | // No user filtering
CTLFLAG_LOCKED, // The handler manages its own locking.
NULL, // arg1 (not used)
0, // arg2 (not used)
&sysctl_user_extended_research_mode_config_handler,
"-", // don't print the content (as it is a blob)
"Configuration blob for Extended Research Mode");
// This function is defined in kern_codesigning.c but don't worth include the whole .h just for it.
bool extended_research_mode_state(void);
// Only register the research_mode_config sysctl if Extended Research Mode is active
__startup_func
static void
extended_research_mode_config_sysctl_startup(void)
{
if (__improbable(extended_research_mode_state())) {
// Register the sysctl handler
sysctl_register_oid_early(&sysctl__user_extended_research_mode_config);
}
}
STARTUP(SYSCTL, STARTUP_RANK_MIDDLE, extended_research_mode_config_sysctl_startup);
#endif /* XNU_TARGET_OS_IOS */
#if DEBUG || DEVELOPMENT
SCALABLE_COUNTER_DEFINE(mach_eventlink_handoff_success_count);
SYSCTL_SCALABLE_COUNTER(_kern, mach_eventlink_handoff_success_count,
mach_eventlink_handoff_success_count, "Number of successful handoffs");
#endif /* DEBUG || DEVELOPMENT*/

View file

@ -85,6 +85,7 @@
#include <kern/clock.h>
#include <kern/task.h>
#include <kern/thread_call.h>
#include <kern/uipc_domain.h>
#if CONFIG_MACF
#include <security/mac_framework.h>
#endif

View file

@ -29,6 +29,7 @@
#define __KPI__
#include <sys/param.h>
#include <sys/cdefs.h>
#include <sys/mbuf.h>
#include <sys/mcache.h>
#include <sys/socket.h>
@ -78,12 +79,35 @@ SYSCTL_QUAD(_kern_ipc_mbtxcf, OID_AUTO, aborted,
CTLFLAG_RD | CTLFLAG_LOCKED, &mbuf_tx_compl_aborted, "");
#endif /* (DEBUG || DEVELOPMENT) */
void *
void * __unsafe_indexable
mbuf_data(mbuf_t mbuf)
{
return m_mtod_current(mbuf);
}
errno_t
mbuf_data_len(mbuf_t mbuf, void *__sized_by(*out_len) *out_buf, size_t *out_len)
{
size_t len;
void *buf;
if (out_len == NULL || out_buf == NULL) {
return EINVAL;
}
len = mbuf_len(mbuf);
buf = m_mtod_current(mbuf);
if (len == 0 || buf == NULL) {
return ENOENT;
}
*out_len = len;
*out_buf = buf;
return 0;
}
void *
mbuf_datastart(mbuf_t mbuf)
{
@ -249,11 +273,6 @@ mbuf_alloccluster(mbuf_how_t how, size_t *size, char * __sized_by_or_null(*size)
caddr_t _addr = NULL;
size_t _size = *size;
/* Jumbo cluster pool not available? */
if (_size > MBIGCLBYTES && njcl == 0) {
return ENOTSUP;
}
if (_size <= MCLBYTES && (_addr = m_mclalloc(how)) != NULL) {
_size = MCLBYTES;
} else if (_size > MCLBYTES && _size <= MBIGCLBYTES &&
@ -288,10 +307,8 @@ mbuf_freecluster(caddr_t addr, size_t size)
m_mclfree(addr);
} else if (size == MBIGCLBYTES) {
m_bigfree(addr, MBIGCLBYTES, NULL);
} else if (njcl > 0) {
m_16kfree(addr, M16KCLBYTES, NULL);
} else {
panic("%s: freeing jumbo cluster to an empty pool", __func__);
m_16kfree(addr, M16KCLBYTES, NULL);
}
}
@ -321,13 +338,7 @@ mbuf_getcluster(mbuf_how_t how, mbuf_type_t type, size_t size, mbuf_t *mbuf)
} else if (size == MBIGCLBYTES) {
*mbuf = m_mbigget(*mbuf, how);
} else if (size == M16KCLBYTES) {
if (njcl > 0) {
*mbuf = m_m16kget(*mbuf, how);
} else {
/* Jumbo cluster pool not available? */
error = ENOTSUP;
goto out;
}
*mbuf = m_m16kget(*mbuf, how);
} else {
error = EINVAL;
goto out;
@ -513,7 +524,7 @@ mbuf_adjustlen(mbuf_t m, int amount)
{
/* Verify m_len will be valid after adding amount */
if (amount > 0) {
size_t used = (size_t)mbuf_data(m) - (size_t)mbuf_datastart(m) +
size_t used = (size_t)mtod(m, void*) - (size_t)mbuf_datastart(m) +
m->m_len;
if ((size_t)(amount + used) > mbuf_maxlen(m)) {

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2000-2020 Apple Inc. All rights reserved.
* Copyright (c) 2000-2024 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
@ -93,8 +93,63 @@
#include <os/log.h>
#include <os/overflow.h>
#include <pexpert/pexpert.h>
#include <libkern/libkern.h>
#include "kern_exec_internal.h"
#if APPLEVIRTUALPLATFORM
#define ALLOW_FORCING_ARM64_32 1
#endif /* APPLEVIRTUALPLATFORM */
#if ALLOW_FORCING_ARM64_32
#if DEVELOPMENT || DEBUG
TUNABLE_DT(uint32_t, force_arm64_32, "/defaults", "force-arm64-32", "force-arm64-32", 0, TUNABLE_DT_NONE);
#else
TUNABLE_DT(uint32_t, force_arm64_32, "/defaults", "force-arm64-32", "force-arm64-32", 0, TUNABLE_DT_NO_BOOTARG);
#endif
#endif /* ALLOW_FORCING_ARM64_32 */
#if ALLOW_FORCING_ARM64_32 || DEVELOPMENT || DEBUG
/*
* The binary grading priority for the highest priority override. Each progressive override
* receives a priority 1 less than its neighbor.
*/
#define BINGRADE_OVERRIDE_MAX 200
#endif /* ALLOW_FORCING_ARM64_32 || DEVELOPMENT || DEBUG */
#if DEVELOPMENT || DEBUG
/*
* Maxmum number of overrides that can be passed via the bingrade boot-arg property.
*/
#define MAX_BINGRADE_OVERRIDES 4
/*
* Max size of one bingrade override + 1 comma
* (technically, sizeof will also include the terminating NUL here, but an overestimation of
* buffer space is fine).
*/
#define BINGRADE_MAXSTRINGLEN sizeof("0x12345678:0x12345678:0x12345678,")
/*
* Each binary grading override has a cpu type and cpu subtype to match against the values in
* the Mach-o header.
*/
typedef struct bingrade {
uint32_t cputype;
uint32_t cpusubtype;
uint32_t execfeatures;
#define EXECFEATURES_OVERRIDE_WILDCARD (~(uint32_t)0)
} bingrade_t;
/* The number of binary grading overrides that are active */
static int num_bingrade_overrides = -1;
/*
* The bingrade_overrides array is an ordered list of binary grading overrides. The first element in the array
* has the highest priority. When parsing the `bingrade' boot-arg, elements are added to this array in order.
*/
static bingrade_t bingrade_overrides[MAX_BINGRADE_OVERRIDES] = { 0 };
#endif /* DEVELOPMENT || DEBUG */
/* An empty load_result_t */
@ -304,6 +359,158 @@ get_macho_vnode(
struct image_params *imgp
);
#if DEVELOPMENT || DEBUG
/*
* Parse the bingrade boot-arg, adding cputype/cpusubtype/execfeatures tuples to the global binary grading
* override array. The bingrade boot-arg must be of the form:
*
* NUM := '0x' <HEXDIGITS> | '0' <OCTALDIGITS> | <DECIMALDIGITS>
* OVERRIDESPEC := <NUM> | <NUM> ':' <NUM> | <NUM> ':' <NUM> ':' <NUM>
* BINSPEC_BOOTARG := <OVERRIDESPEC> ',' <BINSPEC_BOOTARG> | <OVERRIDESPEC>
*
* Returns the number of overrides specified in the boot-arg, or 0 if there were no overrides or the
* syntax of the overrides was found to be invalid.
*/
static int
parse_bingrade_override_bootarg(bingrade_t *overrides, int max_overrides, char *overrides_arg_string)
{
char bingrade_arg[BINGRADE_MAXSTRINGLEN * MAX_BINGRADE_OVERRIDES + 1];
int cputypespec_count = 0;
/* Look for the bingrade boot-arg */
if (overrides_arg_string != NULL || PE_parse_boot_arg_str("bingrade", bingrade_arg, sizeof(bingrade_arg))) {
char *bingrade_str = (overrides_arg_string != NULL) ? overrides_arg_string : &bingrade_arg[0];
char *cputypespec;
/* Skip leading whitespace */
while (*bingrade_str == ' ' || *bingrade_str == '\t') {
bingrade_str++;
}
if (*bingrade_str == 0) {
/* empty string, so just return 0 */
return 0;
}
/* If we found the boot-arg, iterate on each OVERRIDESPEC in the BOOTSPEC_BOOTARG */
while ((cputypespec_count < max_overrides) && ((cputypespec = strsep(&bingrade_str, ",")) != NULL)) {
char *colon = strchr(cputypespec, ':');
char *end;
char *cputypeptr;
char cputypestr[16] = { 0 };
unsigned long cputype, cpusubtype, execfeatures;
/* If there's a colon present, process the cpu subtype and possibly the execfeatures */
if (colon != NULL) {
colon++; /* Move past the colon before parsing */
char execfeat_buf[16] = { 0 }; /* This *MUST* be preinitialized to zeroes */
char *second_colon = strchr(colon, ':');
ptrdiff_t amt_to_copy = 0;
if (second_colon != NULL) {
strlcpy(execfeat_buf, second_colon + 1, MIN(strlen(second_colon + 1) + 1, sizeof(execfeat_buf)));
execfeatures = strtoul(execfeat_buf, &end, 0);
if (execfeat_buf == end || execfeatures > UINT_MAX) {
printf("Invalid bingrade boot-arg (`%s').\n", cputypespec);
return 0;
}
overrides[cputypespec_count].execfeatures = (uint32_t)execfeatures;
/*
* Note there is no "+ 1" here because we are only copying up to but not
* including the second colon. Since cputypestr was initialized to all 0s
* above, the terminating NUL will already be there.
*/
amt_to_copy = second_colon - colon;
} else {
/* No second colon, so use the wildcard for execfeatures */
overrides[cputypespec_count].execfeatures = EXECFEATURES_OVERRIDE_WILDCARD;
/*
* There is no "+ 1" here because colon was already moved forward by 1 (above).
* which allows this computation to include the terminating NUL in the length
* computed.
*/
amt_to_copy = colon - cputypespec;
}
/* Now determine the cpu subtype */
cpusubtype = strtoul(colon, &end, 0);
if (colon == end || cpusubtype > UINT_MAX) {
printf("Invalid bingrade boot-arg (`%s').\n", cputypespec);
return 0;
}
overrides[cputypespec_count].cpusubtype = (uint32_t)cpusubtype;
/* Copy the cputype string into a temp buffer */
strlcpy(cputypestr, cputypespec, MIN(sizeof(cputypestr), amt_to_copy));
cputypeptr = &cputypestr[0];
} else {
/*
* No colon present, set the cpu subtype to 0, the execfeatures to EXECFEATURES_OVERRIDE_WILDCARD
* and use the whole string as the cpu type
*/
overrides[cputypespec_count].cpusubtype = 0;
overrides[cputypespec_count].execfeatures = EXECFEATURES_OVERRIDE_WILDCARD;
cputypeptr = cputypespec;
}
cputype = strtoul(cputypeptr, &end, 0);
if (cputypeptr == end || cputype > UINT_MAX) {
printf("Invalid bingrade boot-arg (`%s').\n", cputypespec);
return 0;
}
overrides[cputypespec_count].cputype = (uint32_t)cputype;
cputypespec_count++;
}
} else {
/* No bingrade boot-arg; return 0 overrides */
return 0;
}
return cputypespec_count;
}
size_t
bingrade_get_override_string(char *existing_overrides, size_t existing_overrides_bufsize)
{
if (num_bingrade_overrides <= 0) {
return 0; /* No overrides set */
}
/* Init the empty string for strlcat */
existing_overrides[0] = 0;
for (int i = 0; i < num_bingrade_overrides; i++) {
char next_override[33]; /* 10char + ':' + 10char + ([future] ':' + 10char) */
snprintf(next_override, sizeof(next_override), "0x%x:0x%x", bingrade_overrides[i].cputype, bingrade_overrides[i].cpusubtype);
if (i > 0) {
strlcat(existing_overrides, ",", existing_overrides_bufsize);
}
strlcat(existing_overrides, next_override, existing_overrides_bufsize);
}
return strlen(existing_overrides);
}
int
binary_grade_overrides_update(char *overrides_arg)
{
#if ALLOW_FORCING_ARM64_32
if (force_arm64_32) {
/* If forcing arm64_32, don't allow bingrade override. */
return 0;
}
#endif /* ALLOW_FORCING_ARM64_32 */
num_bingrade_overrides = parse_bingrade_override_bootarg(bingrade_overrides, MAX_BINGRADE_OVERRIDES, overrides_arg);
return num_bingrade_overrides;
}
#endif /* DEVELOPMENT || DEBUG */
static inline void
widen_segment_command(const struct segment_command *scp32,
struct segment_command_64 *scp)
@ -420,6 +627,7 @@ process_is_plugin_host(struct image_params *imgp, load_result_t *result)
"com.apple.bash", /* Required for the 'enable' command */
"com.apple.zsh", /* Required for the 'zmodload' command */
"com.apple.ksh", /* Required for 'builtin' command */
"com.apple.sh", /* rdar://138353488: sh re-execs into zsh or bash, which are exempted */
};
for (size_t i = 0; i < ARRAY_COUNT(hardening_exceptions); i++) {
if (strncmp(hardening_exceptions[i], identity, strlen(hardening_exceptions[i])) == 0) {
@ -434,6 +642,43 @@ process_is_plugin_host(struct image_params *imgp, load_result_t *result)
}
#endif /* XNU_TARGET_OS_OSX */
static int
grade_binary_override(cpu_type_t __unused exectype, cpu_subtype_t __unused execsubtype, cpu_subtype_t execfeatures __unused,
bool allow_simulator_binary __unused)
{
#if ALLOW_FORCING_ARM64_32
if (force_arm64_32) {
/* Forcing ARM64_32 takes precedence over 'bingrade' boot-arg. */
if (exectype == CPU_TYPE_ARM64_32 && execsubtype == CPU_SUBTYPE_ARM64_32_V8) {
return BINGRADE_OVERRIDE_MAX;
} else {
/* Stop trying to match. */
return 0;
}
}
#endif /* ALLOW_FORCING_ARM64_32 */
#if DEVELOPMENT || DEBUG
if (num_bingrade_overrides == -1) {
num_bingrade_overrides = parse_bingrade_override_bootarg(bingrade_overrides, MAX_BINGRADE_OVERRIDES, NULL);
}
if (num_bingrade_overrides == 0) {
return -1;
}
for (int i = 0; i < num_bingrade_overrides; i++) {
if (bingrade_overrides[i].cputype == exectype && bingrade_overrides[i].cpusubtype == execsubtype &&
(bingrade_overrides[i].execfeatures == EXECFEATURES_OVERRIDE_WILDCARD ||
bingrade_overrides[i].execfeatures == execfeatures)) {
return BINGRADE_OVERRIDE_MAX - i;
}
}
#endif /* DEVELOPMENT || DEBUG */
/* exectype/execsubtype Not found in override list */
return -1;
}
load_return_t
load_machfile(
struct image_params *imgp,
@ -580,6 +825,22 @@ load_machfile(
return lret;
}
/*
* From now on it's safe to query entitlements via the vnode interface. Let's get figuring
* out whether we're a security relevant binary out of the way immediately.
*/
switch (exec_check_security_entitlement(imgp, HARDENED_PROCESS)) {
case EXEC_SECURITY_INVALID_CONFIG:
imgp->ip_free_map = map;
return LOAD_BADMACHO;
case EXEC_SECURITY_ENTITLED:
result->is_hardened_process = true;
break;
case EXEC_SECURITY_NOT_ENTITLED:
result->is_hardened_process = false;
break;
}
#if __x86_64__
/*
* On x86, for compatibility, don't enforce the hard page-zero restriction for 32-bit binaries.
@ -750,6 +1011,27 @@ pie_required(
return FALSE;
}
/*
* Grades the specified CPU type, CPU subtype, CPU features to determine an absolute weight, used in the determination
* of running the associated binary on this machine.
*
* If an override boot-arg is specified, the boot-arg is parsed and its values are stored for later use in overriding
* the system's hard-coded binary grading values.
*/
int
grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype, cpu_subtype_t execfeatures, bool allow_simulator_binary)
{
extern int ml_grade_binary(cpu_type_t, cpu_subtype_t, cpu_subtype_t, bool);
int binary_grade;
if ((binary_grade = grade_binary_override(exectype, execsubtype, execfeatures, allow_simulator_binary)) < 0) {
return ml_grade_binary(exectype, execsubtype, execfeatures, allow_simulator_binary);
}
return binary_grade;
}
/*
* The file size of a mach-o file is limited to 32 bits; this is because
* this is the limit on the kalloc() of enough bytes for a mach_header and

View file

@ -47,13 +47,13 @@ typedef int load_return_t;
/* libmalloc relies on these values not changing. If they change,
* you need to update the values in that project as well */
__options_decl(HR_flags_t, uint32_t, {
__options_decl(hardened_browser_flags_t, uint32_t, {
BrowserHostEntitlementMask = 0x01,
BrowserGPUEntitlementMask = 0x02,
BrowserNetworkEntitlementMask = 0x04,
BrowserWebContentEntitlementMask = 0x08,
});
#define HR_FLAGS_NUM_NIBBLES (sizeof(HR_flags_t) / 2)
#define HR_FLAGS_NUM_NIBBLES (sizeof(hardened_browser_flags_t) / 2)
/*
* Structure describing the result from calling load_machfile(), if that
@ -88,7 +88,9 @@ typedef struct _load_result {
is_64bit_addr : 1,
is_64bit_data : 1,
custom_stack : 1,
is_rosetta : 1;
is_rosetta : 1,
hardened_heap : 1,
is_hardened_process : 1;
unsigned int csflags;
unsigned char uuid[16];
mach_vm_address_t min_vm_addr;
@ -97,8 +99,9 @@ typedef struct _load_result {
mach_vm_address_t ro_vm_end;
unsigned int platform_binary;
/* Flags denoting which type of hardened runtime binary this is*/
HR_flags_t hardened_runtime_binary;
/* Flags denoting which type of platform restrictions binary this is */
hardened_browser_flags_t hardened_browser;
off_t cs_end_offset;
void *threadstate;
size_t threadstate_sz;

View file

@ -181,8 +181,8 @@ retry_trace_me: ;
* when, in this case, it is the current process's parent.
* Most of the other checks in cantrace() don't apply either.
*/
struct proc_ident p_ident = proc_ident(p);
struct proc_ident pproc_ident = proc_ident(pproc);
struct proc_ident p_ident = proc_ident_with_policy(p, IDENT_VALIDATION_PROC_EXACT);
struct proc_ident pproc_ident = proc_ident_with_policy(pproc, IDENT_VALIDATION_PROC_EXACT);
kauth_cred_t pproc_cred = kauth_cred_proc_ref(pproc);
/* Release pproc and find it again after MAC call to avoid deadlock */
@ -253,7 +253,7 @@ retry_proc_find:
AUDIT_ARG(process, t);
task = proc_task(t);
tident = proc_ident(t);
tident = proc_ident_with_policy(t, IDENT_VALIDATION_PROC_EXACT);
if (uap->req == PT_ATTACHEXC) {
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
@ -571,8 +571,8 @@ cantrace(proc_t cur_procp, kauth_cred_t creds, proc_t traced_procp, int *errp)
}
#if CONFIG_MACF
struct proc_ident cur_ident = proc_ident(cur_procp);
struct proc_ident traced_ident = proc_ident(traced_procp);
struct proc_ident cur_ident = proc_ident_with_policy(cur_procp, IDENT_VALIDATION_PROC_EXACT);
struct proc_ident traced_ident = proc_ident_with_policy(traced_procp, IDENT_VALIDATION_PROC_EXACT);
kauth_cred_t cur_cred = kauth_cred_proc_ref(cur_procp);
/*

View file

@ -200,7 +200,7 @@ mcache_init(void)
}
mcache_zone = zone_create("mcache", MCACHE_ALLOC_SIZE,
ZC_PGZ_USE_GUARDS | ZC_DESTRUCTIBLE);
ZC_DESTRUCTIBLE);
LIST_INIT(&mcache_head);
@ -354,7 +354,7 @@ mcache_create_common(const char *name, size_t bufsize, size_t align,
chunksize += sizeof(uint64_t) + align;
chunksize = P2ROUNDUP(chunksize, align);
cp->mc_slab_zone = zone_create(cp->mc_name, chunksize,
ZC_PGZ_USE_GUARDS | ZC_DESTRUCTIBLE);
ZC_DESTRUCTIBLE);
}
cp->mc_chunksize = chunksize;

496
bsd/kern/mem_acct.c Normal file
View file

@ -0,0 +1,496 @@
/*
* Copyright (c) 2024 Apple Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this
* file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_LICENSE_HEADER_END@
*/
#include <kern/cpu_data.h>
#include <kern/kalloc.h>
#include <kern/locks.h>
#include <kern/mem_acct.h>
#include <kern/percpu.h>
#include <os/atomic_private.h>
#include <os/log.h>
#include <os/ptrtools.h>
#include <sys/mem_acct_private.h>
#include <sys/param.h>
#include <sys/sysctl.h>
#include <net/net_sysctl.h>
struct mem_acct {
int64_t _Atomic ma_allocated; /* Amount of memory accounted towards this subsystem (ignore temporary per-CPU accounting from below) */
int32_t *__zpercpu ma_percpu; /* Per-CPU "bounce-buffer" of accounting that will be folded in to `ma_allocated` */
uint64_t ma_hardlimit; /* hard limit that will not be exceeded */
uint8_t ma_percent; /* Percent of hard-limit we should start soft-limiting (if != 100 && != 0) */
uint64_t _Atomic ma_peak;
char ma_name[MEM_ACCT_NAME_LENGTH]; /* Name of the subsystem using this instance of memory-accounting module */
};
#define MEM_ACCT_PCPU_MAX 1024 * 1024 /* Update global var after 1MB in the per-cpu var */
static struct mem_acct *memacct[MEM_ACCT_MAX];
static uint64_t
mem_acct_softlimit(uint64_t hardlimit, uint8_t percent)
{
return (hardlimit * percent) / 100;
}
static uint64_t
mem_acct_presoftlimit(uint64_t hardlimit, uint8_t percent)
{
return (mem_acct_softlimit(hardlimit, percent) * percent) / 100;
}
int
mem_acct_limited(const struct mem_acct *macct)
{
uint64_t hardlimit;
int64_t allocated;
uint8_t percent;
allocated = os_atomic_load(&macct->ma_allocated, relaxed);
if (allocated < 0) {
return 0;
}
hardlimit = os_access_once(macct->ma_hardlimit);
if (hardlimit && allocated > hardlimit) {
return MEMACCT_HARDLIMIT;
}
percent = os_access_once(macct->ma_percent);
if (percent) {
if (allocated > mem_acct_softlimit(hardlimit, percent)) {
return MEMACCT_SOFTLIMIT;
}
if (allocated > mem_acct_presoftlimit(hardlimit, percent)) {
return MEMACCT_PRESOFTLIMIT;
}
}
return 0;
}
void
_mem_acct_add(struct mem_acct *macct, int size)
{
int *pcpu;
/*
* Yes, the accounting is not 100% accurate with the per-cpu
* "bounce-buffer" storing intermediate results. For example, we may
* report "hard-limit" even though all the per-cpu counters may bring us
* below the limit. But honestly, we don't care... If we hit hard-limit
* the system is gonna be in a bad state anyways until we have given
* away enough memory.
*
* The same counts for softlimit, but softlimit still allows us to
* account memory and just makes us a bit more aggressive at freeing
* stuff.
*/
/* Now, add the size to the per-cpu variable */
disable_preemption();
pcpu = zpercpu_get(macct->ma_percpu);
*pcpu += size;
/* If we added enough to the pcpu variable, fold it into the global variable */
if (*pcpu > MEM_ACCT_PCPU_MAX || *pcpu < -MEM_ACCT_PCPU_MAX) {
int limited, newlimited;
int64_t allocated;
limited = mem_acct_limited(macct);
allocated = os_atomic_add(&macct->ma_allocated, *pcpu, relaxed);
/*
* Can be temporarily < 0 if the CPU freeing memory hits
* MEM_ACCT_PCPU_MAX first.
*/
if (allocated > 0) {
os_atomic_max(&macct->ma_peak, allocated, relaxed);
}
newlimited = mem_acct_limited(macct);
if (limited != newlimited) {
os_log(OS_LOG_DEFAULT,
"memacct: %s goes from %u to %u for its limit",
macct->ma_name, limited, newlimited);
}
*pcpu = 0;
}
enable_preemption();
}
static LCK_GRP_DECLARE(mem_acct_mtx_grp, "mem_acct");
static LCK_MTX_DECLARE(mem_acct_mtx, &mem_acct_mtx_grp);
struct mem_acct *
mem_acct_register(const char *__null_terminated name,
uint64_t hardlimit, uint8_t percent)
{
struct mem_acct *acct = NULL;
int i, index = -1;
if (percent > 100) {
os_log(OS_LOG_DEFAULT,
"memacct: percentage for softlimit is out-of-bounds\n");
return NULL;
}
lck_mtx_lock(&mem_acct_mtx);
/* Find an empty slot in the accounting array and check for name uniqueness */
for (i = 0; i < MEM_ACCT_MAX; i++) {
if (memacct[i] == NULL) {
if (index == -1) {
index = i;
}
continue;
}
if (strlcmp(memacct[i]->ma_name, name, MEM_ACCT_NAME_LENGTH - 1) == 0) {
os_log(OS_LOG_DEFAULT,
"memacct: subsystem %s already exists", name);
goto exit;
}
}
if (index == -1) {
os_log(OS_LOG_DEFAULT, "memacct: No space for additional subsystem");
goto exit;
}
memacct[index] = kalloc_type(struct mem_acct, Z_WAITOK_ZERO_NOFAIL);
acct = memacct[index];
strlcpy(acct->ma_name, name, MEM_ACCT_NAME_LENGTH);
acct->ma_hardlimit = hardlimit;
if (percent >= 100) {
os_log(OS_LOG_DEFAULT,
"memacct: percent is > 100");
memacct[index] = NULL;
kfree_type(struct mem_acct, acct);
acct = NULL;
goto exit;
}
acct->ma_percent = percent;
acct->ma_percpu = zalloc_percpu_permanent_type(int32_t);
exit:
lck_mtx_unlock(&mem_acct_mtx);
return acct;
}
/*
* Memory Accounting sysctl handlers
*/
struct walkarg {
int w_op, w_sub;
struct sysctl_req *w_req;
};
/* sysctls on a per-subsystem basis */
static int sysctl_subsystem_peak(struct walkarg *w);
static int sysctl_subsystem_soft_limit(struct walkarg *w);
static int sysctl_subsystem_hard_limit(struct walkarg *w);
static int sysctl_subsystem_allocated(struct walkarg *w);
static int sysctl_all_subsystem_statistics(struct walkarg *w);
/* sysctls for all active subsystems */
static int sysctl_all_statistics(struct sysctl_req *);
static int sysctl_mem_acct_subsystems(struct sysctl_req *);
/* Handler function for all Memory Accounting sysctls */
static int sysctl_mem_acct SYSCTL_HANDLER_ARGS;
/* Helper functions */
static void memacct_copy_stats(struct memacct_statistics *s, struct mem_acct *a);
SYSCTL_NODE(_kern, OID_AUTO, memacct,
CTLFLAG_RW | CTLFLAG_LOCKED, sysctl_mem_acct, "Memory Accounting");
static int
sysctl_mem_acct SYSCTL_HANDLER_ARGS
{
#pragma unused(oidp)
DECLARE_SYSCTL_HANDLER_ARG_ARRAY(int, 2, name, namelen);
int error = EINVAL;
struct walkarg w;
/* Verify the specified subsystem index is valid */
if (name[1] >= MEM_ACCT_MAX || name[1] < 0) {
return EINVAL;
}
bzero(&w, sizeof(w));
w.w_req = req;
w.w_op = name[0];
w.w_sub = name[1];
switch (w.w_op) {
case MEM_ACCT_PEAK:
error = sysctl_subsystem_peak(&w);
break;
case MEM_ACCT_SOFT_LIMIT:
error = sysctl_subsystem_soft_limit(&w);
break;
case MEM_ACCT_HARD_LIMIT:
error = sysctl_subsystem_hard_limit(&w);
break;
case MEM_ACCT_ALLOCATED:
error = sysctl_subsystem_allocated(&w);
break;
case MEM_ACCT_SUBSYSTEMS:
error = sysctl_mem_acct_subsystems(req);
break;
case MEM_ACCT_ALL_SUBSYSTEM_STATISTICS:
error = sysctl_all_subsystem_statistics(&w);
break;
case MEM_ACCT_ALL_STATISTICS:
error = sysctl_all_statistics(req);
break;
}
return error;
}
static int
sysctl_subsystem_peak(struct walkarg *w)
{
int error;
uint64_t value;
int changed = 0;
struct mem_acct *acct = memacct[w->w_sub];
if (acct == NULL) {
return ENOENT;
}
value = os_atomic_load(&acct->ma_peak, relaxed);
error = sysctl_io_number(w->w_req, value, sizeof(value), &value, &changed);
if (error || !changed) {
return error;
}
os_atomic_store(&acct->ma_peak, value, relaxed);
return 0;
}
static int
sysctl_subsystem_soft_limit(struct walkarg *w)
{
int error;
uint64_t hardlimit, value;
int changed = 0;
struct mem_acct *acct = memacct[w->w_sub];
if (acct == NULL) {
return ENOENT;
}
hardlimit = os_atomic_load(&acct->ma_hardlimit, relaxed);
if (acct->ma_percent) {
value = mem_acct_softlimit(hardlimit, acct->ma_percent);
} else {
value = hardlimit;
}
error = sysctl_io_number(w->w_req, value, sizeof(value), &value, &changed);
if (error || !changed) {
return error;
}
return EPERM;
}
static int
sysctl_subsystem_hard_limit(struct walkarg *w)
{
int error;
uint64_t value;
int changed = 0;
struct mem_acct *acct = memacct[w->w_sub];
if (acct == NULL) {
return ENOENT;
}
value = os_atomic_load(&acct->ma_hardlimit, relaxed);
error = sysctl_io_number(w->w_req, value, sizeof(value), &value, &changed);
if (error || !changed) {
return error;
}
acct->ma_hardlimit = value;
return 0;
}
static int
sysctl_subsystem_allocated(struct walkarg *w)
{
int64_t value;
struct mem_acct *acct = memacct[w->w_sub];
lck_mtx_lock(&mem_acct_mtx);
if (acct == NULL) {
return ENOENT;
}
value = os_atomic_load(&acct->ma_allocated, relaxed);
zpercpu_foreach(v, acct->ma_percpu) {
value += *v;
}
lck_mtx_unlock(&mem_acct_mtx);
return sysctl_io_number(w->w_req, value, sizeof(value), NULL, NULL);
}
static int
sysctl_all_subsystem_statistics(struct walkarg *w)
{
/* Returns a single memacct_statistics struct for the specified subsystem */
struct memacct_statistics stats = {};
struct mem_acct *acct = memacct[w->w_sub];
lck_mtx_lock(&mem_acct_mtx);
if (acct == NULL) {
return ENOENT;
}
memacct_copy_stats(&stats, acct);
lck_mtx_unlock(&mem_acct_mtx);
return sysctl_io_opaque(w->w_req, &stats, sizeof(stats), NULL);
}
static int
sysctl_all_statistics(struct sysctl_req *req)
{
/* Returns an array of memacct_statistics structs for all active subsystems */
int i, error;
int count = 0;
lck_mtx_lock(&mem_acct_mtx);
for (i = 0; i < MEM_ACCT_MAX; i++) {
if (memacct[i] == NULL) {
break;
}
count++;
}
struct memacct_statistics *memstats = kalloc_data(sizeof(struct memacct_statistics) * count, Z_WAITOK_ZERO_NOFAIL);
for (i = 0; i < count; i++) {
struct mem_acct *acct;
struct memacct_statistics *stats;
acct = memacct[i];
stats = &memstats[i];
memacct_copy_stats(stats, acct);
}
lck_mtx_unlock(&mem_acct_mtx);
error = sysctl_io_opaque(req, memstats, sizeof(struct memacct_statistics) * count, NULL);
if (error) {
kfree_data(memstats, sizeof(struct memacct_statistics) * count);
return error;
}
kfree_data(memstats, sizeof(struct memacct_statistics) * count);
return 0;
}
static int
sysctl_mem_acct_subsystems(struct sysctl_req *req)
{
/* Returns an array names for all active subsystems */
int i, j, error;
int count = 0;
int totalCharCount = 0;
lck_mtx_lock(&mem_acct_mtx);
for (i = 0; i < MEM_ACCT_MAX; i++) {
if (memacct[i] == NULL) {
break;
}
count++;
}
char *names = kalloc_data(count * MEM_ACCT_NAME_LENGTH, Z_WAITOK_ZERO_NOFAIL);
for (i = 0; i < count; i++) {
struct mem_acct *acct = memacct[i];
char acct_name[MEM_ACCT_NAME_LENGTH];
strbufcpy(acct_name, acct->ma_name);
for (j = 0; j < MEM_ACCT_NAME_LENGTH; j++) {
names[totalCharCount++] = acct_name[j];
}
}
lck_mtx_unlock(&mem_acct_mtx);
error = sysctl_io_opaque(req, names, sizeof(char) * count * MEM_ACCT_NAME_LENGTH, NULL);
if (error) {
kfree_data(names, sizeof(char) * count * MEM_ACCT_NAME_LENGTH);
return error;
}
kfree_data(names, sizeof(char) * count * MEM_ACCT_NAME_LENGTH);
return 0;
}
static void
memacct_copy_stats(struct memacct_statistics *s, struct mem_acct *a)
{
s->peak = os_atomic_load(&a->ma_peak, relaxed);
s->allocated = os_atomic_load(&a->ma_allocated, relaxed);
zpercpu_foreach(v, a->ma_percpu) {
s->allocated += *v;
}
if (a->ma_percent) {
s->softlimit = mem_acct_softlimit(a->ma_hardlimit, a->ma_percent);
} else {
s->softlimit = a->ma_hardlimit;
}
s->hardlimit = a->ma_hardlimit;
strbufcpy(s->ma_name, a->ma_name);
}

71
bsd/kern/mem_acct.h Normal file
View file

@ -0,0 +1,71 @@
/*
* Copyright (c) 2024 Apple Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this
* file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_LICENSE_HEADER_END@
*/
#ifndef _KERN_MEM_ACCT_H
#define _KERN_MEM_ACCT_H
#ifdef XNU_KERNEL_PRIVATE
#include <sys/mem_acct_private.h>
struct mem_acct;
/*
* Add "size" to the memory accounting module of "type".
*/
__private_extern__ void _mem_acct_add(struct mem_acct *macct, int size);
__private_extern__ struct mem_acct *mem_acct_register(
const char *__null_terminated name, uint64_t hardlimit, uint8_t percent);
/*
* pre-softlimit means we are getting close to the softlimit (about 80% of it).
* The subsystem should start taking preventive actions.
*/
#define MEMACCT_PRESOFTLIMIT 1
/*
* We are at the softlimit. Take actions to reduce memory usage, but don't take
* fully destructive actions yet.
*/
#define MEMACCT_SOFTLIMIT 2
/*
* We are above the hardlimit. Prevent holding on to memory in this subsystem.
*/
#define MEMACCT_HARDLIMIT 3
extern int mem_acct_limited(const struct mem_acct *macct);
static inline void
mem_acct_add(struct mem_acct *macct, unsigned int size)
{
_mem_acct_add(macct, size);
}
static inline void
mem_acct_sub(struct mem_acct *macct, unsigned int size)
{
_mem_acct_add(macct, -size);
}
#endif /* XNU_KERNEL_PRIVATE */
#endif /*_KERN_MEM_ACCT_H */

View file

@ -121,7 +121,7 @@ common_hook(void)
return rv;
}
#if (MAC_POLICY_OPS_VERSION != 87)
#if (MAC_POLICY_OPS_VERSION != 91)
# error "struct mac_policy_ops doesn't match definition in mac_policy.h"
#endif
/*
@ -134,10 +134,10 @@ const static struct mac_policy_ops policy_ops = {
CHECK_SET_HOOK(audit_check_postselect)
CHECK_SET_HOOK(audit_check_preselect)
.mpo_reserved01 = (mpo_reserved_hook_t *)common_hook,
.mpo_reserved02 = (mpo_reserved_hook_t *)common_hook,
.mpo_reserved03 = (mpo_reserved_hook_t *)common_hook,
.mpo_reserved04 = (mpo_reserved_hook_t *)common_hook,
CHECK_SET_HOOK(graft_check_graft)
CHECK_SET_HOOK(graft_check_ungraft)
CHECK_SET_HOOK(graft_notify_graft)
CHECK_SET_HOOK(graft_notify_ungraft)
CHECK_SET_HOOK(cred_check_label_update_execve)
CHECK_SET_HOOK(cred_check_label_update)
@ -221,8 +221,8 @@ const static struct mac_policy_ops policy_ops = {
CHECK_SET_HOOK(vnode_notify_unlink)
CHECK_SET_HOOK(vnode_check_swap)
.mpo_reserved33 = (mpo_reserved_hook_t *)common_hook,
.mpo_reserved34 = (mpo_reserved_hook_t *)common_hook,
CHECK_SET_HOOK(vnode_check_dataprotect_set)
CHECK_SET_HOOK(mount_check_remount_with_flags)
CHECK_SET_HOOK(mount_notify_mount)
CHECK_SET_HOOK(vnode_check_copyfile)
@ -319,7 +319,7 @@ const static struct mac_policy_ops policy_ops = {
CHECK_SET_HOOK(proc_check_sched)
CHECK_SET_HOOK(proc_check_setaudit)
CHECK_SET_HOOK(proc_check_setauid)
.mpo_reserved64 = (mpo_reserved_hook_t *)common_hook,
CHECK_SET_HOOK(proc_check_iopolicysys)
CHECK_SET_HOOK(proc_check_signal)
CHECK_SET_HOOK(proc_check_wait)
CHECK_SET_HOOK(proc_check_dump_core)

View file

@ -65,6 +65,8 @@
#include <sys/semaphore.h>
#include <sys/sysproto.h>
#include <sys/proc_info.h>
#include <sys/random.h>
#include <net/siphash.h>
#if CONFIG_MACF
#include <sys/vnode_internal.h>
@ -87,6 +89,7 @@
#define f_ops fp_glob->fg_ops
#define PSEMNAMLEN 31 /* maximum name segment length we bother with */
#define PSEMTEAMIDLEN 31 /* maximum length of team ID we consider */
struct pseminfo {
unsigned int psem_flags;
@ -114,8 +117,10 @@ struct pseminfo {
struct psemcache {
LIST_ENTRY(psemcache) psem_hash; /* hash chain */
struct pseminfo *pseminfo; /* vnode the name refers to */
size_t psem_nlen; /* length of name */
size_t psem_nlen; /* length of name */
size_t psem_teamidlen; /* length of team ID */
char psem_name[PSEMNAMLEN + 1]; /* segment name */
char psem_teamid[PSEMTEAMIDLEN + 1]; /* team ID of users, if any */
};
#define PSEMCACHE_NULL (struct psemcache *)0
@ -124,18 +129,27 @@ struct psemcache {
#define PSEMCACHE_NEGATIVE (ENOENT)
struct psemstats {
long goodhits; /* hits that we can really use */
long neghits; /* negative hits that we can use */
long badhits; /* hits we must drop */
long falsehits; /* hits with id mismatch */
long miss; /* misses */
long longnames; /* long names that ignore cache */
long pstats_hits;
long pstats_miss;
long pstats_local_hits;
long pstats_global_hits;
long pstats_local_miss;
long pstats_global_miss;
long pstats_local_collisions;
long pstats_global_collisions;
long pstats_fallback_hits; /* hits that missed local but hit global */
long pstats_fallback_miss; /* hits that missed both local and global */
long pstats_neghits; /* hits to 'negative entries' (return ENOENT) */
long pstats_longnames; /* semaphore or team ID ENAMETOOLONG */
};
struct psemname {
char *psem_nameptr; /* pointer to looked up name */
size_t psem_namelen; /* length of looked up component */
u_int32_t psem_hash; /* hash value of looked up name */
char *psem_nameptr; /* pointer to looked up name */
size_t psem_namelen; /* length of looked up component */
uint64_t psem_hash_local; /* hash value of looked up name and team */
uint64_t psem_hash_global; /* hash value of looked up name, without team */
const char *psem_teamidptr;
size_t psem_teamidlen;
};
struct psemnode {
@ -147,20 +161,45 @@ struct psemnode {
};
#define PSEMNODE_NULL (struct psemnode *)0
LIST_HEAD(psemhashhead, psemcache);
struct psemhashtable {
struct psemhashhead *psem_table;
/* Hash table mask, i.e size - 1 */
u_long psem_table_mask;
/* SipHash key, randomly assigned at boot */
uint8_t psem_siphash_key[SIPHASH_KEY_LENGTH];
};
#define PSEMHASH(table, hash) (&(table).psem_table[(hash) & (table).psem_table_mask])
struct psemhashtable psem_global, psem_local;
long posix_sem_num; /* number of POSIX semaphores on the system */
long posix_sem_max = 10000; /* max number of POSIX semaphores on the system */
#define PSEMHASH(pnp) \
(&psemhashtbl[(pnp)->psem_hash & psemhash])
LIST_HEAD(psemhashhead, psemcache) * psemhashtbl; /* Hash Table */
u_long psemhash; /* size of hash table - 1 */
long psemnument; /* number of cache entries allocated */
long posix_sem_max = 10000; /* tunable for max POSIX semaphores */
/* 10000 limits to ~1M of memory */
SYSCTL_NODE(_kern, KERN_POSIX, posix, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Posix");
SYSCTL_NODE(_kern_posix, OID_AUTO, sem, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Semaphores");
SYSCTL_LONG(_kern_posix_sem, OID_AUTO, max, CTLFLAG_RW | CTLFLAG_LOCKED, &posix_sem_max, "max");
struct psemstats psemstats; /* cache effectiveness statistics */
#if DEBUG || DEVELOPMENT
SYSCTL_LONG(_kern_posix_sem, OID_AUTO, hits, CTLFLAG_RD, &psemstats.pstats_hits, "");
SYSCTL_LONG(_kern_posix_sem, OID_AUTO, miss, CTLFLAG_RD, &psemstats.pstats_miss, "");
SYSCTL_LONG(_kern_posix_sem, OID_AUTO, local_hits, CTLFLAG_RD, &psemstats.pstats_local_hits, "");
SYSCTL_LONG(_kern_posix_sem, OID_AUTO, local_miss, CTLFLAG_RD, &psemstats.pstats_local_miss, "");
SYSCTL_LONG(_kern_posix_sem, OID_AUTO, global_hits, CTLFLAG_RD, &psemstats.pstats_global_hits, "");
SYSCTL_LONG(_kern_posix_sem, OID_AUTO, global_miss, CTLFLAG_RD, &psemstats.pstats_global_miss, "");
SYSCTL_LONG(_kern_posix_sem, OID_AUTO, fallback_hits, CTLFLAG_RD, &psemstats.pstats_fallback_hits, "");
SYSCTL_LONG(_kern_posix_sem, OID_AUTO, fallback_miss, CTLFLAG_RD, &psemstats.pstats_fallback_miss, "");
SYSCTL_LONG(_kern_posix_sem, OID_AUTO, local_collisions, CTLFLAG_RD, &psemstats.pstats_local_collisions, "");
SYSCTL_LONG(_kern_posix_sem, OID_AUTO, global_collisions, CTLFLAG_RD, &psemstats.pstats_global_collisions, "");
SYSCTL_LONG(_kern_posix_sem, OID_AUTO, neghits, CTLFLAG_RD, &psemstats.pstats_neghits, "");
SYSCTL_LONG(_kern_posix_sem, OID_AUTO, longnames, CTLFLAG_RD, &psemstats.pstats_longnames, "");
#endif
static int psem_access(struct pseminfo *pinfo, mode_t mode, kauth_cred_t cred);
static int psem_cache_search(struct pseminfo **,
struct psemname *, struct psemcache **);
@ -169,6 +208,8 @@ static int psem_delete(struct pseminfo * pinfo);
static int psem_closefile(struct fileglob *fp, vfs_context_t ctx);
static int psem_unlink_internal(struct pseminfo *pinfo, struct psemcache *pcache);
static const char *psem_get_teamid(proc_t p);
static const struct fileops psemops = {
.fo_type = DTYPE_PSXSEM,
.fo_read = fo_no_read,
@ -192,6 +233,61 @@ static int psem_cache_add(struct pseminfo *psemp, struct psemname *pnp, struct p
static void psem_cache_delete(struct psemcache *pcp);
int psem_cache_purge_all(void);
static struct psemname
psem_cache_hash(char *name, size_t len, const char *teamid, size_t teamidlen)
{
SIPHASH_CTX ctx;
struct psemname nd;
nd.psem_nameptr = name;
nd.psem_namelen = len;
nd.psem_teamidptr = teamid;
nd.psem_teamidlen = teamidlen;
nd.psem_hash_local = 0;
nd.psem_hash_global = 0;
_Static_assert(sizeof(nd.psem_hash_local) == SIPHASH_DIGEST_LENGTH, "hash field is wrong size for SipHash");
_Static_assert(sizeof(nd.psem_hash_global) == SIPHASH_DIGEST_LENGTH, "hash field is wrong size for SipHash");
/*
* This routine is called before taking the subsystem lock, so we'll prepare hashes
* for both global and local tables up front.
*/
SipHash24_Init(&ctx);
SipHash_SetKey(&ctx, psem_global.psem_siphash_key);
SipHash_Update(&ctx, name, len);
SipHash_Final((u_int8_t *)&nd.psem_hash_global, &ctx);
if (teamidlen > 0) {
SipHash24_Init(&ctx);
SipHash_SetKey(&ctx, psem_local.psem_siphash_key);
SipHash_Update(&ctx, name, len);
SipHash_Update(&ctx, teamid, teamidlen);
SipHash_Final((u_int8_t *)&nd.psem_hash_local, &ctx);
}
return nd;
}
/*
* Returns 1 if the semaphore name matches what we're looking for, otherwise 0.
* When searching the local table, the team ID must match too.
*/
static int
psem_cache_is_match(struct psemcache *sem, struct psemname *target, bool local)
{
bool name_matches = target->psem_namelen == sem->psem_nlen &&
!bcmp(target->psem_nameptr, sem->psem_name, target->psem_namelen);
if (local) {
bool teamid_matches = target->psem_teamidlen == sem->psem_teamidlen &&
!bcmp(target->psem_teamidptr, sem->psem_teamid, target->psem_teamidlen);
return name_matches && teamid_matches;
}
return name_matches;
}
/*
* Lookup an entry in the cache
*
@ -206,31 +302,66 @@ static int
psem_cache_search(struct pseminfo **psemp, struct psemname *pnp,
struct psemcache **pcache)
{
struct psemcache *pcp, *nnp;
struct psemcache *pcp = NULL, *nnp;
struct psemhashhead *pcpp;
if (pnp->psem_namelen > PSEMNAMLEN) {
psemstats.longnames++;
if (pnp->psem_namelen > PSEMNAMLEN || pnp->psem_teamidlen > PSEMTEAMIDLEN) {
os_atomic_inc(&psemstats.pstats_longnames, relaxed);
return PSEMCACHE_NOTFOUND;
}
pcpp = PSEMHASH(pnp);
for (pcp = pcpp->lh_first; pcp != 0; pcp = nnp) {
nnp = pcp->psem_hash.le_next;
if (pcp->psem_nlen == pnp->psem_namelen &&
!bcmp(pcp->psem_name, pnp->psem_nameptr, pcp->psem_nlen)) {
break;
/* If Team ID is present, try to look up in the local table first. */
if (pnp->psem_teamidlen > 0) {
pcpp = PSEMHASH(psem_local, pnp->psem_hash_local);
for (pcp = pcpp->lh_first; pcp != 0; pcp = nnp) {
nnp = pcp->psem_hash.le_next;
if (psem_cache_is_match(pcp, pnp, true)) {
break;
}
os_atomic_inc(&psemstats.pstats_local_collisions, relaxed);
}
if (pcp == 0) {
os_atomic_inc(&psemstats.pstats_local_miss, relaxed);
} else {
os_atomic_inc(&psemstats.pstats_local_hits, relaxed);
}
}
/* Otherwise, or if the local lookup failed, search the global table. */
if (pcp == 0) {
pcpp = PSEMHASH(psem_global, pnp->psem_hash_global);
for (pcp = pcpp->lh_first; pcp != 0; pcp = nnp) {
nnp = pcp->psem_hash.le_next;
if (psem_cache_is_match(pcp, pnp, false)) {
break;
}
os_atomic_inc(&psemstats.pstats_global_collisions, relaxed);
}
if (pcp == 0) {
os_atomic_inc(&psemstats.pstats_global_miss, relaxed);
if (pnp->psem_teamidlen > 0) {
os_atomic_inc(&psemstats.pstats_fallback_miss, relaxed);
}
} else {
os_atomic_inc(&psemstats.pstats_global_hits, relaxed);
if (pnp->psem_teamidlen > 0) {
os_atomic_inc(&psemstats.pstats_fallback_hits, relaxed);
}
}
}
if (pcp == 0) {
psemstats.miss++;
os_atomic_inc(&psemstats.pstats_miss, relaxed);
return PSEMCACHE_NOTFOUND;
}
/* We found a "positive" match, return the vnode */
if (pcp->pseminfo) {
psemstats.goodhits++;
os_atomic_inc(&psemstats.pstats_hits, relaxed);
/* TOUCH(ncp); */
*psemp = pcp->pseminfo;
*pcache = pcp;
@ -241,7 +372,7 @@ psem_cache_search(struct pseminfo **psemp, struct psemname *pnp,
* We found a "negative" match, ENOENT notifies client of this match.
* The nc_vpid field records whether this is a whiteout.
*/
psemstats.neghits++;
os_atomic_inc(&psemstats.pstats_neghits, relaxed);
return PSEMCACHE_NEGATIVE;
}
@ -252,24 +383,20 @@ static int
psem_cache_add(struct pseminfo *psemp, struct psemname *pnp, struct psemcache *pcp)
{
struct psemhashhead *pcpp;
struct pseminfo *dpinfo;
struct psemcache *dpcp;
#if DIAGNOSTIC
if (pnp->psem_namelen > PSEMNAMLEN) {
panic("cache_enter: name too long");
}
if (pnp->psem_teamidlen > PSEMTEAMIDLEN) {
panic("cache_enter: teamid too long");
}
#endif
/* if the entry has already been added by some one else return */
if (psem_cache_search(&dpinfo, pnp, &dpcp) == PSEMCACHE_FOUND) {
return EEXIST;
}
if (psemnument >= posix_sem_max) {
if (posix_sem_num >= posix_sem_max) {
return ENOSPC;
}
psemnument++;
posix_sem_num++;
/*
* Fill in cache info, if vp is NULL this is a "negative" cache entry.
* For negative entries, we have to record whether it is a whiteout.
@ -279,7 +406,16 @@ psem_cache_add(struct pseminfo *psemp, struct psemname *pnp, struct psemcache *p
pcp->pseminfo = psemp;
pcp->psem_nlen = pnp->psem_namelen;
bcopy(pnp->psem_nameptr, pcp->psem_name, pcp->psem_nlen);
pcpp = PSEMHASH(pnp);
pcp->psem_teamidlen = pnp->psem_teamidlen;
bcopy(pnp->psem_teamidptr, pcp->psem_teamid, pcp->psem_teamidlen);
/* Insert into the right table based on Team ID. */
if (pcp->psem_teamidlen > 0) {
pcpp = PSEMHASH(psem_local, pnp->psem_hash_local);
} else {
pcpp = PSEMHASH(psem_global, pnp->psem_hash_global);
}
#if DIAGNOSTIC
{
struct psemcache *p;
@ -301,7 +437,15 @@ psem_cache_add(struct pseminfo *psemp, struct psemname *pnp, struct psemcache *p
void
psem_cache_init(void)
{
psemhashtbl = hashinit((int)(posix_sem_max / 2), M_SHM, &psemhash);
/*
* The global table stores semaphores created by processes without a Team
* ID (such as platform binaries). The local table stores all other semaphores.
*/
psem_global.psem_table = hashinit((int)(posix_sem_max / 2), M_SHM, &psem_global.psem_table_mask);
psem_local.psem_table = hashinit((int)(posix_sem_max / 2), M_SHM, &psem_local.psem_table_mask);
read_frandom(psem_global.psem_siphash_key, sizeof(psem_global.psem_siphash_key));
read_frandom(psem_local.psem_siphash_key, sizeof(psem_local.psem_siphash_key));
}
static void
@ -317,7 +461,29 @@ psem_cache_delete(struct psemcache *pcp)
#endif /* DIAGNOSTIC */
LIST_REMOVE(pcp, psem_hash);
pcp->psem_hash.le_prev = NULL;
psemnument--;
posix_sem_num--;
}
static int
psem_cache_purge_table(struct psemhashtable *table)
{
struct psemcache *pcp, *tmppcp;
struct psemhashhead *pcpp;
for (pcpp = &table->psem_table[table->psem_table_mask]; pcpp >= table->psem_table; pcpp--) {
LIST_FOREACH_SAFE(pcp, pcpp, psem_hash, tmppcp) {
assert(pcp->psem_nlen);
/*
* unconditionally unlink the cache entry
*/
int error = psem_unlink_internal(pcp->pseminfo, pcp);
if (error) {
return error;
}
}
}
return 0;
}
/*
@ -328,8 +494,6 @@ psem_cache_delete(struct psemcache *pcp)
int
psem_cache_purge_all(void)
{
struct psemcache *pcp, *tmppcp;
struct psemhashhead *pcpp;
int error = 0;
if (kauth_cred_issuser(kauth_cred_get()) == 0) {
@ -337,26 +501,22 @@ psem_cache_purge_all(void)
}
PSEM_SUBSYS_LOCK();
for (pcpp = &psemhashtbl[psemhash]; pcpp >= psemhashtbl; pcpp--) {
LIST_FOREACH_SAFE(pcp, pcpp, psem_hash, tmppcp) {
assert(pcp->psem_nlen);
/*
* unconditionally unlink the cache entry
*/
error = psem_unlink_internal(pcp->pseminfo, pcp);
if (error) {
goto out;
}
}
error = psem_cache_purge_table(&psem_global);
if (error) {
goto out;
}
assert(psemnument == 0);
error = psem_cache_purge_table(&psem_local);
if (error) {
goto out;
}
assert(posix_sem_num == 0);
out:
PSEM_SUBSYS_UNLOCK();
if (error) {
printf("%s: Error %d removing all semaphores: %ld remain!\n",
__func__, error, psemnument);
__func__, error, posix_sem_num);
}
return error;
}
@ -374,18 +534,17 @@ out:
int
sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval)
{
size_t i;
int indx, error;
struct psemname nd;
struct pseminfo *pinfo;
struct fileproc *fp = NULL;
char *pnbuf = NULL;
const char *teamid = NULL;
struct pseminfo *new_pinfo = PSEMINFO_NULL;
struct psemnode *new_pnode = PSEMNODE_NULL;
struct psemcache *pcache = PSEMCACHE_NULL;
char * nameptr;
char * cp;
size_t pathlen, plen;
size_t pathlen, plen, teamidlen;
mode_t fmode;
mode_t cmode = (mode_t)uap->mode;
int value = uap->value;
@ -432,13 +591,13 @@ sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval)
plen = pathlen;
nameptr = pnbuf;
nd.psem_nameptr = nameptr;
nd.psem_namelen = plen;
nd.psem_hash = 0;
for (cp = nameptr, i = 1; *cp != 0 && i <= plen; i++, cp++) {
nd.psem_hash += (unsigned char)*cp * i;
teamid = psem_get_teamid(p);
teamidlen = teamid ? strlen(teamid) : 0;
if (teamidlen > PSEMTEAMIDLEN) {
error = ENAMETOOLONG;
goto bad;
}
nd = psem_cache_hash(nameptr, plen, teamid, teamidlen);
/*
* attempt to allocate a new fp; if unsuccessful, the fp will be
@ -697,18 +856,28 @@ psem_unlink_internal(struct pseminfo *pinfo, struct psemcache *pcache)
return 0;
}
static const char *
psem_get_teamid(proc_t p)
{
#if XNU_TARGET_OS_OSX
#pragma unused(p)
return NULL;
#else
return csproc_get_teamid(p);
#endif
}
int
sem_unlink(__unused proc_t p, struct sem_unlink_args *uap, __unused int32_t *retval)
{
size_t i;
int error = 0;
struct psemname nd;
struct pseminfo *pinfo;
char * nameptr;
char * cp;
char * pnbuf;
size_t pathlen;
const char *teamid;
size_t pathlen, teamidlen;
struct psemcache *pcache = PSEMCACHE_NULL;
pinfo = PSEMINFO_NULL;
@ -741,13 +910,13 @@ sem_unlink(__unused proc_t p, struct sem_unlink_args *uap, __unused int32_t *ret
}
#endif /* PSXSEM_NAME_RESTRICT */
nd.psem_nameptr = nameptr;
nd.psem_namelen = pathlen;
nd.psem_hash = 0;
for (cp = nameptr, i = 1; *cp != 0 && i <= pathlen; i++, cp++) {
nd.psem_hash += (unsigned char)*cp * i;
teamid = psem_get_teamid(p);
teamidlen = teamid ? strlen(teamid) : 0;
if (teamidlen > PSEMTEAMIDLEN) {
error = ENAMETOOLONG;
goto bad;
}
nd = psem_cache_hash(nameptr, pathlen, teamid, teamidlen);
PSEM_SUBSYS_LOCK();
error = psem_cache_search(&pinfo, &nd, &pcache);

View file

@ -219,8 +219,8 @@ static void munge_vinfo_stat(struct stat64 *sbp, struct vinfo_stat *vsbp);
static int proc_piduuidinfo(pid_t pid, uuid_t uuid_buf, uint32_t buffersize);
extern int proc_pidpathinfo_internal(proc_t p, __unused uint64_t arg, char *buf, uint32_t buffersize, __unused int32_t *retval);
extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int);
extern int cansignal_nomac(proc_t src, kauth_cred_t uc_src, proc_t dst, int signum);
extern bool cansignal(struct proc *, kauth_cred_t, struct proc *, int);
extern bool cansignal_nomac(proc_t src, kauth_cred_t uc_src, proc_t dst, int signum);
extern int proc_get_rusage(proc_t proc, int flavor, user_addr_t buffer, int is_zombie);
#define CHECK_SAME_USER TRUE
@ -754,6 +754,19 @@ proc_pidbsdinfo(proc_t p, struct proc_bsdinfo * pbsd, int zombie)
}
#endif /* CONFIG_DELAY_IDLE_SLEEP */
task_t task = proc_task(p);
if (task) {
if (task_has_hardened_heap(task)) {
pbsd->pbi_flags |= PROC_FLAG_HARDENED_HEAP_ENABLED;
}
if (task_has_tpro(task)) {
pbsd->pbi_flags |= PROC_FLAG_TPRO_ENABLED;
}
}
switch (PROC_CONTROL_STATE(p)) {
case P_PCTHROTTLE:
pbsd->pbi_flags |= PROC_FLAG_PC_THROTTLE;
@ -851,6 +864,17 @@ proc_pidshortbsdinfo(proc_t p, struct proc_bsdshortinfo * pbsd_shortp, int zombi
pbsd_shortp->pbsi_flags |= PROC_FLAG_DELAYIDLESLEEP;
}
#endif /* CONFIG_DELAY_IDLE_SLEEP */
task_t task = proc_task(p);
if (task) {
if (task_has_hardened_heap(task)) {
pbsd_shortp->pbsi_flags |= PROC_FLAG_HARDENED_HEAP_ENABLED;
}
if (task_has_tpro(task)) {
pbsd_shortp->pbsi_flags |= PROC_FLAG_TPRO_ENABLED;
}
}
switch (PROC_CONTROL_STATE(p)) {
case P_PCTHROTTLE:
@ -3515,7 +3539,7 @@ proc_ident_for_audit_token(proc_ident_t out, audit_token_t token)
goto out;
}
*out = proc_ident(p);
*out = proc_ident_with_policy(p, IDENT_VALIDATION_PROC_EXACT);
out:
if (p != PROC_NULL) {
proc_rele(p);

View file

@ -64,12 +64,8 @@
#include <sys/types.h>
//#include <stdlib.h>
#include <sys/kpi_private.h>
__private_extern__
void
qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *));
#include <kern/qsort.h>
static inline char *med3(char *, char *, char *, int (*)(const void *, const void *));
static inline void swapfunc(char *, char *, long, int);

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2022 Apple Inc. All rights reserved.
* Copyright (c) 2024 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
@ -26,25 +26,27 @@
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
#ifndef _BTI_TELEMETRY_H_
#define _BTI_TELEMETRY_H_
#ifdef CONFIG_BTI_TELEMETRY
#include <mach/exception.h>
#include <mach/vm_types.h>
#include <mach/machine/thread_status.h>
#ifndef _KERN_QSORT_H_
#define _KERN_QSORT_H_
/**
* Wakes up the BTI exception telemetry subsystem. Call once per boot.
#include <stddef.h>
__BEGIN_DECLS
/*
* The `cmpfunc_t` type is a pointer to a function that should return the
* following:
*
* return < 0 for a < b
* 0 for a == b
* > 0 for a > b
*/
typedef int (*cmpfunc_t)(const void *a, const void *b);
__private_extern__
void
bti_telemetry_init(void);
qsort(void *array, size_t num_elements, size_t element_size, cmpfunc_t compare);
/**
* Handle a BTI exception. Returns TRUE if handled and OK to return from the
* exception, false otherwise.
*/
bool
bti_telemetry_handle_exception(arm_saved_state_t *state);
__END_DECLS
#endif /* CONFIG_BTI_TELEMETRY */
#endif /* _BTI_TELEMETRY_H_ */
#endif /* _KERN_QSORT_H_ */

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, 2023, 2024 Apple Inc. All rights reserved.
* Copyright (c) 2021, 2023-2025, Apple Inc. All rights reserved.
* @APPLE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
@ -319,7 +319,11 @@ soflow_fill_hash_entry_from_address(struct soflow_hash_entry *entry, bool isLoca
in6_verify_ifscope(&sin6->sin6_addr, sin6->sin6_scope_id);
}
}
entry->soflow_family = AF_INET6;
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
entry->soflow_family = AF_INET;
} else {
entry->soflow_family = AF_INET6;
}
return TRUE;
default:
return FALSE;
@ -334,6 +338,7 @@ soflow_fill_hash_entry_from_inp(struct soflow_hash_entry *entry, bool isLocal, s
}
if (inp->inp_vflag & INP_IPV6) {
entry->soflow_family = AF_INET6;
if (isLocal == TRUE) {
if (inp->inp_lport) {
entry->soflow_lport = inp->inp_lport;
@ -348,6 +353,9 @@ soflow_fill_hash_entry_from_inp(struct soflow_hash_entry *entry, bool isLocal, s
if (islocalUpdate) {
entry->soflow_laddr_updated = TRUE;
}
if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) {
entry->soflow_family = AF_INET;
}
}
} else {
if (inp->inp_fport) {
@ -357,9 +365,11 @@ soflow_fill_hash_entry_from_inp(struct soflow_hash_entry *entry, bool isLocal, s
entry->soflow_faddr.addr6 = inp->in6p_faddr;
entry->soflow_faddr6_ifscope = inp->inp_fifscope;
in6_verify_ifscope(&entry->soflow_faddr.addr6, inp->inp_fifscope);
if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_faddr)) {
entry->soflow_family = AF_INET;
}
}
}
entry->soflow_family = AF_INET6;
return TRUE;
} else if (inp->inp_vflag & INP_IPV4) {
if (isLocal == TRUE) {
@ -572,7 +582,7 @@ soflow_db_lookup_entry_internal(struct soflow_db *db, struct sockaddr *local, st
matchentry.soflow_debug = SOFLOW_ENABLE_DEBUG(db->soflow_db_so, (&matchentry));
SOFLOW_ENTRY_LOG(LOG_DEBUG, db->soflow_db_so, &matchentry, true, "Looking for entry");
if (inp->inp_vflag & INP_IPV6) {
if (matchentry.soflow_family == AF_INET6) {
hashkey_faddr = matchentry.soflow_faddr.addr6.s6_addr32[3];
hashkey_laddr = (remoteOnly == false) ? matchentry.soflow_laddr.addr6.s6_addr32[3] : 0;
} else {
@ -588,12 +598,12 @@ soflow_db_lookup_entry_internal(struct soflow_db *db, struct sockaddr *local, st
flowhash = &db->soflow_db_hashbase[inp_hash_element];
LIST_FOREACH(nextentry, flowhash, soflow_entry_link) {
if (inp->inp_vflag & INP_IPV6) {
if (matchentry.soflow_family == AF_INET6) {
if (soflow_match_entries_v6(nextentry, &matchentry, remoteOnly)) {
SOFLOW_ENTRY_LOG(LOG_DEBUG, db->soflow_db_so, nextentry, nextentry->soflow_debug, "Found entry v6");
break;
}
} else if (inp->inp_vflag & INP_IPV4) {
} else if (matchentry.soflow_family == AF_INET) {
if (soflow_match_entries_v4(nextentry, &matchentry, remoteOnly)) {
SOFLOW_ENTRY_LOG(LOG_DEBUG, db->soflow_db_so, nextentry, nextentry->soflow_debug, "Found entry v4");
break;
@ -746,7 +756,7 @@ soflow_db_add_entry(struct soflow_db *db, struct sockaddr *local, struct sockadd
entry->soflow_debug = SOFLOW_ENABLE_DEBUG(db->soflow_db_so, entry);
microuptime(&entry->soflow_timestamp);
if (inp->inp_vflag & INP_IPV6) {
if (entry->soflow_family == AF_INET6) {
hashkey_faddr = entry->soflow_faddr.addr6.s6_addr32[3];
hashkey_laddr = entry->soflow_laddr.addr6.s6_addr32[3];
} else {
@ -778,15 +788,15 @@ done:
return entry;
}
static boolean_t
soflow_udp_get_address_from_control(sa_family_t family, struct mbuf *control, uint8_t *__counted_by(*count) *address_ptr, int *count)
static sa_family_t
soflow_udp_get_address_from_control(struct mbuf *control, uint8_t *__counted_by(*count) *address_ptr, int *count)
{
struct cmsghdr *cm;
struct in6_pktinfo *pi6;
struct socket *so = NULL;
if (control == NULL || address_ptr == NULL) {
return false;
return AF_UNSPEC;
}
for (; control != NULL; control = control->m_next) {
@ -801,23 +811,21 @@ soflow_udp_get_address_from_control(sa_family_t family, struct mbuf *control, ui
switch (cm->cmsg_type) {
case IP_RECVDSTADDR:
if (family == AF_INET &&
cm->cmsg_level == IPPROTO_IP &&
if (cm->cmsg_level == IPPROTO_IP &&
cm->cmsg_len == CMSG_LEN(sizeof(struct in_addr))) {
*address_ptr = CMSG_DATA(cm);
*count = sizeof(struct in_addr);
return true;
return AF_INET;
}
break;
case IPV6_PKTINFO:
case IPV6_2292PKTINFO:
if (family == AF_INET6 &&
cm->cmsg_level == IPPROTO_IPV6 &&
if (cm->cmsg_level == IPPROTO_IPV6 &&
cm->cmsg_len == CMSG_LEN(sizeof(struct in6_pktinfo))) {
pi6 = (struct in6_pktinfo *)(void *)CMSG_DATA(cm);
*address_ptr = (uint8_t *)&pi6->ipi6_addr;
*count = sizeof(struct in6_addr);
return true;
return AF_INET6;
}
break;
default:
@ -825,7 +833,7 @@ soflow_udp_get_address_from_control(sa_family_t family, struct mbuf *control, ui
}
}
}
return false;
return AF_UNSPEC;
}
static boolean_t
@ -869,10 +877,10 @@ soflow_entry_update_local(struct soflow_db *db, struct soflow_hash_entry *entry,
if (local == NULL && control != NULL) {
int size = 0;
uint8_t * __counted_by(size) addr_ptr = NULL;
boolean_t result = soflow_udp_get_address_from_control(entry->soflow_family, control, &addr_ptr, &size);
sa_family_t family = soflow_udp_get_address_from_control(control, &addr_ptr, &size);
if (result && size && addr_ptr) {
switch (entry->soflow_family) {
if (family != AF_UNSPEC && size && addr_ptr) {
switch (family) {
case AF_INET:
if (size == sizeof(struct in_addr)) {
address_buf.sin.sin_port = 0;
@ -941,6 +949,7 @@ static bool
soflow_nstat_provider_request_vals(nstat_provider_context ctx,
u_int32_t *ifflagsp,
nstat_counts *countsp,
nstat_detailed_counts *detailsp,
void *metadatap)
{
struct soflow_hash_entry *hash_entry = (struct soflow_hash_entry *) ctx;
@ -977,6 +986,18 @@ soflow_nstat_provider_request_vals(nstat_provider_context ctx,
countsp->nstat_rxpackets, countsp->nstat_rxbytes, countsp->nstat_txpackets, countsp->nstat_txbytes);
}
if (detailsp) {
bzero(detailsp, sizeof(*detailsp));
detailsp->nstat_media_stats.ms_total.ts_rxbytes = hash_entry->soflow_rxbytes;
detailsp->nstat_media_stats.ms_total.ts_txbytes = hash_entry->soflow_txbytes;
detailsp->nstat_media_stats.ms_total.ts_rxpackets = hash_entry->soflow_rxpackets;
detailsp->nstat_media_stats.ms_total.ts_txpackets = hash_entry->soflow_txpackets;
SOFLOW_LOG(LOG_DEBUG, so, hash_entry->soflow_debug,
"Collected NSTAT detailed counts: rxpackets %llu rxbytes %llu txpackets %llu txbytes %llu",
detailsp->nstat_media_stats.ms_total.ts_rxpackets, detailsp->nstat_media_stats.ms_total.ts_rxbytes,
detailsp->nstat_media_stats.ms_total.ts_txpackets, detailsp->nstat_media_stats.ms_total.ts_txbytes);
}
if (metadatap) {
nstat_udp_descriptor *desc = (nstat_udp_descriptor *)metadatap;
bzero(desc, sizeof(*desc));

View file

@ -36,7 +36,7 @@
#include <IOKit/IOBSD.h>
extern uint32_t stackshot_estimate_adj;
EXPERIMENT_FACTOR_UINT(_kern, stackshot_estimate_adj, &stackshot_estimate_adj, 0, 100,
EXPERIMENT_FACTOR_LEGACY_UINT(_kern, stackshot_estimate_adj, &stackshot_estimate_adj, 0, 100,
"adjust stackshot estimates up by this percentage");
extern unsigned int stackshot_single_thread;
@ -646,7 +646,7 @@ stackshot_dirty_buffer_test(__unused int64_t in, int64_t *out)
kern_return_t kr;
// 8MB buffer
kr = kmem_alloc(kernel_map, &buf, 8 * 1024 * 1024, KMA_ZERO | KMA_DATA, VM_KERN_MEMORY_DIAG);
kr = kmem_alloc(kernel_map, &buf, 8 * 1024 * 1024, KMA_ZERO | KMA_DATA_SHARED, VM_KERN_MEMORY_DIAG);
if (kr != KERN_SUCCESS) {
printf("stackshot_dirty_buffer_test: kmem_alloc returned %d\n", kr);
goto err;
@ -686,7 +686,7 @@ stackshot_kernel_initiator_test(int64_t in, int64_t *out)
uint64_t ss_flags = STACKSHOT_KCDATA_FORMAT | STACKSHOT_NO_IO_STATS | STACKSHOT_SAVE_KEXT_LOADINFO | STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY | STACKSHOT_THREAD_WAITINFO | STACKSHOT_INCLUDE_DRIVER_THREADS_IN_KERNEL;
unsigned ss_bytes = 0;
if (in == 1) {
kr = kmem_alloc(kernel_map, &buf, 8 * 1024 * 1024, KMA_ZERO | KMA_DATA, VM_KERN_MEMORY_DIAG);
kr = kmem_alloc(kernel_map, &buf, 8 * 1024 * 1024, KMA_ZERO | KMA_DATA_SHARED, VM_KERN_MEMORY_DIAG);
if (kr != KERN_SUCCESS) {
printf("stackshot_kernel_initiator_test: kmem_alloc returned %d\n", kr);
goto err;

View file

@ -522,7 +522,7 @@ oslog_init_firehose(void)
kmem_alloc(kernel_map, &kernel_firehose_addr, size + ptoa(2),
KMA_NOFAIL | KMA_PERMANENT | KMA_GUARD_FIRST | KMA_GUARD_LAST |
KMA_DATA | KMA_ZERO, VM_KERN_MEMORY_LOG);
KMA_DATA_SHARED | KMA_ZERO, VM_KERN_MEMORY_LOG);
kernel_firehose_addr += PAGE_SIZE;
/* register buffer with firehose */

View file

@ -653,7 +653,7 @@ log_stream_teardown(log_stream_t *ls)
kfree_data(ls->ls_buf, buf_size);
}
if (ls->ls_blk) {
kfree_type(uint8_t, ls->ls_blk_count, ls->ls_blk);
kfree_data(ls->ls_blk, ls->ls_blk_count);
}
bzero(ls, sizeof(*ls));
}

View file

@ -232,6 +232,7 @@ tprintf_impl(tpr_t tpr, const char *fmt, va_list ap)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wformat-nonliteral"
#pragma clang diagnostic ignored "-Wformat"
os_log_with_args(OS_LOG_DEFAULT, OS_LOG_TYPE_DEFAULT, fmt, ap, __builtin_return_address(0));
#pragma clang diagnostic pop
}
@ -421,13 +422,15 @@ printf_log_locked(bool addcr, const char *fmt, ...)
return retval;
}
extern bool IOSystemStateAOT(void);
bool
vprintf_log_locked(const char *fmt, va_list ap, bool driverkit)
{
struct putchar_args pca;
pca.flags = TOLOGLOCKED;
if (driverkit && enable_dklog_serial_output) {
if (driverkit && (enable_dklog_serial_output || IOSystemStateAOT())) {
pca.flags |= TOCONS;
}
pca.tty = NULL;

View file

@ -2264,7 +2264,7 @@ ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval)
/* Finish copying in the necessary args before taking the proc lock */
error = 0;
len = 0;
if (args->cmd == LEDGER_ENTRY_INFO) {
if (args->cmd == LEDGER_ENTRY_INFO || args->cmd == LEDGER_ENTRY_INFO_V2) {
error = copyin(args->arg3, (char *)&len, sizeof(len));
} else if (args->cmd == LEDGER_TEMPLATE_INFO) {
error = copyin(args->arg2, (char *)&len, sizeof(len));
@ -2327,17 +2327,20 @@ ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval)
break;
}
case LEDGER_ENTRY_INFO: {
case LEDGER_ENTRY_INFO:
case LEDGER_ENTRY_INFO_V2: {
bool v2 = (args->cmd == LEDGER_ENTRY_INFO_V2);
int entry_size = (v2) ? sizeof(struct ledger_entry_info_v2) : sizeof(struct ledger_entry_info);
void *buf;
int sz;
/* Settle ledger entries for memorystatus and pages grabbed */
task_ledger_settle(task);
rval = ledger_get_task_entry_info_multiple(task, &buf, &len);
rval = ledger_get_task_entry_info_multiple(task, &buf, &len, v2);
proc_rele(proc);
if ((rval == 0) && (len >= 0)) {
sz = len * sizeof(struct ledger_entry_info);
sz = len * entry_size;
rval = copyout(buf, args->arg2, sz);
kfree_data(buf, sz);
}
@ -2804,6 +2807,8 @@ SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cpu, CTLTYPE_INT | CTLFLAG_RW | C
#if __AMP__
errno_t mach_to_bsd_errno(kern_return_t mach_err);
extern char sysctl_get_bound_cluster_type(void);
static int
sysctl_kern_sched_thread_bind_cluster_type SYSCTL_HANDLER_ARGS
@ -2825,15 +2830,11 @@ sysctl_kern_sched_thread_bind_cluster_type SYSCTL_HANDLER_ARGS
goto out;
}
if (cluster_type != 'P' &&
cluster_type != 'p' &&
cluster_type != 'E' &&
cluster_type != 'e') {
return EINVAL;
kern_return_t kr = thread_soft_bind_cluster_type(current_thread(), cluster_type);
if (kr != KERN_SUCCESS) {
return mach_to_bsd_errno(kr);
}
thread_soft_bind_cluster_type(current_thread(), cluster_type);
out:
buff[0] = sysctl_get_bound_cluster_type();
@ -2844,7 +2845,7 @@ SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cluster_type, CTLTYPE_STRING | CT
0, 0, sysctl_kern_sched_thread_bind_cluster_type, "A", "");
extern char sysctl_get_task_cluster_type(void);
extern void sysctl_task_set_cluster_type(char cluster_type);
extern kern_return_t sysctl_task_set_cluster_type(char cluster_type);
static int
sysctl_kern_sched_task_set_cluster_type SYSCTL_HANDLER_ARGS
{
@ -2865,14 +2866,11 @@ sysctl_kern_sched_task_set_cluster_type SYSCTL_HANDLER_ARGS
goto out;
}
if (cluster_type != 'E' &&
cluster_type != 'e' &&
cluster_type != 'P' &&
cluster_type != 'p') {
return EINVAL;
kern_return_t kr = sysctl_task_set_cluster_type(cluster_type);
if (kr != KERN_SUCCESS) {
return mach_to_bsd_errno(kr);
}
sysctl_task_set_cluster_type(cluster_type);
out:
cluster_type = sysctl_get_task_cluster_type();
buff[0] = cluster_type;
@ -2934,9 +2932,15 @@ SYSCTL_INT(_kern, OID_AUTO, sched_edge_migrate_ipi_immediate, CTLFLAG_RW | CTLFL
#endif /* __AMP__ */
#if DEVELOPMENT || DEBUG
extern int timeouts_are_fatal;
EXPERIMENT_FACTOR_INT(timeouts_are_fatal, &timeouts_are_fatal, 0, 1,
"Do timeouts panic or emit telemetry (0: telemetry, 1: panic)");
#endif
#if SCHED_HYGIENE_DEBUG
SYSCTL_QUAD(_kern, OID_AUTO, interrupt_masked_threshold_mt, CTLFLAG_RW | CTLFLAG_LOCKED,
SYSCTL_QUAD(_kern, OID_AUTO, interrupt_masked_threshold_mt, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_LEGACY_EXPERIMENT,
&interrupt_masked_timeout,
"Interrupt masked duration after which a tracepoint is emitted or the device panics (in mach timebase units)");
@ -2944,7 +2948,7 @@ SYSCTL_INT(_kern, OID_AUTO, interrupt_masked_debug_mode, CTLFLAG_RW | CTLFLAG_LO
&interrupt_masked_debug_mode, 0,
"Enable interrupt masked tracing or panic (0: off, 1: trace, 2: panic)");
SYSCTL_QUAD(_kern, OID_AUTO, sched_preemption_disable_threshold_mt, CTLFLAG_RW | CTLFLAG_LOCKED,
SYSCTL_QUAD(_kern, OID_AUTO, sched_preemption_disable_threshold_mt, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_LEGACY_EXPERIMENT,
&sched_preemption_disable_threshold_mt,
"Preemption disablement duration after which a tracepoint is emitted or the device panics (in mach timebase units)");

View file

@ -453,7 +453,7 @@ static __attribute__((unused)) void
soioctl_cassert(void)
{
/*
* This is equivalent to _CASSERT() and the compiler wouldn't
* This is equivalent to static_assert() and the compiler wouldn't
* generate any instructions, thus for compile time only.
*/
switch ((u_long)0) {

View file

@ -434,6 +434,8 @@ uaddr_findobj(user_addr_t uaddr, uint64_t *objectp, uint64_t *offsetp)
kern_return_t ret;
vm_page_info_basic_data_t info;
mach_msg_type_number_t count = VM_PAGE_INFO_BASIC_COUNT;
ret = vm_map_page_info(current_map(), uaddr, VM_PAGE_INFO_BASIC, (vm_page_info_t)&info, &count);
if (ret != KERN_SUCCESS) {
return EINVAL;

View file

@ -257,7 +257,7 @@
162 AUE_NULL ALL { int nosys(void); } { old getdomainname }
163 AUE_NULL ALL { int nosys(void); } { old setdomainname }
164 AUE_NULL ALL { int nosys(void); }
164 AUE_FUNMOUNT ALL { int funmount(int fd, int flags); }
165 AUE_QUOTACTL ALL { int quotactl(const char *path, int cmd, int uid, caddr_t arg); }
166 AUE_NULL ALL { int nosys(void); } { old exportfs }
167 AUE_MOUNT ALL { int mount(char *type, char *path, int flags, caddr_t data); }

View file

@ -515,6 +515,10 @@
0x1400188 MACH_SCHED_AST_CHECK
0x140018C MACH_SCHED_PREEMPT_TIMER_ACTIVE
0x1400190 MACH_PROCESSOR_SHUTDOWN
0x1400194 MACH_SCHED_PSET_BITMASKS
0x1400198 MACH_SUSPEND_DRIVERKIT_USERSPACE
0x140019c MACH_SCHED_PREFERRED_PSET
0x14001a0 MACH_SCHED_ONCORE_PREEMPT
0x1500000 MACH_MSGID_INVALID
0x1600000 MTX_SLEEP
0x1600004 MTX_SLEEP_DEADLINE
@ -631,6 +635,142 @@
0x1b00020 VM_RECLAIM_SAMPLE
0x1b00028 VM_RECLAIM_RESIZE
0x1b0002c VM_RECLAIM_FLUSH
0x1b10804 VMLP_EVENT_API_FILL_PROCREGIONINFO
0x1b10808 VMLP_EVENT_API_FILL_PROCREGIONINFO_ONLYMAPPEDVNODES
0x1b1080c VMLP_EVENT_API_FIND_MAPPING_TO_SLIDE
0x1b10810 VMLP_EVENT_API_GET_VMMAP_ENTRIES
0x1b10814 VMLP_EVENT_API_GET_VMSUBMAP_ENTRIES
0x1b10818 VMLP_EVENT_API_KDP_LIGHTWEIGHT_FAULT
0x1b1081c VMLP_EVENT_API_KMEM_ALLOC_GUARD_INTERNAL
0x1b10820 VMLP_EVENT_API_KMEM_FREE_GUARD
0x1b10824 VMLP_EVENT_API_KMEM_GET_GOBJ_STATS
0x1b10828 VMLP_EVENT_API_KMEM_POPULATE_META_LOCKED
0x1b1082c VMLP_EVENT_API_KMEM_REALLOC_GUARD
0x1b10830 VMLP_EVENT_API_KMEM_SIZE_GUARD
0x1b10834 VMLP_EVENT_API_MACH_MAKE_MEMORY_ENTRY_SHARE
0x1b10838 VMLP_EVENT_API_MACH_VM_RANGE_CREATE_V1
0x1b1083c VMLP_EVENT_API_MOVE_PAGES_TO_QUEUE
0x1b10840 VMLP_EVENT_API_TASK_FIND_REGION_DETAILS
0x1b10844 VMLP_EVENT_API_TASK_INFO
0x1b10848 VMLP_EVENT_API_VM32_REGION_INFO
0x1b1084c VMLP_EVENT_API_VM32_REGION_INFO_64
0x1b10850 VMLP_EVENT_API_VM32__MAP_EXEC_LOCKDOWN
0x1b10854 VMLP_EVENT_API_VMTC_REVALIDATE_LOOKUP
0x1b10858 VMLP_EVENT_API_VM_FAULT_COPY
0x1b1085c VMLP_EVENT_API_VM_FAULT_INTERNAL
0x1b10860 VMLP_EVENT_API_VM_KERN_ALLOCATION_INFO
0x1b10864 VMLP_EVENT_API_VM_MAP_APPLE_PROTECTED
0x1b10868 VMLP_EVENT_API_VM_MAP_BEHAVIOR_SET
0x1b1086c VMLP_EVENT_API_VM_MAP_CAN_REUSE
0x1b10870 VMLP_EVENT_API_VM_MAP_CHECK_PROTECTION
0x1b10874 VMLP_EVENT_API_VM_MAP_COPYIN_INTERNAL
0x1b10878 VMLP_EVENT_API_VM_MAP_COPYOUT_INTERNAL
0x1b1087c VMLP_EVENT_API_VM_MAP_COPY_OVERWRITE
0x1b10880 VMLP_EVENT_API_VM_MAP_COPY_OVERWRITE_ALIGNED
0x1b10884 VMLP_EVENT_API_VM_MAP_COPY_OVERWRITE_NESTED
0x1b10888 VMLP_EVENT_API_VM_MAP_COPY_OVERWRITE_UNALIGNED
0x1b1088c VMLP_EVENT_API_VM_MAP_CREATE_UPL
0x1b10890 VMLP_EVENT_API_VM_MAP_CS_DEBUGGED_SET
0x1b10894 VMLP_EVENT_API_VM_MAP_CS_ENFORCEMENT_SET
0x1b10898 VMLP_EVENT_API_VM_MAP_DELETE
0x1b1089c VMLP_EVENT_API_VM_MAP_DELETE_SUBMAP_RECURSE
0x1b108a0 VMLP_EVENT_API_VM_MAP_DESTROY
0x1b108a4 VMLP_EVENT_API_VM_MAP_DISCONNECT_PAGE_MAPPINGS
0x1b108a8 VMLP_EVENT_API_VM_MAP_ENTER
0x1b108ac VMLP_EVENT_API_VM_MAP_ENTER_MEM_OBJECT
0x1b108b0 VMLP_EVENT_API_VM_MAP_ENTRY_HAS_DEVICE_PAGER
0x1b108b4 VMLP_EVENT_API_VM_MAP_EXEC_LOCKDOWN
0x1b108b8 VMLP_EVENT_API_VM_MAP_FIND_SPACE
0x1b108bc VMLP_EVENT_API_VM_MAP_FORK
0x1b108c0 VMLP_EVENT_API_VM_MAP_FORK_COPY
0x1b108c4 VMLP_EVENT_API_VM_MAP_FREEZE
0x1b108c8 VMLP_EVENT_API_VM_MAP_GET_PHYS_PAGE
0x1b108cc VMLP_EVENT_API_VM_MAP_INHERIT
0x1b108d0 VMLP_EVENT_API_VM_MAP_INJECT_ERROR
0x1b108d4 VMLP_EVENT_API_VM_MAP_IS_CORPSE_SOURCE
0x1b108d8 VMLP_EVENT_API_VM_MAP_LOOKUP_AND_LOCK_OBJECT
0x1b108dc VMLP_EVENT_API_VM_MAP_MACHINE_ATTRIBUTE
0x1b108e0 VMLP_EVENT_API_VM_MAP_MARK_ALIEN
0x1b108e4 VMLP_EVENT_API_VM_MAP_MSYNC
0x1b108e8 VMLP_EVENT_API_VM_MAP_NON_ALIGNED_TEST
0x1b108ec VMLP_EVENT_API_VM_MAP_OVERWRITE_SUBMAP_RECURSE
0x1b108f0 VMLP_EVENT_API_VM_MAP_PAGEOUT
0x1b108f4 VMLP_EVENT_API_VM_MAP_PAGE_RANGE_INFO_INTERNAL
0x1b108f8 VMLP_EVENT_API_VM_MAP_PARTIAL_REAP
0x1b108fc VMLP_EVENT_API_VM_MAP_PROTECT
0x1b10900 VMLP_EVENT_API_VM_MAP_PURGABLE_CONTROL
0x1b10904 VMLP_EVENT_API_VM_MAP_RAISE_MAX_OFFSET
0x1b10908 VMLP_EVENT_API_VM_MAP_RAISE_MIN_OFFSET
0x1b1090c VMLP_EVENT_API_VM_MAP_RANGE_CONFIGURE
0x1b10910 VMLP_EVENT_API_VM_MAP_REGION
0x1b10914 VMLP_EVENT_API_VM_MAP_REGION_RECURSE_64
0x1b10918 VMLP_EVENT_API_VM_MAP_REMAP
0x1b1091c VMLP_EVENT_API_VM_MAP_REMAP_EXTRACT
0x1b10920 VMLP_EVENT_API_VM_MAP_REMOVE_AND_UNLOCK
0x1b10924 VMLP_EVENT_API_VM_MAP_REMOVE_GUARD
0x1b10928 VMLP_EVENT_API_VM_MAP_REUSABLE_PAGES
0x1b1092c VMLP_EVENT_API_VM_MAP_REUSE_PAGES
0x1b10930 VMLP_EVENT_API_VM_MAP_SET_CACHE_ATTR
0x1b10934 VMLP_EVENT_API_VM_MAP_SET_CORPSE_SOURCE
0x1b10938 VMLP_EVENT_API_VM_MAP_SET_DATA_LIMIT
0x1b1093c VMLP_EVENT_API_VM_MAP_SET_MAX_ADDR
0x1b10940 VMLP_EVENT_API_VM_MAP_SET_SIZE_LIMIT
0x1b10944 VMLP_EVENT_API_VM_MAP_SET_TPRO_ENFORCEMENT
0x1b10948 VMLP_EVENT_API_VM_MAP_SET_TPRO_RANGE
0x1b1094c VMLP_EVENT_API_VM_MAP_SET_USER_WIRE_LIMIT
0x1b10950 VMLP_EVENT_API_VM_MAP_SHADOW_MAX
0x1b10954 VMLP_EVENT_API_VM_MAP_SIGN
0x1b10958 VMLP_EVENT_API_VM_MAP_SIMPLIFY
0x1b1095c VMLP_EVENT_API_VM_MAP_SINGLE_JIT
0x1b10960 VMLP_EVENT_API_VM_MAP_SIZES
0x1b10964 VMLP_EVENT_API_VM_MAP_SUBMAP_PMAP_CLEAN
0x1b10968 VMLP_EVENT_API_VM_MAP_SWITCH_PROTECT
0x1b1096c VMLP_EVENT_API_VM_MAP_TERMINATE
0x1b10970 VMLP_EVENT_API_VM_MAP_UNSET_CORPSE_SOURCE
0x1b10974 VMLP_EVENT_API_VM_MAP_UNWIRE_NESTED
0x1b10978 VMLP_EVENT_API_VM_MAP_WILLNEED
0x1b1097c VMLP_EVENT_API_VM_MAP_WIRE_NESTED
0x1b10980 VMLP_EVENT_API_VM_MAP_ZERO
0x1b10984 VMLP_EVENT_API_VM_PAGE_DIAGNOSE
0x1b10988 VMLP_EVENT_API_VM_SHARED_REGION_MAP_FILE
0x1b1098c VMLP_EVENT_API_VM_TOGGLE_ENTRY_REUSE
0x1b10990 VMLP_EVENT_API_ZONE_METADATA_INIT
0x1b10994 VMLP_EVENT_API_ZONE_SUBMAP_ALLOC_SEQUESTERED_VA
0x1b11004 VMLP_EVENT_LOCK_TRY_EXCL
0x1b11008 VMLP_EVENT_LOCK_FAIL_EXCL
0x1b1100c VMLP_EVENT_LOCK_REQ_EXCL
0x1b11010 VMLP_EVENT_LOCK_GOT_EXCL
0x1b11014 VMLP_EVENT_LOCK_UNLOCK_EXCL
0x1b11018 VMLP_EVENT_LOCK_DOWNGRADE
0x1b1101c VMLP_EVENT_LOCK_TRY_SH
0x1b11020 VMLP_EVENT_LOCK_FAIL_SH
0x1b11024 VMLP_EVENT_LOCK_REQ_SH
0x1b11028 VMLP_EVENT_LOCK_GOT_SH
0x1b1102c VMLP_EVENT_LOCK_UNLOCK_SH
0x1b11030 VMLP_EVENT_LOCK_TRY_UPGRADE
0x1b11034 VMLP_EVENT_LOCK_GOT_UPGRADE
0x1b11038 VMLP_EVENT_LOCK_FAIL_UPGRADE
0x1b1103c VMLP_EVENT_LOCK_SLEEP_BEGIN
0x1b11040 VMLP_EVENT_LOCK_SLEEP_END
0x1b11044 VMLP_EVENT_LOCK_YIELD_BEGIN
0x1b11048 VMLP_EVENT_LOCK_YIELD_END
0x1b11804 VMLP_EVENT_RANGE
0x1b20004 MEMINFO_PGCNT1
0x1b20008 MEMINFO_PGCNT2
0x1b2000c MEMINFO_PGCNT3
0x1b20010 MEMINFO_PGCNT4
0x1b20014 MEMINFO_PGCNT5
0x1b20018 MEMINFO_PGCNT6
0x1b2001c MEMINFO_PGCNT7
0x1b20020 MEMINFO_PGCNT8
0x1b20044 MEMINFO_PGOUT1
0x1b20048 MEMINFO_PGOUT2
0x1b2004c MEMINFO_PGOUT3
0x1b20050 MEMINFO_PGOUT4
0x1b20054 MEMINFO_PGOUT5
0x1b20058 MEMINFO_PGOUT6
0x1b20084 MEMINFO_DEMAND1
0x1b20088 MEMINFO_DEMAND2
0x2010000 L_IP_In_Beg
0x2010004 L_IP_Out_Beg
0x2010008 L_IP_In_End
@ -1199,7 +1339,15 @@
0x3130180 VFS_check_getattrlistbulk
0x3130184 VFS_check_copyfile
0x3130188 VFS_notify_unlink
0x313018C VFS_notify_rename_swap
0x3130190 VFS_check_rename_swap
0x3130194 VFS_check_dataprotect_set
0x3130198 VFS_mount_notify_mount
0x313019C VFS_mount_check_remount_with_flags
0x31301A0 VFS_graft_check_graft
0x31301A4 VFS_graft_check_ungraft
0x31301A8 VFS_graft_notify_graft
0x31301AC VFS_graft_notify_ungraft
0X3134000 VFS_io_compression_stats
0x3CF0000 CP_OFFSET_IO
0x4010004 proc_exit
@ -1246,6 +1394,48 @@
0x4030050 KEVENT_kqwl_unbind
0x4030054 KEVENT_knote_enable
0x4030058 KEVENT_knote_vanished
0x40d0004 AIO_work_queued
0x40d0008 AIO_worker_wake
0x40d000c AIO_completion_sig
0x40d0010 AIO_completion_kevent
0x40d0014 AIO_completion_cleanup_wait
0x40d0018 AIO_completion_cleanup_wake
0x40d001c AIO_completion_suspend_wake
0x40d0028 AIO_cancel
0x40d002c AIO_cancel_async_workq
0x40d0030 AIO_cancel_sync_workq
0x40d0034 AIO_cancel_activeq
0x40d0038 AIO_cancel_doneq
0x40d0050 AIO_fsync
0x40d0054 AIO_fsync_delay
0x40d0078 AIO_read
0x40d00a0 AIO_write
0x40d00c8 AIO_listio
0x40d00f0 AIO_error
0x40d00f4 AIO_error_val
0x40d00f8 AIO_error_activeq
0x40d00fc AIO_error_workq
0x40d0118 AIO_return
0x40d011c AIO_return_val
0x40d0120 AIO_return_activeq
0x40d0124 AIO_return_workq
0x40d0140 AIO_exec
0x40d0168 AIO_exit
0x40d016c AIO_exit_sleep
0x40d0190 AIO_close
0x40d0194 AIO_close_sleep
0x40d01b8 AIO_suspend
0x40d01bc AIO_suspend_sleep
0x40d01e0 AIO_worker_thread
0x40d0208 AIO_register_kevent
0x40d0230 AIO_WQ_process_entry
0x40d0234 AIO_WQ_aio_thread_create
0x40d0238 AIO_WQ_aio_thread_terminate
0x40d023c AIO_WQ_aio_death_call
0x40d0240 AIO_WQ_aio_thread_park
0x40d0244 AIO_WQ_aio_select_req
0x40d0248 AIO_WQ_aio_thread_create_failed
0x40d024c AIO_WQ_aio_thread_wakeup
0x40e0104 BSC_msync_extended_info
0x40e0264 BSC_pread_extended_info
0x40e0268 BSC_pwrite_extended_info
@ -1692,6 +1882,8 @@
0x01a9002c MACH_SCHED_EDGE_LOAD_AVG
0x01a90030 MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD
0x01a90034 MACH_SCHED_EDGE_RSRC_HEAVY_THREAD
0x01a90038 MACH_SCHED_EDGE_SHARED_RSRC_MIGRATE
0x01a9003c MACH_SCHED_EDGE_STIR_THE_POT
0x01ab0000 WORKGROUP_INTERVAL_CREATE
0x01ab0004 WORKGROUP_INTERVAL_DESTROY
0x01ab0008 WORKGROUP_INTERVAL_CHANGE
@ -1990,6 +2182,8 @@
0x26120004 imp_msg_send
0x26120008 imp_msg_delv
0x26130000 imp_watchport
0x26150000 imp_thread_promote_above_task
0x26160000 imp_runaway_mitigation
0x26170000 imp_suppression_inactive
0x26170004 imp_suppression_active
0x26180000 imp_apptype_none
@ -2006,10 +2200,13 @@
0x261a0004 imp_usynch_remove_override
0x261b0000 imp_donor_update_live_donor
0x261b0004 imp_donor_init_donor_state
0x261c0000 imp_main_thread_qos
0x261d0000 imp_sync_ipc_qos_applied
0x261d0004 imp_sync_ipc_qos_removed
0x261d0008 imp_sync_ipc_qos_overflow
0x261d000c imp_sync_ipc_qos_underflow
0x261e0000 imp_set_gpu_role
0x261f0000 imp_query_gpu_role
0x26210010 imp_task_int_bg
0x26210014 imp_task_ext_bg
0x26210020 imp_thread_int_bg
@ -2047,6 +2244,8 @@
0x263d0028 imp_thread_qos_ipc_override
0x263e0028 imp_thread_qos_servicer_override
0x263f0028 imp_thread_iotier_kevent_override
0x26400028 imp_thread_iotier_kevent_override
0x26410018 imp_task_runaway_mitigation
0x27000000 PERF_PCEVENT
0x27001000 PERF_CPU_IDLE
0x27001100 PERF_CPU_IDLE_TIMER

View file

@ -56,8 +56,10 @@ static os_log_t tracker_db_log_handle = NULL;
static struct thread *g_tracker_gc_thread;
#define TRACKER_GC_RUN_INTERVAL_NSEC (10 * NSEC_PER_SEC) // GC wakes up periodically
#define TRACKER_GC_IDLE_TO (10) // age out entries when not used for a while
#define TRACKER_GC_EXTENDED_IDLE_TO (120) // extended timeout for entries that are used for policy evaluation
static int tracker_db_idle_timeout = TRACKER_GC_IDLE_TO;
static int tracker_db_extended_idle_timeout = TRACKER_GC_EXTENDED_IDLE_TO;
/*
* Sysctls for debug logs control
@ -70,6 +72,9 @@ SYSCTL_INT(_net_tracker, OID_AUTO, log, CTLFLAG_RW | CTLFLAG_LOCKED,
SYSCTL_INT(_net_tracker, OID_AUTO, idle_timeout, CTLFLAG_RW | CTLFLAG_LOCKED,
&tracker_db_idle_timeout, 0, "");
SYSCTL_INT(_net_tracker, OID_AUTO, extended_idle_timeout, CTLFLAG_RW | CTLFLAG_LOCKED,
&tracker_db_extended_idle_timeout, 0, "");
#define TRACKER_LOG(level, fmt, ...) \
do { \
if (tracker_log_level >= level && tracker_db_log_handle) { \
@ -317,6 +322,12 @@ copy_metadata(tracker_metadata_t *dst_metadata, tracker_metadata_t *src_metadata
dst_domain_owner_buffer[0] = 0;
}
if (dst_metadata->flags & SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT) {
// If the client says this needs to extend the timeout, save that.
// This flag is passed in by the caller, and updates the saved metadata.
src_metadata->flags |= SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT;
}
is_short = (dst_metadata->flags & SO_TRACKER_ATTRIBUTE_FLAGS_DOMAIN_SHORT);
dst_metadata->flags = src_metadata->flags;
if (is_short) {
@ -456,6 +467,7 @@ tracker_search_and_insert(struct tracker_db *db, struct tracker_hash_entry *matc
if (insert) {
if (copy_metadata(&nextentry->metadata, &matchentry->metadata) == true) {
TRACKER_ENTRY_LOG(LOG_DEBUG, "Updated entry", nextentry, hash_element);
nextentry->lastused = net_uptime();
return nextentry;
} else {
// Failed to update found entry, delete it from db and allow insertion of new entry.
@ -1069,7 +1081,11 @@ tracker_entry_expire(void *v, wait_result_t w)
hash = &g_tracker_db.tracker_hashbase[i];
LIST_FOREACH_SAFE(entry, hash, entry_link, temp_entry) {
if (tracker_idle_timed_out(entry, tracker_db_idle_timeout, current_time)) {
int timeout_value = tracker_db_idle_timeout;
if (entry->metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT) {
timeout_value = tracker_db_extended_idle_timeout;
}
if (tracker_idle_timed_out(entry, timeout_value, current_time)) {
TRACKER_ENTRY_LOG(LOG_DEBUG, "Deleting entry - IDLE TO", entry, i);
g_tracker_db.tracker_count--;
if (entry->metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_DOMAIN_SHORT) {

View file

@ -222,11 +222,16 @@ static struct ptmx_ioctl *
pty_get_ioctl(dev_t dev, int open_flag, struct tty_dev_t **out_driver)
{
struct tty_dev_t *driver = pty_get_driver(dev);
struct ptmx_ioctl *out = NULL;
if (out_driver) {
*out_driver = driver;
}
if (driver && driver->open) {
return driver->open(minor(dev), open_flag);
out = driver->open(minor(dev), open_flag);
if (!out) {
printf("pty_get_ioctl: driver->open returned NULL\n");
}
return out;
}
return NULL;
}

View file

@ -286,6 +286,7 @@ ptmx_get_ioctl(int minor, int open_flag)
*/
if ((_state.pis_total - _state.pis_free) >= ptmx_max) {
DEVFS_UNLOCK();
printf("ptmx_get_ioctl failed due to ptmx_max limit %d\n", ptmx_max);
return NULL;
}
DEVFS_UNLOCK();
@ -314,6 +315,7 @@ ptmx_get_ioctl(int minor, int open_flag)
ttyfree(new_ptmx_ioctl->pt_tty);
DEVFS_UNLOCK();
kfree_type(struct ptmx_ioctl, new_ptmx_ioctl);
printf("ptmx_get_ioctl failed due to ptmx_max limit %d\n", ptmx_max);
return NULL;
}
@ -348,6 +350,7 @@ ptmx_get_ioctl(int minor, int open_flag)
ttyfree(new_ptmx_ioctl->pt_tty);
DEVFS_UNLOCK();
kfree_type(struct ptmx_ioctl, new_ptmx_ioctl);
printf("ptmx_get_ioctl failed because minor number %d was out of range\n", minor);
return NULL;
}
@ -357,6 +360,7 @@ ptmx_get_ioctl(int minor, int open_flag)
kfree_type(struct ptmx_ioctl, new_ptmx_ioctl);
/* Special error value so we know to redrive the open, we've been raced */
/* XXX Can this still occur? */
return (struct ptmx_ioctl*)-1;
}

View file

@ -117,6 +117,7 @@ static struct ptmx_ioctl *
pty_get_ioctl(int minor, int open_flag)
{
if (minor >= NPTY) {
printf("pty_get_ioctl failed because minor number %d exceeded %d\n", minor, NPTY);
return NULL;
}
struct ptmx_ioctl *pti = &pt_ioctl[minor];

View file

@ -534,9 +534,6 @@ cs_validate_csblob(
uint32_t n, count;
const CS_CodeDirectory *best_cd = NULL;
unsigned int best_rank = 0;
#if XNU_PLATFORM_WatchOS
const CS_CodeDirectory *sha1_cd = NULL;
#endif
if (length < sizeof(CS_SuperBlob)) {
return EBADEXEC;
@ -592,15 +589,6 @@ cs_validate_csblob(
printf("multiple hash=%d CodeDirectories in signature; rejecting\n", best_cd->hashType);
return EBADEXEC;
}
#if XNU_PLATFORM_WatchOS
if (candidate->hashType == CS_HASHTYPE_SHA1) {
if (sha1_cd != NULL) {
printf("multiple sha1 CodeDirectories in signature; rejecting\n");
return EBADEXEC;
}
sha1_cd = candidate;
}
#endif
} else if (type == CSSLOT_ENTITLEMENTS) {
if (ntohl(subBlob->magic) != CSMAGIC_EMBEDDED_ENTITLEMENTS) {
return EBADEXEC;
@ -657,37 +645,6 @@ cs_validate_csblob(
library_constraint = subBlob;
}
}
#if XNU_PLATFORM_WatchOS
/* To keep watchOS fast enough, we have to resort to sha1 for
* some code.
*
* At the time of writing this comment, known sha1 attacks are
* collision attacks (not preimage or second preimage
* attacks), which do not apply to platform binaries since
* they have a fixed hash in the trust cache. Given this
* property, we only prefer sha1 code directories for adhoc
* signatures, which always have to be in a trust cache to be
* valid (can-load-cdhash does not exist for watchOS). Those
* are, incidentally, also the platform binaries, for which we
* care about the performance hit that sha256 would bring us.
*
* Platform binaries may still contain a (not chosen) sha256
* code directory, which keeps software updates that switch to
* sha256-only small.
*/
if (*rcd != NULL && sha1_cd != NULL && (ntohl(sha1_cd->flags) & CS_ADHOC)) {
if (sha1_cd->flags != (*rcd)->flags) {
printf("mismatched flags between hash %d (flags: %#x) and sha1 (flags: %#x) cd.\n",
(int)(*rcd)->hashType, (*rcd)->flags, sha1_cd->flags);
*rcd = NULL;
return EBADEXEC;
}
*rcd = sha1_cd;
}
#endif
} else if (ntohl(blob->magic) == CSMAGIC_CODEDIRECTORY) {
if ((error = cs_validate_codedirectory((const CS_CodeDirectory *)(const void *)addr, length)) != 0) {
return error;
@ -6158,7 +6115,8 @@ int
ubc_cs_getcdhash(
vnode_t vp,
off_t offset,
unsigned char *cdhash)
unsigned char *cdhash,
uint8_t *type)
{
struct cs_blob *blobs, *blob;
off_t rel_offset;
@ -6183,8 +6141,14 @@ ubc_cs_getcdhash(
/* we didn't find a blob covering "offset" */
ret = EBADEXEC; /* XXX any better error ? */
} else {
/* get the SHA1 hash of that blob */
/* get the CDHash of that blob */
bcopy(blob->csb_cdhash, cdhash, sizeof(blob->csb_cdhash));
/* get the type of the CDHash */
if (type != NULL) {
*type = blob->csb_cd->hashType;
}
ret = 0;
}

View file

@ -75,6 +75,8 @@
#include <sys/syslog.h>
#include <sys/queue.h>
#include <kern/uipc_domain.h>
#include <net/dlil.h>
#include <net/nwk_wq.h>
#include <net/sockaddr_utils.h>
@ -535,7 +537,6 @@ net_add_proto_old(struct protosw_old *opp, struct domain_old *odp)
pp->pr_usrreqs = pru;
pp->pr_init = pr_init_old;
pp->pr_drain = opp->pr_drain;
pp->pr_sysctl = opp->pr_sysctl;
pp->pr_lock = opp->pr_lock;
pp->pr_unlock = opp->pr_unlock;
pp->pr_getlock = opp->pr_getlock;
@ -1024,41 +1025,6 @@ net_uptime2timeval(struct timeval *tv)
tv->tv_sec = (time_t)net_uptime();
}
/*
* An alternative way to obtain the coarse-grained uptime (in seconds)
* for networking code which do not require high-precision timestamp,
* as this is significantly cheaper than microuptime().
*/
uint64_t
net_uptime(void)
{
if (_net_uptime == 0) {
net_update_uptime();
}
return _net_uptime;
}
uint64_t
net_uptime_ms(void)
{
if (_net_uptime_ms == 0) {
net_update_uptime();
}
return _net_uptime_ms;
}
uint64_t
net_uptime_us(void)
{
if (_net_uptime_us == 0) {
net_update_uptime();
}
return _net_uptime_us;
}
void
domain_proto_mtx_lock_assert_held(void)
{

106
bsd/kern/uipc_domain.h Normal file
View file

@ -0,0 +1,106 @@
/*
* Copyright (c) 2024 Apple Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this
* file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_LICENSE_HEADER_END@
*/
#ifndef _KERN_UIPC_DOMAIN_H
#define _KERN_UIPC_DOMAIN_H
#ifdef XNU_KERNEL_PRIVATE
#include <kern/mem_acct.h>
#include <kern/uipc_socket.h>
#include <sys/protosw.h>
static inline void
proto_memacct_add(struct protosw *proto, unsigned int size)
{
if (proto->pr_mem_acct) {
mem_acct_add(proto->pr_mem_acct, size);
} else {
socket_memacct_add(size);
}
}
static inline void
proto_memacct_sub(struct protosw *proto, unsigned int size)
{
if (proto->pr_mem_acct) {
mem_acct_sub(proto->pr_mem_acct, size);
} else {
socket_memacct_sub(size);
}
}
static inline bool
proto_memacct_hardlimit(const struct protosw *proto)
{
if (proto->pr_mem_acct) {
return mem_acct_limited(proto->pr_mem_acct) == MEMACCT_HARDLIMIT;
} else {
return socket_memacct_hardlimit();
}
}
static inline bool
proto_memacct_limited(const struct protosw *proto)
{
if (proto->pr_mem_acct) {
return mem_acct_limited(proto->pr_mem_acct) != 0;
} else {
return socket_memacct_limited();
}
}
extern uint64_t _net_uptime;
extern uint64_t _net_uptime_ms;
extern uint64_t _net_uptime_us;
extern void net_update_uptime(void);
extern void net_update_uptime_with_time(const struct timeval *);
/*
* ToDo - we could even replace all callers of net_uptime* by a direct access
* to _net_uptime*
*/
static inline uint64_t
net_uptime(void)
{
return _net_uptime;
}
static inline uint64_t
net_uptime_ms(void)
{
return _net_uptime_ms;
}
static inline uint64_t
net_uptime_us(void)
{
return _net_uptime_us;
}
extern void net_uptime2timeval(struct timeval *);
#endif /* XNU_KERNEL_PRIVATE */
#endif /*_KERN_UIPC_DOMAIN_H */

File diff suppressed because it is too large Load diff

View file

@ -453,25 +453,6 @@ m_tag_verify_cookie(struct m_tag *tag)
#endif /* defined(HAS_APPLE_PAC) */
struct m_tag *
m_tag_create(uint32_t id, uint16_t type, int len, int wait, struct mbuf *buf)
{
#ifdef MB_TAG_MBUF
/*
* Create and return an m_tag, either by re-using space in a previous tag
* or by allocating a new mbuf/cluster
*/
return m_tag_create_mbuf(id, type, (uint16_t)len, wait, buf);
#else /* MB_TAG_MBUF */
#pragma unused(buf)
/*
* Each packet tag has its own allocation
*/
return m_tag_alloc(id, type, (uint16_t)len, wait);
#endif /* MB_TAG_MBUF */
}
#ifdef MB_TAG_MBUF
/* Get a packet tag structure along with specified data following. */
static struct m_tag *
@ -517,7 +498,121 @@ m_tag_alloc_mbuf(u_int32_t id, u_int16_t type, uint16_t len, int wait)
}
return t;
}
#endif /* MB_TAG_MBUF */
static struct m_tag_type_entry *
get_m_tag_type_entry(uint32_t id, uint16_t type, struct m_tag_type_stats **pmtts)
{
m_tag_type_entry_ref_t mtte = &m_tag_type_table[KERNEL_TAG_TYPE_NONE];
if (pmtts != NULL) {
*pmtts = &m_tag_type_stats[KERNEL_TAG_TYPE_NONE];
}
if (id == KERNEL_MODULE_TAG_ID) {
switch (type) {
case KERNEL_TAG_TYPE_DUMMYNET:
case KERNEL_TAG_TYPE_IPFILT:
case KERNEL_TAG_TYPE_ENCAP:
case KERNEL_TAG_TYPE_INET6:
case KERNEL_TAG_TYPE_IPSEC:
case KERNEL_TAG_TYPE_CFIL_UDP:
case KERNEL_TAG_TYPE_PF_REASS:
case KERNEL_TAG_TYPE_AQM:
case KERNEL_TAG_TYPE_DRVAUX:
mtte = &m_tag_type_table[type];
if (pmtts != NULL) {
*pmtts = &m_tag_type_stats[type];
}
break;
default:
#if DEBUG || DEVELOPMENT
if (type > 0 && type < KERNEL_TAG_TYPE_COUNT) {
panic("get_m_tag_type_entry unexpected m_tag type %u",
type);
}
#endif /* DEBUG || DEVELOPMENT */
break;
}
}
return mtte;
}
#ifndef MB_TAG_MBUF
static struct m_tag *
m_tag_kalloc(uint32_t id, uint16_t type, uint16_t len, int wait, struct m_tag_type_entry *mtte)
{
struct m_tag *tag = NULL;
tag = mtte->mt_alloc_func(id, type, len, wait);
if (__probable(tag != NULL)) {
VERIFY(IS_P2ALIGNED(tag, sizeof(uint64_t)));
if (__improbable(tag->m_tag_data == NULL)) {
VERIFY(len == 0);
} else {
VERIFY(len != 0);
VERIFY(IS_P2ALIGNED(tag->m_tag_data, sizeof(uint64_t)));
}
}
return tag;
}
static void
m_tag_kfree(struct m_tag *tag, struct m_tag_type_entry *mtte)
{
mtte->mt_free_func(tag);
}
#endif /* MB_TAG_MBUF */
static struct m_tag *
m_tag_alloc(uint32_t id, uint16_t type, int len, int wait)
{
struct m_tag *tag = NULL;
m_tag_type_entry_ref_t mtte = NULL;
m_tag_type_stats_ref_t mtts = NULL;
mtte = get_m_tag_type_entry(id, type, &mtts);
if (__improbable(len < 0 || len >= MCLBYTES - sizeof(struct m_tag))) {
goto done;
}
#ifdef MB_TAG_MBUF
tag = m_tag_alloc_mbuf(id, type, (uint16_t)len, wait);
#else /* MB_TAG_MBUF */
/*
* Using Z_NOWAIT could cause retransmission delays when there aren't
* many other colocated types in the zone that would prime it. Use
* Z_NOPAGEWAIT instead which will only fail to allocate when zalloc
* needs to block on the VM for pages.
*/
if (wait & Z_NOWAIT) {
wait &= ~Z_NOWAIT;
wait |= Z_NOPAGEWAIT;
}
tag = m_tag_kalloc(id, type, (uint16_t)len, wait, mtte);
#endif /* MB_TAG_MBUF */
done:
if (__probable(tag != NULL)) {
m_tag_verify_cookie(tag);
assert3u(tag->m_tag_id, ==, id);
assert3u(tag->m_tag_type, ==, type);
assert3u(tag->m_tag_len, ==, len);
os_atomic_inc(&mtts->mt_alloc_count, relaxed);
} else {
os_atomic_inc(&mtts->mt_alloc_failed, relaxed);
}
return tag;
}
#ifdef MB_TAG_MBUF
static struct m_tag *
m_tag_create_mbuf(uint32_t id, uint16_t type, uint16_t len, int wait, struct mbuf *buf)
{
@ -610,6 +705,24 @@ m_tag_free_mbuf(struct m_tag *t)
}
#endif /* MB_TAG_MBUF */
struct m_tag *
m_tag_create(uint32_t id, uint16_t type, int len, int wait, struct mbuf *buf)
{
#ifdef MB_TAG_MBUF
/*
* Create and return an m_tag, either by re-using space in a previous tag
* or by allocating a new mbuf/cluster
*/
return m_tag_create_mbuf(id, type, (uint16_t)len, wait, buf);
#else /* MB_TAG_MBUF */
#pragma unused(buf)
/*
* Each packet tag has its own allocation
*/
return m_tag_alloc(id, type, (uint16_t)len, wait);
#endif /* MB_TAG_MBUF */
}
/*
* Allocations for external data are known to not have pointers for
* most platforms -- for macOS this is not guaranteed
@ -684,117 +797,6 @@ m_tag_kfree_external(struct m_tag *tag)
kfree_type(struct m_tag, tag);
}
static struct m_tag_type_entry *
get_m_tag_type_entry(uint32_t id, uint16_t type, struct m_tag_type_stats **pmtts)
{
m_tag_type_entry_ref_t mtte = &m_tag_type_table[KERNEL_TAG_TYPE_NONE];
if (pmtts != NULL) {
*pmtts = &m_tag_type_stats[KERNEL_TAG_TYPE_NONE];
}
if (id == KERNEL_MODULE_TAG_ID) {
switch (type) {
case KERNEL_TAG_TYPE_DUMMYNET:
case KERNEL_TAG_TYPE_IPFILT:
case KERNEL_TAG_TYPE_ENCAP:
case KERNEL_TAG_TYPE_INET6:
case KERNEL_TAG_TYPE_IPSEC:
case KERNEL_TAG_TYPE_CFIL_UDP:
case KERNEL_TAG_TYPE_PF_REASS:
case KERNEL_TAG_TYPE_AQM:
case KERNEL_TAG_TYPE_DRVAUX:
mtte = &m_tag_type_table[type];
if (pmtts != NULL) {
*pmtts = &m_tag_type_stats[type];
}
break;
default:
#if DEBUG || DEVELOPMENT
if (type > 0 && type < KERNEL_TAG_TYPE_COUNT) {
panic("get_m_tag_type_entry unexpected m_tag type %u",
type);
}
#endif /* DEBUG || DEVELOPMENT */
break;
}
}
return mtte;
}
#ifndef MB_TAG_MBUF
static struct m_tag *
m_tag_kalloc(uint32_t id, uint16_t type, uint16_t len, int wait, struct m_tag_type_entry *mtte)
{
struct m_tag *tag = NULL;
tag = mtte->mt_alloc_func(id, type, len, wait);
if (__probable(tag != NULL)) {
VERIFY(IS_P2ALIGNED(tag, sizeof(uint64_t)));
if (__improbable(tag->m_tag_data == NULL)) {
VERIFY(len == 0);
} else {
VERIFY(len != 0);
VERIFY(IS_P2ALIGNED(tag->m_tag_data, sizeof(uint64_t)));
}
}
return tag;
}
static void
m_tag_kfree(struct m_tag *tag, struct m_tag_type_entry *mtte)
{
mtte->mt_free_func(tag);
}
#endif /* MB_TAG_MBUF */
struct m_tag *
m_tag_alloc(uint32_t id, uint16_t type, int len, int wait)
{
struct m_tag *tag = NULL;
m_tag_type_entry_ref_t mtte = NULL;
m_tag_type_stats_ref_t mtts = NULL;
mtte = get_m_tag_type_entry(id, type, &mtts);
if (__improbable(len < 0 || len >= MCLBYTES - sizeof(struct m_tag))) {
goto done;
}
#ifdef MB_TAG_MBUF
tag = m_tag_alloc_mbuf(id, type, (uint16_t)len, wait);
#else /* MB_TAG_MBUF */
/*
* Using Z_NOWAIT could cause retransmission delays when there aren't
* many other colocated types in the zone that would prime it. Use
* Z_NOPAGEWAIT instead which will only fail to allocate when zalloc
* needs to block on the VM for pages.
*/
if (wait & Z_NOWAIT) {
wait &= ~Z_NOWAIT;
wait |= Z_NOPAGEWAIT;
}
tag = m_tag_kalloc(id, type, (uint16_t)len, wait, mtte);
#endif /* MB_TAG_MBUF */
done:
if (__probable(tag != NULL)) {
m_tag_verify_cookie(tag);
assert3u(tag->m_tag_id, ==, id);
assert3u(tag->m_tag_type, ==, type);
assert3u(tag->m_tag_len, ==, len);
os_atomic_inc(&mtts->mt_alloc_count, relaxed);
} else {
os_atomic_inc(&mtts->mt_alloc_failed, relaxed);
}
return tag;
}
/* Free a packet tag. */
void
m_tag_free(struct m_tag *tag)
@ -1262,6 +1264,22 @@ m_sum16(struct mbuf *m, uint32_t off, uint32_t len)
return (uint16_t)os_cpu_in_cksum_mbuf(m, len, off, 0);
}
/*
* Write packet tx_time to the mbuf's meta data.
*/
void
mbuf_set_tx_time(struct mbuf *m, uint64_t tx_time)
{
struct m_tag *tag = NULL;
tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_AQM,
sizeof(uint64_t), M_WAITOK, m);
if (tag != NULL) {
m_tag_prepend(m, tag);
*(uint64_t *)tag->m_tag_data = tx_time;
}
}
static int
sysctl_mb_tag_stats(__unused struct sysctl_oid *oidp,
__unused void *arg1, __unused int arg2, struct sysctl_req *req)

6207
bsd/kern/uipc_mbuf_mcache.c Normal file

File diff suppressed because it is too large Load diff

View file

@ -111,14 +111,17 @@
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet/flow_divert.h>
#include <kern/zalloc.h>
#include <kern/assert.h>
#include <kern/locks.h>
#include <kern/mem_acct.h>
#include <kern/policy_internal.h>
#include <kern/uipc_domain.h>
#include <kern/uipc_socket.h>
#include <kern/task.h>
#include <kern/zalloc.h>
#include <machine/limits.h>
#include <libkern/OSAtomic.h>
#include <pexpert/pexpert.h>
#include <kern/assert.h>
#include <kern/task.h>
#include <kern/policy_internal.h>
#include <sys/kpi_mbuf.h>
#include <sys/mcache.h>
@ -147,19 +150,8 @@
/* TODO: this should be in a header file somewhere */
extern char *proc_name_address(void *p);
static u_int32_t so_cache_hw; /* High water mark for socache */
static u_int32_t so_cache_timeouts; /* number of timeouts */
static u_int32_t so_cache_max_freed; /* max freed per timeout */
static u_int32_t cached_sock_count = 0;
STAILQ_HEAD(, socket) so_cache_head;
int max_cached_sock_count = MAX_CACHED_SOCKETS;
static uint64_t so_cache_time;
static int socketinit_done;
static struct zone *so_cache_zone;
ZONE_DECLARE(so_cache_zone, struct zone *);
static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
struct mem_acct *socket_memacct;
#include <machine/limits.h>
@ -245,8 +237,6 @@ SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
so_gen_t so_gencnt; /* generation count for sockets */
MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
#define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
#define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
#define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
@ -257,8 +247,6 @@ MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
#define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
#define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
int somaxconn = SOMAXCONN;
SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
@ -272,29 +260,6 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
/*
* Set to enable jumbo clusters (if available) for large writes when
* the socket is marked with SOF_MULTIPAGES; see below.
*/
int sosendjcl = 1;
SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
/*
* Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
* writes on the socket for all protocols on any network interfaces,
* depending upon sosendjcl above. Be extra careful when setting this
* to 1, because sending down packets that cross physical pages down to
* broken drivers (those that falsely assume that the physical pages
* are contiguous) might lead to system panics or silent data corruption.
* When set to 0, the system will respect SOF_MULTIPAGES, which is set
* only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
* capable. Set this to 1 only for testing/debugging purposes.
*/
int sosendjcl_ignore_capab = 0;
SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
/*
* Set this to ignore SOF1_IF_2KCL and use big clusters for large
* writes on the socket for all protocols on any network interfaces.
@ -342,16 +307,8 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
extern struct inpcbinfo tcbinfo;
/* TODO: these should be in header file */
extern int get_inpcb_str_size(void);
extern int get_tcp_str_size(void);
vm_size_t so_cache_zone_element_size;
static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
user_ssize_t *);
static void cached_sock_alloc(struct socket **, zalloc_flags_t);
static void cached_sock_free(struct socket *);
/*
* Maximum of extended background idle sockets per process
@ -395,23 +352,23 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
void
socketinit(void)
{
_CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
static_assert(sizeof(so_gencnt) == sizeof(uint64_t));
VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
#ifdef __LP64__
_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
static_assert(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
static_assert(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
static_assert(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
static_assert(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
static_assert(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
static_assert(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
#else
_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
static_assert(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
static_assert(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
static_assert(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
static_assert(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
static_assert(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
static_assert(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
#endif
if (socketinit_done) {
@ -426,92 +383,16 @@ socketinit(void)
PE_parse_boot_argn("sosend_assert_panic", &sosend_assert_panic,
sizeof(sosend_assert_panic));
STAILQ_INIT(&so_cache_head);
so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
+ get_inpcb_str_size() + 4 + get_tcp_str_size());
so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
in_pcbinit();
}
static void
cached_sock_alloc(struct socket **so, zalloc_flags_t how)
{
caddr_t temp;
uintptr_t offset;
lck_mtx_lock(&so_cache_mtx);
if (!STAILQ_EMPTY(&so_cache_head)) {
VERIFY(cached_sock_count > 0);
*so = STAILQ_FIRST(&so_cache_head);
STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
STAILQ_NEXT((*so), so_cache_ent) = NULL;
cached_sock_count--;
lck_mtx_unlock(&so_cache_mtx);
temp = (*so)->so_saved_pcb;
bzero(*so, sizeof(struct socket));
(*so)->so_saved_pcb = temp;
} else {
lck_mtx_unlock(&so_cache_mtx);
uint8_t *so_mem = zalloc_flags_buf(so_cache_zone, how | Z_ZERO);
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wcast-align"
*so = (struct socket *)so_mem;
/*
* Define offsets for extra structures into our
* single block of memory. Align extra structures
* on longword boundaries.
*/
offset = (uintptr_t)so_mem;
offset += sizeof(struct socket);
offset = ALIGN(offset);
struct inpcb *pcb = (struct inpcb *)(so_mem + (offset - (uintptr_t)so_mem));
#pragma clang diagnostic pop
(*so)->so_saved_pcb = (caddr_t)pcb;
offset += get_inpcb_str_size();
offset = ALIGN(offset);
pcb->inp_saved_ppcb = (caddr_t)(so_mem + (offset - (uintptr_t)so_mem));
}
OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
}
static void
cached_sock_free(struct socket *so)
{
lck_mtx_lock(&so_cache_mtx);
so_cache_time = net_uptime();
if (++cached_sock_count > max_cached_sock_count) {
--cached_sock_count;
lck_mtx_unlock(&so_cache_mtx);
zfree(so_cache_zone, so);
} else {
if (so_cache_hw < cached_sock_count) {
so_cache_hw = cached_sock_count;
}
STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
so->cache_timestamp = so_cache_time;
lck_mtx_unlock(&so_cache_mtx);
socket_memacct = mem_acct_register("SOCKET", 0, 0);
if (socket_memacct == NULL) {
panic("mem_acct_register returned NULL");
}
}
@ -561,63 +442,19 @@ so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
}
#endif /* NECP */
boolean_t
so_cache_timer(void)
{
struct socket *p;
int n_freed = 0;
boolean_t rc = FALSE;
lck_mtx_lock(&so_cache_mtx);
so_cache_timeouts++;
so_cache_time = net_uptime();
while (!STAILQ_EMPTY(&so_cache_head)) {
VERIFY(cached_sock_count > 0);
p = STAILQ_FIRST(&so_cache_head);
if ((so_cache_time - p->cache_timestamp) <
SO_CACHE_TIME_LIMIT) {
break;
}
STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
--cached_sock_count;
zfree(so_cache_zone, p);
if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
so_cache_max_freed++;
break;
}
}
/* Schedule again if there is more to cleanup */
if (!STAILQ_EMPTY(&so_cache_head)) {
rc = TRUE;
}
lck_mtx_unlock(&so_cache_mtx);
return rc;
}
/*
* Get a socket structure from our zone, and initialize it.
* We don't implement `waitok' yet (see comments in uipc_domain.c).
*
* Note that it would probably be better to allocate socket
* and PCB at the same time, but I'm not convinced that all
* the protocols can be easily modified to do this.
*/
struct socket *
soalloc(int waitok, int dom, int type)
soalloc(void)
{
zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
struct socket *__single so;
if ((dom == PF_INET) && (type == SOCK_STREAM)) {
cached_sock_alloc(&so, how);
} else {
so = zalloc_flags(socket_zone, how | Z_ZERO);
}
so = zalloc_flags(socket_zone, Z_WAITOK_ZERO);
if (so != NULL) {
so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
@ -662,7 +499,10 @@ socreate_internal(int dom, struct socket **aso, int type, int proto,
if (prp->pr_type != type) {
return EPROTOTYPE;
}
so = soalloc(1, dom, type);
if (proto_memacct_hardlimit(prp)) {
return ENOBUFS;
}
so = soalloc();
if (so == NULL) {
return ENOBUFS;
}
@ -754,6 +594,8 @@ socreate_internal(int dom, struct socket **aso, int type, int proto,
so->next_lock_lr = 0;
so->next_unlock_lr = 0;
proto_memacct_add(so->so_proto, sizeof(struct socket));
/*
* Attachment will create the per pcb lock if necessary and
* increase refcount for creation, make sure it's done before
@ -952,6 +794,8 @@ out:
void
sodealloc(struct socket *so)
{
proto_memacct_sub(so->so_proto, sizeof(struct socket));
kauth_cred_unref(&so->so_cred);
/* Remove any filters */
@ -959,11 +803,7 @@ sodealloc(struct socket *so)
so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
cached_sock_free(so);
} else {
zfree(socket_zone, so);
}
zfree(socket_zone, so);
}
/*
@ -1695,6 +1535,9 @@ soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
* This allows user to disconnect by connecting to, e.g.,
* a null address.
*/
#if NECP
bool set_domain_from_tracker_lookup = false;
#endif /* NECP */
if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
(error = sodisconnectlocked(so)))) {
@ -1712,6 +1555,9 @@ soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
}
#if NECP
set_domain_from_tracker_lookup = (metadata.domain[0] != 0);
#endif /* NECP */
necp_set_socket_domain_attributes(so,
__unsafe_null_terminated_from_indexable(metadata.domain),
__unsafe_null_terminated_from_indexable(metadata.domain_owner));
@ -1721,6 +1567,12 @@ soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
#if NECP
/* Update NECP evaluation after setting any domain via the tracker checks */
so_update_necp_policy(so, NULL, nam);
if (set_domain_from_tracker_lookup && (so->so_flags1 & SOF1_DOMAIN_MATCHED_POLICY)) {
// Mark extended timeout on tracker lookup to ensure that the entry stays around
tracker_metadata_t update_metadata = { };
update_metadata.flags = SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT;
(void)tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &update_metadata);
}
#endif /* NECP */
/*
@ -1817,6 +1669,9 @@ soconnectxlocked(struct socket *so, struct sockaddr *src,
* try to disconnect first. This allows user to disconnect
* by connecting to, e.g., a null address.
*/
#if NECP
bool set_domain_from_tracker_lookup = false;
#endif /* NECP */
if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
!(so->so_proto->pr_flags & PR_MULTICONN) &&
((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
@ -1836,6 +1691,9 @@ soconnectxlocked(struct socket *so, struct sockaddr *src,
if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
}
#if NECP
set_domain_from_tracker_lookup = (metadata.domain[0] != 0);
#endif /* NECP */
necp_set_socket_domain_attributes(so, __unsafe_null_terminated_from_indexable(metadata.domain),
__unsafe_null_terminated_from_indexable(metadata.domain_owner));
}
@ -1895,6 +1753,15 @@ soconnectxlocked(struct socket *so, struct sockaddr *src,
so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
}
}
#if NECP
if (set_domain_from_tracker_lookup && (so->so_flags1 & SOF1_DOMAIN_MATCHED_POLICY)) {
// Mark extended timeout on tracker lookup to ensure that the entry stays around
tracker_metadata_t update_metadata = { };
update_metadata.flags = SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT;
(void)tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &update_metadata);
}
#endif /* NECP */
}
}
@ -1987,9 +1854,10 @@ int
sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
int32_t clen, int32_t atomic, int flags, int *sblocked)
{
int error = 0;
int assumelock = 0;
int error = 0;
int32_t space;
int assumelock = 0;
int ret;
restart:
if (*sblocked == 0) {
@ -2104,6 +1972,12 @@ defunct:
}
goto restart;
}
ret = proto_memacct_limited(so->so_proto);
if (ret == MEMACCT_HARDLIMIT ||
(ret == MEMACCT_SOFTLIMIT && so->so_snd.sb_cc > 0)) {
return ENOMEM;
}
return 0;
}
@ -2313,9 +2187,7 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
* a jumbo cluster pool and if the socket is
* marked accordingly.
*/
jumbocl = sosendjcl && njcl > 0 &&
((so->so_flags & SOF_MULTIPAGES) ||
sosendjcl_ignore_capab) &&
jumbocl = (so->so_flags & SOF_MULTIPAGES) != 0 &&
bigcl;
socket_unlock(so, 0);
@ -4176,12 +4048,12 @@ restart:
if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
(SS_NOFDREF | SS_CANTRCVMORE)) {
error = 0;
goto out;
goto release;
}
error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
if (error) {
goto out;
goto release;
}
sblocked = 1;
@ -4379,7 +4251,6 @@ release:
socket_unlock(so, 1);
}
out:
*pktcntp = npkts;
/*
* Amortize the cost of freeing the mbufs
@ -5810,7 +5681,48 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
so->so_flags1 |= SOF1_DOMAIN_INFO_SILENT;
}
break;
case SO_MAX_PACING_RATE: {
uint64_t pacingrate;
if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
error = EINVAL;
goto out;
}
error = sooptcopyin(sopt, &pacingrate,
sizeof(pacingrate), sizeof(pacingrate));
if (error != 0) {
goto out;
}
if (pacingrate == 0) {
error = EINVAL;
goto out;
}
sotoinpcb(so)->inp_max_pacing_rate = pacingrate;
break;
}
case SO_CONNECTION_IDLE: {
int is_idle;
if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
error = EINVAL;
goto out;
}
error = sooptcopyin(sopt, &is_idle,
sizeof(is_idle), sizeof(is_idle));
if (error != 0) {
goto out;
}
if (is_idle != 0) {
sotoinpcb(so)->inp_flags2 |= INP2_CONNECTION_IDLE;
} else {
sotoinpcb(so)->inp_flags2 &= ~INP2_CONNECTION_IDLE;
}
break;
}
default:
error = ENOPROTOOPT;
break;
@ -6326,6 +6238,28 @@ integer:
optval = ((so->so_flags1 & SOF1_DOMAIN_INFO_SILENT) > 0)
? 1 : 0;
goto integer;
case SO_MAX_PACING_RATE: {
uint64_t pacingrate;
if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
error = EINVAL;
goto out;
}
pacingrate = sotoinpcb(so)->inp_max_pacing_rate;
error = sooptcopyout(sopt, &pacingrate, sizeof(pacingrate));
break;
}
case SO_CONNECTION_IDLE: {
if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
error = EINVAL;
goto out;
}
optval = sotoinpcb(so)->inp_flags2 & INP2_CONNECTION_IDLE ?
1 : 0;
goto integer;
}
default:
error = ENOPROTOOPT;
break;
@ -8228,6 +8162,66 @@ socket_post_kev_msg_closed(struct socket *so)
free_sockaddr(peersa);
}
void
sock_parse_cm_info(struct mbuf *control, struct sock_cm_info *sockcminfo)
{
struct cmsghdr *cm;
for (cm = M_FIRST_CMSGHDR(control);
is_cmsg_valid(control, cm);
cm = M_NXT_CMSGHDR(control, cm)) {
int val;
if (cm->cmsg_level != SOL_SOCKET) {
continue;
}
if (cm->cmsg_len == CMSG_LEN(sizeof(int))) {
val = *(int *)(void *)CMSG_DATA(cm);
}
switch (cm->cmsg_type) {
case SO_TRAFFIC_CLASS:
if (cm->cmsg_len != CMSG_LEN(sizeof(int))) {
break;
}
if (SO_VALID_TC(val)) {
sockcminfo->sotc = val;
break;
} else if (val < SO_TC_NET_SERVICE_OFFSET) {
break;
}
/*
* Handle the case SO_NET_SERVICE_TYPE values are
* passed using SO_TRAFFIC_CLASS
*/
val = val - SO_TC_NET_SERVICE_OFFSET;
OS_FALLTHROUGH;
case SO_NET_SERVICE_TYPE:
if (cm->cmsg_len != CMSG_LEN(sizeof(int))) {
break;
}
if (!IS_VALID_NET_SERVICE_TYPE(val)) {
break;
}
sockcminfo->netsvctype = val;
sockcminfo->sotc = sotc_by_netservicetype[val];
break;
case SCM_TXTIME:
if (cm->cmsg_len != CMSG_LEN(sizeof(uint64_t))) {
break;
}
sockcminfo->tx_time = *(uint64_t *)(void *)CMSG_DATA(cm);
break;
default:
break;
}
}
}
__attribute__((noinline, cold, not_tail_called, noreturn))
__private_extern__ int
assfail(const char *a, const char *f, int l)

77
bsd/kern/uipc_socket.h Normal file
View file

@ -0,0 +1,77 @@
/*
* Copyright (c) 2024 Apple Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this
* file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_LICENSE_HEADER_END@
*/
#ifdef XNU_KERNEL_PRIVATE
#ifndef _KERN_UIPC_SOCKET_H
#define _KERN_UIPC_SOCKET_H
#include <kern/mem_acct.h>
#include <sys/socketvar.h>
extern struct mem_acct *socket_memacct;
static inline void
socket_memacct_add(unsigned int size)
{
mem_acct_add(socket_memacct, size);
}
static inline void
socket_memacct_sub(unsigned int size)
{
mem_acct_sub(socket_memacct, size);
}
static inline bool
socket_memacct_hardlimit()
{
return mem_acct_limited(socket_memacct) == MEMACCT_HARDLIMIT;
}
static inline bool
socket_memacct_limited()
{
return mem_acct_limited(socket_memacct) != 0;
}
struct sock_cm_info {
int sotc;
int netsvctype;
uint64_t tx_time;
};
static inline void
sock_init_cm_info(struct sock_cm_info *sockcminfo, const struct socket *so)
{
sockcminfo->sotc = so->so_traffic_class;
sockcminfo->netsvctype = so->so_netsvctype;
sockcminfo->tx_time = 0;
}
extern void sock_parse_cm_info(struct mbuf *control, struct sock_cm_info *sockcminfo);
#endif /*_KERN_UIPC_SOCKET_H */
#endif /* XNU_KERNEL_PRIVATE */

View file

@ -86,6 +86,8 @@
#include <sys/unpcb.h>
#include <sys/ev.h>
#include <kern/locks.h>
#include <kern/uipc_domain.h>
#include <kern/uipc_socket.h>
#include <net/route.h>
#include <net/content_filter.h>
#include <netinet/in.h>
@ -130,7 +132,7 @@ static int sbappend_common(struct sockbuf *sb, struct mbuf *m, boolean_t nodrop)
/*
* Primitive routines for operating on sockets and socket buffers
*/
static int soqlimitcompat = 1;
int soqlimitcompat = 1;
static int soqlencomp = 0;
/*
@ -357,16 +359,14 @@ sonewconn_internal(struct socket *head, int connstatus)
if (so_qlen >=
(soqlimitcompat ? head->so_qlimit : (3 * head->so_qlimit / 2))) {
return (struct socket *)0;
return NULL;
}
so = soalloc(1, SOCK_DOM(head), head->so_type);
if (proto_memacct_hardlimit(head->so_proto)) {
return NULL;
}
so = soalloc();
if (so == NULL) {
return (struct socket *)0;
}
/* check if head was closed during the soalloc */
if (head->so_proto == NULL) {
sodealloc(so);
return (struct socket *)0;
return NULL;
}
so->so_type = head->so_type;
@ -411,9 +411,11 @@ sonewconn_internal(struct socket *head, int connstatus)
so->so_traffic_class = head->so_traffic_class;
so->so_netsvctype = head->so_netsvctype;
proto_memacct_add(so->so_proto, sizeof(struct socket));
if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
sodealloc(so);
return (struct socket *)0;
return NULL;
}
so->so_rcv.sb_flags |= (head->so_rcv.sb_flags & SB_USRSIZE);
so->so_snd.sb_flags |= (head->so_snd.sb_flags & SB_USRSIZE);
@ -431,7 +433,7 @@ sonewconn_internal(struct socket *head, int connstatus)
if (head->so_proto->pr_unlock) {
socket_lock(head, 0);
}
return (struct socket *)0;
return NULL;
}
if (head->so_proto->pr_unlock) {
socket_lock(head, 0);
@ -442,7 +444,7 @@ sonewconn_internal(struct socket *head, int connstatus)
if ((head->so_options & SO_ACCEPTCONN) == 0) {
so->so_state &= ~SS_NOFDREF;
soclose(so);
return (struct socket *)0;
return NULL;
}
}
@ -1038,45 +1040,6 @@ sbappendstream(struct sockbuf *sb, struct mbuf *m)
return 1;
}
#ifdef SOCKBUF_DEBUG
void
sbcheck(struct sockbuf *sb)
{
struct mbuf *m;
struct mbuf *n = 0;
u_int32_t len = 0, mbcnt = 0;
lck_mtx_t *mutex_held;
if (sb->sb_so->so_proto->pr_getlock != NULL) {
mutex_held = (*sb->sb_so->so_proto->pr_getlock)(sb->sb_so, 0);
} else {
mutex_held = sb->sb_so->so_proto->pr_domain->dom_mtx;
}
LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
if (sbchecking == 0) {
return;
}
for (m = sb->sb_mb; m; m = n) {
n = m->m_nextpkt;
for (; m; m = m->m_next) {
len += m->m_len;
mbcnt += _MSIZE;
/* XXX pretty sure this is bogus */
if (m->m_flags & M_EXT) {
mbcnt += m->m_ext.ext_size;
}
}
}
if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
panic("cc %ld != %ld || mbcnt %ld != %ld", len, sb->sb_cc,
mbcnt, sb->sb_mbcnt);
}
}
#endif
void
sblastrecordchk(struct sockbuf *sb, const char *where)
{
@ -1265,7 +1228,7 @@ sbconcat_mbufs(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, struct
}
if (asa != NULL) {
_CASSERT(sizeof(asa->sa_len) == sizeof(__uint8_t));
static_assert(sizeof(asa->sa_len) == sizeof(__uint8_t));
if (MLEN <= UINT8_MAX && asa->sa_len > MLEN) {
return NULL;
}
@ -1713,9 +1676,6 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
continue;
}
if (compress && n != NULL && (n->m_flags & M_EOR) == 0 &&
#ifndef __APPLE__
M_WRITABLE(n) &&
#endif
m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
m->m_len <= M_TRAILINGSPACE(n) &&
n->m_type == m->m_type) {
@ -1724,7 +1684,6 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
n->m_len += m->m_len;
sb->sb_cc += m->m_len;
if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
/* XXX: Probably don't need */
sb->sb_ctl += m->m_len;
}
@ -1738,6 +1697,36 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
m = m_free(m);
continue;
}
if (compress && n != NULL && (n->m_flags & M_EOR) == 0 &&
proto_memacct_limited(sb->sb_so->so_proto) &&
n->m_type == m->m_type) {
int tocopy = min((int)M_TRAILINGSPACE(n), m->m_len);
bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
tocopy);
n->m_len += tocopy;
sb->sb_cc += tocopy;
if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
sb->sb_ctl += m->m_len;
}
/* update send byte count */
if (sb->sb_flags & SB_SNDBYTE_CNT) {
inp_incr_sndbytes_total(sb->sb_so,
m->m_len);
inp_incr_sndbytes_unsent(sb->sb_so,
m->m_len);
}
if (tocopy < m->m_len) {
memmove(mtod(m, caddr_t),
mtod(m, caddr_t) + tocopy, m->m_len - tocopy);
m->m_len -= tocopy;
} else {
m = m_free(m);
continue;
}
}
if (n != NULL) {
n->m_next = m;
} else {
@ -1871,19 +1860,12 @@ sbdrop(struct sockbuf *sb, int len)
if (m == NULL) {
if (next == NULL) {
/*
* temporarily replacing this panic with printf
* because it occurs occasionally when closing
* a socket when there is no harm in ignoring
* it. This problem will be investigated
* further.
* We have reached the end of the mbuf chain before
* freeing the requested amount of data.
* Since there is no data left, zero the counts
* and exit the loop.
*/
/* panic("sbdrop"); */
printf("sbdrop - count not zero\n");
len = 0;
/*
* zero the counts. if we have no mbufs,
* we have no data (PR-2986815)
*/
sb->sb_cc = 0;
sb->sb_mbcnt = 0;
break;
@ -2449,15 +2431,15 @@ sowriteable(struct socket *so)
void
sballoc(struct sockbuf *sb, struct mbuf *m)
{
int mbcnt = m_capacity(m);
sb->sb_cc += m->m_len;
if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
sb->sb_ctl += m->m_len;
}
sb->sb_mbcnt += _MSIZE;
if (m->m_flags & M_EXT) {
sb->sb_mbcnt += m->m_ext.ext_size;
}
sb->sb_mbcnt += mbcnt;
proto_memacct_add(sb->sb_so->so_proto, mbcnt);
/*
* If data is being added to the send socket buffer,
@ -2473,14 +2455,15 @@ sballoc(struct sockbuf *sb, struct mbuf *m)
void
sbfree(struct sockbuf *sb, struct mbuf *m)
{
int mbcnt = m_capacity(m);
sb->sb_cc -= m->m_len;
if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
sb->sb_ctl -= m->m_len;
}
sb->sb_mbcnt -= _MSIZE;
if (m->m_flags & M_EXT) {
sb->sb_mbcnt -= m->m_ext.ext_size;
}
sb->sb_mbcnt -= mbcnt;
proto_memacct_sub(sb->sb_so->so_proto, mbcnt);
/*
* If data is being removed from the send socket buffer,
@ -2717,8 +2700,8 @@ void
soevent(struct socket *so, uint32_t hint)
{
if (net_wake_pkt_debug > 0 && (hint & SO_FILT_HINT_WAKE_PKT)) {
os_log(OS_LOG_DEFAULT, "%s: SO_FILT_HINT_WAKE_PKT so %p",
__func__, so);
os_log(wake_packet_log_handle, "soevents: SO_FILT_HINT_WAKE_PKT so_gencnt: %llu",
so->so_gencnt);
}
if (so->so_flags & SOF_KNOTE) {
@ -2997,9 +2980,6 @@ SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor,
SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters,
CTLFLAG_RD | CTLFLAG_LOCKED, &nmbclusters, 0, "");
SYSCTL_INT(_kern_ipc, OID_AUTO, njcl,
CTLFLAG_RD | CTLFLAG_LOCKED, &njcl, 0, "");
SYSCTL_INT(_kern_ipc, OID_AUTO, njclbytes,
CTLFLAG_RD | CTLFLAG_LOCKED, &njclbytes, 0, "");

View file

@ -1771,7 +1771,8 @@ static int
sendit_x(proc_ref_t p, socket_ref_t so, struct sendmsg_x_args *uap, u_int *retval)
{
int error = 0;
uio_t __single auio = NULL;
UIO_STACKBUF(uio_buf, UIO_SMALLIOV);
uio_t __single auio;
const bool is_p_64bit_process = IS_64BIT_PROCESS(p);
void *src;
MBUFQ_HEAD() pktlist = {};
@ -1785,15 +1786,10 @@ sendit_x(proc_ref_t p, socket_ref_t so, struct sendmsg_x_args *uap, u_int *retva
*retval = 0;
/* We re-use the uio when possible */
auio = uio_create(1, 0,
auio = uio_createwithbuffer(UIO_SMALLIOV, 0,
(is_p_64bit_process ? UIO_USERSPACE64 : UIO_USERSPACE32),
UIO_WRITE);
if (auio == NULL) {
error = ENOBUFS;
DBG_PRINTF("%s uio_create() failed %d",
__func__, error);
goto done;
}
UIO_WRITE, &uio_buf[0],
UIO_SIZEOF(UIO_SMALLIOV));
src = __unsafe_forge_bidi_indexable(void *, uap->msgp, uap->cnt);
@ -3933,8 +3929,7 @@ sendfile(proc_ref_t p, struct sendfile_args *uap, __unused int *retval)
* large writes only if there is a jumbo cluster pool and
* if the socket is marked accordingly.
*/
jumbocl = sosendjcl && njcl > 0 &&
((so->so_flags & SOF_MULTIPAGES) || sosendjcl_ignore_capab);
jumbocl = (so->so_flags & SOF_MULTIPAGES) != 0;
socket_unlock(so, 0);
alloc_sendpkt(M_WAIT, xfsize, &nbufs, &m0, jumbocl);

View file

@ -2473,7 +2473,7 @@ out:
void
unp_init(void)
{
_CASSERT(UIPC_MAX_CMSG_FD >= (MCLBYTES / sizeof(int)));
static_assert(UIPC_MAX_CMSG_FD >= (MCLBYTES / sizeof(int)));
LIST_INIT(&unp_dhead);
LIST_INIT(&unp_shead);
}

View file

@ -40,39 +40,65 @@
#include <kern/zalloc.h>
#include <kern/locks.h>
#include <machine/atomic.h>
#include <IOKit/IOBSD.h>
#define sotovsockpcb(so) ((struct vsockpcb *)(so)->so_pcb)
#define VSOCK_PORT_RESERVED 1024
#define VSOCK_PRIVATE_ENTITLEMENT "com.apple.private.vsock"
/* VSock Protocol Globals */
static struct vsock_transport * _Atomic the_vsock_transport = NULL;
static ZONE_DEFINE(vsockpcb_zone, "vsockpcbzone",
sizeof(struct vsockpcb), ZC_NONE);
static LCK_GRP_DECLARE(vsock_lock_grp, "vsock");
static struct vsockpcbinfo vsockinfo;
static struct vsock_transport * _Atomic the_vsock_transport[VSOCK_PROTO_MAX];
static ZONE_DEFINE_TYPE(vsockpcb_zone, "vsockpcbzone", struct vsockpcb, ZC_NONE);
static struct vsockpcbinfo vsockinfo[VSOCK_PROTO_MAX];
static uint32_t vsock_sendspace = VSOCK_MAX_PACKET_SIZE * 8;
static uint32_t vsock_recvspace = VSOCK_MAX_PACKET_SIZE * 8;
static uint32_t vsock_sendspace[VSOCK_PROTO_MAX];
static uint32_t vsock_recvspace[VSOCK_PROTO_MAX];
/* VSock Private Entitlements */
static errno_t
vsock_validate_entitlements(uint16_t protocol, struct proc *p)
{
if (protocol != VSOCK_PROTO_PRIVATE) {
return 0;
}
if (!p) {
p = current_proc();
}
if (p == kernproc) {
// Assume kernel callers are entitled.
return 0;
}
if (!IOTaskHasEntitlement(proc_task(p), VSOCK_PRIVATE_ENTITLEMENT)) {
return EPERM;
}
return 0;
}
/* VSock PCB Helpers */
static uint32_t
vsock_get_peer_space(struct vsockpcb *pcb)
vsock_get_peer_space(struct vsockpcb *_Nonnull pcb)
{
VERIFY(pcb != NULL);
return pcb->peer_buf_alloc - (pcb->tx_cnt - pcb->peer_fwd_cnt);
}
static struct vsockpcb *
vsock_get_matching_pcb(struct vsock_address src, struct vsock_address dst)
vsock_get_matching_pcb(struct vsock_address src, struct vsock_address dst, uint16_t protocol)
{
struct vsockpcb *preferred = NULL;
struct vsockpcb *match = NULL;
struct vsockpcb *pcb = NULL;
lck_rw_lock_shared(&vsockinfo.bound_lock);
LIST_FOREACH(pcb, &vsockinfo.bound, bound) {
lck_rw_lock_shared(&vsockinfo[protocol].bound_lock);
LIST_FOREACH(pcb, &vsockinfo[protocol].bound, bound) {
// Source cid and port must match. Only destination port must match. (Allows for a changing CID during migration)
socket_lock(pcb->so, 1);
if ((pcb->so->so_state & SS_ISCONNECTED || pcb->so->so_state & SS_ISCONNECTING) &&
@ -90,14 +116,15 @@ vsock_get_matching_pcb(struct vsock_address src, struct vsock_address dst)
socket_lock(match->so, 1);
preferred = match;
}
lck_rw_done(&vsockinfo.bound_lock);
lck_rw_done(&vsockinfo[protocol].bound_lock);
return preferred;
}
static errno_t
vsock_bind_address_if_free(struct vsockpcb *pcb, uint32_t local_cid, uint32_t local_port, uint32_t remote_cid, uint32_t remote_port)
vsock_bind_address_if_free(struct vsockpcb *_Nonnull pcb, uint32_t local_cid, uint32_t local_port, uint32_t remote_cid, uint32_t remote_port)
{
VERIFY(pcb != NULL);
socket_lock_assert_owned(pcb->so);
// Privileged ports.
@ -108,12 +135,13 @@ vsock_bind_address_if_free(struct vsockpcb *pcb, uint32_t local_cid, uint32_t lo
bool taken = false;
const bool check_remote = (remote_cid != VMADDR_CID_ANY && remote_port != VMADDR_PORT_ANY);
const uint16_t protocol = pcb->so->so_protocol;
struct vsockpcb *pcb_match = NULL;
socket_unlock(pcb->so, 0);
lck_rw_lock_exclusive(&vsockinfo.bound_lock);
LIST_FOREACH(pcb_match, &vsockinfo.bound, bound) {
lck_rw_lock_exclusive(&vsockinfo[protocol].bound_lock);
LIST_FOREACH(pcb_match, &vsockinfo[protocol].bound, bound) {
socket_lock(pcb_match->so, 1);
if (pcb == pcb_match ||
(!check_remote && pcb_match->local_address.port == local_port) ||
@ -129,9 +157,9 @@ vsock_bind_address_if_free(struct vsockpcb *pcb, uint32_t local_cid, uint32_t lo
if (!taken) {
pcb->local_address = (struct vsock_address) { .cid = local_cid, .port = local_port };
pcb->remote_address = (struct vsock_address) { .cid = remote_cid, .port = remote_port };
LIST_INSERT_HEAD(&vsockinfo.bound, pcb, bound);
LIST_INSERT_HEAD(&vsockinfo[protocol].bound, pcb, bound);
}
lck_rw_done(&vsockinfo.bound_lock);
lck_rw_done(&vsockinfo[protocol].bound_lock);
return taken ? EADDRINUSE : 0;
}
@ -175,14 +203,16 @@ vsock_bind_address(struct vsockpcb *pcb, struct vsock_address laddr, struct vsoc
if (laddr.port != VMADDR_PORT_ANY) {
error = vsock_bind_address_if_free(pcb, laddr.cid, laddr.port, raddr.cid, raddr.port);
} else {
const uint16_t protocol = pcb->so->so_protocol;
socket_unlock(pcb->so, 0);
lck_mtx_lock(&vsockinfo.port_lock);
lck_mtx_lock(&vsockinfo[protocol].port_lock);
socket_lock(pcb->so, 0);
const uint32_t first = VSOCK_PORT_RESERVED;
const uint32_t last = VMADDR_PORT_ANY - 1;
uint32_t count = last - first + 1;
uint32_t *last_port = &vsockinfo.last_port;
uint32_t *last_port = &vsockinfo[protocol].last_port;
if (pcb->so->so_flags & SOF_BINDRANDOMPORT) {
uint32_t random = 0;
@ -192,7 +222,7 @@ vsock_bind_address(struct vsockpcb *pcb, struct vsock_address laddr, struct vsoc
do {
if (count == 0) {
lck_mtx_unlock(&vsockinfo.port_lock);
lck_mtx_unlock(&vsockinfo[protocol].port_lock);
return EADDRNOTAVAIL;
}
count--;
@ -205,7 +235,7 @@ vsock_bind_address(struct vsockpcb *pcb, struct vsock_address laddr, struct vsoc
error = vsock_bind_address_if_free(pcb, laddr.cid, *last_port, raddr.cid, raddr.port);
} while (error);
lck_mtx_unlock(&vsockinfo.port_lock);
lck_mtx_unlock(&vsockinfo[protocol].port_lock);
}
return error;
@ -228,15 +258,17 @@ vsock_unbind_pcb_locked(struct vsockpcb *pcb, bool is_locked)
return;
}
const uint16_t protocol = so->so_protocol;
if (!is_locked) {
socket_unlock(so, 0);
lck_rw_lock_exclusive(&vsockinfo.bound_lock);
lck_rw_lock_exclusive(&vsockinfo[protocol].bound_lock);
socket_lock(so, 0);
// Case where some other thread also called unbind() on this socket while waiting to acquire its lock.
if (!pcb->bound.le_prev) {
soisdisconnected(so);
lck_rw_done(&vsockinfo.bound_lock);
lck_rw_done(&vsockinfo[protocol].bound_lock);
return;
}
}
@ -248,7 +280,7 @@ vsock_unbind_pcb_locked(struct vsockpcb *pcb, bool is_locked)
pcb->bound.le_prev = NULL;
if (!is_locked) {
lck_rw_done(&vsockinfo.bound_lock);
lck_rw_done(&vsockinfo[protocol].bound_lock);
}
}
@ -312,15 +344,16 @@ vsock_pcb_send_message(struct vsockpcb *pcb, enum vsock_operation operation, mbu
src.cid = transport_cid;
}
uint32_t buf_alloc = pcb->so->so_rcv.sb_hiwat;
uint32_t fwd_cnt = pcb->fwd_cnt;
const uint16_t protocol = pcb->so->so_protocol;
const uint32_t buf_alloc = pcb->so->so_rcv.sb_hiwat;
const uint32_t fwd_cnt = pcb->fwd_cnt;
if (src.cid == dst.cid) {
pcb->last_buf_alloc = buf_alloc;
pcb->last_fwd_cnt = fwd_cnt;
socket_unlock(pcb->so, 0);
error = vsock_put_message(src, dst, operation, buf_alloc, fwd_cnt, m);
error = vsock_put_message(src, dst, operation, buf_alloc, fwd_cnt, m, protocol);
socket_lock(pcb->so, 0);
} else {
struct vsock_transport *transport = pcb->transport;
@ -336,7 +369,7 @@ vsock_pcb_send_message(struct vsockpcb *pcb, enum vsock_operation operation, mbu
}
static errno_t
vsock_pcb_reset_address(struct vsock_address src, struct vsock_address dst)
vsock_pcb_reset_address(struct vsock_address src, struct vsock_address dst, uint16_t protocol)
{
if (dst.cid == VMADDR_CID_ANY || dst.port == VMADDR_PORT_ANY) {
return EINVAL;
@ -346,7 +379,7 @@ vsock_pcb_reset_address(struct vsock_address src, struct vsock_address dst)
struct vsock_transport *transport = NULL;
if (src.cid == VMADDR_CID_ANY) {
transport = os_atomic_load(&the_vsock_transport, relaxed);
transport = os_atomic_load(&the_vsock_transport[protocol], relaxed);
if (transport == NULL) {
return ENODEV;
}
@ -361,7 +394,7 @@ vsock_pcb_reset_address(struct vsock_address src, struct vsock_address dst)
if (src.cid == dst.cid) {
// Reset both sockets.
struct vsockpcb *pcb = vsock_get_matching_pcb(src, dst);
struct vsockpcb *pcb = vsock_get_matching_pcb(src, dst, protocol);
if (pcb) {
socket_lock_assert_owned(pcb->so);
vsock_unbind_pcb(pcb);
@ -369,7 +402,7 @@ vsock_pcb_reset_address(struct vsock_address src, struct vsock_address dst)
}
} else {
if (!transport) {
transport = os_atomic_load(&the_vsock_transport, relaxed);
transport = os_atomic_load(&the_vsock_transport[protocol], relaxed);
if (transport == NULL) {
return ENODEV;
}
@ -381,13 +414,13 @@ vsock_pcb_reset_address(struct vsock_address src, struct vsock_address dst)
}
static errno_t
vsock_pcb_safe_reset_address(struct vsockpcb *pcb, struct vsock_address src, struct vsock_address dst)
vsock_pcb_safe_reset_address(struct vsockpcb *pcb, struct vsock_address src, struct vsock_address dst, uint16_t protocol)
{
if (pcb) {
socket_lock_assert_owned(pcb->so);
socket_unlock(pcb->so, 0);
}
errno_t error = vsock_pcb_reset_address(src, dst);
errno_t error = vsock_pcb_reset_address(src, dst, protocol);
if (pcb) {
socket_lock(pcb->so, 0);
}
@ -430,6 +463,18 @@ vsock_pcb_credit_update(struct vsockpcb *pcb)
return vsock_pcb_send_message(pcb, VSOCK_CREDIT_UPDATE, NULL);
}
static errno_t
vsock_pcb_credit_update_if_needed(struct vsockpcb *_Nonnull pcb)
{
VERIFY(pcb != NULL);
// Sends a credit update if the credit values have changed since the last sent message.
if (pcb->so->so_rcv.sb_hiwat != pcb->last_buf_alloc || pcb->fwd_cnt != pcb->last_fwd_cnt) {
return vsock_pcb_credit_update(pcb);
}
return 0;
}
static errno_t
vsock_pcb_credit_request(struct vsockpcb *pcb)
{
@ -457,7 +502,7 @@ vsock_disconnect_pcb(struct vsockpcb *pcb)
}
static errno_t
vsock_sockaddr_vm_validate(struct vsockpcb *pcb, struct sockaddr_vm *addr)
vsock_sockaddr_vm_validate(struct vsockpcb *pcb, struct sockaddr_vm *addr, struct proc *p)
{
if (!pcb || !pcb->so || !addr) {
return EINVAL;
@ -478,14 +523,20 @@ vsock_sockaddr_vm_validate(struct vsockpcb *pcb, struct sockaddr_vm *addr)
return EAFNOSUPPORT;
}
errno_t error = vsock_validate_entitlements(pcb->so->so_protocol, p);
if (error) {
return error;
}
return 0;
}
/* VSock Receive Handlers */
static errno_t
vsock_put_message_connected(struct vsockpcb *pcb, enum vsock_operation op, mbuf_t m)
vsock_put_message_connected(struct vsockpcb *_Nonnull pcb, enum vsock_operation op, mbuf_t m)
{
VERIFY(pcb != NULL);
socket_lock_assert_owned(pcb->so);
errno_t error = 0;
@ -520,8 +571,9 @@ vsock_put_message_connected(struct vsockpcb *pcb, enum vsock_operation op, mbuf_
}
static errno_t
vsock_put_message_connecting(struct vsockpcb *pcb, enum vsock_operation op)
vsock_put_message_connecting(struct vsockpcb *_Nonnull pcb, enum vsock_operation op)
{
VERIFY(pcb != NULL);
socket_lock_assert_owned(pcb->so);
errno_t error = 0;
@ -544,14 +596,17 @@ vsock_put_message_connecting(struct vsockpcb *pcb, enum vsock_operation op)
}
static errno_t
vsock_put_message_listening(struct vsockpcb *pcb, enum vsock_operation op, struct vsock_address src, struct vsock_address dst)
vsock_put_message_listening(struct vsockpcb *_Nonnull pcb, enum vsock_operation op, struct vsock_address src, struct vsock_address dst)
{
VERIFY(pcb != NULL);
socket_lock_assert_owned(pcb->so);
struct sockaddr_vm addr;
struct socket *so2 = NULL;
struct vsockpcb *pcb2 = NULL;
const uint16_t protocol = pcb->so->so_protocol;
errno_t error = 0;
switch (op) {
@ -566,7 +621,7 @@ vsock_put_message_listening(struct vsockpcb *pcb, enum vsock_operation op, struc
so2 = sonewconn(pcb->so, 0, (struct sockaddr *)&addr);
if (!so2) {
// It is likely that the backlog is full. Deny this request.
vsock_pcb_safe_reset_address(pcb, dst, src);
vsock_pcb_safe_reset_address(pcb, dst, src, protocol);
error = ECONNREFUSED;
break;
}
@ -597,7 +652,7 @@ done:
soisdisconnected(so2);
}
socket_unlock(so2, 1);
vsock_pcb_reset_address(dst, src);
vsock_pcb_reset_address(dst, src, protocol);
} else {
socket_unlock(so2, 0);
}
@ -605,10 +660,10 @@ done:
break;
case VSOCK_RESET:
error = vsock_pcb_safe_reset_address(pcb, dst, src);
error = vsock_pcb_safe_reset_address(pcb, dst, src, protocol);
break;
default:
vsock_pcb_safe_reset_address(pcb, dst, src);
vsock_pcb_safe_reset_address(pcb, dst, src, protocol);
error = ENOTSUP;
break;
}
@ -621,10 +676,10 @@ done:
errno_t
vsock_add_transport(struct vsock_transport *transport)
{
if (transport == NULL || transport->provider == NULL) {
if (transport == NULL || transport->provider == NULL || transport->protocol >= VSOCK_PROTO_MAX) {
return EINVAL;
}
if (!os_atomic_cmpxchg((void * volatile *)&the_vsock_transport, NULL, transport, acq_rel)) {
if (!os_atomic_cmpxchg((void * volatile *)&the_vsock_transport[transport->protocol], NULL, transport, acq_rel)) {
return EEXIST;
}
return 0;
@ -633,7 +688,7 @@ vsock_add_transport(struct vsock_transport *transport)
errno_t
vsock_remove_transport(struct vsock_transport *transport)
{
if (!os_atomic_cmpxchg((void * volatile *)&the_vsock_transport, transport, NULL, acq_rel)) {
if (!os_atomic_cmpxchg((void * volatile *)&the_vsock_transport[transport->protocol], transport, NULL, acq_rel)) {
return ENODEV;
}
return 0;
@ -650,8 +705,8 @@ vsock_reset_transport(struct vsock_transport *transport)
struct vsockpcb *pcb = NULL;
struct vsockpcb *tmp_pcb = NULL;
lck_rw_lock_exclusive(&vsockinfo.bound_lock);
LIST_FOREACH_SAFE(pcb, &vsockinfo.bound, bound, tmp_pcb) {
lck_rw_lock_exclusive(&vsockinfo[transport->protocol].bound_lock);
LIST_FOREACH_SAFE(pcb, &vsockinfo[transport->protocol].bound, bound, tmp_pcb) {
// Disconnect this transport's sockets. Listen and bind sockets must stay alive.
socket_lock(pcb->so, 1);
if (pcb->transport == transport && pcb->so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) {
@ -662,18 +717,18 @@ vsock_reset_transport(struct vsock_transport *transport)
}
socket_unlock(pcb->so, 1);
}
lck_rw_done(&vsockinfo.bound_lock);
lck_rw_done(&vsockinfo[transport->protocol].bound_lock);
return error;
}
errno_t
vsock_put_message(struct vsock_address src, struct vsock_address dst, enum vsock_operation op, uint32_t buf_alloc, uint32_t fwd_cnt, mbuf_t m)
vsock_put_message(struct vsock_address src, struct vsock_address dst, enum vsock_operation op, uint32_t buf_alloc, uint32_t fwd_cnt, mbuf_t m, uint16_t protocol)
{
struct vsockpcb *pcb = vsock_get_matching_pcb(dst, src);
struct vsockpcb *pcb = vsock_get_matching_pcb(dst, src, protocol);
if (!pcb) {
if (op != VSOCK_RESET) {
vsock_pcb_reset_address(dst, src);
vsock_pcb_reset_address(dst, src, protocol);
}
if (m != NULL) {
mbuf_freem_list(m);
@ -731,9 +786,10 @@ vsock_put_message(struct vsock_address src, struct vsock_address dst, enum vsock
/* VSock Sysctl */
static int
vsock_pcblist SYSCTL_HANDLER_ARGS
common_vsock_pcblist(struct sysctl_oid *oidp __unused, void *arg1, int arg2 __unused, struct sysctl_req *_Nonnull req, uint16_t protocol)
{
#pragma unused(oidp,arg2)
#pragma unused(oidp,arg2)
VERIFY(req != NULL);
int error;
@ -743,10 +799,10 @@ vsock_pcblist SYSCTL_HANDLER_ARGS
}
// Get the generation count and the count of all vsock sockets.
lck_rw_lock_shared(&vsockinfo.all_lock);
uint64_t n = vsockinfo.all_pcb_count;
vsock_gen_t gen_count = vsockinfo.vsock_gencnt;
lck_rw_done(&vsockinfo.all_lock);
lck_rw_lock_shared(&vsockinfo[protocol].all_lock);
uint64_t n = vsockinfo[protocol].all_pcb_count;
vsock_gen_t gen_count = vsockinfo[protocol].vsock_gencnt;
lck_rw_done(&vsockinfo[protocol].all_lock);
const size_t xpcb_len = sizeof(struct xvsockpcb);
struct xvsockpgen xvg;
@ -779,11 +835,11 @@ vsock_pcblist SYSCTL_HANDLER_ARGS
return 0;
}
lck_rw_lock_shared(&vsockinfo.all_lock);
lck_rw_lock_shared(&vsockinfo[protocol].all_lock);
n = 0;
struct vsockpcb *pcb = NULL;
TAILQ_FOREACH(pcb, &vsockinfo.all, all) {
TAILQ_FOREACH(pcb, &vsockinfo[protocol].all, all) {
// Bail if there is not enough user buffer for this next socket.
if (req->oldlen - req->oldidx - sizeof(xvg) < xpcb_len) {
break;
@ -822,9 +878,9 @@ vsock_pcblist SYSCTL_HANDLER_ARGS
}
// Update the generation count to match the sockets being returned.
gen_count = vsockinfo.vsock_gencnt;
gen_count = vsockinfo[protocol].vsock_gencnt;
lck_rw_done(&vsockinfo.all_lock);
lck_rw_done(&vsockinfo[protocol].all_lock);
if (!error) {
/*
@ -845,30 +901,68 @@ vsock_pcblist SYSCTL_HANDLER_ARGS
return error;
}
static int
vsock_pcblist SYSCTL_HANDLER_ARGS
{
return common_vsock_pcblist(oidp, arg1, arg2, req, VSOCK_PROTO_STANDARD);
}
static int
vsock_private_pcblist SYSCTL_HANDLER_ARGS
{
return common_vsock_pcblist(oidp, arg1, arg2, req, VSOCK_PROTO_PRIVATE);
}
#ifdef SYSCTL_DECL
// Standard namespace.
SYSCTL_NODE(_net, OID_AUTO, vsock, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "vsock");
SYSCTL_UINT(_net_vsock, OID_AUTO, sendspace, CTLFLAG_RW | CTLFLAG_LOCKED,
&vsock_sendspace, 0, "Maximum outgoing vsock datagram size");
&vsock_sendspace[VSOCK_PROTO_STANDARD], 0, "Maximum outgoing vsock datagram size");
SYSCTL_UINT(_net_vsock, OID_AUTO, recvspace, CTLFLAG_RW | CTLFLAG_LOCKED,
&vsock_recvspace, 0, "Maximum incoming vsock datagram size");
&vsock_recvspace[VSOCK_PROTO_STANDARD], 0, "Maximum incoming vsock datagram size");
SYSCTL_PROC(_net_vsock, OID_AUTO, pcblist,
CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
(caddr_t)(long)SOCK_STREAM, 0, vsock_pcblist, "S,xvsockpcb",
__unsafe_forge_single(caddr_t, SOCK_STREAM), 0, vsock_pcblist, "S,xvsockpcb",
"List of active vsock sockets");
SYSCTL_UINT(_net_vsock, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
(u_int *)&vsockinfo.all_pcb_count, 0, "");
(u_int *)&vsockinfo[VSOCK_PROTO_STANDARD].all_pcb_count, 0, "");
// Private namespace.
SYSCTL_NODE(_net, OID_AUTO, vsock_private, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "vsock_private");
SYSCTL_PROC(_net_vsock_private, OID_AUTO, pcblist,
CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
__unsafe_forge_single(caddr_t, SOCK_STREAM), 0, vsock_private_pcblist, "S,xvsockpcb",
"List of active private vsock sockets");
SYSCTL_UINT(_net_vsock_private, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
(u_int *)&vsockinfo[VSOCK_PROTO_PRIVATE].all_pcb_count, 0, "");
#endif
/* VSock Protocol */
static int
vsock_attach(struct socket *so, int proto, struct proc *p)
vsock_attach(struct socket *_Nonnull so, int proto, struct proc *p)
{
#pragma unused(proto, p)
VERIFY(so != NULL);
const uint16_t protocol = so->so_protocol;
if (protocol >= VSOCK_PROTO_MAX) {
return EINVAL;
}
errno_t error = vsock_validate_entitlements(protocol, p);
if (error) {
return error;
}
const uint32_t send_space = vsock_sendspace[protocol];
const uint32_t receive_space = vsock_recvspace[protocol];
if (send_space == 0 || receive_space == 0) {
return ENOMEM;
}
// Reserve send and receive buffers.
errno_t error = soreserve(so, vsock_sendspace, vsock_recvspace);
error = soreserve(so, send_space, receive_space);
if (error) {
return error;
}
@ -880,7 +974,7 @@ vsock_attach(struct socket *so, int proto, struct proc *p)
}
// Get the transport for this socket.
struct vsock_transport *transport = os_atomic_load(&the_vsock_transport, relaxed);
struct vsock_transport *transport = os_atomic_load(&the_vsock_transport[protocol], relaxed);
if (transport == NULL) {
return ENODEV;
}
@ -908,11 +1002,11 @@ vsock_attach(struct socket *so, int proto, struct proc *p)
}
// Add to the list of all vsock sockets.
lck_rw_lock_exclusive(&vsockinfo.all_lock);
TAILQ_INSERT_TAIL(&vsockinfo.all, pcb, all);
vsockinfo.all_pcb_count++;
pcb->vsock_gencnt = ++vsockinfo.vsock_gencnt;
lck_rw_done(&vsockinfo.all_lock);
lck_rw_lock_exclusive(&vsockinfo[protocol].all_lock);
TAILQ_INSERT_TAIL(&vsockinfo[protocol].all, pcb, all);
vsockinfo[protocol].all_pcb_count++;
pcb->vsock_gencnt = ++vsockinfo[protocol].vsock_gencnt;
lck_rw_done(&vsockinfo[protocol].all_lock);
return 0;
}
@ -920,25 +1014,24 @@ vsock_attach(struct socket *so, int proto, struct proc *p)
static int
vsock_control(struct socket *so, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) data, struct ifnet *ifp, struct proc *p)
{
#pragma unused(ifp)
#pragma unused(ifp, p)
VERIFY(so != NULL || p == kernproc);
VERIFY(so != NULL);
if (cmd != IOCTL_VM_SOCKETS_GET_LOCAL_CID) {
return EINVAL;
}
struct vsock_transport *transport;
if (so) {
struct vsockpcb *pcb = sotovsockpcb(so);
if (pcb == NULL) {
return EINVAL;
}
transport = pcb->transport;
} else {
transport = os_atomic_load(&the_vsock_transport, relaxed);
if (so == NULL) {
return EINVAL;
}
struct vsockpcb *pcb = sotovsockpcb(so);
if (pcb == NULL) {
return EINVAL;
}
struct vsock_transport *transport = pcb->transport;
if (transport == NULL) {
return ENODEV;
}
@ -971,18 +1064,24 @@ vsock_detach(struct socket *so)
return error;
}
// Remove from the list of all vsock sockets.
lck_rw_lock_exclusive(&vsockinfo.all_lock);
TAILQ_REMOVE(&vsockinfo.all, pcb, all);
pcb->all.tqe_next = NULL;
pcb->all.tqe_prev = NULL;
vsockinfo.all_pcb_count--;
vsockinfo.vsock_gencnt++;
lck_rw_done(&vsockinfo.all_lock);
const uint16_t protocol = so->so_protocol;
// Mark this socket for deallocation.
so->so_flags |= SOF_PCBCLEARING;
// Reorder locks.
socket_unlock(so, 0);
lck_rw_lock_exclusive(&vsockinfo[protocol].all_lock);
socket_lock(so, 0);
// Remove from the list of all vsock sockets.
TAILQ_REMOVE(&vsockinfo[protocol].all, pcb, all);
pcb->all.tqe_next = NULL;
pcb->all.tqe_prev = NULL;
vsockinfo[protocol].all_pcb_count--;
vsockinfo[protocol].vsock_gencnt++;
lck_rw_done(&vsockinfo[protocol].all_lock);
return 0;
}
@ -1004,7 +1103,7 @@ vsock_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
struct sockaddr_vm *addr = (struct sockaddr_vm *)nam;
errno_t error = vsock_sockaddr_vm_validate(pcb, addr);
errno_t error = vsock_sockaddr_vm_validate(pcb, addr, p);
if (error) {
return error;
}
@ -1093,7 +1192,7 @@ vsock_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
struct sockaddr_vm *addr = (struct sockaddr_vm *)nam;
errno_t error = vsock_sockaddr_vm_validate(pcb, addr);
errno_t error = vsock_sockaddr_vm_validate(pcb, addr, p);
if (error) {
return error;
}
@ -1376,9 +1475,9 @@ vsock_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
const uint32_t threshold = VSOCK_MAX_PACKET_SIZE;
// Send a credit update if is possible that the peer will no longer send.
// Send a credit update if it is possible that the peer will no longer send.
if ((pcb->fwd_cnt - pcb->last_fwd_cnt + threshold) >= pcb->last_buf_alloc) {
errno_t error = vsock_pcb_credit_update(pcb);
errno_t error = vsock_pcb_credit_update_if_needed(pcb);
if (!result && error) {
result = error;
}
@ -1408,23 +1507,37 @@ static struct pr_usrreqs vsock_usrreqs = {
};
static void
vsock_init(struct protosw *pp, struct domain *dp)
common_vsock_init(struct protosw *pp, struct domain *dp, uint16_t protocol, lck_grp_t *lock_group)
{
#pragma unused(dp)
static int vsock_initialized = 0;
static int vsock_initialized[VSOCK_PROTO_MAX] = {0};
VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
if (!os_atomic_cmpxchg((volatile int *)&vsock_initialized, 0, 1, acq_rel)) {
if (!os_atomic_cmpxchg((volatile int *)&vsock_initialized[protocol], 0, 1, acq_rel)) {
return;
}
// Setup VSock protocol info struct.
lck_rw_init(&vsockinfo.all_lock, &vsock_lock_grp, LCK_ATTR_NULL);
lck_rw_init(&vsockinfo.bound_lock, &vsock_lock_grp, LCK_ATTR_NULL);
lck_mtx_init(&vsockinfo.port_lock, &vsock_lock_grp, LCK_ATTR_NULL);
TAILQ_INIT(&vsockinfo.all);
LIST_INIT(&vsockinfo.bound);
vsockinfo.last_port = VMADDR_PORT_ANY;
lck_rw_init(&vsockinfo[protocol].all_lock, lock_group, LCK_ATTR_NULL);
lck_rw_init(&vsockinfo[protocol].bound_lock, lock_group, LCK_ATTR_NULL);
lck_mtx_init(&vsockinfo[protocol].port_lock, lock_group, LCK_ATTR_NULL);
TAILQ_INIT(&vsockinfo[protocol].all);
LIST_INIT(&vsockinfo[protocol].bound);
vsockinfo[protocol].last_port = VMADDR_PORT_ANY;
}
static void
vsock_init(struct protosw *pp, struct domain *dp)
{
static LCK_GRP_DECLARE(vsock_lock_grp, "vsock");
common_vsock_init(pp, dp, VSOCK_PROTO_STANDARD, &vsock_lock_grp);
}
static void
vsock_private_init(struct protosw *pp, struct domain *dp)
{
static LCK_GRP_DECLARE(vsock_private_lock_grp, "vsock_private");
common_vsock_init(pp, dp, VSOCK_PROTO_PRIVATE, &vsock_private_lock_grp);
}
static int
@ -1444,8 +1557,10 @@ vsock_sofreelastref(struct socket *so, int dealloc)
}
static int
vsock_unlock(struct socket *so, int refcount, void *lr_saved)
vsock_unlock(struct socket *_Nonnull so, int refcount, void *lr_saved)
{
VERIFY(so != NULL);
lck_mtx_t *mutex_held = so->so_proto->pr_domain->dom_mtx;
#ifdef MORE_LOCKING_DEBUG
LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
@ -1472,14 +1587,22 @@ vsock_unlock(struct socket *so, int refcount, void *lr_saved)
return 0;
}
static struct protosw vsocksw[] = {
static struct protosw vsocksw[VSOCK_PROTO_MAX] = {
{
.pr_type = SOCK_STREAM,
.pr_protocol = 0,
.pr_protocol = VSOCK_PROTO_STANDARD,
.pr_flags = PR_CONNREQUIRED | PR_WANTRCVD,
.pr_init = vsock_init,
.pr_unlock = vsock_unlock,
.pr_usrreqs = &vsock_usrreqs,
},
{
.pr_type = SOCK_STREAM,
.pr_protocol = VSOCK_PROTO_PRIVATE,
.pr_flags = PR_CONNREQUIRED | PR_WANTRCVD,
.pr_init = vsock_private_init,
.pr_unlock = vsock_unlock,
.pr_usrreqs = &vsock_usrreqs,
}
};
@ -1490,15 +1613,21 @@ static const int vsock_proto_count = (sizeof(vsocksw) / sizeof(struct protosw));
static struct domain *vsock_domain = NULL;
static void
vsock_dinit(struct domain *dp)
vsock_dinit(struct domain *_Nonnull dp)
{
// The VSock domain is initialized with a singleton pattern.
VERIFY(dp != NULL);
VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
VERIFY(vsock_domain == NULL);
vsock_domain = dp;
const uint32_t default_buffer_size = VSOCK_MAX_PACKET_SIZE * 8;
// Add protocols and initialize.
for (int i = 0; i < vsock_proto_count; i++) {
vsock_sendspace[i] = default_buffer_size;
vsock_recvspace[i] = default_buffer_size;
net_add_proto((struct protosw *)&vsocksw[i], dp, 1);
}
}

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2000-2018 Apple Inc. All rights reserved.
* Copyright (c) 2000-2024 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
@ -43,6 +43,8 @@ struct exec_info {
};
int grade_binary(cpu_type_t, cpu_subtype_t, cpu_subtype_t, bool allow_simulator_binary);
int binary_grade_overrides_update(char *overrides_arg);
size_t bingrade_get_override_string(char *existing_overrides, size_t existing_overrides_bufsize);
boolean_t binary_match(cpu_type_t mask_bits, cpu_type_t req_cpu,
cpu_subtype_t req_subcpu, cpu_type_t test_cpu,
cpu_subtype_t test_subcpu);

Some files were not shown because too many files have changed in this diff Show more