diff --git a/CHANGES b/CHANGES index aa7f126fdd..5c8a07b00d 100644 --- a/CHANGES +++ b/CHANGES @@ -2,6 +2,32 @@ The list of most significant changes made over time in Intel(R) Threading Building Blocks (Intel(R) TBB). +Intel TBB 2018 Update 2 +TBB_INTERFACE_VERSION == 10002 + +Changes (w.r.t. Intel TBB 2018 Update 1): + +- Added support for Android* NDK r16, macOS* 10.13, Fedora* 26. +- Binaries for Universal Windows Driver (vc14_uwd) now link with static + Microsoft* runtime libraries, and are only available in commercial + releases. +- Extended flow graph documentation with more code samples. + +Preview Features: + +- Added a Python* module for multi-processing computations in numeric + Python* libraries. + +Bugs fixed: + +- Fixed constructors of concurrent_hash_map to be exception-safe. +- Fixed auto-initialization in the main thread to be cleaned up at + shutdown. +- Fixed a crash when tbbmalloc_proxy is used together with dbghelp. +- Fixed static_partitioner to assign tasks properly in case of nested + parallelism. + +------------------------------------------------------------------------ Intel TBB 2018 Update 1 TBB_INTERFACE_VERSION == 10001 diff --git a/Makefile b/Makefile index 40ca412a43..4efc537a9c 100644 --- a/Makefile +++ b/Makefile @@ -50,13 +50,11 @@ rml: mkdir $(MAKE) -C "$(work_dir)_debug" -r -f $(tbb_root)/build/Makefile.rml cfg=debug $(MAKE) -C "$(work_dir)_release" -r -f $(tbb_root)/build/Makefile.rml cfg=release - examples: tbb tbbmalloc $(MAKE) -C examples -r -f Makefile tbb_root=.. release test -python: mkdir - $(MAKE) -C "$(work_dir)_release" -r -f $(tbb_root)/build/Makefile.tbb cfg=release - bash -c ". $(work_dir)_release$(SLASH)tbbvars.sh && $(MAKE) -rC '$(full_tbb_root)/python' CXX=$(compiler) install test-install" +python: tbb + $(MAKE) -C "$(work_dir)_release" -rf $(tbb_root)/python/Makefile install .PHONY: clean clean_examples mkdir info diff --git a/README.md b/README.md index 3e863874ee..2678e2e5aa 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -# Intel(R) Threading Building Blocks 2018 Update 1 -[![Stable release](https://img.shields.io/badge/version-2018_U1-green.svg)](https://github.com/01org/tbb/releases/tag/2018_U1) +# Intel(R) Threading Building Blocks 2018 Update 2 +[![Stable release](https://img.shields.io/badge/version-2018_U2-green.svg)](https://github.com/01org/tbb/releases/tag/2018_U2) [![Apache License Version 2.0](https://img.shields.io/badge/license-Apache_2.0-green.svg)](LICENSE) Intel(R) Threading Building Blocks (Intel(R) TBB) lets you easily write parallel C++ programs that take diff --git a/build/android.clang.inc b/build/android.clang.inc index a935968c9c..667c21a4a4 100644 --- a/build/android.clang.inc +++ b/build/android.clang.inc @@ -68,8 +68,20 @@ ifeq (0, $(dynamic_load)) endif # Paths to the NDK prebuilt tools and libraries -CPLUS_FLAGS += --sysroot=$(SYSROOT) +ifneq (,$(findstring $(ndk_version),r16 r16b)) + # Since Android* NDK r16 another sysroot and isystem paths have to be specified + CPLUS_FLAGS += --sysroot=$(NDK_ROOT)/sysroot -isystem $(NDK_ROOT)/sysroot/usr/include/$(TRIPLE) + # Android* version flag required since r16 + CPLUS_FLAGS += -D__ANDROID_API__=$(API_LEVEL) +else + CPLUS_FLAGS += --sysroot=$(SYSROOT) +endif + +# Library sysroot flag LIB_LINK_FLAGS += --sysroot=$(SYSROOT) +# Flag for test executables +LINK_FLAGS += --sysroot=$(SYSROOT) + LIBS = -L$(CPLUS_LIB_PATH) -lc++_shared ifeq (,$(findstring $(ndk_version),$(foreach v, 7 8 9 10 11,r$(v) r$(v)b r$(v)c r$(v)d r$(v)e))) LIBS += -lc++abi diff --git a/build/build.py b/build/build.py new file mode 100644 index 0000000000..7fbf21a100 --- /dev/null +++ b/build/build.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python +# +# Copyright (c) 2017 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# +# + +# Provides unified tool for preparing TBB for packaging + +from __future__ import print_function +import os +import re +import sys +import shutil +import platform +import argparse +from glob import glob +from collections import OrderedDict + +jp = os.path.join +is_win = (platform.system() == 'Windows') +is_lin = (platform.system() == 'Linux') +is_mac = (platform.system() == 'Darwin') + +default_prefix = os.getenv('PREFIX', 'install_prefix') +if is_win: + default_prefix = jp(default_prefix, 'Library') # conda-specific by default on Windows + +parser = argparse.ArgumentParser() +parser.add_argument('--tbbroot', default='.', help='Take Intel TBB from here') +parser.add_argument('--prefix', default=default_prefix, help='Prefix') +parser.add_argument('--no-rebuild', default=False, action='store_true', help='do not rebuild') +parser.add_argument('--install', default=False, action='store_true', help='install all') +parser.add_argument('--install-libs', default=False, action='store_true', help='install libs') +parser.add_argument('--install-devel', default=False, action='store_true', help='install devel') +parser.add_argument('--install-docs', default=False, action='store_true', help='install docs') +parser.add_argument('--install-python',default=False, action='store_true', help='install python module') +parser.add_argument('--make-tool', default='make', help='Use different make command instead') +parser.add_argument('--copy-tool', default=None, help='Use this command for copying ($ tool file dest-dir)') +parser.add_argument('--build-args', default="", help='specify extra build args') +parser.add_argument('--build-prefix', default='local', help='build dir prefix') +if is_win: + parser.add_argument('--msbuild', default=False, action='store_true', help='Use msbuild') + parser.add_argument('--vs', default="2012", help='select VS version for build') + parser.add_argument('--vs-platform', default="x64", help='select VS platform for build') +parser.add_argument('ignore', nargs='?', help="workaround conda-build issue #2512") + +args = parser.parse_args() + +if args.install: + args.install_libs = True + args.install_devel = True + args.install_docs = True + args.install_python= True + +def custom_cp(src, dst): + assert os.system(' '.join([args.copy_tool, src, dst])) == 0 + +if args.copy_tool: + install_cp = custom_cp # e.g. to use install -p -D -m 755 on Linux +else: + install_cp = shutil.copy + +bin_dir = jp(args.prefix, "bin") +lib_dir = jp(args.prefix, "lib") +inc_dir = jp(args.prefix, 'include') +doc_dir = jp(args.prefix, 'share', 'doc', 'tbb') +if is_win: + os.environ["OS"] = "Windows_NT" # make sure TBB will interpret it corretly + libext = '.dll' + libpref = '' + dll_dir = bin_dir +else: + libext = '.dylib' if is_mac else '.so.2' + libpref = 'lib' + dll_dir = lib_dir + +tbb_names = ["tbb", "tbbmalloc", "tbbmalloc_proxy"] + +############################################################## + +os.chdir(args.tbbroot) +if is_win and args.msbuild: + preview_release_dir = release_dir = jp(args.tbbroot, 'build', 'vs'+args.vs, args.vs_platform, 'Release') + if not args.no_rebuild or not os.path.isdir(release_dir): + assert os.system('msbuild /m /p:Platform=%s /p:Configuration=Release %s build/vs%s/makefile.sln'% \ + (args.vs_platform, args.build_args, args.vs)) == 0 + preview_debug_dir = debug_dir = jp(args.tbbroot, 'build', 'vs'+args.vs, args.vs_platform, 'Debug') + if not args.no_rebuild or not os.path.isdir(debug_dir): + assert os.system('msbuild /m /p:Platform=%s /p:Configuration=Debug %s build/vs%s/makefile.sln'% \ + (args.vs_platform, args.build_args, args.vs)) == 0 +else: + release_dir = jp(args.tbbroot, 'build', args.build_prefix+'_release') + debug_dir = jp(args.tbbroot, 'build', args.build_prefix+'_debug') + if not args.no_rebuild or not (os.path.isdir(release_dir) and os.path.isdir(debug_dir)): + assert os.system('%s -j tbb_build_prefix=%s %s'% \ + (args.make_tool, args.build_prefix, args.build_args)) == 0 + preview_release_dir = jp(args.tbbroot, 'build', args.build_prefix+'_preview_release') + preview_debug_dir = jp(args.tbbroot, 'build', args.build_prefix+'_preview_debug') + if not args.no_rebuild or not (os.path.isdir(preview_release_dir) and os.path.isdir(preview_debug_dir)): + assert os.system('%s -j tbb_build_prefix=%s_preview %s tbb_cpf=1 tbb'% \ + (args.make_tool, args.build_prefix, args.build_args)) == 0 + + +filemap = OrderedDict() +def append_files(files, dst): + global filemap + filemap.update(dict(zip(files, [dst]*len(files)))) + +if args.install_libs: + files = [jp(release_dir, libpref+f+libext) for f in tbb_names] + append_files(files, dll_dir) + +if args.install_devel: + dll_files = [jp(debug_dir, libpref+f+'_debug'+libext) for f in tbb_names] # adding debug libraries + if not is_win or not args.msbuild: + dll_files += [jp(preview_release_dir, libpref+"tbb_preview"+libext), + jp(preview_debug_dir, libpref+"tbb_preview_debug"+libext)] + if is_win: + dll_files += sum( [glob(jp(d, 'tbb*.pdb')) for d in # copying debug info + (release_dir, debug_dir, preview_release_dir, preview_debug_dir)], []) + if is_lin: + dll_files += sum( [glob(jp(d, 'libtbb*.so')) for d in # copying linker scripts + (release_dir, debug_dir, preview_release_dir, preview_debug_dir)], []) + # symlinks .so -> .so.2 should not be created instead + # since linking with -ltbb when using links can result in + # incorrect dependence upon unversioned .so files + append_files(dll_files, dll_dir) + if is_win: + lib_files = sum([glob(jp(d,e)) for d in (release_dir, debug_dir) for e in ('*.lib', '*.def')], []) + append_files(lib_files, lib_dir) # copying linker libs and defs + for rootdir, dirnames, filenames in os.walk(jp(args.tbbroot,'include')): + files = [jp(rootdir, f) for f in filenames if not '.html' in f] + append_files(files, jp(inc_dir, rootdir.split('include')[1][1:])) + +if args.install_python: + assert os.system('%s -j tbb_build_prefix=%s %s python'% \ + (args.make_tool, args.build_prefix, args.build_args)) == 0 + if is_lin: + append_files([jp(release_dir, 'libirml.so.1')], dll_dir) + +if args.install_docs: + files = [jp(args.tbbroot, *f) for f in ( + ('CHANGES',), + ('LICENSE',), + ('README',), + ('README.md',), + ('doc','Release_Notes.txt'), + )] + append_files(files, doc_dir) + +for f in filemap.keys(): + assert os.path.exists(f) + assert os.path.isfile(f) + +if filemap: + print("Copying to prefix =", args.prefix) +for f, dest in filemap.items(): + if not os.path.isdir(dest): + os.makedirs(dest) + print("+ %s to $prefix%s"%(f,dest.replace(args.prefix, ''))) + install_cp(f, dest) + +print("done") diff --git a/build/common_rules.inc b/build/common_rules.inc index 3922413df3..809d5c25c8 100644 --- a/build/common_rules.inc +++ b/build/common_rules.inc @@ -63,7 +63,14 @@ ifeq ($(origin LIB_LINK_LIBS), undefined) LIB_LINK_LIBS = $(LIBDL) $(LIBS) endif +# Define C & C++ compilers according to platform defaults or CXX & CC environment variables +ifneq (,$(findstring environment, $(origin CXX))) +CPLUS = $(CXX) +endif CONLY ?= $(CPLUS) +ifneq (,$(findstring environment, $(origin CC))) +CONLY = $(CC) +endif # The most generic rules #$(1) - is the target pattern diff --git a/build/macos.clang.inc b/build/macos.clang.inc index 3237705343..63ee41e2d9 100644 --- a/build/macos.clang.inc +++ b/build/macos.clang.inc @@ -44,7 +44,7 @@ else CPLUS_FLAGS = -g -O0 -DTBB_USE_DEBUG endif -CPLUS_FLAGS += -DUSE_PTHREAD +CPLUS_FLAGS += -DUSE_PTHREAD $(ITT_NOTIFY) # For Clang, we add the option to support RTM intrinsics *iff* xtest is found in ifneq (,$(shell grep xtest `echo "\#include" | clang -E -M - 2>&1 | grep immintrin.h` 2>/dev/null)) @@ -57,12 +57,14 @@ ifneq (,$(stdlib)) endif ifeq (intel64,$(arch)) + ITT_NOTIFY = -DDO_ITT_NOTIFY CPLUS_FLAGS += -m64 $(RTM_KEY) LINK_FLAGS += -m64 LIB_LINK_FLAGS += -m64 endif ifeq (ia32,$(arch)) + ITT_NOTIFY = -DDO_ITT_NOTIFY CPLUS_FLAGS += -m32 $(RTM_KEY) LINK_FLAGS += -m32 LIB_LINK_FLAGS += -m32 diff --git a/build/macos.gcc.inc b/build/macos.gcc.inc index 65c7047cb1..3890e9fdcc 100644 --- a/build/macos.gcc.inc +++ b/build/macos.gcc.inc @@ -44,15 +44,17 @@ else CPLUS_FLAGS = -g -O0 -DTBB_USE_DEBUG endif -CPLUS_FLAGS += -DUSE_PTHREAD +CPLUS_FLAGS += -DUSE_PTHREAD $(ITT_NOTIFY) ifeq (intel64,$(arch)) + ITT_NOTIFY = -DDO_ITT_NOTIFY CPLUS_FLAGS += -m64 LINK_FLAGS += -m64 LIB_LINK_FLAGS += -m64 endif ifeq (ia32,$(arch)) + ITT_NOTIFY = -DDO_ITT_NOTIFY CPLUS_FLAGS += -m32 LINK_FLAGS += -m32 LIB_LINK_FLAGS += -m32 diff --git a/build/macos.icc.inc b/build/macos.icc.inc index 92a339b29c..c7dafe6f34 100644 --- a/build/macos.icc.inc +++ b/build/macos.icc.inc @@ -58,7 +58,8 @@ else CPLUS_FLAGS = -g -O0 -DTBB_USE_DEBUG endif -CPLUS_FLAGS += -DUSE_PTHREAD +ITT_NOTIFY = -DDO_ITT_NOTIFY +CPLUS_FLAGS += -DUSE_PTHREAD $(ITT_NOTIFY) ifneq (,$(codecov)) CPLUS_FLAGS += -prof-gen=srcpos diff --git a/build/windows.cl.inc b/build/windows.cl.inc index 3906d08676..bfc7d2648b 100644 --- a/build/windows.cl.inc +++ b/build/windows.cl.inc @@ -43,6 +43,11 @@ else endif EH_FLAGS = $(if $(no_exceptions),/EHs-,/EHsc /GR) +# UWD binaries have to use static CRT linkage +ifeq ($(target_app), uwd) + MS_CRT_KEY = /MT$(if $(findstring debug,$(cfg)),d) +endif + ifeq ($(cfg), release) CPLUS_FLAGS = $(MS_CRT_KEY) /O2 /Zi $(EH_FLAGS) /Zc:forScope /Zc:wchar_t /D__TBB_LIB_NAME=$(TBB.LIB) ASM_FLAGS = @@ -54,17 +59,20 @@ endif ZW_KEY = /ZW:nostdlib -ifneq (,$(filter win8ui,$(target_app) $(target_ui))) - CPLUS_FLAGS += $(ZW_KEY) /D "_UNICODE" /D "UNICODE" /D "WINAPI_FAMILY=WINAPI_FAMILY_APP" +# These flags are general for Windows* universal applications +ifneq (,$(target_app)) + CPLUS_FLAGS += $(ZW_KEY) /D "_UNICODE" /D "UNICODE" /D "WINAPI_FAMILY=WINAPI_FAMILY_APP" +endif + +ifeq ($(target_app), win8ui) _WIN32_WINNT = 0x0602 -else ifneq (,$(filter uwp,$(target_app) $(target_ui))) - CPLUS_FLAGS += $(ZW_KEY) /D "_UNICODE" /D "UNICODE" /D "WINAPI_FAMILY=WINAPI_FAMILY_APP" +else ifneq (,$(filter $(target_app),uwp uwd)) _WIN32_WINNT = 0x0A00 LIB_LINK_FLAGS += /NODEFAULTLIB:kernel32.lib OneCore.lib else CPLUS_FLAGS += /DDO_ITT_NOTIFY endif -ifneq (,$(filter store,$(target_mode) $(target_ui_mode))) +ifeq ($(target_mode), store) # it is necessary to source vcvars with 'store' argument in production LIB_LINK_FLAGS += /APPCONTAINER endif diff --git a/doc/Release_Notes.txt b/doc/Release_Notes.txt index 2c352ebdc7..8ed04a1b24 100644 --- a/doc/Release_Notes.txt +++ b/doc/Release_Notes.txt @@ -66,7 +66,7 @@ Software - Supported Operating Systems Systems with Linux* operating systems CentOS 7.1 Debian* 8, 9 - Fedora* 24, 25 + Fedora* 24, 25, 26 Intel(R) Cluster Ready Red Hat* Enterprise Linux* 6, 7 SuSE* Linux* Enterprise Server 11, 12 @@ -75,9 +75,9 @@ Software - Supported Operating Systems Yocto 2.2, 2.3 Systems with OS X* or macOS* operating systems OS X* 10.10, 10.11 - macOS* 10.12 + macOS* 10.12, 10.13 Systems with Android* operating systems - Android* 5.x, 6.x, 7.x + Android* 5.x, 6.x, 7.x, 8.x Software - Supported Compilers @@ -94,8 +94,8 @@ Software - Supported Compilers version provided with that operating system is supported GNU Compilers (gcc) 4.1 - 7.1 GNU C Library (glibc) version 2.4 - 2.19 - Xcode* 6.3 - 8.3 - Android* NDK r10e - r15b + Xcode* 6.3 - 9.1 + Android* NDK r10e - r16 Software - Supported Performance Analysis Tools diff --git a/doc/html/a00040.html b/doc/html/a00040.html index 65836598ca..c92ed3a67c 100644 --- a/doc/html/a00040.html +++ b/doc/html/a00040.html @@ -349,10 +349,10 @@ void internal_copy (const concurrent_hash_map &source)  Copy "source" to *this, where *this must start out empty.
  - + template<typename I > -void internal_copy (I first, I last) -  +void internal_copy (I first, I last, size_type reserve_size) +  const_pointer internal_fast_find (const Key &key) const  Fast find when no concurrent erasure is used. For internal use inside TBB only! More...
  diff --git a/doc/html/a00361.html b/doc/html/a00361.html index 9edce96c2f..72ba936c39 100644 --- a/doc/html/a00361.html +++ b/doc/html/a00361.html @@ -101,7 +101,7 @@ internal::hash_map_iterator (defined in tbb::interface5::concurrent_hash_map< Key, T, HashCompare, A >)tbb::interface5::concurrent_hash_map< Key, T, HashCompare, A >friend internal::hash_map_range (defined in tbb::interface5::concurrent_hash_map< Key, T, HashCompare, A >)tbb::interface5::concurrent_hash_map< Key, T, HashCompare, A >friend internal_copy(const concurrent_hash_map &source)tbb::interface5::concurrent_hash_map< Key, T, HashCompare, A >protected - internal_copy(I first, I last) (defined in tbb::interface5::concurrent_hash_map< Key, T, HashCompare, A >)tbb::interface5::concurrent_hash_map< Key, T, HashCompare, A >protected + internal_copy(I first, I last, size_type reserve_size) (defined in tbb::interface5::concurrent_hash_map< Key, T, HashCompare, A >)tbb::interface5::concurrent_hash_map< Key, T, HashCompare, A >protected internal_equal_range(const Key &key, I end) const tbb::interface5::concurrent_hash_map< Key, T, HashCompare, A >protected internal_fast_find(const Key &key) const tbb::interface5::concurrent_hash_map< Key, T, HashCompare, A >inlineprotected is_write_access_needed (defined in tbb::interface5::concurrent_hash_map< Key, T, HashCompare, A >)tbb::interface5::concurrent_hash_map< Key, T, HashCompare, A >friend diff --git a/examples/parallel_for/tachyon/msvs/win8ui/copy_libraries_and_assets.bat b/examples/parallel_for/tachyon/msvs/win8ui/copy_libraries_and_assets.bat index 1322d0fd5f..9f7216803b 100644 --- a/examples/parallel_for/tachyon/msvs/win8ui/copy_libraries_and_assets.bat +++ b/examples/parallel_for/tachyon/msvs/win8ui/copy_libraries_and_assets.bat @@ -54,7 +54,7 @@ copy "%TBBROOT%\%interim_path%\%vc_dir%\tbbmalloc%postfix%.pdb" "%output_dir%" copy "%TBBROOT%\%interim_lib_path%\%vc_dir%\tbb%postfix%.lib" "%output_dir%" :: Copying DAT-file -echo Using DAT-file %dat_file% +echo Using DAT-file %dat_file% if exist %dat_file% copy %dat_file% "%output_dir%\Assets\balls.dat" goto end diff --git a/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.sln b/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.sln index a7372953a8..ed64646af5 100644 --- a/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.sln +++ b/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.sln @@ -1,6 +1,8 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 2012 +# Visual Studio 2013 +VisualStudioVersion = 12.0.40629.0 +MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tbbTachyon", "tbbTachyon.vcxproj", "{E20CB432-6730-4021-A372-1C81A333518A}" EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{E6DDEA39-7910-47F9-A0E3-56AD7E62ACBD}" diff --git a/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.vcxproj b/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.vcxproj index 076f67ce9f..43f1c96775 100644 --- a/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.vcxproj +++ b/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -30,27 +30,27 @@ Application true - v110 + v120 true Application true - v110 + v120 true Application false false - v110 + v120 true Application false false - v110 + v120 true @@ -236,4 +236,4 @@ - \ No newline at end of file + diff --git a/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.vcxproj.filters b/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.vcxproj.filters index 56341ace16..54c9254155 100644 --- a/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.vcxproj.filters +++ b/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.vcxproj.filters @@ -1,5 +1,5 @@  - + Resources\Common @@ -212,4 +212,4 @@ Frontend - \ No newline at end of file + diff --git a/include/tbb/blocked_range2d.h b/include/tbb/blocked_range2d.h index 1e3dfd1711..0ba40794ce 100644 --- a/include/tbb/blocked_range2d.h +++ b/include/tbb/blocked_range2d.h @@ -55,7 +55,7 @@ class blocked_range2d { //! True if range is empty bool empty() const { - // Yes, it is a logical OR here, not AND. + // Range is empty if at least one dimension is empty. return my_rows.empty() || my_cols.empty(); } diff --git a/include/tbb/blocked_range3d.h b/include/tbb/blocked_range3d.h index 5f3b3c860f..1c8b2a8b62 100644 --- a/include/tbb/blocked_range3d.h +++ b/include/tbb/blocked_range3d.h @@ -61,7 +61,7 @@ class blocked_range3d { //! True if range is empty bool empty() const { - // Yes, it is a logical OR here, not AND. + // Range is empty if at least one dimension is empty. return my_pages.empty() || my_rows.empty() || my_cols.empty(); } diff --git a/include/tbb/concurrent_hash_map.h b/include/tbb/concurrent_hash_map.h index 52e41de56b..f75ec0563b 100644 --- a/include/tbb/concurrent_hash_map.h +++ b/include/tbb/concurrent_hash_map.h @@ -770,7 +770,9 @@ class concurrent_hash_map : protected internal::hash_map_base { concurrent_hash_map( const concurrent_hash_map &table, const allocator_type &a = allocator_type() ) : internal::hash_map_base(), my_allocator(a) { + call_clear_on_leave scope_guard(this); internal_copy(table); + scope_guard.dismiss(); } #if __TBB_CPP11_RVALUE_REF_PRESENT @@ -789,7 +791,7 @@ class concurrent_hash_map : protected internal::hash_map_base { this->swap(table); }else{ call_clear_on_leave scope_guard(this); - internal_copy(std::make_move_iterator(table.begin()), std::make_move_iterator(table.end())); + internal_copy(std::make_move_iterator(table.begin()), std::make_move_iterator(table.end()), table.size()); scope_guard.dismiss(); } } @@ -800,8 +802,9 @@ class concurrent_hash_map : protected internal::hash_map_base { concurrent_hash_map( I first, I last, const allocator_type &a = allocator_type() ) : my_allocator(a) { - reserve( std::distance(first, last) ); // TODO: load_factor? - internal_copy(first, last); + call_clear_on_leave scope_guard(this); + internal_copy(first, last, std::distance(first, last)); + scope_guard.dismiss(); } #if __TBB_INITIALIZER_LISTS_PRESENT @@ -809,8 +812,9 @@ class concurrent_hash_map : protected internal::hash_map_base { concurrent_hash_map( std::initializer_list il, const allocator_type &a = allocator_type() ) : my_allocator(a) { - reserve(il.size()); - internal_copy(il.begin(), il.end()); + call_clear_on_leave scope_guard(this); + internal_copy(il.begin(), il.end(), il.size()); + scope_guard.dismiss(); } #endif //__TBB_INITIALIZER_LISTS_PRESENT @@ -847,8 +851,7 @@ class concurrent_hash_map : protected internal::hash_map_base { //! Assignment concurrent_hash_map& operator=( std::initializer_list il ) { clear(); - reserve(il.size()); - internal_copy(il.begin(), il.end()); + internal_copy(il.begin(), il.end(), il.size()); return *this; } #endif //__TBB_INITIALIZER_LISTS_PRESENT @@ -1073,7 +1076,7 @@ class concurrent_hash_map : protected internal::hash_map_base { void internal_copy( const concurrent_hash_map& source ); template - void internal_copy( I first, I last ); + void internal_copy( I first, I last, size_type reserve_size ); //! Fast find when no concurrent erasure is used. For internal use inside TBB only! /** Return pointer to item with given key, or NULL if no such item exists. @@ -1429,9 +1432,9 @@ void concurrent_hash_map::clear() { template void concurrent_hash_map::internal_copy( const concurrent_hash_map& source ) { - reserve( source.my_size ); // TODO: load_factor? hashcode_t mask = source.my_mask; if( my_mask == mask ) { // optimized version + reserve( source.my_size ); // TODO: load_factor? bucket *dst = 0, *src = 0; bool rehash_required = false; for( hashcode_t k = 0; k <= mask; k++ ) { @@ -1448,12 +1451,13 @@ void concurrent_hash_map::internal_copy( const concurrent_h } } if( rehash_required ) rehash(); - } else internal_copy( source.begin(), source.end() ); + } else internal_copy( source.begin(), source.end(), source.my_size ); } template template -void concurrent_hash_map::internal_copy(I first, I last) { +void concurrent_hash_map::internal_copy(I first, I last, size_type reserve_size) { + reserve( reserve_size ); // TODO: load_factor? hashcode_t m = my_mask; for(; first != last; ++first) { hashcode_t h = my_hash_compare.hash( (*first).first ); diff --git a/include/tbb/flow_graph.h b/include/tbb/flow_graph.h index fb839ac184..2672349ef5 100644 --- a/include/tbb/flow_graph.h +++ b/include/tbb/flow_graph.h @@ -34,7 +34,6 @@ #include "internal/_aggregator_impl.h" #include "tbb_profiling.h" #include "task_arena.h" -#include "flow_graph_abstractions.h" #if __TBB_PREVIEW_ASYNC_MSG #include // std::vector in internal::async_storage @@ -761,7 +760,15 @@ inline graph::graph(task_group_context& use_this_context) : my_is_active = true; } -inline void graph::reserve_wait() { +inline graph::~graph() { + wait_for_all(); + my_root_task->set_ref_count(0); + tbb::task::destroy(*my_root_task); + if (own_context) delete my_context; + delete my_task_arena; +} + +inline void graph::reserve_wait() { if (my_root_task) { my_root_task->increment_ref_count(); tbb::internal::fgt_reserve_wait(this); @@ -821,12 +828,32 @@ inline void graph::reset( reset_flags f ) { my_reset_task_list.clear(); } +inline graph::iterator graph::begin() { return iterator(this, true); } + +inline graph::iterator graph::end() { return iterator(this, false); } + +inline graph::const_iterator graph::begin() const { return const_iterator(this, true); } + +inline graph::const_iterator graph::end() const { return const_iterator(this, false); } + +inline graph::const_iterator graph::cbegin() const { return const_iterator(this, true); } + +inline graph::const_iterator graph::cend() const { return const_iterator(this, false); } + #if TBB_PREVIEW_FLOW_GRAPH_TRACE inline void graph::set_name(const char *name) { tbb::internal::fgt_graph_desc(this, name); } #endif +inline graph_node::graph_node(graph& g) : my_graph(g) { + my_graph.register_node(this); +} + +inline graph_node::~graph_node() { + my_graph.remove_node(this); +} + #include "internal/_flow_graph_node_impl.h" //! An executable node that acts as a source, i.e. it has no predecessors diff --git a/include/tbb/flow_graph_opencl_node.h b/include/tbb/flow_graph_opencl_node.h index bf541a10fd..812726fb15 100644 --- a/include/tbb/flow_graph_opencl_node.h +++ b/include/tbb/flow_graph_opencl_node.h @@ -457,7 +457,7 @@ class opencl_async_msg : public async_msg { operator const T&() const { return data(); } protected: - // Overridden in this derived class to inform that + // Overridden in this derived class to inform that // async calculation chain is over void finalize() const __TBB_override { receive_if_memory_object(*this); @@ -530,10 +530,10 @@ class opencl_memory { opencl_async_msg receive(const cl_event *e) { opencl_async_msg d; - if (e) { + if (e) { d = opencl_async_msg(my_host_ptr, *e); - } else { - d = opencl_async_msg(my_host_ptr); + } else { + d = opencl_async_msg(my_host_ptr); } // Concurrent receives are prohibited so we do not worry about synchronization. diff --git a/include/tbb/internal/_flow_graph_impl.h b/include/tbb/internal/_flow_graph_impl.h index ffdd57be83..6cb706e842 100644 --- a/include/tbb/internal/_flow_graph_impl.h +++ b/include/tbb/internal/_flow_graph_impl.h @@ -206,13 +206,7 @@ class graph : tbb::internal::no_copy, public tbb::flow::graph_proxy { //! Destroys the graph. /** Calls wait_for_all, then destroys the root task and context. */ - ~graph() { - wait_for_all(); - my_root_task->set_ref_count(0); - tbb::task::destroy(*my_root_task); - if (own_context) delete my_context; - delete my_task_arena; - } + ~graph(); #if TBB_PREVIEW_FLOW_GRAPH_TRACE void set_name(const char *name); @@ -305,17 +299,17 @@ class graph : tbb::internal::no_copy, public tbb::flow::graph_proxy { // Graph iterator constructors //! start iterator - iterator begin() { return iterator(this, true); } + iterator begin(); //! end iterator - iterator end() { return iterator(this, false); } + iterator end(); //! start const iterator - const_iterator begin() const { return const_iterator(this, true); } + const_iterator begin() const; //! end const iterator - const_iterator end() const { return const_iterator(this, false); } + const_iterator end() const; //! start const iterator - const_iterator cbegin() const { return const_iterator(this, true); } + const_iterator cbegin() const; //! end const iterator - const_iterator cend() const { return const_iterator(this, false); } + const_iterator cend() const; //! return status of graph execution bool is_cancelled() { return cancelled; } @@ -361,12 +355,9 @@ class graph_node : tbb::internal::no_copy { graph& my_graph; graph_node *next, *prev; public: - explicit graph_node(graph& g) : my_graph(g) { - my_graph.register_node(this); - } - virtual ~graph_node() { - my_graph.remove_node(this); - } + explicit graph_node(graph& g); + + virtual ~graph_node(); #if TBB_PREVIEW_FLOW_GRAPH_TRACE virtual void set_name(const char *name) = 0; diff --git a/include/tbb/internal/_flow_graph_trace_impl.h b/include/tbb/internal/_flow_graph_trace_impl.h index 111810c3b2..328f378a64 100644 --- a/include/tbb/internal/_flow_graph_trace_impl.h +++ b/include/tbb/internal/_flow_graph_trace_impl.h @@ -88,7 +88,10 @@ static inline void fgt_internal_create_output_port( void *node, void *p, string_ template void register_input_port(void *node, tbb::flow::receiver* port, string_index name_index) { // TODO: Make fgt_internal_create_input_port a function template? - fgt_internal_create_input_port( node, port, name_index); + // In C++03 dependent name lookup from the template definition context + // works only for function declarations with external linkage: + // http://www.open-std.org/JTC1/SC22/WG21/docs/cwg_defects.html#561 + fgt_internal_create_input_port(node, static_cast(port), name_index); } template < typename PortsTuple, int N > @@ -239,7 +242,7 @@ static inline void fgt_async_reserve( void *node, void *graph ) { itt_region_begin( ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_NULL ); } -static inline void fgt_async_commit( void *node, void *graph ) { +static inline void fgt_async_commit( void *node, void */*graph*/) { itt_region_end( ITT_DOMAIN_FLOW, node, FLOW_NODE ); } diff --git a/include/tbb/partitioner.h b/include/tbb/partitioner.h index d7ebdbfed4..80006adbb4 100644 --- a/include/tbb/partitioner.h +++ b/include/tbb/partitioner.h @@ -50,6 +50,7 @@ #endif // __TBB_DEFINE_MIC #include "task.h" +#include "task_arena.h" #include "aligned_space.h" #include "atomic.h" #include "internal/_template_helpers.h" @@ -349,16 +350,25 @@ struct proportional_mode : adaptive_mode { #endif // warning 4127 is back }; +static size_t get_initial_partition_head() { + int current_index = tbb::this_task_arena::current_thread_index(); + if (current_index == tbb::task_arena::not_initialized) + current_index = 0; + return size_t(current_index); +} + //! Provides default linear indexing of partitioner's sequence template struct linear_affinity_mode : proportional_mode { size_t my_head; + size_t my_max_affinity; using proportional_mode::self; - linear_affinity_mode() : proportional_mode(), my_head(0) {} + linear_affinity_mode() : proportional_mode(), my_head(get_initial_partition_head()), + my_max_affinity(self().my_divisor) {} linear_affinity_mode(linear_affinity_mode &src, split) : proportional_mode(src, split()) - , my_head(src.my_head + src.my_divisor) {} + , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {} linear_affinity_mode(linear_affinity_mode &src, const proportional_split& split_obj) : proportional_mode(src, split_obj) - , my_head(src.my_head + src.my_divisor) {} + , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {} void set_affinity( task &t ) { if( self().my_divisor ) t.set_affinity( affinity_id(my_head) + 1 ); diff --git a/include/tbb/tbb_config.h b/include/tbb/tbb_config.h index 3ac2b3e036..d8e0e20166 100644 --- a/include/tbb/tbb_config.h +++ b/include/tbb/tbb_config.h @@ -522,7 +522,8 @@ There are four cases that are supported: #endif /* TBB_PREVIEW_FLOW_GRAPH_TRACE */ #ifndef __TBB_ITT_STRUCTURE_API -#define __TBB_ITT_STRUCTURE_API ( !__TBB_DEFINE_MIC && (__TBB_CPF_BUILD || TBB_PREVIEW_FLOW_GRAPH_TRACE || TBB_PREVIEW_ALGORITHM_TRACE) ) +#define __TBB_ITT_STRUCTURE_API ( (__TBB_CPF_BUILD || TBB_PREVIEW_FLOW_GRAPH_TRACE || TBB_PREVIEW_ALGORITHM_TRACE) \ + && !(__TBB_DEFINE_MIC || __MINGW64__ || __MINGW32__) ) #endif #if TBB_USE_EXCEPTIONS && !__TBB_TASK_GROUP_CONTEXT diff --git a/include/tbb/tbb_stddef.h b/include/tbb/tbb_stddef.h index b4ee781010..9f7d51b536 100644 --- a/include/tbb/tbb_stddef.h +++ b/include/tbb/tbb_stddef.h @@ -26,7 +26,7 @@ #define TBB_VERSION_MINOR 0 // Engineering-focused interface version -#define TBB_INTERFACE_VERSION 10001 +#define TBB_INTERFACE_VERSION 10002 #define TBB_INTERFACE_VERSION_MAJOR TBB_INTERFACE_VERSION/1000 // The oldest major interface version still supported @@ -155,8 +155,8 @@ namespace tbb { #if TBB_USE_ASSERT - //! Assert that x is true. - /** If x is false, print assertion failure message. + //! Assert that predicate is true. + /** If predicate is false, print assertion failure message. If the comment argument is not NULL, it is printed as part of the failure message. The comment argument has no other effect. */ #define __TBB_ASSERT(predicate,message) __TBB_ASSERT_RELEASE(predicate,message) diff --git a/jni/Application.mk b/jni/Application.mk index b9474c2236..940567b153 100644 --- a/jni/Application.mk +++ b/jni/Application.mk @@ -45,20 +45,23 @@ export target?=android ifeq (ia32,$(arch)) APP_ABI:=x86 + export TRIPLE:=i686-linux-android else ifeq (intel64,$(arch)) APP_ABI:=x86_64 + export TRIPLE:=x86_64-linux-android else ifeq (arm,$(arch)) APP_ABI:=armeabi-v7a + export TRIPLE:=arm-linux-androideabi else ifeq (arm64,$(arch)) APP_ABI:=arm64-v8a + export TRIPLE:=aarch64-linux-android else APP_ABI:=$(arch) endif -APP_PLATFORM:=android-21 -ifneq ("","$(api_version)") - APP_PLATFORM:=$(api_version) -endif +api_version?=21 +export API_LEVEL:=$(api_version) +APP_PLATFORM:=android-$(api_version) ifeq (clang,$(compiler)) NDK_TOOLCHAIN_VERSION:=clang diff --git a/python/Makefile b/python/Makefile index a444542c62..bf58ddfcae 100644 --- a/python/Makefile +++ b/python/Makefile @@ -16,23 +16,33 @@ # # -all: release test +tbb_root?=.. +include $(tbb_root)/build/common.inc +.PHONY: all release test install test-install -clean: - python setup.py clean - -rm -rf build/ tbb_wrap.* _TBB.* *.pyc TBB.py* +export TBBROOT=$(abspath $(tbb_root)) +SRC=$(tbb_root)/python/*.py $(tbb_root)/python/tbb/* +PY_SETUP=python $(tbb_root)/python/setup.py + +all: install test -release: TBB.py +clean: + $(PY_SETUP) clean -b$(CURDIR) -TBB.py: tbb.i tbb.src.py setup.py - python setup.py build_ext -f --inplace +release: CC=$(compiler) +release: $(SRC) rml + $(PY_SETUP) build -b$(CURDIR) -f check -test: TBB.py - python TBB.py test +install: CC=$(compiler) +install: $(SRC) rml + $(PY_SETUP) build -b$(CURDIR) install -install: - python setup.py install +test: + python -m tbb test -test-install: - @echo Testing installed module - python -m TBB test +rml: +ifeq (linux,$(tbb_os)) + $(MAKE) -C "$(work_dir)_release" -rf $(tbb_root)/python/rml/Makefile cfg=release rml +rml_%: + $(MAKE) -C "$(work_dir)_release" -rf $(tbb_root)/python/rml/Makefile cfg=release $(subst rml_,,$@) +endif diff --git a/python/TBB.py b/python/TBB.py new file mode 100644 index 0000000000..9e162997d1 --- /dev/null +++ b/python/TBB.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python +# +# Copyright (c) 2016-2017 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# +# + + +from tbb import * +from tbb import __all__, __doc__ + +if __name__ == "__main__": + from tbb import _main + import sys + sys.exit(_main()) diff --git a/python/index.html b/python/index.html index 480f870b86..a13b6f2298 100644 --- a/python/index.html +++ b/python/index.html @@ -4,10 +4,10 @@

Python* API for Intel® Threading Building Blocks (Intel® TBB).

Overview

-It is a preview Python* module which unlocks opportunities for additional performance in multi-threaded Python programs by enabling threading composability +It is a preview Python* module which unlocks opportunities for additional performance in multi-threaded and multiprocess Python programs by enabling threading composability between two or more thread-enabled libraries like Numpy, Scipy, Sklearn, Dask, Joblib, and etc.

-The biggest improvement can be achieved when a task pool like the ThreadPool from the Python standard library or libraries like Dask or Joblib (used in multi-threading mode) +The biggest improvement can be achieved when a task pool like the ThreadPool or Pool from the Python standard library or libraries like Dask or Joblib (used either in multi-threading or multi-processing mode) execute tasks calling compute-intensive functions of Numpy/Scipy/Sklearn/PyDAAL which in turn are parallelized using Intel® Math Kernel Library or/and Intel® TBB.

The module implements Pool class with the standard interface using Intel® TBB which can be used to replace Python's ThreadPool. @@ -15,43 +15,52 @@

Overview

For more information and examples, please refer to online blog. +

Directories

+
+
rml +
The folder contains sources for building the plugin with cross-process dynamic thread scheduler implementation. +
tbb +
The folder contains Python module sources. +
+

Files

setup.py
Standard Python setup script.
Makefile -
Makefile for building, installing, and testing. See below. -
tbb.i -
SWIG interface description file. -
tbb.src.py -
Python part of module implementation. +
Internal Makefile for building, installing, and testing. See below. +
TBB.py +
Alternative entry point for Python module.

Build and install

-Prior to building it, please set up the environment using corresponding tbbvars script, e.g. `source tbbvars.sh intel64` +For accessing targets defined in python/Makefile, please use +src/Makefile +instead and build runtime libraries before working with Python.
-
make -
Default build and run. Equivalent to 'make release test'. -
make release -
Compile and link against the release version of Intel TBB runtime library. The resulting executable is left in the directory for the example. -
make test -
Run local build of the module previously produced by one of the above commands. -
make install -
Install module into Python. -
make [(above options or targets)] CXX={icl, icc} -
Build and run as above, but use Intel® C++ compiler instead of default, native compilers -(e.g., icl instead of cl.exe on Windows* systems, or icc instead of g++ on Linux* or macOS* systems). -Please note, CXX=icl works on Windows only with Intel® Distribution for Python*. -
make clean -
Remove any intermediate files produced by the above commands. +
make -C ../src python_all +
Install and test as described below. +
make -C ../src python_install +
Install module into Python environment. +
make -C ../src python_test +
Test installed Intel® TBB module for Python. +
make -C ../src python_release +
Recompile Python module. Result is located in Intel® TBB build directory. +
make -C ../src python_clean +
Remove any intermediate files produced by the commands above. Does not remove installed module.

Command-line interface

-
pydoc TBB -
Read built-in documentation for Python interfaces. -
python -m TBB your_script.py -
Run your_script.py in context of `with TBB.Monkey():` when Intel TBB is enabled. +
python -m tbb -h +
Print documentation on command-line interface
+
pydoc tbb +
Read built-in documentation for Python interfaces.
+
python-tbb your_script.py +
python -m tbb your_script.py +
Run your_script.py in context of `with tbb.Monkey():` when Intel® TBB is enabled. By default only multi-threading will be covered.
+
python -m tbb --ipc your_script.py +
Run your_script.py in context of `with tbb.Monkey():` when Intel® TBB enabled in both multi-threading and multi-processing modes.

System Requirements

diff --git a/python/rml/Makefile b/python/rml/Makefile new file mode 100644 index 0000000000..6ca2d21942 --- /dev/null +++ b/python/rml/Makefile @@ -0,0 +1,155 @@ +# Copyright (c) 2017 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# +# + + +.NOTPARALLEL: + +tbb_root ?= ../.. +BUILDING_PHASE=0 +TEST_RESOURCE = $(RML.RES) +include $(tbb_root)/build/common.inc +DEBUG_SUFFIX=$(findstring _debug,_$(cfg)) + +ifneq (linux,$(target)) +$(error "IPC RML is supported on Linux only") +endif + +.PHONY: default rml test clean + +# default target +default: rml test + +RML_ROOT ?= $(tbb_root)/src/rml +RML_SERVER_ROOT = $(RML_ROOT)/server +# TODO: new API needs to be added for this server, exposing everything +RML.DEF = + +VPATH = $(tbb_root)/src/tbb $(tbb_root)/src/tbb/$(ASSEMBLY_SOURCE) +VPATH += $(tbb_root)/python/rml $(RML_ROOT)/test $(tbb_root)/src/test +VPATH += $(tbb_root)/src/rml/client + +include $(tbb_root)/build/common_rules.inc + +#-------------------------------------------------------------------------- +# Define rules for making the RML server shared library and client objects. +#-------------------------------------------------------------------------- + +# Object files that make up RML server +RML_SERVER.OBJ = ipc_server.$(OBJ) + +# Object files that RML clients need +RML_TBB_CLIENT.OBJ ?= ipc_utils.$(OBJ) +RML.OBJ = $(RML_SERVER.OBJ) $(RML_TBB_CLIENT.OBJ) +ifeq (windows,$(tbb_os)) +RML_ASM.OBJ = $(if $(findstring intel64,$(arch)),$(TBB_ASM.OBJ)) +endif +ifeq (linux,$(tbb_os)) +RML_ASM.OBJ = $(if $(findstring ia64,$(arch)),$(TBB_ASM.OBJ)) +endif + +RML_TBB_DEP= cache_aligned_allocator_rml.$(OBJ) dynamic_link_rml.$(OBJ) tbb_misc_rml.$(OBJ) tbb_misc_ex_rml.$(OBJ) +TBB_DEP_NON_RML_TEST?= cache_aligned_allocator_rml.$(OBJ) dynamic_link_rml.$(OBJ) $(RML_ASM.OBJ) tbb_misc_rml.$(OBJ) tbb_misc_ex_rml.$(OBJ) +ifeq ($(cfg),debug) +RML_TBB_DEP+= spin_mutex_rml.$(OBJ) +TBB_DEP_RML_TEST?= $(RML_ASM.OBJ) tbb_misc_rml.$(OBJ) +else +TBB_DEP_RML_TEST?= $(RML_ASM.OBJ) +endif +LIBS += $(LIBDL) +TBB_DEP_RML_TEST = rml_tbb.$(OBJ) dynamic_link_rml.$(OBJ) + +INCLUDES += $(INCLUDE_KEY)$(RML_ROOT)/include $(INCLUDE_KEY). +T_INCLUDES = $(INCLUDES) $(INCLUDE_KEY)$(tbb_root)/src/test $(INCLUDE_KEY)$(RML_SERVER_ROOT) + +# Suppress superfluous warnings for RML compilation +R_CPLUS_FLAGS = $(subst DO_ITT_NOTIFY,DO_ITT_NOTIFY=0,$(CPLUS_FLAGS)) $(WARNING_SUPPRESS) \ + $(DEFINE_KEY)TBB_USE_THREADING_TOOLS=0 $(DEFINE_KEY)__TBB_RML_STATIC=1 $(DEFINE_KEY)__TBB_NO_IMPLICIT_LINKAGE=1 + +%.$(OBJ): %.cpp + $(CPLUS) $(COMPILE_ONLY) $(R_CPLUS_FLAGS) $(PIC_KEY) $(DSE_KEY) $(INCLUDES) $< + +tbb_misc_rml.$(OBJ) $(RML_SERVER.OBJ): version_string.ver + +RML_TEST.OBJ = test_job_automaton.$(OBJ) test_thread_monitor.$(OBJ) test_rml_tbb.$(OBJ) + +$(RML_TBB_DEP): %_rml.$(OBJ): %.cpp + $(CPLUS) $(COMPILE_ONLY) $(OUTPUTOBJ_KEY)$@ $(R_CPLUS_FLAGS) $(PIC_KEY) $(DSE_KEY) $(INCLUDES) $< + +$(RML_TEST.OBJ): %.$(OBJ): %.cpp + $(CPLUS) $(COMPILE_ONLY) $(R_CPLUS_FLAGS) $(PIC_KEY) $(T_INCLUDES) $< + +ifneq (,$(RML.DEF)) +rml.def: $(RML.DEF) + $(CPLUS) $(PREPROC_ONLY) $< $(CPLUS_FLAGS) $(INCLUDES) > $@ + +LIB_LINK_FLAGS += $(EXPORT_KEY)rml.def +$(RML.DLL): rml.def +endif + +$(RML.DLL): CPLUS_FLAGS += $(SDL_FLAGS) +$(RML.DLL): BUILDING_LIBRARY = $(RML.DLL) +$(RML.DLL): $(RML_TBB_DEP) $(RML.OBJ) $(RML.RES) $(RML_NO_VERSION.DLL) $(RML_ASM.OBJ) + $(LIB_LINK_CMD) $(LIB_OUTPUT_KEY)$(RML.DLL) $(RML.OBJ) $(RML_TBB_DEP) $(RML_ASM.OBJ) $(RML.RES) $(LIB_LINK_LIBS) $(LIB_LINK_FLAGS) + +ifneq (,$(RML_NO_VERSION.DLL)) +$(RML_NO_VERSION.DLL): + echo "INPUT ($(RML.DLL))" > $(RML_NO_VERSION.DLL) +endif + +rml: rml_dll +rml_dll: $(RML.DLL) + +#------------------------------------------------------ +# End of rules for making the RML server shared library +#------------------------------------------------------ + +#------------------------------------------------------ +# Define rules for making the RML unit tests +#------------------------------------------------------ + +add_debug=$(basename $(1))_debug$(suffix $(1)) +cross_suffix=$(if $(crosstest),$(if $(DEBUG_SUFFIX),$(subst _debug,,$(1)),$(call add_debug,$(1))),$(1)) + +RML_TESTS = test_job_automaton.$(TEST_EXT) test_thread_monitor.$(TEST_EXT) +RML_CUSTOM_TESTS = test_rml_tbb.$(TEST_EXT) + +test_rml_tbb.$(TEST_EXT): test_rml_tbb.$(OBJ) $(RML_TBB_CLIENT.OBJ) $(TBB_DEP_RML_TEST) + $(CPLUS) $(OUTPUT_KEY)$@ $(CPLUS_FLAGS) test_rml_tbb.$(OBJ) $(RML_TBB_CLIENT.OBJ) $(TBB_DEP_RML_TEST) $(LIBS) $(LINK_FLAGS) + +$(RML_TESTS): %.$(TEST_EXT): %.$(OBJ) $(TBB_DEP_NON_RML_TEST) + $(CPLUS) $(OUTPUT_KEY)$@ $(CPLUS_FLAGS) $< $(TBB_DEP_NON_RML_TEST) $(LIBS) $(LINK_FLAGS) + +export IPC_ENABLE=1 +### run_cmd is usually empty +test: $(call cross_suffix,$(RML.DLL)) $(TEST_PREREQUISITE) $(RML_TESTS) $(RML_CUSTOM_TESTS) + $(run_cmd) ./test_job_automaton.$(TEST_EXT) $(args) + $(run_cmd) ./test_thread_monitor.$(TEST_EXT) $(args) +#TODO: $(run_cmd) ./test_rml_tbb.$(TEST_EXT) $(args) +#TODO: IPC_ENABLE=1 LD_PRELOAD=$(abspath libirml.so.1) $(MAKE) -rf $(tbb_root)/src/Makefile cfg=release tbb_test_release + +#------------------------------------------------------ +# End of rules for making the TBBMalloc unit tests +#------------------------------------------------------ + +# Include automatically generated dependencies +-include *.d + +clean: + -rm -rf *.o *.so* *.d *.def version_string.ver + -rm -rf $(work_dir)_release/libirml* + -rm -rf $(work_dir)_debug/libirml* diff --git a/python/rml/ipc_server.cpp b/python/rml/ipc_server.cpp new file mode 100644 index 0000000000..0c28920306 --- /dev/null +++ b/python/rml/ipc_server.cpp @@ -0,0 +1,1119 @@ +/* + Copyright (c) 2017 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + + + +*/ + +#include "rml_tbb.h" +#include "../server/thread_monitor.h" +#include "tbb/atomic.h" +#include "tbb/cache_aligned_allocator.h" +#include "tbb/scheduler_common.h" +#include "tbb/governor.h" +#include "tbb/tbb_misc.h" + +#include "ipc_utils.h" + +#include + +namespace rml { +namespace internal { + +static const char* IPC_ENABLE_VAR_NAME = "IPC_ENABLE"; + +typedef versioned_object::version_type version_type; + +extern "C" factory::status_type __RML_open_factory(factory& f, version_type& server_version, version_type client_version) { + if( !tbb::internal::rml::get_enable_flag( IPC_ENABLE_VAR_NAME ) ) { + return factory::st_incompatible; + } + + // Hack to keep this library from being closed + static tbb::atomic one_time_flag; + if( one_time_flag.compare_and_swap(true,false)==false ) { + __TBB_ASSERT( (size_t)f.library_handle!=factory::c_dont_unload, NULL ); +#if _WIN32||_WIN64 + f.library_handle = reinterpret_cast(factory::c_dont_unload); +#else + f.library_handle = reinterpret_cast(factory::c_dont_unload); +#endif + } + // End of hack + + return factory::st_success; +} + +extern "C" void __RML_close_factory(factory& f) { +} + +class ipc_thread_monitor : public thread_monitor { +public: + ipc_thread_monitor() : thread_monitor() {} + +#if USE_WINTHREAD +#elif USE_PTHREAD + static handle_type launch(thread_routine_type thread_routine, void* arg, size_t stack_size); +#endif +}; + +#if USE_WINTHREAD +#elif USE_PTHREAD +inline ipc_thread_monitor::handle_type ipc_thread_monitor::launch(void* (*thread_routine)(void*), void* arg, size_t stack_size) { + pthread_attr_t s; + if( pthread_attr_init( &s ) ) return 0; + if( stack_size>0 ) { + if( pthread_attr_setstacksize( &s, stack_size ) ) return 0; + } + pthread_t handle; + if( pthread_create( &handle, &s, thread_routine, arg ) ) return 0; + if( pthread_attr_destroy( &s ) ) return 0; + return handle; +} +#endif + +}} //rml::internal + +using rml::internal::ipc_thread_monitor; + +namespace tbb { +namespace internal { +namespace rml { + +typedef ipc_thread_monitor::handle_type thread_handle; + +class ipc_server; + +static const char* IPC_MAX_THREADS_VAR_NAME = "MAX_THREADS"; +static const char* IPC_ACTIVE_SEM_PREFIX = "/__IPC_active"; +static const char* IPC_STOP_SEM_PREFIX = "/__IPC_stop"; +static const char* IPC_ACTIVE_SEM_VAR_NAME = "IPC_ACTIVE_SEMAPHORE"; +static const char* IPC_STOP_SEM_VAR_NAME = "IPC_STOP_SEMAPHORE"; +static const mode_t IPC_SEM_MODE = 0660; + +static tbb::atomic my_global_thread_count; + +char* get_active_sem_name() { + char* value = getenv( IPC_ACTIVE_SEM_VAR_NAME ); + if( value!=NULL && strlen( value )>0 ) { + char* sem_name = new char[strlen( value ) + 1]; + __TBB_ASSERT( sem_name!=NULL, NULL ); + strcpy( sem_name, value ); + return sem_name; + } else { + return get_shared_name( IPC_ACTIVE_SEM_PREFIX ); + } +} + +char* get_stop_sem_name() { + char* value = getenv( IPC_STOP_SEM_VAR_NAME ); + if( value!=NULL && strlen( value )>0 ) { + char* sem_name = new char[strlen( value ) + 1]; + __TBB_ASSERT( sem_name!=NULL, NULL ); + strcpy( sem_name, value ); + return sem_name; + } else { + return get_shared_name( IPC_STOP_SEM_PREFIX ); + } +} + +static void release_thread_sem(sem_t* my_sem) { + int old; + do { + old = my_global_thread_count; + if( old<=0 ) return; + } while( my_global_thread_count.compare_and_swap(old-1, old)!=old ); + if( old>0 ) { + sem_post( my_sem ); + } +} + +extern "C" void set_active_sem_name() { + char* templ = new char[strlen( IPC_ACTIVE_SEM_PREFIX ) + strlen( "_XXXXXX" ) + 1]; + __TBB_ASSERT( templ!=NULL, NULL ); + strcpy( templ, IPC_ACTIVE_SEM_PREFIX ); + strcpy( templ + strlen( IPC_ACTIVE_SEM_PREFIX ), "_XXXXXX" ); + char* sem_name = mktemp( templ ); + if( sem_name!=NULL ) { + int status = setenv( IPC_ACTIVE_SEM_VAR_NAME, sem_name, 1 ); + __TBB_ASSERT( status==0, NULL ); + } + delete[] templ; +} + +extern "C" void set_stop_sem_name() { + char* templ = new char[strlen( IPC_STOP_SEM_PREFIX ) + strlen( "_XXXXXX" ) + 1]; + __TBB_ASSERT( templ!=NULL, NULL ); + strcpy( templ, IPC_STOP_SEM_PREFIX ); + strcpy( templ + strlen( IPC_STOP_SEM_PREFIX ), "_XXXXXX" ); + char* sem_name = mktemp( templ ); + if( sem_name!=NULL ) { + int status = setenv( IPC_STOP_SEM_VAR_NAME, sem_name, 1 ); + __TBB_ASSERT( status==0, NULL ); + } + delete[] templ; +} + +extern "C" void release_resources() { + if( my_global_thread_count!=0 ) { + char* active_sem_name = get_active_sem_name(); + sem_t* my_active_sem = sem_open( active_sem_name, O_CREAT ); + __TBB_ASSERT( my_active_sem, "Unable to open active threads semaphore" ); + delete[] active_sem_name; + + do { + release_thread_sem( my_active_sem ); + } while( my_global_thread_count!=0 ); + } +} + +extern "C" void release_semaphores() { + int status = 0; + char* sem_name = NULL; + + sem_name = get_active_sem_name(); + if( sem_name==NULL ) { + runtime_warning("Can not get RML semaphore name"); + return; + } + status = sem_unlink( sem_name ); + if( status!=0 ) { + if( errno==ENOENT ) { + /* There is no semaphore with the given name, nothing to do */ + } else { + runtime_warning("Can not release RML semaphore"); + return; + } + } + delete[] sem_name; + + sem_name = get_stop_sem_name(); + if( sem_name==NULL ) { + runtime_warning( "Can not get RML semaphore name" ); + return; + } + status = sem_unlink( sem_name ); + if( status!=0 ) { + if( errno==ENOENT ) { + /* There is no semaphore with the given name, nothing to do */ + } else { + runtime_warning("Can not release RML semaphore"); + return; + } + } + delete[] sem_name; +} + +class ipc_worker: no_copy { +protected: + //! State in finite-state machine that controls the worker. + /** State diagram: + /----------stop---\ + | ^ | + V | | + init --> starting --> normal | + | | | | + | V | | + \------> quit <-------/<----/ + */ + enum state_t { + //! *this is initialized + st_init, + //! *this has associated thread that is starting up. + st_starting, + //! Associated thread is doing normal life sequence. + st_normal, + //! Associated thread is stopped but can be started again. + st_stop, + //! Associated thread has ended normal life sequence and promises to never touch *this again. + st_quit + }; + atomic my_state; + + //! Associated server + ipc_server& my_server; + + //! Associated client + tbb_client& my_client; + + //! index used for avoiding the 64K aliasing problem + const size_t my_index; + + //! Monitor for sleeping when there is no work to do. + /** The invariant that holds for sleeping workers is: + "my_slack<=0 && my_state==st_normal && I am on server's list of asleep threads" */ + ipc_thread_monitor my_thread_monitor; + + //! Handle of the OS thread associated with this worker + thread_handle my_handle; + + //! Link for list of workers that are sleeping or have no associated thread. + ipc_worker* my_next; + + friend class ipc_server; + + //! Actions executed by the associated thread + void run(); + + //! Wake up associated thread (or launch a thread if there is none) + bool wake_or_launch(); + + //! Called by a thread (usually not the associated thread) to commence termination. + void start_shutdown(bool join); + + //! Called by a thread (usually not the associated thread) to commence stopping. + void start_stopping(bool join); + + static __RML_DECL_THREAD_ROUTINE thread_routine(void* arg); + + static void release_handle(thread_handle my_handle, bool join); + +protected: + ipc_worker(ipc_server& server, tbb_client& client, const size_t i) : + my_server(server), + my_client(client), + my_index(i) + { + my_state = st_init; + } +}; + +static const size_t cache_line_size = tbb::internal::NFS_MaxLineSize; + +#if _MSC_VER && !defined(__INTEL_COMPILER) + // Suppress overzealous compiler warnings about uninstantiable class + #pragma warning(push) + #pragma warning(disable:4510 4610) +#endif +class padded_ipc_worker: public ipc_worker { + char pad[cache_line_size - sizeof(ipc_worker)%cache_line_size]; +public: + padded_ipc_worker(ipc_server& server, tbb_client& client, const size_t i) + : ipc_worker( server,client,i ) { suppress_unused_warning(pad); } +}; +#if _MSC_VER && !defined(__INTEL_COMPILER) + #pragma warning(pop) +#endif + +class ipc_waker : public padded_ipc_worker { +private: + static __RML_DECL_THREAD_ROUTINE thread_routine(void* arg); + void run(); + bool wake_or_launch(); + + friend class ipc_server; + +public: + ipc_waker(ipc_server& server, tbb_client& client, const size_t i) + : padded_ipc_worker( server, client, i ) {} +}; + +class ipc_stopper : public padded_ipc_worker { +private: + static __RML_DECL_THREAD_ROUTINE thread_routine(void* arg); + void run(); + bool wake_or_launch(); + + friend class ipc_server; + +public: + ipc_stopper(ipc_server& server, tbb_client& client, const size_t i) + : padded_ipc_worker( server, client, i ) {} +}; + +class ipc_server: public tbb_server, no_copy { +private: + tbb_client& my_client; + //! Maximum number of threads to be created. + /** Threads are created lazily, so maximum might not actually be reached. */ + tbb_client::size_type my_n_thread; + + //! Stack size for each thread. */ + const size_t my_stack_size; + + //! Number of jobs that could use their associated thread minus number of active threads. + /** If negative, indicates oversubscription. + If positive, indicates that more threads should run. + Can be lowered asynchronously, but must be raised only while holding my_asleep_list_mutex, + because raising it impacts the invariant for sleeping threads. */ + atomic my_slack; + + //! Counter used to determine when to delete this. + atomic my_ref_count; + + padded_ipc_worker* my_thread_array; + + //! List of workers that are asleep or committed to sleeping until notified by another thread. + tbb::atomic my_asleep_list_root; + + //! Protects my_asleep_list_root + typedef scheduler_mutex_type asleep_list_mutex_type; + asleep_list_mutex_type my_asleep_list_mutex; + + //! Should server wait workers while terminate + const bool my_join_workers; + + //! Service thread for waking of workers + ipc_waker* my_waker; + + //! Service thread to stop threads + ipc_stopper* my_stopper; + + //! Semaphore to account active threads + sem_t* my_active_sem; + + //! Semaphore to account stop threads + sem_t* my_stop_sem; + +#if TBB_USE_ASSERT + atomic my_net_slack_requests; +#endif /* TBB_USE_ASSERT */ + + //! Wake up to two sleeping workers, if there are any sleeping. + /** The call is used to propagate a chain reaction where each thread wakes up two threads, + which in turn each wake up two threads, etc. */ + void propagate_chain_reaction() { + // First test of a double-check idiom. Second test is inside wake_some(0). + if( my_slack>0 ) { + int active_threads = 0; + if( try_get_active_thread() ) { + ++active_threads; + if( try_get_active_thread() ) { + ++active_threads; + } + wake_some( 0, active_threads ); + } + } + } + + //! Try to add t to list of sleeping workers + bool try_insert_in_asleep_list(ipc_worker& t); + + //! Try to add t to list of sleeping workers even if there is some work to do + bool try_insert_in_asleep_list_forced(ipc_worker& t); + + //! Equivalent of adding additional_slack to my_slack and waking up to 2 threads if my_slack permits. + void wake_some(int additional_slack, int active_threads); + + //! Equivalent of adding additional_slack to my_slack and waking up to 1 thread if my_slack permits. + void wake_one_forced(int additional_slack); + + //! Stop one thread from asleep list + bool stop_one(); + + //! Wait for active thread + bool wait_active_thread(); + + //! Try to get active thread + bool try_get_active_thread(); + + //! Release active thread + void release_active_thread(); + + //! Wait for thread to stop + bool wait_stop_thread(); + + //! Add thread to stop list + void add_stop_thread(); + + void remove_server_ref() { + if( --my_ref_count==0 ) { + my_client.acknowledge_close_connection(); + this->~ipc_server(); + tbb::cache_aligned_allocator().deallocate( this, 1 ); + } + } + + friend class ipc_worker; + friend class ipc_waker; + friend class ipc_stopper; +public: + ipc_server(tbb_client& client); + virtual ~ipc_server(); + + version_type version() const __TBB_override { + return 0; + } + + void request_close_connection(bool /*exiting*/) __TBB_override { + my_waker->start_shutdown(false); + my_stopper->start_shutdown(false); + for( size_t i=0; i=2 && !__MINGW64__ +// ensure that stack is properly aligned +__attribute__((force_align_arg_pointer)) +#endif +__RML_DECL_THREAD_ROUTINE ipc_worker::thread_routine(void* arg) { + ipc_worker* self = static_cast(arg); + AVOID_64K_ALIASING( self->my_index ); + self->run(); + return 0; +} +#if _MSC_VER && !defined(__INTEL_COMPILER) + #pragma warning(pop) +#endif + +void ipc_worker::release_handle(thread_handle handle, bool join) { + if( join ) + ipc_thread_monitor::join( handle ); + else + ipc_thread_monitor::detach_thread( handle ); +} + +void ipc_worker::start_shutdown(bool join) { + state_t s; + + do { + s = my_state; + __TBB_ASSERT( s!=st_quit, NULL ); + } while( my_state.compare_and_swap( st_quit, s )!=s ); + if( s==st_normal || s==st_starting ) { + // May have invalidated invariant for sleeping, so wake up the thread. + // Note that the notify() here occurs without maintaining invariants for my_slack. + // It does not matter, because my_state==st_quit overrides checking of my_slack. + my_thread_monitor.notify(); + // Do not need release handle in st_init state, + // because in this case the thread wasn't started yet. + // For st_starting release is done at launch site. + if( s==st_normal ) + release_handle( my_handle, join ); + } +} + +void ipc_worker::start_stopping(bool join) { + state_t s; + + do { + s = my_state; + } while( my_state.compare_and_swap( st_stop, s )!=s ); + if( s==st_normal || s==st_starting ) { + // May have invalidated invariant for sleeping, so wake up the thread. + // Note that the notify() here occurs without maintaining invariants for my_slack. + // It does not matter, because my_state==st_quit overrides checking of my_slack. + my_thread_monitor.notify(); + // Do not need release handle in st_init state, + // because in this case the thread wasn't started yet. + // For st_starting release is done at launch site. + if( s==st_normal ) + release_handle( my_handle, join ); + } +} + +void ipc_worker::run() { + my_server.propagate_chain_reaction(); + + // Transiting to st_normal here would require setting my_handle, + // which would create race with the launching thread and + // complications in handle management on Windows. + + ::rml::job& j = *my_client.create_one_job(); + state_t state = my_state; + while( state!=st_quit && state!=st_stop ) { + if( my_server.my_slack>=0 ) { + my_client.process(j); + } else { + ipc_thread_monitor::cookie c; + // Prepare to wait + my_thread_monitor.prepare_wait(c); + // Check/set the invariant for sleeping + state = my_state; + if( state!=st_quit && state!=st_stop && my_server.try_insert_in_asleep_list(*this) ) { + if( my_server.my_n_thread > 1 ) my_server.release_active_thread(); + my_thread_monitor.commit_wait(c); + my_server.propagate_chain_reaction(); + } else { + // Invariant broken + my_thread_monitor.cancel_wait(); + } + } + state = my_state; + } + my_client.cleanup(j); + + my_server.remove_server_ref(); +} + +inline bool ipc_worker::wake_or_launch() { + if( ( my_state==st_init && my_state.compare_and_swap( st_starting, st_init )==st_init ) || + ( my_state==st_stop && my_state.compare_and_swap( st_starting, st_stop )==st_stop ) ) { + // after this point, remove_server_ref() must be done by created thread +#if USE_WINTHREAD + my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size, &this->my_index ); +#elif USE_PTHREAD + { + affinity_helper fpa; + fpa.protect_affinity_mask( /*restore_process_mask=*/true ); + my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size ); + if( my_handle == 0 ) { + // Unable to create new thread for process + // However, this is expected situation for the use cases of this coordination server + state_t s = my_state.compare_and_swap( st_init, st_starting ); + if (st_starting != s) { + // Do shutdown during startup. my_handle can't be released + // by start_shutdown, because my_handle value might be not set yet + // at time of transition from st_starting to st_quit. + __TBB_ASSERT( s==st_quit, NULL ); + release_handle( my_handle, my_server.my_join_workers ); + } + return false; + } else { + my_server.my_ref_count++; + } + // Implicit destruction of fpa resets original affinity mask. + } +#endif /* USE_PTHREAD */ + state_t s = my_state.compare_and_swap( st_normal, st_starting ); + if( st_starting!=s ) { + // Do shutdown during startup. my_handle can't be released + // by start_shutdown, because my_handle value might be not set yet + // at time of transition from st_starting to st_quit. + __TBB_ASSERT( s==st_quit, NULL ); + release_handle( my_handle, my_server.my_join_workers ); + } + } + else { + my_thread_monitor.notify(); + } + + return true; +} + +//------------------------------------------------------------------------ +// Methods of ipc_waker +//------------------------------------------------------------------------ +#if _MSC_VER && !defined(__INTEL_COMPILER) + // Suppress overzealous compiler warnings about an initialized variable 'sink_for_alloca' not referenced + #pragma warning(push) + #pragma warning(disable:4189) +#endif +#if __MINGW32__ && __GNUC__==4 &&__GNUC_MINOR__>=2 && !__MINGW64__ +// ensure that stack is properly aligned +__attribute__((force_align_arg_pointer)) +#endif +__RML_DECL_THREAD_ROUTINE ipc_waker::thread_routine(void* arg) { + ipc_waker* self = static_cast(arg); + AVOID_64K_ALIASING( self->my_index ); + self->run(); + return 0; +} +#if _MSC_VER && !defined(__INTEL_COMPILER) + #pragma warning(pop) +#endif + +void ipc_waker::run() { + // Transiting to st_normal here would require setting my_handle, + // which would create race with the launching thread and + // complications in handle management on Windows. + + while( my_state!=st_quit ) { + bool have_to_sleep = false; + if( my_server.my_slack>0 ) { + if( my_server.wait_active_thread() ) { + if( my_server.my_slack>0 ) { + my_server.wake_some( 0, 1 ); + } else { + my_server.release_active_thread(); + have_to_sleep = true; + } + } + } else { + have_to_sleep = true; + } + if( have_to_sleep ) { + ipc_thread_monitor::cookie c; + // Prepare to wait + my_thread_monitor.prepare_wait(c); + // Check/set the invariant for sleeping + if( my_state!=st_quit && my_server.my_slack<0 ) { + my_thread_monitor.commit_wait(c); + } else { + // Invariant broken + my_thread_monitor.cancel_wait(); + } + } + } + + my_server.remove_server_ref(); +} + +inline bool ipc_waker::wake_or_launch() { + if( my_state==st_init && my_state.compare_and_swap( st_starting, st_init )==st_init ) { + // after this point, remove_server_ref() must be done by created thread +#if USE_WINTHREAD + my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size, &this->my_index ); +#elif USE_PTHREAD + { + affinity_helper fpa; + fpa.protect_affinity_mask( /*restore_process_mask=*/true ); + my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size ); + if( my_handle == 0 ) { + runtime_warning( "Unable to create new thread for process %d", getpid() ); + state_t s = my_state.compare_and_swap( st_init, st_starting ); + if (st_starting != s) { + // Do shutdown during startup. my_handle can't be released + // by start_shutdown, because my_handle value might be not set yet + // at time of transition from st_starting to st_quit. + __TBB_ASSERT( s==st_quit, NULL ); + release_handle( my_handle, my_server.my_join_workers ); + } + return false; + } else { + my_server.my_ref_count++; + } + // Implicit destruction of fpa resets original affinity mask. + } +#endif /* USE_PTHREAD */ + state_t s = my_state.compare_and_swap( st_normal, st_starting ); + if( st_starting!=s ) { + // Do shutdown during startup. my_handle can't be released + // by start_shutdown, because my_handle value might be not set yet + // at time of transition from st_starting to st_quit. + __TBB_ASSERT( s==st_quit, NULL ); + release_handle( my_handle, my_server.my_join_workers ); + } + } + else { + my_thread_monitor.notify(); + } + + return true; +} + +//------------------------------------------------------------------------ +// Methods of ipc_stopper +//------------------------------------------------------------------------ +#if _MSC_VER && !defined(__INTEL_COMPILER) + // Suppress overzealous compiler warnings about an initialized variable 'sink_for_alloca' not referenced + #pragma warning(push) + #pragma warning(disable:4189) +#endif +#if __MINGW32__ && __GNUC__==4 &&__GNUC_MINOR__>=2 && !__MINGW64__ +// ensure that stack is properly aligned +__attribute__((force_align_arg_pointer)) +#endif +__RML_DECL_THREAD_ROUTINE ipc_stopper::thread_routine(void* arg) { + ipc_stopper* self = static_cast(arg); + AVOID_64K_ALIASING( self->my_index ); + self->run(); + return 0; +} +#if _MSC_VER && !defined(__INTEL_COMPILER) + #pragma warning(pop) +#endif + +void ipc_stopper::run() { + // Transiting to st_normal here would require setting my_handle, + // which would create race with the launching thread and + // complications in handle management on Windows. + + while( my_state!=st_quit ) { + if( my_server.wait_stop_thread() ) { + if( my_state!=st_quit ) { + if( !my_server.stop_one() ) { + my_server.add_stop_thread(); + prolonged_pause(); + } + } + } + } + + my_server.remove_server_ref(); +} + +inline bool ipc_stopper::wake_or_launch() { + if( my_state==st_init && my_state.compare_and_swap( st_starting, st_init )==st_init ) { + // after this point, remove_server_ref() must be done by created thread +#if USE_WINTHREAD + my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size, &this->my_index ); +#elif USE_PTHREAD + { + affinity_helper fpa; + fpa.protect_affinity_mask( /*restore_process_mask=*/true ); + my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size ); + if( my_handle == 0 ) { + runtime_warning( "Unable to create new thread for process %d", getpid() ); + state_t s = my_state.compare_and_swap( st_init, st_starting ); + if (st_starting != s) { + // Do shutdown during startup. my_handle can't be released + // by start_shutdown, because my_handle value might be not set yet + // at time of transition from st_starting to st_quit. + __TBB_ASSERT( s==st_quit, NULL ); + release_handle( my_handle, my_server.my_join_workers ); + } + return false; + } else { + my_server.my_ref_count++; + } + // Implicit destruction of fpa resets original affinity mask. + } +#endif /* USE_PTHREAD */ + state_t s = my_state.compare_and_swap( st_normal, st_starting ); + if( st_starting!=s ) { + // Do shutdown during startup. my_handle can't be released + // by start_shutdown, because my_handle value might be not set yet + // at time of transition from st_starting to st_quit. + __TBB_ASSERT( s==st_quit, NULL ); + release_handle( my_handle, my_server.my_join_workers ); + } + } + else { + my_thread_monitor.notify(); + } + + return true; +} + +//------------------------------------------------------------------------ +// Methods of ipc_server +//------------------------------------------------------------------------ +ipc_server::ipc_server(tbb_client& client) : + my_client( client ), + my_stack_size( client.min_stack_size() ), + my_thread_array(NULL), + my_waker(NULL), + my_stopper(NULL), + my_join_workers(false) +{ + my_ref_count = 1; + my_slack = 0; +#if TBB_USE_ASSERT + my_net_slack_requests = 0; +#endif /* TBB_USE_ASSERT */ + my_n_thread = get_num_threads(IPC_MAX_THREADS_VAR_NAME); + if( my_n_thread==0 ) { + my_n_thread = AvailableHwConcurrency(); + __TBB_ASSERT( my_n_thread>0, NULL ); + } + + my_asleep_list_root = NULL; + my_thread_array = tbb::cache_aligned_allocator().allocate( my_n_thread ); + memset( my_thread_array, 0, sizeof(padded_ipc_worker)*my_n_thread ); + for( size_t i=0; imy_next = my_asleep_list_root; + my_asleep_list_root = t; + } + + my_waker = tbb::cache_aligned_allocator().allocate(1); + memset( my_waker, 0, sizeof(ipc_waker) ); + new( my_waker ) ipc_waker( *this, client, my_n_thread ); + + my_stopper = tbb::cache_aligned_allocator().allocate(1); + memset( my_stopper, 0, sizeof(ipc_stopper) ); + new( my_stopper ) ipc_stopper( *this, client, my_n_thread + 1 ); + + char* active_sem_name = get_active_sem_name(); + my_active_sem = sem_open( active_sem_name, O_CREAT, IPC_SEM_MODE, my_n_thread - 1 ); + __TBB_ASSERT( my_active_sem, "Unable to open active threads semaphore" ); + delete[] active_sem_name; + + char* stop_sem_name = get_stop_sem_name(); + my_stop_sem = sem_open( stop_sem_name, O_CREAT, IPC_SEM_MODE, 0 ); + __TBB_ASSERT( my_stop_sem, "Unable to open stop threads semaphore" ); + delete[] stop_sem_name; +} + +ipc_server::~ipc_server() { + __TBB_ASSERT( my_net_slack_requests==0, NULL ); + + for( size_t i=my_n_thread; i--; ) + my_thread_array[i].~padded_ipc_worker(); + tbb::cache_aligned_allocator().deallocate( my_thread_array, my_n_thread ); + tbb::internal::poison_pointer( my_thread_array ); + + my_waker->~ipc_waker(); + tbb::cache_aligned_allocator().deallocate( my_waker, 1 ); + tbb::internal::poison_pointer( my_waker ); + + my_stopper->~ipc_stopper(); + tbb::cache_aligned_allocator().deallocate( my_stopper, 1 ); + tbb::internal::poison_pointer( my_stopper ); + + sem_close( my_active_sem ); + sem_close( my_stop_sem ); +} + +inline bool ipc_server::try_insert_in_asleep_list(ipc_worker& t) { + asleep_list_mutex_type::scoped_lock lock; + if( !lock.try_acquire( my_asleep_list_mutex ) ) + return false; + // Contribute to slack under lock so that if another takes that unit of slack, + // it sees us sleeping on the list and wakes us up. + int k = ++my_slack; + if( k<=0 ) { + t.my_next = my_asleep_list_root; + my_asleep_list_root = &t; + return true; + } else { + --my_slack; + return false; + } +} + +inline bool ipc_server::try_insert_in_asleep_list_forced(ipc_worker& t) { + asleep_list_mutex_type::scoped_lock lock; + if( !lock.try_acquire( my_asleep_list_mutex ) ) + return false; + // Contribute to slack under lock so that if another takes that unit of slack, + // it sees us sleeping on the list and wakes us up. + ++my_slack; + t.my_next = my_asleep_list_root; + my_asleep_list_root = &t; + return true; +} + +inline bool ipc_server::wait_active_thread() { + if( sem_wait( my_active_sem ) == 0 ) { + ++my_global_thread_count; + return true; + } + return false; +} + +inline bool ipc_server::try_get_active_thread() { + if( sem_trywait( my_active_sem ) == 0 ) { + ++my_global_thread_count; + return true; + } + return false; +} + +inline void ipc_server::release_active_thread() { + release_thread_sem( my_active_sem ); +} + +inline bool ipc_server::wait_stop_thread() { + struct timespec ts; + if( clock_gettime( CLOCK_REALTIME, &ts )==0 ) { + ts.tv_sec++; + if( sem_timedwait( my_stop_sem, &ts )==0 ) { + return true; + } + } + return false; +} + +inline void ipc_server::add_stop_thread() { + sem_post( my_stop_sem ); +} + +void ipc_server::wake_some( int additional_slack, int active_threads ) { + __TBB_ASSERT( additional_slack>=0, NULL ); + ipc_worker* wakee[2]; + ipc_worker **w = wakee; + { + asleep_list_mutex_type::scoped_lock lock(my_asleep_list_mutex); + while( active_threads>0 && my_asleep_list_root && w0 ) { + if( additional_slack+my_slack<=0 ) // additional demand does not exceed surplus supply + break; + --additional_slack; + } else { + // Chain reaction; Try to claim unit of slack + int old; + do { + old = my_slack; + if( old<=0 ) goto done; + } while( my_slack.compare_and_swap( old-1, old )!=old ); + } + // Pop sleeping worker to combine with claimed unit of slack + my_asleep_list_root = (*w++ = my_asleep_list_root)->my_next; + --active_threads; + } + if( additional_slack ) { + // Contribute our unused slack to my_slack. + my_slack += additional_slack; + } + } +done: + while( w>wakee ) { + if( !(*--w)->wake_or_launch() ) { + add_stop_thread(); + do { + } while( !try_insert_in_asleep_list_forced(**w) ); + release_active_thread(); + } + } + while( active_threads ) { + release_active_thread(); + --active_threads; + } +} + +void ipc_server::wake_one_forced( int additional_slack ) { + __TBB_ASSERT( additional_slack>=0, NULL ); + ipc_worker* wakee[1]; + ipc_worker **w = wakee; + { + asleep_list_mutex_type::scoped_lock lock(my_asleep_list_mutex); + while( my_asleep_list_root && w0 ) { + if( additional_slack+my_slack<=0 ) // additional demand does not exceed surplus supply + break; + --additional_slack; + } else { + // Chain reaction; Try to claim unit of slack + int old; + do { + old = my_slack; + if( old<=0 ) goto done; + } while( my_slack.compare_and_swap( old-1, old )!=old ); + } + // Pop sleeping worker to combine with claimed unit of slack + my_asleep_list_root = (*w++ = my_asleep_list_root)->my_next; + } + if( additional_slack ) { + // Contribute our unused slack to my_slack. + my_slack += additional_slack; + } + } +done: + while( w>wakee ) { + if( !(*--w)->wake_or_launch() ) { + add_stop_thread(); + do { + } while( !try_insert_in_asleep_list_forced(**w) ); + } + } +} + +bool ipc_server::stop_one() { + ipc_worker* current = NULL; + ipc_worker* next = NULL; + { + asleep_list_mutex_type::scoped_lock lock(my_asleep_list_mutex); + if( my_asleep_list_root ) { + current = my_asleep_list_root; + if( current->my_state==ipc_worker::st_normal ) { + next = current->my_next; + while( next!= NULL && next->my_state==ipc_worker::st_normal ) { + current = next; + next = current->my_next; + } + current->start_stopping( my_join_workers ); + return true; + } + } + } + return false; +} + +void ipc_server::adjust_job_count_estimate( int delta ) { +#if TBB_USE_ASSERT + my_net_slack_requests+=delta; +#endif /* TBB_USE_ASSERT */ + if( my_n_thread > 1 ) { + if( delta<0 ) { + my_slack+=delta; + } else if( delta>0 ) { + int active_threads = 0; + if( try_get_active_thread() ) { + ++active_threads; + if( try_get_active_thread() ) { + ++active_threads; + } + } + wake_some( delta, active_threads ); + + if( !my_waker->wake_or_launch() ) { + add_stop_thread(); + } + if( !my_stopper->wake_or_launch() ) { + add_stop_thread(); + } + } + } else { // Corner case when RML shouldn't provide any worker thread but client has to have at least one + if( delta<0 ) { + my_slack += delta; + } else { + wake_one_forced( delta ); + } + } +} + +//------------------------------------------------------------------------ +// RML factory methods +//------------------------------------------------------------------------ + +#if USE_PTHREAD + +static tbb_client* my_global_client = NULL; +static tbb_server* my_global_server = NULL; + +void rml_atexit() { + release_resources(); +} + +void rml_atfork_child() { + if( my_global_server!=NULL && my_global_client!=NULL ) { + ipc_server* server = static_cast( my_global_server ); + server->~ipc_server(); + memset( server, 0, sizeof(ipc_server) ); + new( server ) ipc_server( *my_global_client ); + pthread_atfork( NULL, NULL, rml_atfork_child ); + atexit( rml_atexit ); + } +} + +#endif /* USE_PTHREAD */ + +extern "C" tbb_factory::status_type __TBB_make_rml_server(tbb_factory& f, tbb_server*& server, tbb_client& client) { + server = new( tbb::cache_aligned_allocator().allocate(1) ) ipc_server(client); +#if USE_PTHREAD + my_global_client = &client; + my_global_server = server; + pthread_atfork( NULL, NULL, rml_atfork_child ); + atexit( rml_atexit ); +#endif /* USE_PTHREAD */ + if( getenv( "RML_DEBUG" ) ) { + runtime_warning("IPC server is started"); + } + return tbb_factory::st_success; +} + +extern "C" void __TBB_call_with_my_server_info(::rml::server_info_callback_t cb, void* arg) { +} + +} // namespace rml +} // namespace internal + +} // namespace tbb diff --git a/python/rml/ipc_utils.cpp b/python/rml/ipc_utils.cpp new file mode 100644 index 0000000000..a7a62c6332 --- /dev/null +++ b/python/rml/ipc_utils.cpp @@ -0,0 +1,144 @@ +/* + Copyright (c) 2017 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + + + +*/ + +#include "ipc_utils.h" + +#include +#include +#include +#include +#include + +namespace tbb { +namespace internal { +namespace rml { + +#define MAX_STR_LEN 255 +#define STARTTIME_ITEM_ID 21 + +static char* get_stat_item(char* line, int item_id) { + int id = 0, i = 0; + + while( id!=item_id ) { + while( line[i]!='(' && line[i]!=' ' && line[i]!='\0' ) { + ++i; + } + if( line[i]==' ' ) { + ++id; + ++i; + } else if( line[i]=='(' ) { + while( line[i]!=')' && line[i]!='\0' ) { + ++i; + } + if( line[i]==')' ) { + ++i; + } else { + return NULL; + } + } else { + return NULL; + } + } + + return line + i; +} + +unsigned long long get_start_time(int pid) { + const char* stat_file_path_template = "/proc/%d/stat"; + char stat_file_path[MAX_STR_LEN + 1]; + sprintf( stat_file_path, stat_file_path_template, pid ); + + FILE* stat_file = fopen( stat_file_path, "rt" ); + if( stat_file==NULL ) { + return 0; + } + + char stat_line[MAX_STR_LEN + 1]; + char* line = fgets( stat_line, MAX_STR_LEN, stat_file ); + if( line==NULL ) { + return 0; + } + + char* starttime_str = get_stat_item( stat_line, STARTTIME_ITEM_ID ); + if( starttime_str==NULL ) { + return 0; + } + + unsigned long long starttime = strtoull( starttime_str, NULL, 10 ); + if( starttime==ULLONG_MAX ) { + return 0; + } + + return starttime; +} + +char* get_shared_name(const char* prefix, int pid, unsigned long long time) { + const char* name_template = "%s_%d_%llu"; + const int digits_in_int = 10; + const int digits_in_long = 20; + + int len = strlen( name_template ) + strlen( prefix ) + digits_in_int + digits_in_long + 1; + char* name = new char[len]; + sprintf( name, name_template, prefix, pid, time ); + + return name; +} + +char* get_shared_name(const char* prefix) { + int pid = getpgrp(); + unsigned long long time = get_start_time( pid ); + return get_shared_name( prefix, pid, time ); +} + +int get_num_threads(const char* env_var) { + if( env_var==NULL ) { + return 0; + } + + char* value = getenv( env_var ); + if( value==NULL ) { + return 0; + } + + int num_threads = (int)strtol( value, NULL, 10 ); + return num_threads; +} + +bool get_enable_flag(const char* env_var) { + if( env_var==NULL ) { + return false; + } + + char* value = getenv( env_var ); + if( value==NULL ) { + return false; + } + + if( strcmp( value, "0" ) == 0 || + strcmp( value, "false" ) == 0 || + strcmp( value, "False" ) == 0 || + strcmp( value, "FALSE" ) == 0 ) { + return false; + } + + return true; +} + +}}} //tbb::internal::rml diff --git a/python/rml/ipc_utils.h b/python/rml/ipc_utils.h new file mode 100644 index 0000000000..2ca4626bc0 --- /dev/null +++ b/python/rml/ipc_utils.h @@ -0,0 +1,34 @@ +/* + Copyright (c) 2017 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + + + +*/ + +#ifndef __IPC_UTILS_H +#define __IPC_UTILS_H + +namespace tbb { +namespace internal { +namespace rml { + +char* get_shared_name(const char* prefix); +int get_num_threads(const char* env_var); +bool get_enable_flag(const char* env_var); + +}}} //tbb::internal::rml + +#endif diff --git a/python/setup.py b/python/setup.py index 89e74a694a..62e8e08ce7 100644 --- a/python/setup.py +++ b/python/setup.py @@ -18,14 +18,19 @@ # # + # System imports from __future__ import print_function +from glob import glob import platform import os from distutils.core import * from distutils.command.build import build +rundir = os.getcwd() +os.chdir(os.path.abspath(os.path.dirname(__file__))) + if any(i in os.environ for i in ["CC", "CXX"]): if "CC" not in os.environ: os.environ['CC'] = os.environ['CXX'] @@ -33,17 +38,18 @@ os.environ['CXX'] = os.environ['CC'] if platform.system() == 'Linux': os.environ['LDSHARED'] = os.environ['CXX'] + " -shared" + print("Environment specifies CC=%s CXX=%s"%(os.environ['CC'], os.environ['CXX'])) intel_compiler = os.getenv('CC', '') in ['icl', 'icpc', 'icc'] try: tbb_root = os.environ['TBBROOT'] print("Using TBBROOT=", tbb_root) except: - tbb_root = '.' + tbb_root = '..' if not intel_compiler: print("Warning: TBBROOT env var is not set and Intel's compiler is not used. It might lead\n" " !!!: to compile/link problems. Source tbbvars.sh/.csh file to set environment") -use_compiler_tbb = intel_compiler and tbb_root == '.' +use_compiler_tbb = intel_compiler and tbb_root == '..' if use_compiler_tbb: print("Using Intel TBB from Intel's compiler") if platform.system() == 'Windows': @@ -52,21 +58,24 @@ os.environ['MSSdk'] = '1' print("Using compiler settings from environment") tbb_flag = ['/Qtbb'] if use_compiler_tbb else [] + tbb_flag += ['/EHsc'] # for Python 2 compile_flags = ['/Qstd=c++11'] if intel_compiler else [] else: tbb_flag = ['-tbb'] if use_compiler_tbb else [] compile_flags = ['-std=c++11', '-Wno-unused-variable'] -_tbb = Extension("_TBB", ["tbb.i"], +_tbb = Extension("tbb._api", ["tbb/api.i"], include_dirs=[os.path.join(tbb_root, 'include')] if not use_compiler_tbb else [], swig_opts =['-c++', '-O', '-threads'] + ( # add '-builtin' later ['-I' + os.path.join(tbb_root, 'include')] if not use_compiler_tbb else []), extra_compile_args=compile_flags + tbb_flag, extra_link_args=tbb_flag, - libraries =['tbb'] if not use_compiler_tbb else [], - library_dirs=[os.path.join(tbb_root, 'lib', 'intel64', 'gcc4.4'), # for Linux - os.path.join(tbb_root, 'lib'), # for MacOS - os.path.join(tbb_root, 'lib', 'intel64', 'vc_mt'), # for Windows + libraries =(['tbb'] if not use_compiler_tbb else []) + + (['irml'] if platform.system() == "Linux" else []), # TODO: why do we need this? + library_dirs=[ rundir, # for custom-builds + os.path.join(tbb_root, 'lib', 'intel64', 'gcc4.4'), # for Linux + os.path.join(tbb_root, 'lib'), # for MacOS + os.path.join(tbb_root, 'lib', 'intel64', 'vc_mt'), # for Windows ] if not use_compiler_tbb else [], language ='c++', ) @@ -86,7 +95,7 @@ class TBBBuild(build): url ="https://software.intel.com/en-us/intel-tbb", author ="Intel Corporation", author_email="inteltbbdevelopers@intel.com", - license ="Dual license: Apache or Intel Simplified Software License", + license ="Dual license: Apache or Proprietary", version ="0.1", classifiers =[ 'Development Status :: 4 - Beta', @@ -97,11 +106,9 @@ class TBBBuild(build): 'Intended Audience :: Other Audience', 'Intended Audience :: Science/Research', 'License :: OSI Approved :: Apache Software License', - 'License :: Other/Intel Simplified Software License', 'Operating System :: MacOS :: MacOS X', 'Operating System :: Microsoft :: Windows', - 'Operating System :: POSIX', - 'Operating System :: Unix', + 'Operating System :: POSIX :: Linux', 'Programming Language :: Python', 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 3', @@ -109,8 +116,9 @@ class TBBBuild(build): 'Topic :: System :: Hardware :: Symmetric Multi-processing', 'Topic :: Software Development :: Libraries', ], - keywords='tbb multiprocessing multithreading composable parallelism', + keywords='TBB multiprocessing multithreading composable parallelism', ext_modules=[_tbb], + packages=['tbb'], py_modules=['TBB'], cmdclass={'build': TBBBuild} ) diff --git a/python/tbb/__init__.py b/python/tbb/__init__.py new file mode 100644 index 0000000000..20d9a78450 --- /dev/null +++ b/python/tbb/__init__.py @@ -0,0 +1,322 @@ +#!/usr/bin/env python +# +# Copyright (c) 2016-2017 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# +# + + +from __future__ import print_function + +import multiprocessing.pool +import ctypes +import atexit +import sys +import os + +from .api import * +from .api import __all__ as api__all +from .pool import * +from .pool import __all__ as pool__all + +__all__ = ["Monkey", "is_active"] + api__all + pool__all + +__doc__ = """ +Python API for Intel(R) Threading Building Blocks library (Intel(R) TBB) +extended with standard Python's pools implementation and monkey-patching. + +Command-line interface example: +$ python -m tbb $your_script.py +Runs your_script.py in context of tbb.Monkey +""" + +is_active = False +""" Indicates whether TBB context is activated """ + +ipc_enabled = False +""" Indicates whether IPC mode is enabled """ + +libirml = "libirml.so.1" + + +def _test(arg=None): + """Some tests""" + import platform + if platform.system() == "Linux": + ctypes.CDLL(libirml) + from .test import test + test(arg) + print("done") + + +def tbb_process_pool_worker27(inqueue, outqueue, initializer=None, initargs=(), + maxtasks=None): + from multiprocessing.pool import worker + worker(inqueue, outqueue, initializer, initargs, maxtasks) + if ipc_enabled: + try: + librml = ctypes.CDLL(libirml) + librml.release_resources() + except: + print("Warning: Can not load ", libirml, file=sys.stderr) + + +class TBBProcessPool27(multiprocessing.pool.Pool): + def _repopulate_pool(self): + """Bring the number of pool processes up to the specified number, + for use after reaping workers which have exited. + """ + from multiprocessing.util import debug + + for i in range(self._processes - len(self._pool)): + w = self.Process(target=tbb_process_pool_worker27, + args=(self._inqueue, self._outqueue, + self._initializer, + self._initargs, self._maxtasksperchild) + ) + self._pool.append(w) + w.name = w.name.replace('Process', 'PoolWorker') + w.daemon = True + w.start() + debug('added worker') + + def __del__(self): + self.close() + for p in self._pool: + p.join() + + def __exit__(self, *args): + self.close() + for p in self._pool: + p.join() + + +def tbb_process_pool_worker3(inqueue, outqueue, initializer=None, initargs=(), + maxtasks=None, wrap_exception=False): + from multiprocessing.pool import worker + worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception) + if ipc_enabled: + try: + librml = ctypes.CDLL(libirml) + librml.release_resources() + except: + print("Warning: Can not load ", libirml, file=sys.stderr) + + +class TBBProcessPool3(multiprocessing.pool.Pool): + def _repopulate_pool(self): + """Bring the number of pool processes up to the specified number, + for use after reaping workers which have exited. + """ + from multiprocessing.util import debug + + for i in range(self._processes - len(self._pool)): + w = self.Process(target=tbb_process_pool_worker3, + args=(self._inqueue, self._outqueue, + self._initializer, + self._initargs, self._maxtasksperchild, + self._wrap_exception) + ) + self._pool.append(w) + w.name = w.name.replace('Process', 'PoolWorker') + w.daemon = True + w.start() + debug('added worker') + + def __del__(self): + self.close() + for p in self._pool: + p.join() + + def __exit__(self, *args): + self.close() + for p in self._pool: + p.join() + + +class Monkey: + """ + Context manager which replaces standard multiprocessing.pool + implementations with tbb.pool using monkey-patching. It also enables TBB + threading for Intel(R) Math Kernel Library (Intel(R) MKL). For example: + + with tbb.Monkey(): + run_my_numpy_code() + + It allows multiple parallel tasks to be executed on the same thread pool + and coordinate number of threads across multiple processes thus avoiding + overheads from oversubscription. + """ + _items = {} + _modules = {} + + def __init__(self, max_num_threads=None, benchmark=False): + """ + Create context manager for running under TBB scheduler. + :param max_num_threads: if specified, limits maximal number of threads + :param benchmark: if specified, blocks in initialization until requested number of threads are ready + """ + if max_num_threads: + self.ctl = global_control(global_control.max_allowed_parallelism, int(max_num_threads)) + if benchmark: + if not max_num_threads: + max_num_threads = default_num_threads() + from .api import _concurrency_barrier + _concurrency_barrier(int(max_num_threads)) + + def _patch(self, class_name, module_name, obj): + m = self._modules[class_name] = __import__(module_name, globals(), + locals(), [class_name]) + if m == None: + return + oldattr = getattr(m, class_name, None) + if oldattr == None: + self._modules[class_name] = None + return + self._items[class_name] = oldattr + setattr(m, class_name, obj) + + def __enter__(self): + global is_active + assert is_active == False, "tbb.Monkey does not support nesting yet" + is_active = True + self.env = os.getenv('MKL_THREADING_LAYER') + os.environ['MKL_THREADING_LAYER'] = 'TBB' + + if ipc_enabled: + if sys.version_info.major == 2 and sys.version_info.minor >= 7: + self._patch("Pool", "multiprocessing.pool", TBBProcessPool27) + elif sys.version_info.major == 3 and sys.version_info.minor >= 5: + self._patch("Pool", "multiprocessing.pool", TBBProcessPool3) + self._patch("ThreadPool", "multiprocessing.pool", Pool) + return self + + def __exit__(self, exc_type, exc_value, traceback): + global is_active + assert is_active == True, "modified?" + is_active = False + if self.env is None: + del os.environ['MKL_THREADING_LAYER'] + else: + os.environ['MKL_THREADING_LAYER'] = self.env + for name in self._items.keys(): + setattr(self._modules[name], name, self._items[name]) + + +def init_sem_name(): + try: + librml = ctypes.CDLL(libirml) + librml.set_active_sem_name() + librml.set_stop_sem_name() + except Exception as e: + print("Warning: Can not initialize name of shared semaphores:", e, + file=sys.stderr) + + +def tbb_atexit(): + if ipc_enabled: + try: + librml = ctypes.CDLL(libirml) + librml.release_semaphores() + except: + print("Warning: Can not release shared semaphores", + file=sys.stderr) + + +def _main(): + # Run the module specified as the next command line argument + # python -m TBB user_app.py + global ipc_enabled + + import platform + import argparse + parser = argparse.ArgumentParser(prog="python -m tbb", description=""" + Run your Python script in context of tbb.Monkey, which + replaces standard Python pools and threading layer of + Intel(R) Math Kernel Library by implementation based on + Intel(R) Threading Building Blocks. It enables multiple parallel + tasks to be executed on the same thread pool and coordinate + number of threads across multiple processes thus avoiding + overheads from oversubscription. + """, formatter_class=argparse.ArgumentDefaultsHelpFormatter) + if platform.system() == "Linux": + parser.add_argument('--ipc', action='store_true', + help="Enable inter-process (IPC) coordination between Intel TBB schedulers") + parser.add_argument('-a', '--allocator', action='store_true', + help="Enable Intel TBB scalable allocator as a replacement for standard memory allocator") + parser.add_argument('--allocator-huge-pages', action='store_true', + help="Enable huge pages for Intel TBB allocator (implies: -a)") + parser.add_argument('-p', '--max-num-threads', default=default_num_threads(), type=int, + help="Initialize Intel TBB with P max number of threads per process", metavar='P') + parser.add_argument('-b', '--benchmark', action='store_true', + help="Block Intel TBB initialization until all the threads are created before continue the script. " + "This is necessary for performance benchmarks that want to exclude lazy scheduler initialization effects from the measurements") + parser.add_argument('-v', '--verbose', action='store_true', + help="Request verbose and version information") + parser.add_argument('-m', action='store_true', dest='module', + help="Executes following as a module") + parser.add_argument('name', help="Script or module name") + parser.add_argument('args', nargs=argparse.REMAINDER, + help="Command line arguments") + args = parser.parse_args() + + if args.verbose: + os.environ["TBB_VERSION"] = "1" + if platform.system() == "Linux": + if args.allocator_huge_pages: + args.allocator = True + if args.allocator and not os.environ.get("_TBB_MALLOC_PRELOAD"): + libtbbmalloc_lib = 'libtbbmalloc_proxy.so.2' + ld_preload = 'LD_PRELOAD' + os.environ["_TBB_MALLOC_PRELOAD"] = "1" + preload_list = filter(None, os.environ.get(ld_preload, "").split(':')) + if libtbbmalloc_lib in preload_list: + print('Info:', ld_preload, "contains", libtbbmalloc_lib, "already\n") + else: + os.environ[ld_preload] = ':'.join([libtbbmalloc_lib] + list(preload_list)) + + if args.allocator_huge_pages: + assert platform.system() == "Linux" + try: + with open('/proc/sys/vm/nr_hugepages', 'r') as f: + pages = int(f.read()) + if pages == 0: + print("TBB: Pre-allocated huge pages are not currently reserved in the system. To reserve, run e.g.:\n" + "\tsudo sh -c 'echo 2000 > /proc/sys/vm/nr_hugepages'") + os.environ["TBB_MALLOC_USE_HUGE_PAGES"] = "1" + except: + print("TBB: Failed to read number of pages from /proc/sys/vm/nr_hugepages\n" + "\tIs the Linux kernel configured with the huge pages feature?") + sys.exit(1) + + os.execl(sys.executable, sys.executable, '-m', 'tbb', *sys.argv[1:]) + assert False, "Re-execution failed" + + sys.argv = [args.name] + args.args + ipc_enabled = platform.system() == "Linux" and args.ipc + os.environ["IPC_ENABLE"] = "1" if ipc_enabled else "0" + if ipc_enabled: + atexit.register(tbb_atexit) + init_sem_name() + if not os.environ.get("KMP_BLOCKTIME"): # TODO move + os.environ["KMP_BLOCKTIME"] = "0" + if '_' + args.name in globals(): + return globals()['_' + args.name](*args.args) + else: + import runpy + runf = runpy.run_module if args.module else runpy.run_path + with Monkey(max_num_threads=args.max_num_threads, benchmark=args.benchmark): + runf(args.name, run_name='__main__') diff --git a/python/tbb/__main__.py b/python/tbb/__main__.py new file mode 100644 index 0000000000..20fea9a88f --- /dev/null +++ b/python/tbb/__main__.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# +# Copyright (c) 2016-2017 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# +# + + +from . import _main +from sys import exit +exit(_main()) diff --git a/python/tbb.i b/python/tbb/api.i similarity index 58% rename from python/tbb.i rename to python/tbb/api.i index 87cc95e9b0..bbc887a6d9 100644 --- a/python/tbb.i +++ b/python/tbb/api.i @@ -18,50 +18,31 @@ # # -# Based on the software developed by: -# Copyright (c) 2008,2016 david decotigny (Pool of threads) -# Copyright (c) 2006-2008, R Oudkerk (multiprocessing.Pool) -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# 3. Neither the name of author nor the names of any contributors may be -# used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -# SUCH DAMAGE. -# -from __future__ import print_function +__all__ = ["task_arena", "task_group", "task_scheduler_init", "global_control", "default_num_threads"] %} %begin %{ /* Defines Python wrappers for Intel(R) Threading Building Blocks (Intel TBB).*/ %} -%module TBB +%module api #if SWIG_VERSION < 0x030001 #error SWIG version 3.0.6 or newer is required for correct functioning #endif %{ +#define TBB_PREVIEW_GLOBAL_CONTROL 1 +#define TBB_PREVIEW_WAITING_FOR_WORKERS 1 #include +#include +#if TBB_IMPLEMENT_CPP0X +namespace std { using tbb::mutex; } +#define unique_ptr auto_ptr +#else +#include +#include +#include +#endif using namespace tbb; class PyCaller : public swig::SwigPtr_PyObject { @@ -91,8 +72,47 @@ struct ArenaPyCaller { } }; +struct barrier_data { + std::condition_variable event; + std::mutex m; + int worker_threads, full_threads; +}; + +class barrier_task : public tbb::task { + barrier_data &b; +public: + barrier_task(barrier_data &d) : b(d) {} + /*override*/ tbb::task *execute() { + std::unique_lock lock(b.m); + if(++b.worker_threads >= b.full_threads) + b.event.notify_all(); + else while(b.worker_threads < b.full_threads) + b.event.wait(lock); + return NULL; + } +}; + +void _concurrency_barrier(int threads = tbb::task_scheduler_init::automatic) { + if(threads == task_scheduler_init::automatic) + threads = task_scheduler_init::default_num_threads(); + if(threads < 2) + return; + std::unique_ptr g( + (global_control::active_value(global_control::max_allowed_parallelism) < unsigned(threads))? + new global_control(global_control::max_allowed_parallelism, threads) : NULL); + barrier_data b; + b.worker_threads = 0; + b.full_threads = threads-1; + for(int i = 0; i < b.full_threads; i++) + tbb::task::enqueue( *new( tbb::task::allocate_root() ) barrier_task(b) ); + std::unique_lock lock(b.m); + b.event.wait(lock); +}; + %} +void _concurrency_barrier(int threads = tbb::task_scheduler_init::automatic); + namespace tbb { class task_scheduler_init { public: @@ -100,13 +120,14 @@ namespace tbb { static const int automatic = -1; //! Argument to initialize() or constructor that causes initialization to be deferred. static const int deferred = -2; - task_scheduler_init( int max_threads=automatic, + task_scheduler_init( int max_threads=automatic, size_t thread_stack_size=0 ); ~task_scheduler_init(); void initialize( int max_threads=automatic ); void terminate(); static int default_num_threads(); bool is_active() const; + void blocking_terminate(); }; class task_arena { @@ -139,7 +160,21 @@ namespace tbb { }; }; -} + class global_control { + public: + enum parameter { + max_allowed_parallelism, + thread_stack_size, + parameter_max // insert new parameters above this point + }; + global_control(parameter param, size_t value); + ~global_control(); + static size_t active_value(parameter param); + }; -// Python part of the module -%pythoncode "tbb.src.py" +} // tbb + +// Additional definitions for Python part of the module +%pythoncode %{ +default_num_threads = task_scheduler_init_default_num_threads +%} diff --git a/python/tbb.src.py b/python/tbb/pool.py similarity index 79% rename from python/tbb.src.py rename to python/tbb/pool.py index b2d3312d19..4747b22a5a 100644 --- a/python/tbb.src.py +++ b/python/tbb/pool.py @@ -1,4 +1,4 @@ - +#!/usr/bin/env python # # Copyright (c) 2016-2017 Intel Corporation # @@ -73,20 +73,14 @@ import sys import threading import traceback +from .api import * -__all__ = ["Pool", "Monkey", "task_arena", "task_group", "task_scheduler_init"] +__all__ = ["Pool", "TimeoutError"] __doc__ = """ -Python API to Intel(R) Threading Building Blocks library (Intel(R) TBB) -extended with standard Pool implementation and monkey-patching. - -Command-line interface: -$ python -m TBB $your_script.py - -Runs your_script.py in context of `with Monkey():` +Standard Python Pool implementation based on Python API +for Intel(R) Threading Building Blocks library (Intel(R) TBB) """ -default_num_threads = task_scheduler_init_default_num_threads - class TimeoutError(Exception): """Raised when a result is not available within the given timeout""" @@ -96,7 +90,7 @@ class TimeoutError(Exception): class Pool(object): """ The Pool class provides standard multiprocessing.Pool interface - which is mapped onto Intel TBB tasks executing in its thread pool + which is mapped onto Intel(R) TBB tasks executing in its thread pool """ def __init__(self, nworkers=0, name="Pool"): @@ -639,202 +633,3 @@ def notify_ready(self, apply_result): self._to_notify._set_exception() else: self._to_notify._set_value(lst) - - -def _test(arg=None): - """Some tests""" - if arg == "-v": - def say(*x): - print(*x) - else: - def say(*x): - pass - say("Start Pool testing") - import time - - get_tid = lambda: threading.current_thread().ident - - def return42(): - return 42 - - def f(x): - return x * x - - def work(mseconds): - res = str(mseconds) - if mseconds < 0: - mseconds = -mseconds - say("[%d] Start to work for %fms..." % (get_tid(), mseconds*10)) - time.sleep(mseconds/100.) - say("[%d] Work done (%fms)." % (get_tid(), mseconds*10)) - return res - - ### Test copy/pasted from multiprocessing - pool = Pool(4) # start worker threads - - # edge cases - assert pool.map(return42, []) == [] - assert pool.apply_async(return42, []).get() == 42 - assert pool.apply(return42, []) == 42 - assert list(pool.imap(return42, iter([]))) == [] - assert list(pool.imap_unordered(return42, iter([]))) == [] - assert pool.map_async(return42, []).get() == [] - assert list(pool.imap_async(return42, iter([])).get()) == [] - assert list(pool.imap_unordered_async(return42, iter([])).get()) == [] - - # basic tests - result = pool.apply_async(f, (10,)) # evaluate "f(10)" asynchronously - assert result.get(timeout=1) == 100 # ... unless slow computer - assert list(pool.map(f, range(10))) == list(map(f, range(10))) - it = pool.imap(f, range(10)) - assert next(it) == 0 - assert next(it) == 1 - assert next(it) == 4 - - # Test apply_sync exceptions - result = pool.apply_async(time.sleep, (3,)) - try: - say(result.get(timeout=1)) # raises `TimeoutError` - except TimeoutError: - say("Good. Got expected timeout exception.") - else: - assert False, "Expected exception !" - assert result.get() is None # sleep() returns None - - def cb(s): - say("Result ready: %s" % s) - - # Test imap() - assert list(pool.imap(work, range(10, 3, -1), chunksize=4)) == list(map( - str, range(10, 3, -1))) - - # Test imap_unordered() - assert sorted(pool.imap_unordered(work, range(10, 3, -1))) == sorted(map( - str, range(10, 3, -1))) - - # Test map_async() - result = pool.map_async(work, range(10), callback=cb) - try: - result.get(timeout=0.01) # raises `TimeoutError` - except TimeoutError: - say("Good. Got expected timeout exception.") - else: - assert False, "Expected exception !" - say(result.get()) - - # Test imap_async() - result = pool.imap_async(work, range(3, 10), callback=cb) - try: - result.get(timeout=0.01) # raises `TimeoutError` - except TimeoutError: - say("Good. Got expected timeout exception.") - else: - assert False, "Expected exception !" - for i in result.get(): - say("Item:", i) - say("### Loop again:") - for i in result.get(): - say("Item2:", i) - - # Test imap_unordered_async() - result = pool.imap_unordered_async(work, range(10, 3, -1), callback=cb) - try: - say(result.get(timeout=0.01)) # raises `TimeoutError` - except TimeoutError: - say("Good. Got expected timeout exception.") - else: - assert False, "Expected exception !" - for i in result.get(): - say("Item1:", i) - for i in result.get(): - say("Item2:", i) - r = result.get() - for i in r: - say("Item3:", i) - for i in r: - say("Item4:", i) - for i in r: - say("Item5:", i) - - # - # The case for the exceptions - # - - # Exceptions in imap_unordered_async() - result = pool.imap_unordered_async(work, range(2, -10, -1), callback=cb) - time.sleep(3) - try: - for i in result.get(): - say("Got item:", i) - except (IOError, ValueError): - say("Good. Got expected exception") - - # Exceptions in imap_async() - result = pool.imap_async(work, range(2, -10, -1), callback=cb) - time.sleep(3) - try: - for i in result.get(): - say("Got item:", i) - except (IOError, ValueError): - say("Good. Got expected exception") - - # Stop the test: need to stop the pool !!! - pool.terminate() - pool.join() - print("done") - - -# End of david's derived file content - -class Monkey: - """ - Context manager which replaces standard multiprocessing.pool.ThreadPool - implementation with TBB.Pool using monkey-patching. It also enables TBB - threading for Intel(R) Math Kernel Library (Intel(R) MKL). For example: - - with TBB.Monkey(): - run_my_numpy_code() - - """ - _items = {'ThreadPool': None} - - def __init__(self): - pass - - def __enter__(self): - import os - self.env = os.getenv('MKL_THREADING_LAYER') - os.environ['MKL_THREADING_LAYER'] = 'TBB' - self.module = __import__('multiprocessing.pool', globals(), locals(), self._items.keys()) - for name in self._items.keys(): - oldattr = getattr(self.module, name) - self._items[name] = oldattr - setattr(self.module, name, Pool) - return self - - def __exit__(self, exc_type, exc_value, traceback): - import os - if self.env is None: - del os.environ['MKL_THREADING_LAYER'] - else: - os.environ['MKL_THREADING_LAYER'] = self.env - for name in self._items.keys(): - setattr(self.module, name, self._items[name]) - - -def _main(): - # Run the module specified as the next command line argument - # python -m TBB user_app.py - del sys.argv[0] # shift arguments - if len(sys.argv) < 1: - print("No file name specified for execution", file=sys.stderr) - elif '_' + sys.argv[0] in globals(): - globals()['_' + sys.argv[0]](*sys.argv[1:]) - else: - import runpy - with Monkey(): - runpy.run_path(sys.argv[0], run_name='__main__') - - -if __name__ == "__main__": - sys.exit(_main()) diff --git a/python/tbb/test.py b/python/tbb/test.py new file mode 100644 index 0000000000..8a8238d6d5 --- /dev/null +++ b/python/tbb/test.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python +# +# Copyright (c) 2016-2017 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# +# + +# Based on the software developed by: +# Copyright (c) 2008,2016 david decotigny (Pool of threads) +# Copyright (c) 2006-2008, R Oudkerk (multiprocessing.Pool) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. Neither the name of author nor the names of any contributors may be +# used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# + +from __future__ import print_function +import time +import threading + +from .api import * +from .pool import * + + +def test(arg=None): + if arg == "-v": + def say(*x): + print(*x) + else: + def say(*x): + pass + say("Start Pool testing") + + get_tid = lambda: threading.current_thread().ident + + def return42(): + return 42 + + def f(x): + return x * x + + def work(mseconds): + res = str(mseconds) + if mseconds < 0: + mseconds = -mseconds + say("[%d] Start to work for %fms..." % (get_tid(), mseconds*10)) + time.sleep(mseconds/100.) + say("[%d] Work done (%fms)." % (get_tid(), mseconds*10)) + return res + + ### Test copy/pasted from multiprocessing + pool = Pool(4) # start worker threads + + # edge cases + assert pool.map(return42, []) == [] + assert pool.apply_async(return42, []).get() == 42 + assert pool.apply(return42, []) == 42 + assert list(pool.imap(return42, iter([]))) == [] + assert list(pool.imap_unordered(return42, iter([]))) == [] + assert pool.map_async(return42, []).get() == [] + assert list(pool.imap_async(return42, iter([])).get()) == [] + assert list(pool.imap_unordered_async(return42, iter([])).get()) == [] + + # basic tests + result = pool.apply_async(f, (10,)) # evaluate "f(10)" asynchronously + assert result.get(timeout=1) == 100 # ... unless slow computer + assert list(pool.map(f, range(10))) == list(map(f, range(10))) + it = pool.imap(f, range(10)) + assert next(it) == 0 + assert next(it) == 1 + assert next(it) == 4 + + # Test apply_sync exceptions + result = pool.apply_async(time.sleep, (3,)) + try: + say(result.get(timeout=1)) # raises `TimeoutError` + except TimeoutError: + say("Good. Got expected timeout exception.") + else: + assert False, "Expected exception !" + assert result.get() is None # sleep() returns None + + def cb(s): + say("Result ready: %s" % s) + + # Test imap() + assert list(pool.imap(work, range(10, 3, -1), chunksize=4)) == list(map( + str, range(10, 3, -1))) + + # Test imap_unordered() + assert sorted(pool.imap_unordered(work, range(10, 3, -1))) == sorted(map( + str, range(10, 3, -1))) + + # Test map_async() + result = pool.map_async(work, range(10), callback=cb) + try: + result.get(timeout=0.01) # raises `TimeoutError` + except TimeoutError: + say("Good. Got expected timeout exception.") + else: + assert False, "Expected exception !" + say(result.get()) + + # Test imap_async() + result = pool.imap_async(work, range(3, 10), callback=cb) + try: + result.get(timeout=0.01) # raises `TimeoutError` + except TimeoutError: + say("Good. Got expected timeout exception.") + else: + assert False, "Expected exception !" + for i in result.get(): + say("Item:", i) + say("### Loop again:") + for i in result.get(): + say("Item2:", i) + + # Test imap_unordered_async() + result = pool.imap_unordered_async(work, range(10, 3, -1), callback=cb) + try: + say(result.get(timeout=0.01)) # raises `TimeoutError` + except TimeoutError: + say("Good. Got expected timeout exception.") + else: + assert False, "Expected exception !" + for i in result.get(): + say("Item1:", i) + for i in result.get(): + say("Item2:", i) + r = result.get() + for i in r: + say("Item3:", i) + for i in r: + say("Item4:", i) + for i in r: + say("Item5:", i) + + # + # The case for the exceptions + # + + # Exceptions in imap_unordered_async() + result = pool.imap_unordered_async(work, range(2, -10, -1), callback=cb) + time.sleep(3) + try: + for i in result.get(): + say("Got item:", i) + except (IOError, ValueError): + say("Good. Got expected exception") + + # Exceptions in imap_async() + result = pool.imap_async(work, range(2, -10, -1), callback=cb) + time.sleep(3) + try: + for i in result.get(): + say("Got item:", i) + except (IOError, ValueError): + say("Good. Got expected exception") + + # Stop the test: need to stop the pool !!! + pool.terminate() + pool.join() + + diff --git a/src/Makefile b/src/Makefile index 195304d5f1..070c634b4f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -94,8 +94,8 @@ else @$(MAKE) -C "$(work_dir)_$(cfg)" -r -f $(tbb_root)/build/Makefile.test $@ endif -python_%: tbb_release - bash -c ". $(work_dir)_release$(SLASH)tbbvars.sh && $(MAKE) -rC '$(full_tbb_root)/python' CXX=$(compiler) $(subst python_,,$@)" +python_%: + $(MAKE) -C "$(work_dir)_release" -rf $(tbb_root)/python/Makefile $(subst python_,,$@) .PHONY: test_release test_debug test_release_no_depends test_debug_no_depends .PHONY: tbb_release tbb_debug tbb_test_release tbb_test_debug tbb_test_release_no_depends tbb_test_debug_no_depends diff --git a/src/tbb/arena.cpp b/src/tbb/arena.cpp index abe4d35bea..70ad17583c 100644 --- a/src/tbb/arena.cpp +++ b/src/tbb/arena.cpp @@ -641,28 +641,27 @@ void generic_scheduler::nested_arena_entry(arena* a, size_t slot_index) { attach_arena( a, slot_index, /*is_master*/true ); __TBB_ASSERT( my_arena == a, NULL ); governor::assume_scheduler( this ); -#if __TBB_ARENA_OBSERVER - my_last_local_observer = 0; // TODO: try optimize number of calls - my_arena->my_observers.notify_entry_observers( my_last_local_observer, /*worker=*/false ); -#endif // TODO? ITT_NOTIFY(sync_acquired, a->my_slots + index); // TODO: it requires market to have P workers (not P-1) // TODO: a preempted worker should be excluded from assignment to other arenas e.g. my_slack-- if( !is_worker() && slot_index >= my_arena->my_num_reserved_slots ) my_arena->my_market->adjust_demand(*my_arena, -1); +#if __TBB_ARENA_OBSERVER + my_last_local_observer = 0; // TODO: try optimize number of calls + my_arena->my_observers.notify_entry_observers( my_last_local_observer, /*worker=*/false ); +#endif } void generic_scheduler::nested_arena_exit() { - if( !is_worker() && my_arena_index >= my_arena->my_num_reserved_slots ) - my_arena->my_market->adjust_demand(*my_arena, 1); #if __TBB_ARENA_OBSERVER my_arena->my_observers.notify_exit_observers( my_last_local_observer, /*worker=*/false ); #endif /* __TBB_ARENA_OBSERVER */ - #if __TBB_TASK_PRIORITY if ( my_offloaded_tasks ) my_arena->orphan_offloaded_tasks( *this ); #endif + if( !is_worker() && my_arena_index >= my_arena->my_num_reserved_slots ) + my_arena->my_market->adjust_demand(*my_arena, 1); // Free the master slot. __TBB_ASSERT(my_arena->my_slots[my_arena_index].my_scheduler, "A slot is already empty"); __TBB_store_with_release(my_arena->my_slots[my_arena_index].my_scheduler, (generic_scheduler*)NULL); diff --git a/src/tbb/concurrent_vector.cpp b/src/tbb/concurrent_vector.cpp index 4ecf09903a..bf405cf61b 100644 --- a/src/tbb/concurrent_vector.cpp +++ b/src/tbb/concurrent_vector.cpp @@ -43,7 +43,7 @@ using namespace std; namespace tbb { namespace internal { - class concurrent_vector_base_v3::helper :no_assign { +class concurrent_vector_base_v3::helper :no_assign { public: //! memory page size static const size_type page_size = 4096; @@ -197,7 +197,7 @@ namespace internal { func( begin, n ); } }; -}; +}; // class helper void concurrent_vector_base_v3::helper::extend_segment_table(concurrent_vector_base_v3 &v, concurrent_vector_base_v3::size_type start) { if( start > segment_size(pointers_per_short_table) ) start = segment_size(pointers_per_short_table); diff --git a/src/tbb/condition_variable.cpp b/src/tbb/condition_variable.cpp index 49ed609f4a..cf5b0f074d 100644 --- a/src/tbb/condition_variable.cpp +++ b/src/tbb/condition_variable.cpp @@ -141,8 +141,18 @@ static const dynamic_link_descriptor CondVarLinkTable[] = { void init_condvar_module() { __TBB_ASSERT( (uintptr_t)__TBB_init_condvar==(uintptr_t)&init_condvar_using_event, NULL ); - if( dynamic_link( "Kernel32.dll", CondVarLinkTable, 4 ) ) +#if __TBB_WIN8UI_SUPPORT + // We expect condition variables to be always available for Windows* store applications, + // so there is no need to check presense and use alternative implementation. + __TBB_init_condvar = (void (WINAPI *)(PCONDITION_VARIABLE))&InitializeConditionVariable; + __TBB_condvar_wait = (BOOL(WINAPI *)(PCONDITION_VARIABLE, LPCRITICAL_SECTION, DWORD))&SleepConditionVariableCS; + __TBB_condvar_notify_one = (void (WINAPI *)(PCONDITION_VARIABLE))&WakeConditionVariable; + __TBB_condvar_notify_all = (void (WINAPI *)(PCONDITION_VARIABLE))&WakeAllConditionVariable; + __TBB_destroy_condvar = (void (WINAPI *)(PCONDITION_VARIABLE))&destroy_condvar_noop; +#else + if (dynamic_link("Kernel32.dll", CondVarLinkTable, 4)) __TBB_destroy_condvar = (void (WINAPI *)(PCONDITION_VARIABLE))&destroy_condvar_noop; +#endif } #endif /* _WIN32||_WIN64 */ diff --git a/src/tbb/mac32-tbb-export.lst b/src/tbb/mac32-tbb-export.lst index 8ce79f2dee..07d274fc5b 100644 --- a/src/tbb/mac32-tbb-export.lst +++ b/src/tbb/mac32-tbb-export.lst @@ -191,6 +191,15 @@ __TBB_SYMBOL( _ZN3tbb8internal33itt_store_pointer_with_release_v3EPvS1_ ) __TBB_SYMBOL( _ZN3tbb8internal18call_itt_notify_v5EiPv ) __TBB_SYMBOL( _ZN3tbb8internal19itt_load_pointer_v3EPKv ) __TBB_SYMBOL( _ZN3tbb8internal20itt_set_sync_name_v3EPvPKc ) +#if __TBB_ITT_STRUCTURE_API +__TBB_SYMBOL( _ZN3tbb8internal22itt_make_task_group_v7ENS0_15itt_domain_enumEPvyS2_yNS0_12string_indexE ) +__TBB_SYMBOL( _ZN3tbb8internal23itt_metadata_str_add_v7ENS0_15itt_domain_enumEPvyNS0_12string_indexEPKc ) +__TBB_SYMBOL( _ZN3tbb8internal19itt_relation_add_v7ENS0_15itt_domain_enumEPvyNS0_12itt_relationES2_y ) +__TBB_SYMBOL( _ZN3tbb8internal17itt_task_begin_v7ENS0_15itt_domain_enumEPvyS2_yNS0_12string_indexE ) +__TBB_SYMBOL( _ZN3tbb8internal15itt_task_end_v7ENS0_15itt_domain_enumE ) +__TBB_SYMBOL( _ZN3tbb8internal19itt_region_begin_v9ENS0_15itt_domain_enumEPvyS2_yNS0_12string_indexE ) +__TBB_SYMBOL( _ZN3tbb8internal17itt_region_end_v9ENS0_15itt_domain_enumEPvy ) +#endif // pipeline.cpp __TBB_SYMBOL( _ZTIN3tbb6filterE ) diff --git a/src/tbb/mac64-tbb-export.lst b/src/tbb/mac64-tbb-export.lst index 67d3676d00..d410e69bd6 100644 --- a/src/tbb/mac64-tbb-export.lst +++ b/src/tbb/mac64-tbb-export.lst @@ -188,6 +188,15 @@ __TBB_SYMBOL( _ZN3tbb8internal33itt_store_pointer_with_release_v3EPvS1_ ) __TBB_SYMBOL( _ZN3tbb8internal18call_itt_notify_v5EiPv ) __TBB_SYMBOL( _ZN3tbb8internal19itt_load_pointer_v3EPKv ) __TBB_SYMBOL( _ZN3tbb8internal20itt_set_sync_name_v3EPvPKc ) +#if __TBB_ITT_STRUCTURE_API +__TBB_SYMBOL( _ZN3tbb8internal23itt_metadata_str_add_v7ENS0_15itt_domain_enumEPvyNS0_12string_indexEPKc ) +__TBB_SYMBOL( _ZN3tbb8internal22itt_make_task_group_v7ENS0_15itt_domain_enumEPvyS2_yNS0_12string_indexE ) +__TBB_SYMBOL( _ZN3tbb8internal17itt_task_begin_v7ENS0_15itt_domain_enumEPvyS2_yNS0_12string_indexE ) +__TBB_SYMBOL( _ZN3tbb8internal19itt_relation_add_v7ENS0_15itt_domain_enumEPvyNS0_12itt_relationES2_y ) +__TBB_SYMBOL( _ZN3tbb8internal15itt_task_end_v7ENS0_15itt_domain_enumE ) +__TBB_SYMBOL( _ZN3tbb8internal19itt_region_begin_v9ENS0_15itt_domain_enumEPvyS2_yNS0_12string_indexE ) +__TBB_SYMBOL( _ZN3tbb8internal17itt_region_end_v9ENS0_15itt_domain_enumEPvy ) +#endif // pipeline.cpp __TBB_SYMBOL( _ZTIN3tbb6filterE ) diff --git a/src/tbb/market.cpp b/src/tbb/market.cpp index 3b6efc630e..9fb6936e24 100644 --- a/src/tbb/market.cpp +++ b/src/tbb/market.cpp @@ -329,7 +329,7 @@ void market::try_destroy_arena ( arena* a, uintptr_t aba_epoch ) { assert_market_valid(); #if __TBB_TASK_PRIORITY // scan all priority levels, not only in [my_global_bottom_priority;my_global_top_priority] - // range, because arena to be destoyed can have no outstanding request for workers + // range, because arena to be destroyed can have no outstanding request for workers for ( int p = num_priority_levels-1; p >= 0; --p ) { priority_level_info &pl = my_priority_levels[p]; arena_list_type &my_arenas = pl.arenas; diff --git a/src/tbb/tbb_main.cpp b/src/tbb/tbb_main.cpp index 685d28ec3d..a1fd8d964d 100644 --- a/src/tbb/tbb_main.cpp +++ b/src/tbb/tbb_main.cpp @@ -241,12 +241,16 @@ void DoOneTimeInitializations() { #if (_WIN32||_WIN64) && !__TBB_SOURCE_DIRECTLY_INCLUDED //! Windows "DllMain" that handles startup and shutdown of dynamic library. -extern "C" bool WINAPI DllMain( HANDLE /*hinstDLL*/, DWORD reason, LPVOID /*lpvReserved*/ ) { +extern "C" bool WINAPI DllMain( HANDLE /*hinstDLL*/, DWORD reason, LPVOID lpvReserved ) { switch( reason ) { case DLL_PROCESS_ATTACH: __TBB_InitOnce::add_ref(); break; case DLL_PROCESS_DETACH: + // Since THREAD_DETACH is not called for the main thread, call auto-termination + // here as well - but not during process shutdown (due to risk of a deadlock). + if( lpvReserved==NULL ) // library unload + governor::terminate_auto_initialized_scheduler(); __TBB_InitOnce::remove_ref(); // It is assumed that InitializationDone is not set after DLL_PROCESS_DETACH, // and thus no race on InitializationDone is possible. diff --git a/src/tbb/tbb_main.h b/src/tbb/tbb_main.h index 24d77d1128..b675fe0703 100644 --- a/src/tbb/tbb_main.h +++ b/src/tbb/tbb_main.h @@ -22,6 +22,7 @@ #define _TBB_tbb_main_H #include "tbb/atomic.h" +#include "governor.h" namespace tbb { @@ -73,6 +74,7 @@ class __TBB_InitOnce { //! Remove the initial reference to resources. /** This is not necessarily the last reference if other threads are still running. **/ ~__TBB_InitOnce() { + governor::terminate_auto_initialized_scheduler(); // TLS dtor not called for the main thread remove_ref(); // We assume that InitializationDone is not set after file-scope destructors // start running, and thus no race on InitializationDone is possible. diff --git a/src/tbbmalloc/proxy.cpp b/src/tbbmalloc/proxy.cpp index 5ef279da34..1c8e6657ea 100644 --- a/src/tbbmalloc/proxy.cpp +++ b/src/tbbmalloc/proxy.cpp @@ -337,13 +337,23 @@ void* __TBB_malloc_safer__aligned_realloc_##CRTLIB( void *ptr, size_t size, size return __TBB_malloc_safer_aligned_realloc( ptr, size, aligment, &func_ptrs ); \ } -// Limit is 30 bytes/60 symbols per line, * can be used to match any digit in bytecodes. -// Purpose of the pattern is to mark an instruction bound, it should consist of several -// full instructions plus one more byte. It's not required for the patterns to be unique -// (i.e., it's OK to have same pattern for unrelated functions). +// Only for ucrtbase: substitution for _o_free +void (*orig__o_free)(void*); +void __TBB_malloc__o_free(void *ptr) +{ + __TBB_malloc_safer_free( ptr, orig__o_free ); +} + +// Size limit is MAX_PATTERN_SIZE (28) byte codes / 56 symbols per line. +// * can be used to match any digit in byte codes. +// # followed by several * indicate a relative address that needs to be corrected. +// Purpose of the pattern is to mark an instruction bound; it should consist of several +// full instructions plus one extra byte code. It's not required for the patterns +// to be unique (i.e., it's OK to have same pattern for unrelated functions). // TODO: use hot patch prologues if exist const char* known_bytecodes[] = { #if _WIN64 +// "========================================================" - 56 symbols "4883EC284885C974", // release free() "4883EC284885C975", // release _msize() "4885C974375348", // release free() 8.0.50727.42, 10.0 @@ -354,12 +364,14 @@ const char* known_bytecodes[] = { "48894C24084883EC28BA", // debug prologue "4C894424184889542410", // debug _aligned_msize() 10.0 "48894C24084883EC2848", // debug _aligned_free 10.0 + "488BD1488D0D#*******E9", // _o_free(), ucrtbase.dll #if __TBB_OVERLOAD_OLD_MSVCR "48895C2408574883EC3049", // release _aligned_msize 9.0 "4883EC384885C975", // release _msize() 9.0 "4C8BC1488B0DA6E4040033", // an old win64 SDK #endif #else // _WIN32 +// "========================================================" - 56 symbols "8BFF558BEC8B", // multiple "8BFF558BEC83", // release free() & _msize() 10.0.40219.325, _msize() ucrtbase.dll "8BFF558BECFF", // release _aligned_msize ucrtbase.dll @@ -641,8 +653,13 @@ void doMallocReplacement() { ReplaceFunctionWithStore( modules_to_replace[j].name, c_routines_to_replace[i]._func, c_routines_to_replace[i]._fptr, NULL, NULL, c_routines_to_replace[i]._on_error ); } - // ucrtbase.dll does not export operator new/delete. - if ( strcmp(modules_to_replace[j].name, "ucrtbase.dll") == 0 ){ + if ( strcmp(modules_to_replace[j].name, "ucrtbase.dll") == 0 ) { + // If _o_free function is present and patchable, redirect it to tbbmalloc as well + // This prevents issues with other _o_* functions which might allocate memory with malloc + if ( IsPrologueKnown(GetModuleHandle("ucrtbase.dll"), "_o_free", known_bytecodes) ) { + ReplaceFunctionWithStore( "ucrtbase.dll", "_o_free", (FUNCPTR)__TBB_malloc__o_free, known_bytecodes, (FUNCPTR*)&orig__o_free, FRR_FAIL ); + } + // ucrtbase.dll does not export operator new/delete, so skip the rest of the loop. continue; } diff --git a/src/tbbmalloc/tbb_function_replacement.cpp b/src/tbbmalloc/tbb_function_replacement.cpp index b0157856c1..f93483286e 100644 --- a/src/tbbmalloc/tbb_function_replacement.cpp +++ b/src/tbbmalloc/tbb_function_replacement.cpp @@ -28,9 +28,9 @@ #include #include #include +#include #include "tbb_function_replacement.h" -#include "tbb/tbb_config.h" #include "tbb/tbb_stddef.h" #include "../tbb/tbb_assert_impl.h" @@ -66,7 +66,7 @@ inline bool IsInDistance(UINT_PTR addr1, UINT_PTR addr2, __int64 dist) * doesn't allocate memory dynamically. * * The struct MemoryBuffer holds the data about a page in the memory used for - * replacing functions in Intel64 where the target is too far to be replaced + * replacing functions in 64-bit code where the target is too far to be replaced * with a short jump. All the calculations of m_base and m_next are in a multiple * of SIZE_OF_ADDRESS (which is 8 in Win64). */ @@ -180,14 +180,13 @@ static MemoryProvider memProvider; // Compare opcodes from dictionary (str1) and opcodes from code (str2) // str1 might contain '*' to mask addresses -// RETURN: NULL if opcodes did not match, string length of str1 on success +// RETURN: 0 if opcodes did not match, 1 on success size_t compareStrings( const char *str1, const char *str2 ) { - size_t str1Length = strlen(str1); - for (size_t i=0; i= SIZE_OF_RELJUMP, "Incorrect bytecode pattern?" ); - UINT_PTR strdAddr = memProvider.GetLocation(srcAddr); - if (!strdAddr) - return 0; - *storedAddr = Addrint2Ptr(strdAddr); - // Set 'executable' flag for original instructions in the new place - DWORD pageFlags = PAGE_EXECUTE_READWRITE; - if (!VirtualProtect(*storedAddr, MAX_PROBE_SIZE, pageFlags, &pageFlags)) return 0; - // Copy original instructions to the new place - memcpy(*storedAddr, codePtr, opcodesNumber); - // Set jump to the code after replacement - offset = srcAddr - strdAddr - SIZE_OF_RELJUMP; - offset32 = (UINT)((offset & 0xFFFFFFFF)); - *((UCHAR*)*storedAddr+opcodesNumber) = 0xE9; - memcpy(((UCHAR*)*storedAddr+opcodesNumber+1), &offset32, sizeof(offset32)); - } + bytesToMove = strlen(pattern)/2-1; // The last byte matching the pattern must not be copied + __TBB_ASSERT_RELEASE( bytesToMove >= SIZE_OF_RELJUMP, "Incorrect bytecode pattern?" ); + UINT_PTR trampAddr = memProvider.GetLocation(srcAddr); + if (!trampAddr) + return 0; + *storedAddr = Addrint2Ptr(trampAddr); + // Set 'executable' flag for original instructions in the new place + DWORD pageFlags = PAGE_EXECUTE_READWRITE; + if (!VirtualProtect(*storedAddr, MAX_PROBE_SIZE, pageFlags, &pageFlags)) return 0; + // Copy original instructions to the new place + memcpy(*storedAddr, codePtr, bytesToMove); + offset = srcAddr - trampAddr; + offset32 = (UINT)(offset & 0xFFFFFFFF); + CorrectOffset( trampAddr, pattern, offset32 ); + // Set jump to the code after replacement + offset32 -= SIZE_OF_RELJUMP; + *(UCHAR*)(trampAddr+bytesToMove) = 0xE9; + memcpy((UCHAR*)(trampAddr+bytesToMove+1), &offset32, sizeof(offset32)); } // The following will work correctly even if srcAddr>tgtAddr, as long as @@ -287,7 +296,7 @@ static DWORD InsertTrampoline32(void *inpAddr, void *targetAddr, const char ** o memcpy(codePtr+1, &offset32, sizeof(offset32)); // Fill the rest with NOPs to correctly see disassembler of old code in debugger. - for( unsigned i=SIZE_OF_RELJUMP; i= SIZE_OF_INDJUMP, "Incorrect bytecode pattern?" ); - UINT_PTR strdAddr = memProvider.GetLocation(srcAddr); - if (!strdAddr) - return 0; - *storedAddr = Addrint2Ptr(strdAddr); - // Set 'executable' flag for original instructions in the new place - DWORD pageFlags = PAGE_EXECUTE_READWRITE; - if (!VirtualProtect(*storedAddr, MAX_PROBE_SIZE, pageFlags, &pageFlags)) return 0; - // Copy original instructions to the new place - memcpy(*storedAddr, codePtr, opcodesNumber); - // Set jump to the code after replacement. It is within the distance of relative jump! - offset = srcAddr - strdAddr - SIZE_OF_RELJUMP; - offset32 = (UINT)((offset & 0xFFFFFFFF)); - *((UCHAR*)*storedAddr+opcodesNumber) = 0xE9; - memcpy(((UCHAR*)*storedAddr+opcodesNumber+1), &offset32, sizeof(offset32)); - } + bytesToMove = strlen(pattern)/2-1; // The last byte matching the pattern must not be copied + __TBB_ASSERT_RELEASE( bytesToMove >= SIZE_OF_INDJUMP, "Incorrect bytecode pattern?" ); + UINT_PTR trampAddr = memProvider.GetLocation(srcAddr); + if (!trampAddr) + return 0; + *storedAddr = Addrint2Ptr(trampAddr); + // Set 'executable' flag for original instructions in the new place + DWORD pageFlags = PAGE_EXECUTE_READWRITE; + if (!VirtualProtect(*storedAddr, MAX_PROBE_SIZE, pageFlags, &pageFlags)) return 0; + // Copy original instructions to the new place + memcpy(*storedAddr, codePtr, bytesToMove); + offset = srcAddr - trampAddr; + offset32 = (UINT)(offset & 0xFFFFFFFF); + CorrectOffset( trampAddr, pattern, offset32 ); + // Set jump to the code after replacement. It is within the distance of relative jump! + offset32 -= SIZE_OF_RELJUMP; + *(UCHAR*)(trampAddr+bytesToMove) = 0xE9; + memcpy((UCHAR*)(trampAddr+bytesToMove+1), &offset32, sizeof(offset32)); } // Fill the buffer @@ -353,7 +357,7 @@ static DWORD InsertTrampoline64(void *inpAddr, void *targetAddr, const char ** o memcpy(codePtr+2, &offset32, sizeof(offset32)); // Fill the rest with NOPs to correctly see disassembler of old code in debugger. - for( unsigned i=SIZE_OF_INDJUMP; i 0, "abortOnError ignored in CheckOpcodes?" ); + } + } + + const char* pattern = opcodeIdx>0? opcodes[opcodeIdx-1]: NULL; // -1 compensates for +1 in CheckOpcodes + probeSize = InsertTrampoline32(inpAddr, targetAddr, pattern, origFunc); if (!probeSize) - probeSize = InsertTrampoline64(inpAddr, targetAddr, opcodes, origFunc); + probeSize = InsertTrampoline64(inpAddr, targetAddr, pattern, origFunc); // Restore original protection VirtualProtect(inpAddr, MAX_PROBE_SIZE, origProt, &origProt); diff --git a/src/tbbmalloc/tbb_function_replacement.h b/src/tbbmalloc/tbb_function_replacement.h index e986ab18b2..3595667004 100644 --- a/src/tbbmalloc/tbb_function_replacement.h +++ b/src/tbbmalloc/tbb_function_replacement.h @@ -56,7 +56,7 @@ union Int2Ptr { inline UINT_PTR Ptr2Addrint(LPVOID ptr); inline LPVOID Addrint2Ptr(UINT_PTR ptr); -// Use this value as the maximum size the trampoline region +// The size of a trampoline region const unsigned MAX_PROBE_SIZE = 32; // The size of a jump relative instruction "e9 00 00 00 00" @@ -68,6 +68,10 @@ const unsigned SIZE_OF_INDJUMP = 6; // The size of address we put in the location (in Intel64) const unsigned SIZE_OF_ADDRESS = 8; +// The size limit (in bytes) for an opcode pattern to fit into a trampoline +// There should be enough space left for a relative jump; +1 is for the extra pattern byte. +const unsigned MAX_PATTERN_SIZE = MAX_PROBE_SIZE - SIZE_OF_RELJUMP + 1; + // The max distance covered in 32 bits: 2^31 - 1 - C // where C should not be smaller than the size of a probe. // The latter is important to correctly handle "backward" jumps. diff --git a/src/test/harness_allocator.h b/src/test/harness_allocator.h index b0ecc6196c..b013ccdc97 100644 --- a/src/test/harness_allocator.h +++ b/src/test/harness_allocator.h @@ -127,7 +127,7 @@ struct arena { //! Allocate space for n objects, starting on a cache/sector line. pointer allocate( size_type n, const void* =0) { size_t new_size = (my_data->my_allocated += n*sizeof(T)); - __TBB_ASSERT(my_data->my_allocated <= my_data->my_size,"trying to allocate more than was reserved"); + ASSERT(my_data->my_allocated <= my_data->my_size,"trying to allocate more than was reserved"); char* result = &(my_data->my_buffer[new_size - n*sizeof(T)]); return reinterpret_cast(result); } @@ -135,8 +135,8 @@ struct arena { //! Free block of memory that starts on a cache line void deallocate( pointer p_arg, size_type n) { char* p = reinterpret_cast(p_arg); - __TBB_ASSERT(p >=my_data->my_buffer && p <= my_data->my_buffer + my_data->my_size, "trying to deallocate pointer not from arena ?"); - __TBB_ASSERT(p + n*sizeof(T) <= my_data->my_buffer + my_data->my_size, "trying to deallocate incorrect number of items?"); + ASSERT(p >=my_data->my_buffer && p <= my_data->my_buffer + my_data->my_size, "trying to deallocate pointer not from arena ?"); + ASSERT(p + n*sizeof(T) <= my_data->my_buffer + my_data->my_size, "trying to deallocate incorrect number of items?"); tbb::internal::suppress_unused_warning(p, n); } diff --git a/src/test/harness_defs.h b/src/test/harness_defs.h index 26bd9055f3..369c70ac75 100644 --- a/src/test/harness_defs.h +++ b/src/test/harness_defs.h @@ -175,10 +175,18 @@ #endif #endif -#ifndef TBB_PREVIEW_FLOW_GRAPH_FEATURES - #if __TBB_CPF_BUILD +#if __TBB_CPF_BUILD + #ifndef TBB_PREVIEW_FLOW_GRAPH_FEATURES #define TBB_PREVIEW_FLOW_GRAPH_FEATURES 1 #endif + #if __TBB_ITT_STRUCTURE_API + #ifndef TBB_PREVIEW_FLOW_GRAPH_TRACE + #define TBB_PREVIEW_FLOW_GRAPH_TRACE 1 + #endif + #ifndef TBB_PREVIEW_ALGORITHM_TRACE + #define TBB_PREVIEW_ALGORITHM_TRACE 1 + #endif + #endif #endif // std::is_copy_constructible::value returns 'true' for non copyable type when MSVC compiler is used. diff --git a/src/test/test_blocked_range2d.cpp b/src/test/test_blocked_range2d.cpp index b2a0ae97fd..1ebc52edac 100644 --- a/src/test/test_blocked_range2d.cpp +++ b/src/test/test_blocked_range2d.cpp @@ -68,25 +68,25 @@ static void SerialTest() { typedef AbstractValueType row_type; typedef AbstractValueType col_type; typedef tbb::blocked_range2d range_type; - for( int rowx=-10; rowx<10; ++rowx ) { - for( int rowy=rowx; rowy<10; ++rowy ) { - row_type rowi = MakeAbstractValueType(rowx); - row_type rowj = MakeAbstractValueType(rowy); - for( int rowg=1; rowg<10; ++rowg ) { - for( int colx=-10; colx<10; ++colx ) { - for( int coly=colx; coly<10; ++coly ) { - col_type coli = MakeAbstractValueType(colx); - col_type colj = MakeAbstractValueType(coly); - for( int colg=1; colg<10; ++colg ) { - range_type r( rowi, rowj, rowg, coli, colj, colg ); + for( int row_x=-10; row_x<10; ++row_x ) { + for( int row_y=row_x; row_y<10; ++row_y ) { + row_type row_i = MakeAbstractValueType(row_x); + row_type row_j = MakeAbstractValueType(row_y); + for( int row_grain=1; row_grain<10; ++row_grain ) { + for( int col_x=-10; col_x<10; ++col_x ) { + for( int col_y=col_x; col_y<10; ++col_y ) { + col_type col_i = MakeAbstractValueType(col_x); + col_type col_j = MakeAbstractValueType(col_y); + for( int col_grain=1; col_grain<10; ++col_grain ) { + range_type r( row_i, row_j, row_grain, col_i, col_j, col_grain ); AssertSameType( r.is_divisible(), true ); AssertSameType( r.empty(), true ); AssertSameType( static_cast(0), static_cast(0) ); AssertSameType( static_cast(0), static_cast(0) ); - AssertSameType( r.rows(), tbb::blocked_range( rowi, rowj, 1 )); - AssertSameType( r.cols(), tbb::blocked_range( coli, colj, 1 )); - ASSERT( r.empty()==(rowx==rowy||colx==coly), NULL ); - ASSERT( r.is_divisible()==(rowy-rowx>rowg||coly-colx>colg), NULL ); + AssertSameType( r.rows(), tbb::blocked_range( row_i, row_j, 1 )); + AssertSameType( r.cols(), tbb::blocked_range( col_i, col_j, 1 )); + ASSERT( r.empty()==(row_x==row_y||col_x==col_y), NULL ); + ASSERT( r.is_divisible()==(row_y-row_x>row_grain||col_y-col_x>col_grain), NULL ); if( r.is_divisible() ) { range_type r2(r,tbb::split()); if( GetValueOf(r2.rows().begin())==GetValueOf(r.rows().begin()) ) { diff --git a/src/test/test_blocked_range3d.cpp b/src/test/test_blocked_range3d.cpp index aea20223f9..242fcfd97e 100644 --- a/src/test/test_blocked_range3d.cpp +++ b/src/test/test_blocked_range3d.cpp @@ -70,22 +70,22 @@ static void SerialTest() { typedef AbstractValueType row_type; typedef AbstractValueType col_type; typedef tbb::blocked_range3d range_type; - for( int pagex=-4; pagex<4; ++pagex ) { - for( int pagey=pagex; pagey<4; ++pagey ) { - page_type pagei = MakeAbstractValueType(pagex); - page_type pagej = MakeAbstractValueType(pagey); - for( int pageg=1; pageg<4; ++pageg ) { - for( int rowx=-4; rowx<4; ++rowx ) { - for( int rowy=rowx; rowy<4; ++rowy ) { - row_type rowi = MakeAbstractValueType(rowx); - row_type rowj = MakeAbstractValueType(rowy); - for( int rowg=1; rowg<4; ++rowg ) { - for( int colx=-4; colx<4; ++colx ) { - for( int coly=colx; coly<4; ++coly ) { - col_type coli = MakeAbstractValueType(colx); - col_type colj = MakeAbstractValueType(coly); - for( int colg=1; colg<4; ++colg ) { - range_type r( pagei, pagej, pageg, rowi, rowj, rowg, coli, colj, colg ); + for( int page_x=-4; page_x<4; ++page_x ) { + for( int page_y=page_x; page_y<4; ++page_y ) { + page_type page_i = MakeAbstractValueType(page_x); + page_type page_j = MakeAbstractValueType(page_y); + for( int page_grain=1; page_grain<4; ++page_grain ) { + for( int row_x=-4; row_x<4; ++row_x ) { + for( int row_y=row_x; row_y<4; ++row_y ) { + row_type row_i = MakeAbstractValueType(row_x); + row_type row_j = MakeAbstractValueType(row_y); + for( int row_grain=1; row_grain<4; ++row_grain ) { + for( int col_x=-4; col_x<4; ++col_x ) { + for( int col_y=col_x; col_y<4; ++col_y ) { + col_type col_i = MakeAbstractValueType(col_x); + col_type col_j = MakeAbstractValueType(col_y); + for( int col_grain=1; col_grain<4; ++col_grain ) { + range_type r( page_i, page_j, page_grain, row_i, row_j, row_grain, col_i, col_j, col_grain ); AssertSameType( r.is_divisible(), true ); AssertSameType( r.empty(), true ); @@ -94,13 +94,13 @@ static void SerialTest() { AssertSameType( static_cast(0), static_cast(0) ); AssertSameType( static_cast(0), static_cast(0) ); - AssertSameType( r.pages(), tbb::blocked_range( pagei, pagej, 1 )); - AssertSameType( r.rows(), tbb::blocked_range( rowi, rowj, 1 )); - AssertSameType( r.cols(), tbb::blocked_range( coli, colj, 1 )); + AssertSameType( r.pages(), tbb::blocked_range( page_i, page_j, 1 )); + AssertSameType( r.rows(), tbb::blocked_range( row_i, row_j, 1 )); + AssertSameType( r.cols(), tbb::blocked_range( col_i, col_j, 1 )); - ASSERT( r.empty()==(pagex==pagey||rowx==rowy||colx==coly), NULL ); + ASSERT( r.empty()==(page_x==page_y||row_x==row_y||col_x==col_y), NULL ); - ASSERT( r.is_divisible()==(pagey-pagex>pageg||rowy-rowx>rowg||coly-colx>colg), NULL ); + ASSERT( r.is_divisible()==(page_y-page_x>page_grain||row_y-row_x>row_grain||col_y-col_x>col_grain), NULL ); if( r.is_divisible() ) { range_type r2(r,tbb::split()); diff --git a/src/test/test_concurrent_hash_map.cpp b/src/test/test_concurrent_hash_map.cpp index 03c281af94..a4a376b638 100644 --- a/src/test/test_concurrent_hash_map.cpp +++ b/src/test/test_concurrent_hash_map.cpp @@ -208,7 +208,6 @@ typedef local_counting_allocator > MyAllocator; typedef tbb::concurrent_hash_map MyTable; typedef tbb::concurrent_hash_map MyTable2; typedef tbb::concurrent_hash_map YourTable; -typedef tbb::concurrent_hash_map MyTable; template inline void CheckAllocator(MyTable &table, size_t expected_allocs, size_t expected_frees, bool exact = true) { diff --git a/src/test/test_opencl_node.cpp b/src/test/test_opencl_node.cpp index 7bb3149e0e..03a81ccc4d 100644 --- a/src/test/test_opencl_node.cpp +++ b/src/test/test_opencl_node.cpp @@ -477,7 +477,7 @@ class ConcurrencyTestBody : NoAssign { for ( int i = 0; i < numChecks; i += 2 ) { for ( int j = 0; j < 2; ++j ) { opencl_buffer b1( f, N ); - std::fill( b1.begin(), b1.end(), 1 ); + std::fill( b1.begin(), b1.end(), cl_char(1) ); input_port<0>( *n2 ).try_put( b1 ); } @@ -485,12 +485,12 @@ class ConcurrencyTestBody : NoAssign { opencl_buffer b( f, 4*N ); size_t id0 = (rnd.get() % N) & alignmentMask; opencl_subbuffer sb1( b, id0, N ); - std::fill( sb1.begin(), sb1.end(), 0 ); + std::fill( sb1.begin(), sb1.end(), cl_short(0) ); input_port<1>( *n2 ).try_put( sb1 ); size_t id1 = (rnd.get() % N) & alignmentMask; opencl_subbuffer sb2 = b.subbuffer( 2*N + id1, N ); - std::fill( sb2.begin(), sb2.end(), 0 ); + std::fill( sb2.begin(), sb2.end(), cl_short(0) ); input_port<1>( *n2 ).try_put( sb2 ); } } else { @@ -498,7 +498,7 @@ class ConcurrencyTestBody : NoAssign { // output_port<1> of the previous node. for ( int i = 0; i < numChecks; ++i ) { opencl_buffer b( f, N ); - std::fill( b.begin(), b.end(), 1 ); + std::fill( b.begin(), b.end(), cl_char(1) ); input_port<0>( *n2 ).try_put( b ); } } diff --git a/src/test/test_partitioner.h b/src/test/test_partitioner.h index 40ec34db05..b3da6cbf4d 100644 --- a/src/test/test_partitioner.h +++ b/src/test/test_partitioner.h @@ -29,6 +29,7 @@ #endif #include "tbb/tbb_stddef.h" #include "harness.h" +#include namespace test_partitioner_utils { @@ -346,6 +347,9 @@ class BinaryTree { visualize_node(m_root); } + bool operator ==(const BinaryTree& other_tree) const { return compare_nodes(m_root, other_tree.m_root); } + void fill_leafs(std::vector& leafs) const { fill_leafs_impl(m_root, leafs); } + private: TreeNode *m_root; @@ -431,6 +435,20 @@ class BinaryTree { if (node->m_right) visualize_node(node->m_right, indent + 1); } + + bool compare_nodes(TreeNode* node1, TreeNode* node2) const { + if (node1 == NULL && node2 == NULL) return true; + if (node1 == NULL || node2 == NULL) return false; + return are_nodes_equal(node1, node2) && compare_nodes(node1->m_left, node2->m_left) + && compare_nodes(node1->m_right, node2->m_right); + } + + void fill_leafs_impl(TreeNode* node, std::vector& leafs) const { + if (node->m_left == NULL && node->m_right == NULL) + leafs.push_back(node); + if (node->m_left != NULL) fill_leafs_impl(node->m_left, leafs); + if (node->m_right != NULL) fill_leafs_impl(node->m_right, leafs); + } }; class SimpleBody { diff --git a/src/test/test_partitioner_whitebox.cpp b/src/test/test_partitioner_whitebox.cpp index 80026eb66c..35eaddea88 100644 --- a/src/test/test_partitioner_whitebox.cpp +++ b/src/test/test_partitioner_whitebox.cpp @@ -145,5 +145,7 @@ class ParallelBody: public ParallelTestBody { int TestMain() { uniform_iterations_distribution::test >(); uniform_iterations_distribution::test >(); + uniform_iterations_distribution::test_task_affinity(); + uniform_iterations_distribution::test_task_affinity(); return Harness::Done; } diff --git a/src/test/test_partitioner_whitebox.h b/src/test/test_partitioner_whitebox.h index a4cc44d4a0..aaa2ec64bb 100644 --- a/src/test/test_partitioner_whitebox.h +++ b/src/test/test_partitioner_whitebox.h @@ -28,6 +28,7 @@ #include "string.h" #include "harness_assert.h" #include "test_partitioner.h" +#include #if TBB_USE_DEBUG // reducing number of simulations due to test timeout @@ -40,6 +41,11 @@ typedef tbb::enumerable_thread_specific ThreadNumsType; size_t g_threadNumInitialValue = 10; ThreadNumsType g_threadNums(g_threadNumInitialValue); +namespace whitebox_simulation { +size_t whitebox_thread_index = 0; +test_partitioner_utils::BinaryTree reference_tree; +} + // simulate a subset of task.h namespace tbb { namespace internal { @@ -64,9 +70,16 @@ class fake_task { fake_task *my_parent; affinity_id my_affinity; }; +namespace task_arena { +static const int not_initialized = -2;//should match corresponding value in task_arena.h +}//namespace task_arena +namespace this_task_arena { +inline int current_thread_index() { return (int)whitebox_simulation::whitebox_thread_index; } } +}//namespace tbb #define __TBB_task_H +#define __TBB_task_arena_H #define get_initial_auto_partitioner_divisor my_get_initial_auto_partitioner_divisor #define affinity_partitioner_base_v3 my_affinity_partitioner_base_v3 #define task fake_task @@ -393,4 +406,66 @@ void test() { ParallelTestBody(parallel_group_thread_starting_index)); } +namespace task_affinity_whitebox { +size_t range_begin = 0; +size_t range_end = 20; +} + +template +void check_tree(const test_partitioner_utils::BinaryTree&); + +template<> +void check_tree(const test_partitioner_utils::BinaryTree& tree) { + ASSERT(tree == whitebox_simulation::reference_tree, + "affinity_partitioner distributes tasks differently from run to run"); +} + +template<> +void check_tree(const test_partitioner_utils::BinaryTree& tree) { + std::vector tree_leafs; + tree.fill_leafs(tree_leafs); + typedef std::vector Slots; + Slots affinity_slots(tree_leafs.size() + 1, 0); + + for (std::vector::iterator i = tree_leafs.begin(); i != tree_leafs.end(); ++i) { + affinity_slots[(*i)->m_affinity]++; + if ((*i)->m_affinity == 0) + ASSERT((*i)->m_range_begin == task_affinity_whitebox::range_begin, + "Task with affinity 0 was executed with wrong range"); + } + + typedef std::iterator_traits::difference_type slots_difference_type; + ASSERT(std::count(affinity_slots.begin(), affinity_slots.end(), size_t(0)) == slots_difference_type(1), + "static_partitioner incorrectly distributed tasks by threads"); + ASSERT(std::count(affinity_slots.begin(), affinity_slots.end(), size_t(1)) == slots_difference_type(g_threadNums.local()), + "static_partitioner incorrectly distributed tasks by threads"); + ASSERT(affinity_slots[tbb::this_task_arena::current_thread_index() + 1] == 0, + "static_partitioner incorrectly assigns task with 0 affinity"); + ASSERT(std::accumulate(affinity_slots.begin(), affinity_slots.end(), size_t(0)) == g_threadNums.local(), + "static_partitioner has created more tasks than the number of threads"); +} + +template +void test_task_affinity() { + using namespace task_affinity_whitebox; + test_partitioner_utils::SimpleBody body; + for (size_t p = 1; p <= 50; ++p) { + g_threadNums.local() = p; + whitebox_simulation::whitebox_thread_index = 0; + test_partitioner_utils::TestRanges::BlockedRange range(range_begin, range_end, /*statData*/NULL, + /*provide_feedback*/false, /*ensure_non_empty_size*/false); + Partitioner partitioner; + whitebox_simulation::reference_tree = test_partitioner_utils::BinaryTree(); + whitebox_simulation::parallel_for(range, body, partitioner, &(whitebox_simulation::reference_tree)); + while (whitebox_simulation::whitebox_thread_index < p) { + test_partitioner_utils::BinaryTree tree; + whitebox_simulation::parallel_for(range, body, partitioner, &tree); + check_tree(tree); + whitebox_simulation::whitebox_thread_index++; + } + range_begin++; + range_end += 2; + } +} + } /* namespace uniform_iterations_distribution */ diff --git a/src/test/test_tbb_version.cpp b/src/test/test_tbb_version.cpp index bc6606ed19..62f788241d 100644 --- a/src/test/test_tbb_version.cpp +++ b/src/test/test_tbb_version.cpp @@ -229,7 +229,7 @@ int main(int argc, char *argv[] ) { void initialize_strings_vector(std::vector * vector) { vector->push_back(string_pair("TBB: VERSION\t\t2018.0", required)); // check TBB_VERSION - vector->push_back(string_pair("TBB: INTERFACE VERSION\t10001", required)); // check TBB_INTERFACE_VERSION + vector->push_back(string_pair("TBB: INTERFACE VERSION\t10002", required)); // check TBB_INTERFACE_VERSION vector->push_back(string_pair("TBB: BUILD_DATE", required)); vector->push_back(string_pair("TBB: BUILD_HOST", required)); vector->push_back(string_pair("TBB: BUILD_OS", required));