Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
dune-codegen
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
This is an archived project. Repository and other project resources are read-only.
Show more breadcrumbs
Christian Heinigk
dune-codegen
Commits
c522ea7a
Commit
c522ea7a
authored
7 years ago
by
Dominic Kempf
Browse files
Options
Downloads
Patches
Plain Diff
Use a divide and conquer approach to reduce the complexity of opportunity generation
parent
49298e64
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
python/dune/perftool/sumfact/vectorization.py
+49
-41
49 additions, 41 deletions
python/dune/perftool/sumfact/vectorization.py
with
49 additions
and
41 deletions
python/dune/perftool/sumfact/vectorization.py
+
49
−
41
View file @
c522ea7a
...
...
@@ -53,7 +53,7 @@ def attach_vectorization_info(sf):
def
position_penalty_factor
(
sf
):
if
isinstance
(
sf
,
SumfactKernel
):
if
isinstance
(
sf
,
SumfactKernel
)
or
sf
.
vertical_width
>
1
:
return
1
else
:
return
1
+
sum
(
abs
(
sf
.
kernels
[
i
].
position_priority
-
i
)
if
sf
.
kernels
[
i
].
position_priority
is
not
None
else
0
for
i
in
range
(
sf
.
length
))
...
...
@@ -64,8 +64,13 @@ def costmodel(sf):
# Penalize vertical vectorization
vertical_penalty
=
1
+
math
.
log
(
sf
.
vertical_width
)
# Penalize scalar sum factorization kernels
scalar_penalty
=
1
if
isinstance
(
sf
,
SumfactKernel
):
scalar_penalty
=
get_vcl_type_size
(
np
.
float64
)
# Return total operations
return
sf
.
operations
*
position_penalty_factor
(
sf
)
*
vertical_penalty
return
sf
.
operations
*
position_penalty_factor
(
sf
)
*
vertical_penalty
*
scalar_penalty
@backend
(
interface
=
"
vectorization_strategy
"
,
name
=
"
explicit
"
)
...
...
@@ -89,8 +94,6 @@ def explicit_costfunction(sf):
def
strategy_cost
(
strategy
):
qp
,
strategy
=
strategy
set_quadrature_points
(
qp
)
func
=
get_backend
(
interface
=
"
vectorization_strategy
"
,
selector
=
lambda
:
get_option
(
"
vectorization_strategy
"
))
keys
=
set
(
sf
.
cache_key
for
sf
in
strategy
.
values
())
...
...
@@ -160,35 +163,14 @@ def decide_vectorization_strategy():
# Find the best vectorization strategy by using a costmodel
width
=
get_vcl_type_size
(
np
.
float64
)
strategy
=
min
(
vectorization_opportunity_generator
(
active_sumfacts
,
width
),
key
=
strategy_cost
)
# Treat the quadrature points
qp
,
sfdict
=
strategy
set_quadrature_points
(
qp
)
logger
.
debug
(
"
decide_vectorization_strategy: Decided for the following strategy:
"
"
\n
"
.
join
(
stringify_vectorization_strategy
(
strategy
)))
# We map inactive sum factorization kernels to 0
sfdict
=
add_to_frozendict
(
sfdict
,
{
sf
:
0
for
sf
in
inactive_sumfacts
})
# Register the results
for
sf
in
all_sumfacts
:
_cache_vectorization_info
(
sf
,
sfdict
[
sf
])
def
vectorization_opportunity_generator
(
sumfacts
,
width
):
"""
Generator that yields all vectorization opportunities for the given
sum factorization kernels as tuples of quadrature point tuple and vectorization
dictionary
"""
#
#
Find
all the possible quadrature point tuples
#
Optimize over
all the possible quadrature point tuples
#
quad_points
=
[
quadrature_points_per_direction
()]
if
get_option
(
"
vectorization_allow_quadrature_changes
"
):
sf
=
next
(
iter
(
sumfacts
))
sf
=
next
(
iter
(
active_
sumfacts
))
depth
=
1
while
depth
<=
width
:
i
=
0
if
sf
.
matrix_sequence
[
0
].
face
is
None
else
1
...
...
@@ -198,12 +180,44 @@ def vectorization_opportunity_generator(sumfacts, width):
depth
=
depth
*
2
quad_points
=
list
(
set
(
quad_points
))
for
qp
in
quad_points
:
#
# Determine vectorization opportunities given a fixed quadrature point number
#
for
opp
in
fixed_quad_vectorization_opportunity_generator
(
frozenset
(
sumfacts
),
width
,
qp
):
yield
qp
,
opp
# Find the minimum cost strategy between all the quadrature point tuples
optimal_strategies
=
{
qp
:
fixed_quadrature_optimal_vectorization
(
active_sumfacts
,
width
,
qp
)
for
qp
in
quad_points
}
qp
=
min
(
optimal_strategies
,
key
=
lambda
qp
:
strategy_cost
(
optimal_strategies
[
qp
]))
sfdict
=
optimal_strategies
[
qp
]
set_quadrature_points
(
qp
)
logger
.
debug
(
"
decide_vectorization_strategy: Decided for the following strategy:
"
"
\n
"
.
join
(
stringify_vectorization_strategy
((
qp
,
sfdict
))))
# We map inactive sum factorization kernels to 0
sfdict
=
add_to_frozendict
(
sfdict
,
{
sf
:
0
for
sf
in
inactive_sumfacts
})
# Register the results
for
sf
in
all_sumfacts
:
_cache_vectorization_info
(
sf
,
sfdict
[
sf
])
def
fixed_quadrature_optimal_vectorization
(
sumfacts
,
width
,
qp
):
"""
For a given quadrature point tuple, find the optimal strategy!
In order to have this scale sufficiently, we cannot simply list all vectorization
opportunities and score them individually, but we need to do a divide and conquer
approach.
"""
set_quadrature_points
(
qp
)
# Find the sets of simultaneously realizable kernels (thats an equivalence relation)
keys
=
frozenset
(
sf
.
input_key
for
sf
in
sumfacts
)
# Find minimums for each of these sets
sfdict
=
frozendict
()
for
key
in
keys
:
key_sumfacts
=
frozenset
(
sf
for
sf
in
sumfacts
if
sf
.
input_key
==
key
)
minimum
=
min
(
fixed_quad_vectorization_opportunity_generator
(
key_sumfacts
,
width
,
qp
),
key
=
strategy_cost
)
sfdict
=
add_to_frozendict
(
sfdict
,
minimum
)
return
sfdict
def
fixed_quad_vectorization_opportunity_generator
(
sumfacts
,
width
,
qp
,
already
=
frozendict
()):
...
...
@@ -227,20 +241,14 @@ def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=
):
yield
opp
# Find all the sum factorization kernels that the chosen kernel can be parallelized with
#
# TODO: Right now we check for same input, which is not actually needed in order
# to be a suitable candidate! We should relax this concept at some point!
candidates
=
filter
(
lambda
sf
:
sf
.
input_key
==
sf_to_decide
.
input_key
,
sumfacts
)
horizontal
=
1
while
horizontal
<=
width
:
# Iterate over the possible combinations of sum factorization kernels
# taking into account all the permutations of kernels. This also includes
# combinations which use a padding of 1 - but only for pure horizontality.
generators
=
[
it
.
permutations
(
candidate
s
,
horizontal
)]
generators
=
[
it
.
permutations
(
sumfact
s
,
horizontal
)]
if
horizontal
>=
4
:
generators
.
append
(
it
.
permutations
(
candidate
s
,
horizontal
-
1
))
generators
.
append
(
it
.
permutations
(
sumfact
s
,
horizontal
-
1
))
for
combo
in
it
.
chain
(
*
generators
):
# The chosen kernels must be part of the kernels for recursion
# to work correctly
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment