Line data Source code
1 : // SPDX-License-Identifier: Apache-2.0
2 : /**
3 : * Copyright (C) 2022 Jijoong Moon <jijoong.moon@samsung.com>
4 : *
5 : * @file optimized_v2_planner.cpp
6 : * @date 29 December 2022
7 : * @see https://github.com/nnstreamer/nntrainer
8 : * @author Parichay Kapoor <pk.kapoor@samsung.com>
9 : * @author Jijoong Moon <jijoong.moon@samsung.com>
10 : * @bug No known bugs except for NYI items
11 : * @brief This is Optimized V2 Memory Planner
12 : *
13 : */
14 :
15 : #include <algorithm>
16 : #include <memory>
17 : #include <nntrainer_error.h>
18 : #include <nntrainer_log.h>
19 : #include <stdexcept>
20 : #include <vector>
21 :
22 : #include <optimized_v2_planner.h>
23 :
24 : namespace nntrainer {
25 :
26 : /**
27 : * @brief Memory Request data structure clubbing all the requests
28 : *
29 : */
30 : struct MemoryRequest {
31 : unsigned int start; /**< start of the validity (inclusive) */
32 : unsigned int end; /**< end of the validity (exclusive) */
33 : unsigned int loc; /**< index/location of the this request */
34 : size_t size; /**< size of the request */
35 : size_t offset; /**< offset for this request */
36 :
37 : /**
38 : * @brief Constructor for the Memory Request
39 : *
40 : */
41 : MemoryRequest(size_t s, const std::pair<unsigned int, unsigned int> &valid,
42 0 : unsigned int idx) :
43 0 : start(valid.first), end(valid.second), loc(idx), size(s), offset(0) {}
44 : };
45 :
46 : /**
47 : * @brief Memory Request data structure clubbing for the weight gradient
48 : * requests
49 : */
50 0 : struct WGradMemoryRequest {
51 : MemoryRequest *mem_req;
52 : std::vector<std::pair<unsigned int, unsigned int>> start_end;
53 :
54 0 : WGradMemoryRequest(MemoryRequest *req) : mem_req(req) {}
55 : };
56 :
57 : /**
58 : * @brief check if validate interval is overlapping in a very naive way.
59 : *
60 : * @param memory_validity validity
61 : * @param memory_size size
62 : * @param memory_offset offset
63 : * @param memory_req request
64 : */
65 : [[maybe_unused]] static void validateIntervalOverlap(
66 : const std::vector<std::pair<unsigned int, unsigned int>> &memory_validity,
67 : const std::vector<size_t> &memory_size,
68 : const std::vector<size_t> &memory_offset, size_t memory_req) {
69 : auto bits = std::make_unique<bool[]>(memory_req);
70 :
71 : for (size_t i = 0; i < memory_req; ++i) {
72 : bits[i] = 0;
73 : }
74 :
75 : auto exec_start =
76 : std::min_element(memory_validity.begin(), memory_validity.end(),
77 : [](auto &a, auto &b) { return a.first < b.first; });
78 :
79 : auto exec_end =
80 : std::max_element(memory_validity.begin(), memory_validity.end(),
81 : [](auto &a, auto &b) { return a.second < b.second; });
82 :
83 : auto set = [&](int offset, size_t size, int idx) {
84 : for (unsigned int i = offset; i < size; ++i) {
85 : NNTR_THROW_IF(bits[i], std::invalid_argument)
86 : << " bits taken at i: " << i << " offset: " << offset
87 : << " size: " << size << " idx: " << idx;
88 : bits[i] = 1;
89 : }
90 : };
91 :
92 : auto unset = [&](int offset, size_t size, int idx) {
93 : for (unsigned int i = offset; i < size; ++i) {
94 : NNTR_THROW_IF(!bits[i], std::invalid_argument)
95 : << "double freeing bits at i: " << i << " offset: " << offset
96 : << " size: " << size << " idx: " << idx;
97 : bits[i] = 0;
98 : }
99 : };
100 :
101 : for (unsigned int exec = exec_start->first; exec <= exec_end->second;
102 : ++exec) {
103 :
104 : for (unsigned int idx = 0; idx < memory_validity.size(); ++idx) {
105 : auto &validity = memory_validity.at(idx);
106 : auto &sz = memory_size.at(idx);
107 : auto &offset = memory_offset.at(idx);
108 : if (validity.first == exec) {
109 : set(offset, sz, idx);
110 : }
111 : if (validity.second == exec) {
112 : unset(offset, sz, idx);
113 : }
114 : }
115 : }
116 : // check if there is any dangling memory
117 : set(0, memory_req, memory_validity.size());
118 : }
119 :
120 : /**
121 : * @copydoc MemoryPlanner::planLayout(
122 : * const std::vector<size_t> &memory_size,
123 : * const std::vector<std::pair<unsigned int, unsigned int>> &memory_validity,
124 : * std::vector<size_t> &memory_offset,
125 : * std::vector<bool> &memory_is_wgrad);
126 : *
127 : * @details The optimized v1 memory planner assigns memory to the requests whose
128 : * validity starts first.
129 : * The requested memories are sorted based on the ascending order of the start
130 : * timestamps, and descending order using the end timestamps. The
131 : * sorted memories are given increasing offset based on the memory size.
132 : * At the end of each timestamp, invalid memories are freed, and offset updated
133 : * for reuse. This planner allocates overlapping memory for all the required
134 : * memories.
135 : *
136 : */
137 0 : size_t OptimizedV2Planner::planLayout(
138 : const std::vector<size_t> &memory_size,
139 : const std::vector<std::pair<unsigned int, unsigned int>> &memory_validity,
140 : std::vector<size_t> &memory_offset, std::vector<bool> &memory_is_wgrad,
141 : size_t n_wgrad) const {
142 :
143 : std::vector<MemoryRequest> wgrad_requests;
144 0 : wgrad_requests.reserve(n_wgrad);
145 :
146 : /** create memory requests structure array for easier management */
147 : std::vector<MemoryRequest> requests;
148 0 : requests.reserve(memory_size.size() - n_wgrad);
149 0 : if (n_wgrad) {
150 0 : for (unsigned int idx = 0; idx < memory_size.size(); idx++) {
151 0 : if (!memory_is_wgrad[idx]) {
152 0 : requests.emplace_back(memory_size[idx], memory_validity[idx], idx);
153 : } else {
154 0 : wgrad_requests.emplace_back(memory_size[idx], memory_validity[idx],
155 : idx);
156 : }
157 : }
158 : } else {
159 0 : for (unsigned int idx = 0; idx < memory_size.size(); idx++) {
160 0 : requests.emplace_back(memory_size[idx], memory_validity[idx], idx);
161 : }
162 : }
163 :
164 : /**
165 : * sort the memory requests with ascending order of start time first, and
166 : * then end time
167 : */
168 0 : std::sort(requests.begin(), requests.end(),
169 : [](auto const &v1, auto const &v2) -> int {
170 0 : if (v1.start == v2.start)
171 0 : return v1.end < v2.end;
172 0 : return v1.start < v2.start;
173 : /** TODO: try this */
174 : // if (v1.end == v2.end)
175 : // return v1.start < v2.start;
176 : // return v1.end > v2.end;
177 : });
178 :
179 : /** all the memories in use sorted by their assigned offset and size */
180 : std::vector<MemoryRequest *> sorted_req;
181 :
182 : /** iterate over the sorted requests and start allocation of the requests */
183 0 : memory_offset.resize(memory_size.size());
184 0 : size_t memory_req = 0;
185 0 : for (auto &req : requests) {
186 : /** remove expired memories and update offset */
187 0 : while (!sorted_req.empty() && sorted_req.back()->end <= req.start)
188 : sorted_req.pop_back();
189 :
190 : /** if there exists an expired memory with same size (not at the edge),
191 : * reuse it */
192 : bool replace_and_fill = false;
193 0 : for (int idx = sorted_req.size() - 1; idx >= 0; idx--) {
194 0 : auto const &sr = sorted_req[idx];
195 : /** TODO: reuse if memory size not exactly match */
196 0 : if (sr->end <= req.start && sr->size == req.size) {
197 0 : req.offset = sr->offset;
198 0 : memory_offset[req.loc] = req.offset;
199 0 : sorted_req[idx] = &req;
200 : replace_and_fill = true;
201 : break;
202 : }
203 : }
204 0 : if (replace_and_fill) {
205 0 : continue;
206 : }
207 :
208 : size_t offset = 0;
209 0 : if (!sorted_req.empty())
210 0 : offset = sorted_req.back()->offset + sorted_req.back()->size;
211 :
212 : /** assign offset to the new request and push to queue */
213 0 : req.offset = offset;
214 0 : memory_offset[req.loc] = offset;
215 0 : memory_req = std::max(memory_req, req.offset + req.size);
216 0 : sorted_req.push_back(&req);
217 : }
218 :
219 0 : if (wgrad_requests.size()) {
220 : /** TODO: We donot need to start from memeory_req. We might find proper
221 : * offset considering execution order */
222 0 : size_t last_offset = memory_req;
223 :
224 : /* sort the memory request with ascending order of size */
225 0 : std::sort(
226 : wgrad_requests.begin(), wgrad_requests.end(),
227 0 : [](auto const &v1, auto const &v2) -> int { return v1.size > v2.size; });
228 :
229 : std::vector<WGradMemoryRequest> wgrad_sorted_req;
230 :
231 : bool replace_and_fill = false;
232 : #ifdef DEBUG
233 : unsigned int new_grad_cnt = 0;
234 : unsigned int reused_grad_cnt = 0;
235 : size_t new_grad_size = 0;
236 : size_t reused_grad_size = 0;
237 : #endif
238 0 : for (auto &req : wgrad_requests) {
239 0 : for (unsigned int idx = 0; idx < wgrad_sorted_req.size(); idx++) {
240 : const auto &sr = wgrad_sorted_req[idx];
241 : bool merge = true;
242 0 : if (sr.mem_req->size >= req.size) {
243 0 : for (auto &interval : sr.start_end) {
244 0 : if ((interval.first < req.start && interval.first < req.end &&
245 0 : req.end < interval.second) ||
246 0 : (req.start > interval.first && req.start < interval.second &&
247 0 : req.end > interval.second) ||
248 0 : (req.start == interval.first && req.end == interval.second)) {
249 : merge = false;
250 : break;
251 : }
252 : }
253 : }
254 :
255 0 : if (merge) {
256 0 : req.offset = sr.mem_req->offset;
257 0 : memory_offset[req.loc] = req.offset;
258 : replace_and_fill = true;
259 0 : wgrad_sorted_req[idx].start_end.push_back(
260 0 : std::make_pair(req.start, req.end));
261 : #ifdef DEBUG
262 : reused_grad_size += req.size;
263 : reused_grad_cnt++;
264 : #endif
265 : break;
266 : } else {
267 : replace_and_fill = false;
268 : }
269 : }
270 0 : if (replace_and_fill) {
271 0 : continue;
272 : }
273 :
274 : size_t offset = last_offset;
275 0 : if (!wgrad_sorted_req.empty())
276 0 : offset = wgrad_sorted_req.back().mem_req->offset +
277 0 : wgrad_sorted_req.back().mem_req->size;
278 :
279 0 : req.offset = offset;
280 0 : memory_offset[req.loc] = offset;
281 0 : memory_req = std::max(memory_req, req.offset + req.size);
282 0 : wgrad_sorted_req.push_back(WGradMemoryRequest(&req));
283 0 : wgrad_sorted_req.back().start_end.push_back(
284 0 : std::make_pair(req.start, req.end));
285 : #ifdef DEBUG
286 : new_grad_cnt++;
287 : new_grad_size += req.size;
288 : #endif
289 : }
290 0 : }
291 :
292 : // validateIntervalOverlap(memory_validity, memory_size, memory_offset,
293 : // memory_req);
294 :
295 0 : return memory_req;
296 0 : }
297 :
298 : } // namespace nntrainer
|