@@ -332,7 +332,10 @@ template <class ELFT> void OutputSection::maybeCompress() {
332332
333333 // Write uncompressed data to a temporary zero-initialized buffer.
334334 auto buf = std::make_unique<uint8_t []>(size);
335- writeTo<ELFT>(buf.get ());
335+ {
336+ parallel::TaskGroup tg;
337+ writeTo<ELFT>(buf.get (), tg);
338+ }
336339 // We chose 1 (Z_BEST_SPEED) as the default compression level because it is
337340 // the fastest. If -O2 is given, we use level 6 to compress debug info more by
338341 // ~15%. We found that level 7 to 9 doesn't make much difference (~1% more
@@ -386,7 +389,8 @@ static void writeInt(uint8_t *buf, uint64_t data, uint64_t size) {
386389 llvm_unreachable (" unsupported Size argument" );
387390}
388391
389- template <class ELFT > void OutputSection::writeTo (uint8_t *buf) {
392+ template <class ELFT >
393+ void OutputSection::writeTo (uint8_t *buf, parallel::TaskGroup &tg) {
390394 llvm::TimeTraceScope timeScope (" Write sections" , name);
391395 if (type == SHT_NOBITS)
392396 return ;
@@ -419,41 +423,68 @@ template <class ELFT> void OutputSection::writeTo(uint8_t *buf) {
419423 }
420424
421425 // Write leading padding.
422- SmallVector<InputSection *, 0 > storage;
423426 ArrayRef<InputSection *> sections = getInputSections (*this , storage);
424427 std::array<uint8_t , 4 > filler = getFiller ();
425428 bool nonZeroFiller = read32 (filler.data ()) != 0 ;
426429 if (nonZeroFiller)
427430 fill (buf, sections.empty () ? size : sections[0 ]->outSecOff , filler);
428431
429- parallelFor (0 , sections.size (), [&](size_t i) {
430- InputSection *isec = sections[i];
431- if (auto *s = dyn_cast<SyntheticSection>(isec))
432- s->writeTo (buf + isec->outSecOff );
433- else
434- isec->writeTo <ELFT>(buf + isec->outSecOff );
435-
436- // Fill gaps between sections.
437- if (nonZeroFiller) {
438- uint8_t *start = buf + isec->outSecOff + isec->getSize ();
439- uint8_t *end;
440- if (i + 1 == sections.size ())
441- end = buf + size;
432+ auto fn = [=](size_t begin, size_t end) {
433+ size_t numSections = sections.size ();
434+ for (size_t i = begin; i != end; ++i) {
435+ InputSection *isec = sections[i];
436+ if (auto *s = dyn_cast<SyntheticSection>(isec))
437+ s->writeTo (buf + isec->outSecOff );
442438 else
443- end = buf + sections[i + 1 ]->outSecOff ;
444- if (isec->nopFiller ) {
445- assert (target->nopInstrs );
446- nopInstrFill (start, end - start);
447- } else
448- fill (start, end - start, filler);
439+ isec->writeTo <ELFT>(buf + isec->outSecOff );
440+
441+ // Fill gaps between sections.
442+ if (nonZeroFiller) {
443+ uint8_t *start = buf + isec->outSecOff + isec->getSize ();
444+ uint8_t *end;
445+ if (i + 1 == numSections)
446+ end = buf + size;
447+ else
448+ end = buf + sections[i + 1 ]->outSecOff ;
449+ if (isec->nopFiller ) {
450+ assert (target->nopInstrs );
451+ nopInstrFill (start, end - start);
452+ } else
453+ fill (start, end - start, filler);
454+ }
449455 }
450- }) ;
456+ };
451457
452- // Linker scripts may have BYTE()-family commands with which you
453- // can write arbitrary bytes to the output. Process them if any.
458+ // If there is any BYTE()-family command (rare), write the section content
459+ // first then process BYTE to overwrite the filler content. The write is
460+ // serial due to the limitation of llvm/Support/Parallel.h.
461+ bool written = false ;
462+ size_t numSections = sections.size ();
454463 for (SectionCommand *cmd : commands)
455- if (auto *data = dyn_cast<ByteCommand>(cmd))
464+ if (auto *data = dyn_cast<ByteCommand>(cmd)) {
465+ if (!std::exchange (written, true ))
466+ fn (0 , numSections);
456467 writeInt (buf + data->offset , data->expression ().getValue (), data->size );
468+ }
469+ if (written || !numSections)
470+ return ;
471+
472+ // There is no data command. Write content asynchronously to overlap the write
473+ // time with other output sections. Note, if a linker script specifies
474+ // overlapping output sections (needs --noinhibit-exec or --no-check-sections
475+ // to supress the error), the output may be non-deterministic.
476+ const size_t taskSizeLimit = 4 << 20 ;
477+ for (size_t begin = 0 , i = 0 , taskSize = 0 ;;) {
478+ taskSize += sections[i]->getSize ();
479+ bool done = ++i == numSections;
480+ if (done || taskSize >= taskSizeLimit) {
481+ tg.execute ([=] { fn (begin, i); });
482+ if (done)
483+ break ;
484+ begin = i;
485+ taskSize = 0 ;
486+ }
487+ }
457488}
458489
459490static void finalizeShtGroup (OutputSection *os, InputSection *section) {
@@ -673,10 +704,14 @@ template void OutputSection::writeHeaderTo<ELF32BE>(ELF32BE::Shdr *Shdr);
673704template void OutputSection::writeHeaderTo<ELF64LE>(ELF64LE::Shdr *Shdr);
674705template void OutputSection::writeHeaderTo<ELF64BE>(ELF64BE::Shdr *Shdr);
675706
676- template void OutputSection::writeTo<ELF32LE>(uint8_t *Buf);
677- template void OutputSection::writeTo<ELF32BE>(uint8_t *Buf);
678- template void OutputSection::writeTo<ELF64LE>(uint8_t *Buf);
679- template void OutputSection::writeTo<ELF64BE>(uint8_t *Buf);
707+ template void OutputSection::writeTo<ELF32LE>(uint8_t *,
708+ llvm::parallel::TaskGroup &);
709+ template void OutputSection::writeTo<ELF32BE>(uint8_t *,
710+ llvm::parallel::TaskGroup &);
711+ template void OutputSection::writeTo<ELF64LE>(uint8_t *,
712+ llvm::parallel::TaskGroup &);
713+ template void OutputSection::writeTo<ELF64BE>(uint8_t *,
714+ llvm::parallel::TaskGroup &);
680715
681716template void OutputSection::maybeCompress<ELF32LE>();
682717template void OutputSection::maybeCompress<ELF32BE>();
0 commit comments