Commit 6dfba800f965ef6546d303f7c3bea1a0b80f39bb
1 parent
ee2ff04c62
Exists in
master
complement a la partie filtre et inclusion du rapport de fin de M2 dont on pourr…
…a s'inspirer pour expliquer la simu
Showing 3 changed files with 560 additions and 12 deletions Inline Diff
Makefile
# source: https://tex.stackexchange.com/questions/40738/how-to-properly-make-a-latex-project | 1 | 1 | # source: https://tex.stackexchange.com/questions/40738/how-to-properly-make-a-latex-project | |
2 | 2 | |||
TEX = pdflatex -shell-escape -interaction=nonstopmode -file-line-error | 3 | 3 | TEX = pdflatex -shell-escape -interaction=nonstopmode -file-line-error | |
BIB = bibtex | 4 | 4 | BIB = bibtex | |
TARGET = ifcs2018 | 5 | 5 | TARGET = ifcs2018 | |
6 | 6 | |||
all: $(TARGET)_abstract $(TARGET)_poster $(TARGET)_proceeding | 7 | 7 | all: $(TARGET)_abstract $(TARGET)_poster $(TARGET)_proceeding | |
8 | 8 | |||
view: $(TARGET) | 9 | 9 | view: $(TARGET) | |
evince $(TARGET).pdf | 10 | 10 | evince $(TARGET).pdf | |
11 | 11 | |||
view_poster: $(TARGET)_poster | 12 | 12 | view_poster: $(TARGET)_poster | |
evince $(TARGET)_poster.pdf | 13 | 13 | evince $(TARGET)_poster.pdf | |
14 | 14 | |||
$(TARGET)_abstract: $(TARGET)_abstract.tex references.bib | 15 | 15 | $(TARGET)_abstract: $(TARGET)_abstract.tex references.bib biblio.bib | |
$(TEX) $@.tex | 16 | 16 | $(TEX) $@.tex | |
$(BIB) $@ | 17 | 17 | $(BIB) $@ | |
$(TEX) $@.tex | 18 | 18 | $(TEX) $@.tex | |
$(TEX) $@.tex | 19 | 19 | $(TEX) $@.tex | |
20 | 20 | |||
$(TARGET)_poster: | 21 | 21 | $(TARGET)_poster: | |
$(TEX) $@.tex | 22 | 22 | $(TEX) $@.tex | |
$(TEX) $@.tex | 23 | 23 | $(TEX) $@.tex | |
24 | 24 | |||
$(TARGET)_proceeding: $(TARGET)_proceeding.tex references.bib | 25 | 25 | $(TARGET)_proceeding: $(TARGET)_proceeding.tex references.bib biblio.bib | |
$(TEX) $@.tex | 26 | 26 | $(TEX) $@.tex | |
$(BIB) $@ | 27 | 27 | $(BIB) $@ | |
$(TEX) $@.tex | 28 | 28 | $(TEX) $@.tex | |
$(TEX) $@.tex | 29 | 29 | $(TEX) $@.tex |
biblio.bib
File was created | 1 | @thesis{gwen-cogen, | ||
2 | author = {Gwenhaël Goavec-Merou}, | |||
3 | title = {Générateur de coprocesseur pour le traitement de données en flux (vidéo ou similaire) sur FPGA}, | |||
4 | institution = {FEMTO-ST}, | |||
5 | year = {2014} | |||
6 | } | |||
7 | ||||
8 | @article{hide, | |||
9 | title={HIDE: A hardware intelligent description environment}, | |||
10 | author={Benkrid, Khaled and Belkacemi, S and Benkrid, Abdsamad}, | |||
11 | journal={Microprocessors and Microsystems}, | |||
12 | volume={30}, | |||
13 | number={6}, | |||
14 | pages={283--300}, | |||
15 | year={2006}, | |||
16 | publisher={Elsevier} | |||
17 | } | |||
18 | ||||
19 | @inproceedings{skeleton, | |||
20 | title={High level programming for FPGA based image and video processing using hardware skeletons}, | |||
21 | author={Benkrid, Khaled and Crookes, Danny and Smith, J and Benkrid, Abdsamad}, | |||
22 | booktitle={Field-Programmable Custom Computing Machines, 2001. FCCM'01. The 9th Annual IEEE Symposium on}, | |||
23 | pages={219--226}, | |||
24 | year={2001}, | |||
25 | organization={IEEE} | |||
26 | } | |||
27 | ||||
28 | @article{benkrid2004application, | |||
29 | title={From application descriptions to hardware in seconds: a logic-based approach to bridging the gap}, | |||
30 | author={Benkrid, Khaled and Crookes, Danny}, | |||
31 | journal={Very Large Scale Integration (VLSI) Systems, IEEE Transactions on}, | |||
32 | volume={12}, | |||
33 | number={4}, | |||
34 | pages={420--436}, | |||
35 | year={2004}, | |||
36 | publisher={IEEE} | |||
37 | } | |||
38 | ||||
39 | @phdthesis{these-dsp-fpga, | |||
40 | title={Design methodologies and architectures for digital signal processing on FPGAs}, | |||
41 | author={Mirzaei, Shahnam}, | |||
42 | year={2010}, | |||
43 | school={UNIVERSITY OF CALIFORNIA SANTA BARBARA} | |||
44 | } | |||
45 | ||||
46 | @article{def1-ordo, | |||
47 | title={Algorithmique Parallèle-Cours Et Exercices Corrigés}, | |||
48 | author={Legrand, Arnaud and Robert, Yves}, | |||
49 | year={2003}, | |||
50 | publisher={Dunod} | |||
51 | } | |||
52 | ||||
53 | @article{these-mathias, | |||
54 | title={Optimisation du débit pour des applications linéaires multi-tâches sur plateformes distribuées incluant des temps de reconfiguration}, | |||
55 | author={Coqblin, Mathias}, | |||
56 | institution = {FEMTO-ST}, | |||
57 | year={2012} | |||
58 | } | |||
59 | ||||
60 | @thesis{these-alex, | |||
61 | author = {Alexandru Dobrila}, | |||
62 | title = {Optimisation du débit en environnement distribué incertain}, | |||
63 | institution = {FEMTO-ST}, | |||
64 | year = {2011} | |||
65 | } | |||
66 | ||||
67 | @book{def2-ordo, | |||
68 | title={Handbook of scheduling: algorithms, models, and performance analysis}, | |||
69 | author={Leung, Joseph YT}, | |||
70 | year={2004}, | |||
71 | publisher={CRC Press} | |||
72 | } | |||
73 | ||||
74 | @inproceedings{def-ordo-en-ligne, | |||
75 | title={On the Definition of "On-Line" in Job Scheduling Problems}, | |||
76 | author={Feitelson, Dror G and Mu'alem, Ahuva W}, | |||
77 | booktitle={SIGACT NEWS}, | |||
78 | year={2000}, | |||
79 | organization={Citeseer} | |||
80 | } | |||
81 | ||||
82 | @article{shmueli2005backfilling, | |||
83 | title={Backfilling with lookahead to optimize the packing of parallel jobs}, | |||
84 | author={Shmueli, Edi and Feitelson, Dror G}, | |||
85 | journal={Journal of Parallel and Distributed Computing}, | |||
86 | volume={65}, | |||
87 | number={9}, | |||
88 | pages={1090--1107}, | |||
89 | year={2005}, | |||
90 | publisher={Elsevier} | |||
91 | } | |||
92 | ||||
93 | @article{graham1979optimization, | |||
94 | title={Optimization and approximation in deterministic sequencing and scheduling: a survey}, | |||
95 | author={Graham, Ronald L and Lawler, Eugene L and Lenstra, Jan Karel and Kan, AHG Rinnooy}, | |||
96 | journal={Annals of discrete mathematics}, | |||
97 | volume={5}, | |||
98 | pages={287--326}, | |||
99 | year={1979}, | |||
100 | publisher={Elsevier} | |||
101 | } | |||
102 | ||||
103 | @article{salvador2012accelerating, | |||
104 | title={Accelerating FPGA-based evolution of wavelet transform filters by optimized task scheduling}, | |||
105 | author={Salvador, Ruben and Vidal, Alberto and Moreno, Felix and Riesgo, Teresa and Sekanina, Lukas}, | |||
106 | journal={Microprocessors and Microsystems}, | |||
107 | volume={36}, | |||
108 | number={5}, | |||
109 | pages={427--438}, | |||
110 | year={2012}, | |||
111 | publisher={Elsevier} | |||
112 | } | |||
113 | ||||
114 | @article{zhuo2007scalable, | |||
115 | title={Scalable and modular algorithms for floating-point matrix multiplication on reconfigurable computing systems}, | |||
116 | author={Zhuo, Ling and Prasanna, Viktor K}, | |||
117 | journal={Parallel and Distributed Systems, IEEE Transactions on}, | |||
118 | volume={18}, | |||
119 | number={4}, | |||
120 | pages={433--448}, | |||
121 | year={2007}, | |||
122 | publisher={IEEE} | |||
123 | } | |||
124 | ||||
125 | @article{olariu1993computing, | |||
126 | title={Computing the Hough transform on reconfigurable meshes}, | |||
127 | author={Olariu, Stephan and Schwing, James L and Zhang, Jingyuan}, | |||
128 | journal={Image and vision computing}, | |||
129 | volume={11}, | |||
130 | number={10}, | |||
131 | pages={623--628}, | |||
132 | year={1993}, | |||
133 | publisher={Elsevier} | |||
134 | } | |||
135 | ||||
136 | @article{pan1999improved, | |||
137 | title={An improved constant-time algorithm for computing the Radon and Hough transforms on a reconfigurable mesh}, | |||
138 | author={Pan, Yi and Li, Keqin and Hamdi, Mounir}, | |||
139 | journal={Systems, Man and Cybernetics, Part A: Systems and Humans, IEEE Transactions on}, | |||
140 | volume={29}, | |||
141 | number={4}, | |||
142 | pages={417--421}, | |||
143 | year={1999}, | |||
144 | publisher={IEEE} | |||
145 | } | |||
146 | ||||
147 | @article{kasbah2008multigrid, | |||
148 | title={Multigrid solvers in reconfigurable hardware}, | |||
149 | author={Kasbah, Safaa J and Damaj, Issam W and Haraty, Ramzi A}, | |||
150 | journal={Journal of Computational and Applied Mathematics}, | |||
151 | volume={213}, | |||
152 | number={1}, | |||
153 | pages={79--94}, | |||
154 | year={2008}, | |||
155 | publisher={Elsevier} | |||
156 | } | |||
157 | ||||
158 | @inproceedings{crookes1998environment, | |||
159 | title={An environment for generating FPGA architectures for image algebra-based algorithms}, | |||
160 | author={Crookes, Danny and Alotaibi, Khalid and Bouridane, Ahmed and Donachy, Paul and Benkrid, Abdsamad}, | |||
161 | booktitle={Image Processing, 1998. ICIP 98. Proceedings. 1998 International Conference on}, | |||
162 | pages={990--994}, | |||
163 | year={1998}, | |||
164 | organization={IEEE} | |||
165 | } | |||
166 | ||||
167 | @article{crookes2000design, | |||
168 | title={Design and implementation of a high level programming environment for FPGA-based image processing}, |
ifcs2018_proceeding.tex
\documentclass[a4paper,conference]{IEEEtran/IEEEtran} | 1 | 1 | \documentclass[a4paper,conference]{IEEEtran/IEEEtran} | |
\usepackage{graphicx,color,hyperref} | 2 | 2 | \usepackage{graphicx,color,hyperref} | |
\usepackage{amsfonts} | 3 | 3 | \usepackage{amsfonts} | |
4 | \usepackage{amsthm} | |||
5 | \usepackage{amssymb} | |||
6 | \usepackage{amsmath} | |||
\usepackage{url} | 4 | 7 | \usepackage{url} | |
\usepackage[normalem]{ulem} | 5 | 8 | \usepackage[normalem]{ulem} | |
\graphicspath{{/home/jmfriedt/gpr/170324_avalanche/}{/home/jmfriedt/gpr/1705_homemade/}} | 6 | 9 | \graphicspath{{/home/jmfriedt/gpr/170324_avalanche/}{/home/jmfriedt/gpr/1705_homemade/}} | |
% correct bad hyphenation here | 7 | 10 | % correct bad hyphenation here | |
\hyphenation{op-tical net-works semi-conduc-tor} | 8 | 11 | \hyphenation{op-tical net-works semi-conduc-tor} | |
\textheight=26cm | 9 | 12 | \textheight=26cm | |
\setlength{\footskip}{30pt} | 10 | 13 | \setlength{\footskip}{30pt} | |
\pagenumbering{gobble} | 11 | 14 | \pagenumbering{gobble} | |
\begin{document} | 12 | 15 | \begin{document} | |
\title{Filter optimization for real time digital processing of radiofrequency signals: application | 13 | 16 | \title{Filter optimization for real time digital processing of radiofrequency signals: application | |
to oscillator metrology} | 14 | 17 | to oscillator metrology} | |
15 | 18 | |||
\author{\IEEEauthorblockN{A. Hugeat\IEEEauthorrefmark{1}\IEEEauthorrefmark{2}, J. Bernard\IEEEauthorrefmark{2}, | 16 | 19 | \author{\IEEEauthorblockN{A. Hugeat\IEEEauthorrefmark{1}\IEEEauthorrefmark{2}, J. Bernard\IEEEauthorrefmark{2}, | |
G. Goavec-M\'erou\IEEEauthorrefmark{1}, | 17 | 20 | G. Goavec-M\'erou\IEEEauthorrefmark{1}, | |
P.-Y. Bourgeois\IEEEauthorrefmark{1}, J.-M. Friedt\IEEEauthorrefmark{1}} | 18 | 21 | P.-Y. Bourgeois\IEEEauthorrefmark{1}, J.-M. Friedt\IEEEauthorrefmark{1}} | |
\IEEEauthorblockA{\IEEEauthorrefmark{1}FEMTO-ST, Time \& Frequency department, Besan\c con, France } | 19 | 22 | \IEEEauthorblockA{\IEEEauthorrefmark{1}FEMTO-ST, Time \& Frequency department, Besan\c con, France } | |
\IEEEauthorblockA{\IEEEauthorrefmark{2}FEMTO-ST, Computer Science department DISC, Besan\c con, France \\ | 20 | 23 | \IEEEauthorblockA{\IEEEauthorrefmark{2}FEMTO-ST, Computer Science department DISC, Besan\c con, France \\ | |
Email: \{pyb2,jmfriedt\}@femto-st.fr} | 21 | 24 | Email: \{pyb2,jmfriedt\}@femto-st.fr} | |
} | 22 | 25 | } | |
\maketitle | 23 | 26 | \maketitle | |
\thispagestyle{plain} | 24 | 27 | \thispagestyle{plain} | |
\pagestyle{plain} | 25 | 28 | \pagestyle{plain} | |
29 | \newtheorem{definition}{Definition} | |||
26 | 30 | |||
\begin{abstract} | 27 | 31 | \begin{abstract} | |
Software Defined Radio (SDR) provides stability, flexibility and reconfigurability to | 28 | 32 | Software Defined Radio (SDR) provides stability, flexibility and reconfigurability to | |
radiofrequency signal processing. Applied to oscillator characterization in the context | 29 | 33 | radiofrequency signal processing. Applied to oscillator characterization in the context | |
of ultrastable clocks, stringent filtering requirements are defined by spurious signal or | 30 | 34 | of ultrastable clocks, stringent filtering requirements are defined by spurious signal or | |
noise rejection needs. Since real time radiofrequency processing must be performed in a | 31 | 35 | noise rejection needs. Since real time radiofrequency processing must be performed in a | |
Field Programmable Array to meet timing constraints, we investigate optimization strategies | 32 | 36 | Field Programmable Array to meet timing constraints, we investigate optimization strategies | |
to design filters meeting rejection characteristics while limiting the hardware resources | 33 | 37 | to design filters meeting rejection characteristics while limiting the hardware resources | |
required and keeping timing constraints within the targeted measurement bandwidths. | 34 | 38 | required and keeping timing constraints within the targeted measurement bandwidths. | |
\end{abstract} | 35 | 39 | \end{abstract} | |
36 | 40 | |||
\begin{IEEEkeywords} | 37 | 41 | \begin{IEEEkeywords} | |
Software Defined Radio, Mixed-Integer Linear Programming, Finite Impulse Response filter | 38 | 42 | Software Defined Radio, Mixed-Integer Linear Programming, Finite Impulse Response filter | |
\end{IEEEkeywords} | 39 | 43 | \end{IEEEkeywords} | |
40 | 44 | |||
\section{Digital signal processing of ultrastable clock signals} | 41 | 45 | \section{Digital signal processing of ultrastable clock signals} | |
42 | 46 | |||
Analog oscillator phase noise characteristics are classically performed by downconverting | 43 | 47 | Analog oscillator phase noise characteristics are classically performed by downconverting | |
the radiofrequency signal using a saturated mixer to bring the radiofrequency signal to baseband, | 44 | 48 | the radiofrequency signal using a saturated mixer to bring the radiofrequency signal to baseband, | |
followed by a Fourier analysis of the beat signal to analyze phase fluctuations close to carrier. In | 45 | 49 | followed by a Fourier analysis of the beat signal to analyze phase fluctuations close to carrier. In | |
a fully digital approach, the radiofrequency signal is digitized and numerically downconverted by | 46 | 50 | a fully digital approach, the radiofrequency signal is digitized and numerically downconverted by | |
multiplying the samples with a local numerically controlled oscillator (Fig. \ref{schema}) \cite{rsi}. | 47 | 51 | multiplying the samples with a local numerically controlled oscillator (Fig. \ref{schema}) \cite{rsi}. | |
48 | 52 | |||
\begin{figure}[h!tb] | 49 | 53 | \begin{figure}[h!tb] | |
\begin{center} | 50 | 54 | \begin{center} | |
\includegraphics[width=.8\linewidth]{images/schema} | 51 | 55 | \includegraphics[width=.8\linewidth]{images/schema} | |
\end{center} | 52 | 56 | \end{center} | |
\caption{Fully digital oscillator phase noise characterization: the Device Under Test | 53 | 57 | \caption{Fully digital oscillator phase noise characterization: the Device Under Test | |
(DUT) signal is sampled by the radiofrequency grade Analog to Digital Converter (ADC) and | 54 | 58 | (DUT) signal is sampled by the radiofrequency grade Analog to Digital Converter (ADC) and | |
downconverted by mixing with a Numerically Controlled Oscillator (NCO). Unwanted signals | 55 | 59 | downconverted by mixing with a Numerically Controlled Oscillator (NCO). Unwanted signals | |
and noise aliases are rejected by a Low Pass Filter (LPF) implemented as a cascade of Finite | 56 | 60 | and noise aliases are rejected by a Low Pass Filter (LPF) implemented as a cascade of Finite | |
Impulse Response (FIR) filters. The signal is then decimated before a Fourier analysis displays | 57 | 61 | Impulse Response (FIR) filters. The signal is then decimated before a Fourier analysis displays | |
the spectral characteristics of the phase fluctuations.} | 58 | 62 | the spectral characteristics of the phase fluctuations.} | |
\label{schema} | 59 | 63 | \label{schema} | |
\end{figure} | 60 | 64 | \end{figure} | |
61 | 65 | |||
As with the analog mixer, | 62 | 66 | As with the analog mixer, | |
the non-linear behavior of the downconverter introduces noise or spurious signal aliasing as | 63 | 67 | the non-linear behavior of the downconverter introduces noise or spurious signal aliasing as | |
well as the generation of the frequency sum signal in addition to the frequency difference. | 64 | 68 | well as the generation of the frequency sum signal in addition to the frequency difference. | |
These unwanted spectral characteristics must be rejected before decimating the data stream | 65 | 69 | These unwanted spectral characteristics must be rejected before decimating the data stream | |
for the phase noise spectral characterization. The characteristics introduced between the downconverter | 66 | 70 | for the phase noise spectral characterization. The characteristics introduced between the downconverter | |
and the decimation processing blocks are core characteristics of an oscillator characterization | 67 | 71 | and the decimation processing blocks are core characteristics of an oscillator characterization | |
system, and must reject out-of-band signals below the targeted phase noise -- typically in the | 68 | 72 | system, and must reject out-of-band signals below the targeted phase noise -- typically in the | |
sub -170~dBc/Hz for ultrastable oscillator we aim at characterizing. The filter blocks will | 69 | 73 | sub -170~dBc/Hz for ultrastable oscillator we aim at characterizing. The filter blocks will | |
use most resources of the Field Programmable Gate Array (FPGA) used to process the radiofrequency | 70 | 74 | use most resources of the Field Programmable Gate Array (FPGA) used to process the radiofrequency | |
datastream: optimizing the performance of the filter while reducing the needed resources is | 71 | 75 | datastream: optimizing the performance of the filter while reducing the needed resources is | |
hence tackled in a systematic approach using optimization techniques. Most significantly, we | 72 | 76 | hence tackled in a systematic approach using optimization techniques. Most significantly, we | |
tackle the issue by attempting to cascade multiple Finite Impulse Response (FIR) filters with | 73 | 77 | tackle the issue by attempting to cascade multiple Finite Impulse Response (FIR) filters with | |
tunable number of coefficients and tunable number of bits representing the coefficients and the | 74 | 78 | tunable number of coefficients and tunable number of bits representing the coefficients and the | |
data being processed. | 75 | 79 | data being processed. | |
76 | 80 | |||
\section{Finite impulse response filter} | 77 | 81 | \section{Finite impulse response filter} | |
78 | 82 | |||
We select FIR filter for their unconditional stability and ease of design. A FIR filter is defined | 79 | 83 | We select FIR filter for their unconditional stability and ease of design. A FIR filter is defined | |
by a set of weights $b_k$ applied to the inputs $x_k$ through a convolution to generate the outputs $y_k$ | 80 | 84 | by a set of weights $b_k$ applied to the inputs $x_k$ through a convolution to generate the outputs $y_k$ | |
$$y_n=\sum_{k=0}^N b_k x_{n-k}$$ | 81 | 85 | $$y_n=\sum_{k=0}^N b_k x_{n-k}$$ | |
82 | 86 | |||
As opposed to an implementation on a general purpose processor in which word size is defined by the | 83 | 87 | As opposed to an implementation on a general purpose processor in which word size is defined by the | |
processor architecture, implementing such a filter on an FPGA offer more degrees of freedom since | 84 | 88 | processor architecture, implementing such a filter on an FPGA offer more degrees of freedom since | |
not only the coefficient values and number of taps must be defined, but also the number of bits defining | 85 | 89 | not only the coefficient values and number of taps must be defined, but also the number of bits defining | |
the coefficients and the sample size. | 86 | 90 | the coefficients and the sample size. | |
87 | 91 | |||
The coefficients are classically expressed as floating point values. However, this binary | 88 | 92 | The coefficients are classically expressed as floating point values. However, this binary | |
number representation is not efficient for fast arithmetic computation by an FPGA. Instead, | 89 | 93 | number representation is not efficient for fast arithmetic computation by an FPGA. Instead, | |
we select to quantify these floating point values into integer values. This quantization | 90 | 94 | we select to quantify these floating point values into integer values. This quantization | |
will result in some precision loss. As illustrated in Fig. \ref{float_vs_int}, we see that we aren't | 91 | 95 | will result in some precision loss. | |
need too coefficients or too sample size. If we have lot of coefficients but a small sample size, | 92 | |||
the first and last are equal to zero. But if we have too sample size for few coefficients that not improve the quality. | 93 | |||
94 | 96 | |||
97 | %As illustrated in Fig. \ref{float_vs_int}, we see that we aren't | |||
98 | %need too coefficients or too sample size. If we have lot of coefficients but a small sample size, | |||
99 | %the first and last are equal to zero. But if we have too sample size for few coefficients that not improve the quality. | |||
100 | ||||
% JMF je ne comprends pas la derniere phrase ci-dessus ni la figure ci dessous | 95 | 101 | % JMF je ne comprends pas la derniere phrase ci-dessus ni la figure ci dessous | |
\begin{figure}[h!tb] | 96 | 102 | %\begin{figure}[h!tb] | |
\includegraphics[width=\linewidth]{images/float-vs-integer.pdf} | 97 | 103 | %\includegraphics[width=\linewidth]{images/float-vs-integer.pdf} | |
\caption{Impact of the quantization resolution of the coefficients} | 98 | 104 | %\caption{Impact of the quantization resolution of the coefficients} | |
%\label{float_vs_int} | 99 | 105 | %\label{float_vs_int} | |
\end{figure} | 100 | 106 | %\end{figure} | |
101 | 107 | |||
108 | The tradeoff between quantization resolution and number of coefficients when considering | |||
109 | integer operations is not trivial. As an illustration of the issue related to the | |||
110 | relation between number of fiter taps and quantization, Fig. \ref{float_vs_int} exhibits | |||
111 | a 128-coefficient FIR bandpass filter designed using floating point numbers (blue). Upon | |||
112 | quantization on 6~bit integers, 60 of the 128~coefficients in the beginning and end of the | |||
113 | taps become null, making the large number of coefficients irrelevant and allowing to save | |||
114 | processing resource by shrinking the filter length. This tradeoff aimed at minimizing resources | |||
115 | to reach a given rejection level, or maximizing out of band rejection for a given computational | |||
116 | resource, will drive the investigation on cascading filters designed with varying tap resolution | |||
117 | and tap length, as will be shown in the next section. | |||
118 | ||||
\begin{figure}[h!tb] | 102 | 119 | \begin{figure}[h!tb] | |
\includegraphics[width=\linewidth]{images/demo_filtre} | 103 | 120 | \includegraphics[width=\linewidth]{images/demo_filtre} | |
\caption{Impact of the quantization resolution of the coefficients: the quantization is | 104 | 121 | \caption{Impact of the quantization resolution of the coefficients: the quantization is | |
set to 6~bits, setting the 30~first and 30~last coefficients out of the initial 128~band-pass | 105 | 122 | set to 6~bits, setting the 30~first and 30~last coefficients out of the initial 128~band-pass | |
filter coefficients to 0.} | 106 | 123 | filter coefficients to 0.} | |
\label{float_vs_int} | 107 | 124 | \label{float_vs_int} | |
\end{figure} | 108 | 125 | \end{figure} | |
109 | 126 | |||
110 | ||||
\section{Filter optimization} | 111 | 127 | \section{Filter optimization} | |
112 | 128 | |||
A basic approach for implementing the FIR filter is to compute the transfer function of | 113 | 129 | A basic approach for implementing the FIR filter is to compute the transfer function of | |
a monolithic filter: this single filter defines all coefficients with the same resolution | 114 | 130 | a monolithic filter: this single filter defines all coefficients with the same resolution | |
(number of bits) and processes data represented with their own resolution. Meeting the | 115 | 131 | (number of bits) and processes data represented with their own resolution. Meeting the | |
filter shape requires a large number of coefficients, limited by resources of the FPGA since | 116 | 132 | filter shape requires a large number of coefficients, limited by resources of the FPGA since | |
this filter must process data stream at the radiofrequency sampling rate after the mixer. | 117 | 133 | this filter must process data stream at the radiofrequency sampling rate after the mixer. | |
118 | 134 | |||
An optimization problem \cite{leung2004handbook} aims at improving one or many | 119 | 135 | An optimization problem \cite{leung2004handbook} aims at improving one or many | |
performance criteria within a constrained resource environment. Amongst the tools | 120 | 136 | performance criteria within a constrained resource environment. Amongst the tools | |
developed to meet this aim, Mixed-Integer Linear Programming (MILP) provides the framework to | 121 | 137 | developed to meet this aim, Mixed-Integer Linear Programming (MILP) provides the framework to | |
provide a formal definition of the stated problem and search for an optimal use of available | 122 | 138 | provide a formal definition of the stated problem and search for an optimal use of available | |
resources \cite{yu2007design, kodek1980design}. | 123 | 139 | resources \cite{yu2007design, kodek1980design}. | |
124 | 140 | |||
The degrees of freedom when addressing the problem of replacing the single monolithic | 125 | 141 | The degrees of freedom when addressing the problem of replacing the single monolithic | |
FIR with a cascade of optimized filters are the number of coefficients $N_i$ of each filter $i$, | 126 | 142 | FIR with a cascade of optimized filters are the number of coefficients $N_i$ of each filter $i$, | |
the number of bits $c_i$ representing the coefficients and the number of bits $d_i$ representing | 127 | 143 | the number of bits $c_i$ representing the coefficients and the number of bits $d_i$ representing | |
the data fed to the filter. Because each FIR in the chain is fed the output of the previous stage, | 128 | 144 | the data fed to the filter. Because each FIR in the chain is fed the output of the previous stage, | |
the optimization of the complete processing chain within a constrained resource environment is not | 129 | 145 | the optimization of the complete processing chain within a constrained resource environment is not | |
trivial. The resource occupation of a FIR filter is considered as $c_i+d_i+\log_2(N_i)$ which is | 130 | 146 | trivial. The resource occupation of a FIR filter is considered as $c_i+d_i+\log_2(N_i)$ which is | |
the number of bits needed in a worst case condition to represent the output of the FIR. | 131 | 147 | the number of bits needed in a worst case condition to represent the output of the FIR. | |
132 | 148 | |||
133 | ||||
\begin{figure}[h!tb] | 134 | 149 | \begin{figure}[h!tb] | |
\includegraphics[width=\linewidth]{images/noise-rejection.pdf} | 135 | 150 | \includegraphics[width=\linewidth]{images/noise-rejection.pdf} | |
\caption{Rejection as a function of number of coefficients and number of bits} | 136 | 151 | \caption{Rejection as a function of number of coefficients and number of bits} | |
\label{noise-rejection} | 137 | 152 | \label{noise-rejection} | |
\end{figure} | 138 | 153 | \end{figure} | |
139 | 154 | |||
The objective function maximizes the noise rejection while keeping resource occupation below | 140 | 155 | The objective function maximizes the noise rejection while keeping resource occupation below | |
a user-defined threshold. The MILP solver is allowed to choose the number of successive | 141 | 156 | a user-defined threshold. The MILP solver is allowed to choose the number of successive | |
filters, within an upper bound. The last problem is to model the noise rejection. Since filter | 142 | 157 | filters, within an upper bound. The last problem is to model the noise rejection. Since filter | |
noise rejection capability is not modeled with linear equation, a look-up-table is generated | 143 | 158 | noise rejection capability is not modeled with linear equation, a look-up-table is generated | |
for multiple filter configurations in which the $c_i$, $d_i$ and $N_i$ parameters are varied: for each | 144 | 159 | for multiple filter configurations in which the $c_i$, $d_i$ and $N_i$ parameters are varied: for each | |
one of these conditions, the low-pass filter rejection defined as the mean power between | 145 | 160 | one of these conditions, the low-pass filter rejection defined as the mean power between | |
half the Nyquist frequency and the Nyquist frequency is stored as computed by the frequency response | 146 | 161 | half the Nyquist frequency and the Nyquist frequency is stored as computed by the frequency response | |
of the digital filter (Fig. \ref{noise-rejection}). | 147 | 162 | of the digital filter (Fig. \ref{noise-rejection}). | |
148 | 163 | |||
Linear program formalism for solving the problem is well documented: an objective function is | 149 | 164 | Linear program formalism for solving the problem is well documented: an objective function is | |
defined which is linearly dependent on the parameters to be optimized. Constraints are expressed | 150 | 165 | defined which is linearly dependent on the parameters to be optimized. Constraints are expressed | |
as linear equation and solved using one of the available solvers, in our case GLPK\cite{glpk}. | 151 | 166 | as linear equation and solved using one of the available solvers, in our case GLPK\cite{glpk}. | |
152 | 167 | |||
The MILP solver provides a solution to the problem by selecting a series of small FIR with | 153 | 168 | The MILP solver provides a solution to the problem by selecting a series of small FIR with | |
increasing number of bits representing data and coefficients as well as an increasing number | 154 | 169 | increasing number of bits representing data and coefficients as well as an increasing number | |
of coefficients, instead of a single monolithic filter. Fig. \ref{compare-fir} exhibits the | 155 | 170 | of coefficients, instead of a single monolithic filter. Fig. \ref{compare-fir} exhibits the | |
performance comparison between one solution and a monolithic FIR when selecting a cutoff | 156 | 171 | performance comparison between one solution and a monolithic FIR when selecting a cutoff | |
frequency of half the Nyquist frequency: a series of 5 FIR and a series of 10 FIR with the | 157 | 172 | frequency of half the Nyquist frequency: a series of 5 FIR and a series of 10 FIR with the | |
same space usage are provided as selected by the MILP solver. The FIR cascade provides improved | 158 | 173 | same space usage are provided as selected by the MILP solver. The FIR cascade provides improved | |
rejection than the monolithic FIR at the expense of a lower cutoff frequency which remains to | 159 | 174 | rejection than the monolithic FIR at the expense of a lower cutoff frequency which remains to | |
be tuned or compensated for. | 160 | 175 | be tuned or compensated for. | |
161 | 176 | |||
\begin{figure}[h!tb] | 162 | 177 | \begin{figure}[h!tb] | |
% \includegraphics[width=\linewidth]{images/compare-fir.pdf} | 163 | 178 | % \includegraphics[width=\linewidth]{images/compare-fir.pdf} | |
\includegraphics[width=\linewidth]{images/fir-mono-vs-fir-series-200dB.pdf} | 164 | 179 | \includegraphics[width=\linewidth]{images/fir-mono-vs-fir-series-200dB.pdf} | |
\caption{Comparison of the rejection capability between a series of FIR and a monolithic FIR | 165 | 180 | \caption{Comparison of the rejection capability between a series of FIR and a monolithic FIR | |
with a cutoff frequency set at half the Nyquist frequency.} | 166 | 181 | with a cutoff frequency set at half the Nyquist frequency.} | |
\label{compare-fir} | 167 | 182 | \label{compare-fir} | |
\end{figure} | 168 | 183 | \end{figure} | |
169 | 184 | |||
The resource occupation when synthesizing such FIR on a Xilinx FPGA is summarized as Tab. \ref{t1}. | 170 | 185 | The resource occupation when synthesizing such FIR on a Xilinx FPGA is summarized as Tab. \ref{t1}. | |
171 | 186 | |||
\begin{table}[h!tb] | 172 | 187 | \begin{table}[h!tb] | |
\caption{Resource occupation on a Xilinx Zynq-7000 series FPGA when synthesizing the FIR cascade | 173 | 188 | \caption{Resource occupation on a Xilinx Zynq-7000 series FPGA when synthesizing the FIR cascade | |
identified as optimal by the MILP solver within a finite resource criterion. The last line refers | 174 | 189 | identified as optimal by the MILP solver within a finite resource criterion. The last line refers | |
to available resources on a Zynq-7010 as found on the Redpitaya board. The rejection is the mean | 175 | 190 | to available resources on a Zynq-7010 as found on the Redpitaya board. The rejection is the mean | |
value from 0.6 to 1 Nyquist frequency.} | 176 | 191 | value from 0.6 to 1 Nyquist frequency.} | |
\begin{center} | 177 | 192 | \begin{center} | |
\begin{tabular}{|c|cccc|}\hline | 178 | 193 | \begin{tabular}{|c|cccc|}\hline | |
FIR & BlockRAM & LookUpTables & DSP & rejection (dB)\\\hline\hline | 179 | 194 | FIR & BlockRAM & LookUpTables & DSP & rejection (dB)\\\hline\hline | |
1 (monolithic) & 1 & 4064 & 40 & -72 \\ | 180 | 195 | 1 (monolithic) & 1 & 4064 & 40 & -72 \\ | |
5 & 5 & 12332 & 0 & -217 \\ | 181 | 196 | 5 & 5 & 12332 & 0 & -217 \\ | |
10 & 10 & 12717 & 0 & -251 \\\hline\hline | 182 | 197 | 10 & 10 & 12717 & 0 & -251 \\\hline\hline | |
Zynq 7010 & 60 & 17600 & 80 & \\\hline | 183 | 198 | Zynq 7010 & 60 & 17600 & 80 & \\\hline | |
\end{tabular} | 184 | 199 | \end{tabular} | |
\end{center} | 185 | 200 | \end{center} | |
%\vspace{-0.7cm} | 186 | 201 | %\vspace{-0.7cm} | |
\label{t1} | 187 | 202 | \label{t1} | |
\end{table} | 188 | 203 | \end{table} | |
189 | 204 | |||
\section{Filter coefficient selection} | 190 | 205 | \section{Filter coefficient selection} | |
191 | 206 | |||
The coefficients of a single monolithic filter are computed as the impulse response | 192 | 207 | The coefficients of a single monolithic filter are computed as the impulse response | |
of the filter transfer function, and practically approximated by a multitude of methods | 193 | 208 | of the filter transfer function, and practically approximated by a multitude of methods | |
including least square optimization (Matlab's {\tt firls} function), Hamming or Kaiser windowing | 194 | 209 | including least square optimization (Matlab's {\tt firls} function), Hamming or Kaiser windowing | |
(Matlab's {\tt fir1} function). Cascading filters opens a new optimization opportunity by | 195 | 210 | (Matlab's {\tt fir1} function). Cascading filters opens a new optimization opportunity by | |
selecting various coefficient sets depending on the number of coefficients. Fig. \ref{2} | 196 | 211 | selecting various coefficient sets depending on the number of coefficients. Fig. \ref{2} | |
illustrates that for a number of coefficients ranging from 8 to 47, {\tt fir1} provides a better | 197 | 212 | illustrates that for a number of coefficients ranging from 8 to 47, {\tt fir1} provides a better | |
rejection than {\tt firls}: since the linear solver increases the number of coefficients along | 198 | 213 | rejection than {\tt firls}: since the linear solver increases the number of coefficients along | |
the processing chain, the type of selected filter also changes depending on the number of coefficients | 199 | 214 | the processing chain, the type of selected filter also changes depending on the number of coefficients | |
and evolves along the processing chain. | 200 | 215 | and evolves along the processing chain. | |
201 | 216 | |||
\begin{figure}[h!tb] | 202 | 217 | \begin{figure}[h!tb] | |
\includegraphics[width=\linewidth]{images/fir1-vs-firls} | 203 | 218 | \includegraphics[width=\linewidth]{images/fir1-vs-firls} | |
\caption{Evolution of the rejection capability of least-square optimized filters and Hamming | 204 | 219 | \caption{Evolution of the rejection capability of least-square optimized filters and Hamming | |
FIR filters as a function of the number of coefficients, for floating point numbers and 8-bit | 205 | 220 | FIR filters as a function of the number of coefficients, for floating point numbers and 8-bit | |
encoded integers.} | 206 | 221 | encoded integers.} | |
\label{2} | 207 | 222 | \label{2} | |
\end{figure} | 208 | 223 | \end{figure} | |
209 | 224 | |||
\section{Conclusion} | 210 | 225 | \section{Conclusion} | |
211 | 226 | |||
We address the optimization problem of designing a low-pass filter chain in a Field Programmable Gate | 212 | 227 | We address the optimization problem of designing a low-pass filter chain in a Field Programmable Gate | |
Array for improved noise rejection within constrained resource occupation, as needed for | 213 | 228 | Array for improved noise rejection within constrained resource occupation, as needed for | |
real time processing of radiofrequency signal when characterizing spectral phase noise | 214 | 229 | real time processing of radiofrequency signal when characterizing spectral phase noise | |
characteristics of stable oscillators. The flexibility of the digital approach makes the result | 215 | 230 | characteristics of stable oscillators. The flexibility of the digital approach makes the result | |
best suited for closing the loop and using the measurement output in a feedback loop for | 216 | 231 | best suited for closing the loop and using the measurement output in a feedback loop for | |
controlling clocks, e.g. in a quartz-stabilized high performance clock whose long term behavior | 217 | 232 | controlling clocks, e.g. in a quartz-stabilized high performance clock whose long term behavior | |
is controlled by non-piezoelectric resonator (sapphire resonator, microwave or optical | 218 | 233 | is controlled by non-piezoelectric resonator (sapphire resonator, microwave or optical | |
atomic transition). | 219 | 234 | atomic transition). | |
220 | 235 | |||
\section*{Acknowledgement} | 221 | 236 | \section*{Acknowledgement} | |
222 | 237 | |||
This work is supported by the ANR Programme d'Investissement d'Avenir in | 223 | 238 | This work is supported by the ANR Programme d'Investissement d'Avenir in | |
progress at the Time and Frequency Departments of the FEMTO-ST Institute | 224 | 239 | progress at the Time and Frequency Departments of the FEMTO-ST Institute | |
(Oscillator IMP, First-TF and Refimeve+), and by R\'egion de Franche-Comt\'e. | 225 | 240 | (Oscillator IMP, First-TF and Refimeve+), and by R\'egion de Franche-Comt\'e. | |
The authors would like to thank E. Rubiola, F. Vernotte, G. Cabodevila for support and | 226 | 241 | The authors would like to thank E. Rubiola, F. Vernotte, G. Cabodevila for support and | |
fruitful discussions. | 227 | 242 | fruitful discussions. | |
228 | 243 | |||
244 | ||||
245 | ||||
246 | \subsubsection{Contraintes} | |||
247 | \label{def-contraintes} | |||
248 | Maintenant que nous avons d\'efini ce qu'\'etait une chaine de traitement, nous allons voir | |||
249 | quelles sont les contraintes li\'ees à celles-ci. | |||
250 | ||||
251 | Le temps d'ex\'ecution des t\^aches se compte en front montant d'horloge souvent appel\'e | |||
252 | coup d'horloge. On a donc une unit\'e de temps discr\'etis\'ee car un coup d'horloge est indivisible. | |||
253 | les dates sont donc cadenc\'ees par l'horloge du FPGA. | |||
254 | ||||
255 | Chaque t\^ache doit pouvoir traiter chaque donn\'ee qui arrive, ce qui impose une contrainte | |||
256 | forte de d\'ebit d'entr\'ee. En effet, dans le cadre du traitement du signal, il est primordial | |||
257 | d'avoir toutes les donn\'ees de manière cons\'ecutive. Si la moindre donn\'ee est perdue, le r\'esultat | |||
258 | obtenu n'est plus valide. Cette contrainte se traduit la plupart du temps par de m\'ecanisme de | |||
259 | FIFO qui bufferise les donn\'ees entrantes (dans le cas où la t\^ache n\'ecessite en tableau de donn\'ees, | |||
260 | par exemple). Ou cela peut aussi se mettre en place par un m\'ecanisme de pipeline ou de parall\'elisme | |||
261 | à l'int\'erieur du bloc. Mais cela relève de l'impl\'ementation bas niveau du bloc. | |||
262 | ||||
263 | Le temps d'ex\'ecution d'une t\^ache correspond à la latence d'un bloc. Il s'agit donc du | |||
264 | temps que passe une donn\'ee brute dans le bloc avant de ressortir trait\'ee. Dans notre contexte | |||
265 | la latence n'est pas importante. En effet, puisqu'on a un flux de donn\'ees continu, après un court laps | |||
266 | de temps toutes les t\^aches ont d\'epass\'e leur temps de latence et elles produisent les donn\'ees | |||
267 | r\'egulièrement. | |||
268 | ||||
269 | Il y a tout de même une exception à cela, c'est lors d'un traitement parallèle. Dans l'exemple de la | |||
270 | figure \ref{exemple-chaine-traitement}, on voit un bloc qui divise le flux en deux branches. Dans le | |||
271 | cas où on resynchronise le flux, il est imp\'eratif que la somme des latences des deux branches soit la | |||
272 | même. Cela peut donc imposer la pr\'esence de blocs qui ajoutent de la latence sans faire de traitements utiles. | |||
273 | ||||
274 | En revanche, une t\^ache se caract\'erise par un d\'ebit de sortie et celui-ci doit rester fixe. | |||
275 | Cela s'explique par la contrainte du d\'ebit d'entr\'ee du bloc de traitement suivant. Si un bloc a un d\'ebit de sortie | |||
276 | fluctuant, il est \'evident que la contrainte d'entr\'ee ne sera pas possible à formaliser. | |||
277 | ||||
278 | Une autre contrainte li\'ee de manière plus globale est la consommation de ressources. Comme nous l'avons | |||
279 | dit dans la section \ref{def-fpga}, le FPGA dispose d'un nombre de portes logiques limit\'e. | |||
280 | Il faut donc que la chaine de traitement ne d\'epasse pas le nombre de ressources dont dispose la puce | |||
281 | FPGA. | |||
282 | ||||
283 | La consommation de ressources est influenc\'ee par les blocs de traitement. En effet, pour pouvoir | |||
284 | tenir les d\'ebits d'entr\'ee \'elev\'ees, cela consomme \'enorm\'ement de ressources. Plus le d\'ebit est rapide, plus | |||
285 | la consommation de ressources sera grande. | |||
286 | ||||
287 | \subsection{Travaux traitant du sujet} | |||
288 | Nous avons commenc\'e notre recherche en lisant des articles traitant de l'optimisation dans un FPGA. | |||
289 | Dans sa thèse, S. Mirzaei \cite{these-dsp-fpga} donne surtout des bonnes pratiques pour d\'evelopper | |||
290 | des composants FPGA bas niveau. Ce n'est pas exactement ce que nous cherchions. | |||
291 | ||||
292 | Dans les r\'ef\'erences \cite{zhuo2007scalable, olariu1993computing, pan1999improved}, les auteurs | |||
293 | proposent tous des optimisations hardware uniquement. Cependant ces articles sont focalis\'es sur des optimisations mat\'erielles | |||
294 | or notre objectif est de trouver une formalisation math\'ematique d'un FPGA. | |||
295 | ||||
296 | Une autre approche est propos\'ee par S. Kasbah et al. dans leur article \cite{kasbah2008multigrid}. | |||
297 | En effet, ils utilisent une approche HLS de leur problème. Ils ont utilis\'e un synth\'etiseur optimis\'e et | |||
298 | un langage d\'eriv\'e du C++ pour d\'ecrire leur algorithme. Bien qu'ils obtiennent de bons r\'esultats, | |||
299 | leur m\'ethode n'est pas exploitable dans notre cas, car ils n'ont pas les mêmes contraintes de d\'ebit et | |||
300 | de temps r\'eel que nous. | |||
301 | ||||
302 | Une dernière approche que nous avons \'etudi\'ee est l'utilisation de \emph{skeletons}. D. Crookes et A. Benkrid | |||
303 | ont beaucoup parl\'e de cette m\'ethode dans leur articles \cite{crookes1998environment, crookes2000design, benkrid2002towards}. | |||
304 | L'id\'ee essentielle est qu'ils r\'ealisent des composants très optimis\'es et param\'etrables. Ainsi lorsqu'ils | |||
305 | veulent faire un d\'eveloppement, ils utilisent les blocs d\'ejà faits. | |||
306 | ||||
307 | Ces blocs repr\'esentent une \'etape de calcul (une d\'ecimation, un filtrage, une modulation, une | |||
308 | d\'emodulation etc...). En prenant le cas du FIR, on rend param\'etrables les valeurs des coefficients | |||
309 | utilis\'es pour le produit de convolutions ainsi que leur nombre. Le facteur de d\'ecimation est | |||
310 | lui aussi param\'etrable. | |||
311 | ||||
312 | On gagne ainsi beaucoup de temps de d\'eveloppement car on r\'eutilise des composants d\'ejà \'eprouv\'es et optimis\'es. | |||
313 | De plus, au fil des projets, on constitue une bibliothèque de composants nous | |||
314 | permettant de faire une chaine complète très simplement. | |||
315 | ||||
316 | K. Benkrid, S. Belkacemi et A. Benkrid dans leur article\cite{hide} caract\'erisent | |||
317 | ces blocs en Prolog pour faire un langage descriptif permettant d'assembler les blocs de manière | |||
318 | optimale. En partant de cette description, ils arrivent à g\'en\'erer directement le code VHDL. | |||
319 | ||||
320 | G. Goavec-Merou, dans sa thèse\cite{gwen-cogen}, pr\'esente un outil, CoGen, bas\'e sur l'approche en skeletons. Son id\'ee | |||
321 | est de caract\'eriser des blocs \'ecrits en VHDL, en donnant diff\'erents caract\'eristiques : | |||
322 | \begin{itemize} | |||
323 | \item la latence du bloc repr\'esente, en coups d'horloge, le temps entre l'entr\'ee de la donn\'ee | |||
324 | et le temps où la même donn\'ee ressort du bloc. | |||
325 | \item l'acceptance repr\'esente le nombre de donn\'ees par coup d'horloge que le bloc est capable | |||
326 | de traiter. | |||
327 | \item la sortance repr\'esente le nombre de donn\'ees qui sortent par coup d'horloge. | |||
328 | \end{itemize} | |||
329 | ||||
330 | Gr\^ace à cela, le logiciel est capable de donner une impl\'ementation optimale d'un problème qu'on lui | |||
331 | soumet. Le problème ne se d\'efinit pas uniquement par un r\'esultat attendu mais aussi par des | |||
332 | contraintes de d\'ebit et/ou de pr\'ecision. | |||
333 | ||||
334 | Dans une second temps, nous nous sommes aussi int\'eress\'es à des articles d'ordonnancement. | |||
335 | Nous avons notamment lu des documents parlant des cas des micro-usines. | |||
336 | ||||
337 | Les micro-usines ressemblent un peu à des FPGA dans le sens où on connait à l'avance les | |||
338 | t\^aches à effectuer et leurs caract\'eristiques. Nous allons donc nous inspirer | |||
339 | de leur modèle pour essayer de construire le notre. | |||
340 | ||||
341 | Dans sa thèse A. Dobrila \cite{these-alex} traite d'un problème de tol\'erance aux pannes | |||
342 | dans le contextes des mirco-usines. Mais les FPGA ne sont pas concern\'es dans la mesure | |||
343 | où si le composant tombe en panne, tout le traitement est paralys\'e. Cette thèse nous a n\'eanmoins | |||
344 | permis d'avoir un exemple de formalisation de problème. | |||
345 | ||||
346 | Pour finir nous avons lu la thèse de M. Coqblin \cite{these-mathias} qui elle aussi traite du sujet | |||
347 | des micro-usines. Le travail de M. Coqblin porte surtout sur une chaine de traitement | |||
348 | reconfigurable, il tient compte dans ses travaux du surcoût engendr\'e par la reconfiguration d'une machine. | |||
349 | Cela n'est pas tout à fait exploitable dans notre contexte puisqu'une | |||
350 | puce FPGA d\'es qu'elle est programm\'ee n'a pas la possibilit\'e de reconfigurer une partie de sa chaine de | |||
351 | traitement. Là encore, nous avions un exemple de formalisation d'un problème. | |||
352 | ||||
353 | Pour conclure, nous avons vu deux approches li\'ees à deux domaines diff\'erents. La première est le | |||
354 | point de vue \'electronique qui se focalise principalement sur des optimisations mat\'erielles ou algorithmiques. | |||
355 | La seconde est le point de vue informatique : les modèles sont très g\'en\'eriques et ne sont pas | |||
356 | adapt\'es au cas des FPGA. La suite de ce rapport se concentrera donc sur la recherche d'un compromis | |||
357 | entre ces deux points de vue. | |||
358 | ||||
359 | \section{Contexte d'ordonnancement} | |||
360 | Dans cette partie, nous donnerons des d\'efinitions de termes rattach\'es au domaine de l'ordonnancement | |||
361 | et nous verrons que le sujet trait\'e se rapproche beaucoup d'un problème d'ordonnancement. De ce fait | |||
362 | nous pourrons aller plus loin que les travaux vus pr\'ec\'edemment et nous tenterons des approches d'ordonnancement | |||
363 | et d'optimisation. | |||
364 | ||||
365 | \subsection{D\'efinition du vocabulaire} | |||
366 | Avant tout, il faut d\'efinir ce qu'est un problème d'optimisation. Il y a deux d\'efinitions | |||
367 | importantes à donner. La première est propos\'ee par Legrand et Robert dans leur livre \cite{def1-ordo} : | |||
368 | \begin{definition} | |||
369 | \label{def-ordo1} | |||
370 | Un ordonnancement d'un système de t\^aches $G\ =\ (V,\ E,\ w)$ est une fonction $\sigma$ : | |||
371 | $V \rightarrow \mathbb{N}$ telle que $\sigma(u) + w(u) \leq \sigma(v)$ pour toute arête $(u,\ v) \in E$. | |||
372 | \end{definition} | |||
373 | ||||
374 | Dit plus simplement, l'ensemble $V$ repr\'esente les t\^aches à ex\'ecuter, l'ensemble $E$ repr\'esente les d\'ependances | |||
375 | des t\^aches et $w$ les temps d'ex\'ecution de la t\^ache. La fonction $\sigma$ donne donc l'heure de d\'ebut de | |||
376 | chacune des t\^aches. La d\'efinition dit que si une t\^ache $v$ d\'epend d'une t\^ache $u$ alors | |||
377 | la date de d\'ebut de $v$ sera plus grande ou \'egale au d\'ebut de l'ex\'ecution de la t\^ache $u$ plus son | |||
378 | temps d'ex\'ecution. | |||
379 | ||||
380 | Une autre d\'efinition importante qui est propos\'ee par Leung et al. \cite{def2-ordo} est : | |||
381 | \begin{definition} | |||
382 | \label{def-ordo2} | |||
383 | L'ordonnancement traite de l'allocation de ressources rares à des activit\'es avec | |||
384 | l'objectif d'optimiser un ou plusieurs critères de performance. | |||
385 | \end{definition} | |||
386 | ||||
387 | Cette d\'efinition est plus g\'en\'erique mais elle nous int\'eresse d'avantage que la d\'efinition \ref{def-ordo1}. | |||
388 | En effet, la partie qui nous int\'eresse dans cette première d\'efinition est le respect de la pr\'ec\'edance des t\^aches. | |||
389 | Dans les faits les dates de d\'ebut ne nous int\'eressent pas r\'eellement. | |||
390 | ||||
391 | En revanche la d\'efinition \ref{def-ordo2} sera au c\oe{}ur du projet. Pour se convaincre de cela, | |||
392 | il nous faut d'abord d\'efinir quel est le type de problème d'ordonnancement qu'on traite et quelles | |||
393 | sont les m\'ethodes qu'on peut appliquer. | |||
394 | ||||
395 | Les problèmes d'ordonnancement peuvent être class\'es en diff\'erentes cat\'egories : | |||
396 | \begin{itemize} | |||
397 | \item T\^aches ind\'ependantes : dans cette cat\'egorie de problèmes, les t\^aches sont complètement ind\'ependantes | |||
398 | les unes des autres. Dans notre cas, ce n'est pas le plus adapt\'e. | |||
399 | \item Graphe de t\^aches : la d\'efinition \ref{def-ordo1} d\'ecrit cette cat\'egorie. La plupart du temps, | |||
400 | les t\^aches sont repr\'esent\'ees par une DAG. Cette cat\'egorie est très proche de notre cas puisque nous devons \'egalement ex\'ecuter | |||
401 | des t\^aches qui ont un certain nombre de d\'ependances. On pourra même dire que dans certain cas, | |||
402 | on a des anti-arbres, c'est à dire que nous avons une multitude de t\^aches d'entr\'ees qui convergent vers une | |||
403 | t\^ache de fin. | |||
404 | \item Workflow : cette cat\'egorie est une sous cat\'egorie des graphes de t\^aches dans le sens où | |||
405 | il s'agit d'un graphe de t\^aches r\'ep\'et\'e de nombreuses de fois. C'est exactement ce type de problème | |||
406 | que nous traitons ici. | |||
407 | \end{itemize} | |||
408 | ||||
409 | Bien entendu, cette liste n'est pas exhaustive et il existe de nombreuses autres classifications et sous-classifications | |||
410 | de ces problèmes. Nous n'avons parl\'e ici que des cat\'egories les plus communes. | |||
411 | ||||
412 | Un autre point à d\'efinir, est le critère d'optimisation. Il y a là encore un grand nombre de | |||
413 | critères possibles. Nous allons donc parler des principaux : | |||
414 | \begin{itemize} | |||
415 | \item Temps de compl\'etion total (ou Makespan en anglais) : ce critère est l'un des critères d'optimisation | |||
416 | les plus courant. Il s'agit donc de minimiser la date de fin de la dernière t\^ache de l'ensemble des | |||
417 | t\^aches à ex\'ecuter. L'enjeu de cette optimisation est donc de trouver l'ordonnancement optimal permettant | |||
418 | la fin d'ex\'ecution au plus tôt. | |||
419 | \item Somme des temps d'ex\'ecution (Flowtime en anglais) : il s'agit de faire la somme des temps d'ex\'ecution de toutes les t\^aches | |||
420 | et d'optimiser ce r\'esultat. | |||
421 | \item Le d\'ebit : ce critère quant à lui, vise à augmenter au maximum le d\'ebit de traitement des donn\'ees. | |||
422 | \end{itemize} | |||
423 | ||||
424 | En plus de cela, on peut avoir besoin de plusieurs critères d'optimisation. Il s'agit dans ce cas d'une optimisation | |||
425 | multi-critères. Bien entendu, cela complexifie d'autant plus le problème car la solution la plus optimale pour un | |||
426 | des critères peut être très mauvaise pour un autre critère. De ce cas, il s'agira de trouver une solution qui permet | |||
427 | de faire le meilleur compromis entre tous les critères. | |||
428 | ||||
429 | ||||
430 | \subsection{Formalisation du problème} | |||
431 | \label{formalisation} | |||
432 | Maintenant que nous avons donn\'e le vocabulaire li\'e à l'ordonnancement, nous allons pouvoir essayer caract\'eriser | |||
433 | formellement notre problème. En effet, nous allons reprendre les contraintes \'enonc\'ees dans la sections \ref{def-contraintes} | |||
434 | et nous essayerons de les formaliser le plus finement possible. | |||
435 | ||||
436 | Comme nous l'avons dit, une t\^ache est un bloc de traitement. Chaque t\^ache $i$ dispose d'un ensemble de paramètres | |||
437 | que nous nommerons $\mathcal{P}_{i}$. Cet ensemble $\mathcal{P}_i$ est propre à chaque t\^ache et il variera d'une | |||
438 | t\^ache à l'autre. Nous reviendrons plus tard sur les paramètres qui peuvent composer cet ensemble. | |||
439 | ||||
440 | Outre cet ensemble $\mathcal{P}_i$, chaque t\^ache dispose de paramètres communs : | |||
441 | \begin{itemize} | |||
442 | \item Dur\'ee de la t\^ache : Comme nous l'avons dit auparavant, dans le cadre d'un FPGA le temps est compt\'e en nombre de coup d'horloge. | |||
443 | En outre, les blocs sont toujours sollicit\'es, certains même sont capables de lire et de renvoyer une r\'esultat à chaque coups d'horloge. | |||
444 | Donc la dur\'ee d'une t\^ache ne peut être le laps de temps entre l'entr\'ee d'une donn\'ee et la sortie d'une autre. Nous d\'efinirons la | |||
445 | dur\'ee comme le temps de traitement d'une donn\'ee, c'est à dire la diff\'erence de temps entre la date de sortie d'une donn\'ee | |||
446 | et de sa date d'entr\'ee. Nous nommerons cette dur\'ee $\delta_i$. % Je devrais la nomm\'ee w comme dans la def2 | |||
447 | \item La pr\'ecision : La pr\'ecision d'une donn\'ee est le nombre de bits significatifs qu'elle compte. En effet, au fil des traitements | |||
448 | les pr\'ecisions peuvent varier. On nomme donc la pr\'ecision d'entr\'ee d'une t\^ache $i$ comme $\pi_i^-$ et la pr\'ecision en sortie $\pi_i^+$. | |||
449 | \item La fr\'equence du flux en entr\'ee (ou sortie) : Cette fr\'equence repr\'esente la fr\'equence des donn\'ees qui arrivent (resp. sortent). | |||
450 | Selon les t\^aches, les fr\'equences varieront. En effet, certains blocs ralentissent le flux c'est pourquoi on distingue la fr\'equence du | |||
451 | flux en entr\'ee et la fr\'equence en sortie. Nous nommerons donc la fr\'equence du flux en entr\'ee $f_i^-$ et la fr\'equence en sortie $f_i^+$. | |||
452 | \item La quantit\'e de donn\'ees en entr\'ee (ou en sortie) : Il s'agit de la quantit\'e de donn\'ees que le bloc s'attend à traiter (resp. | |||
453 | est capable de produire). Les t\^aches peuvent avoir à traiter des gros volumes de donn\'ees et n'en ressortir qu'une partie. Cette | |||
454 | fois encore, il nous faut donc diff\'erencier l'entr\'ee et la sortie. Nous nommerons donc la quantit\'e de donn\'ees entrantes $q_i^-$ | |||
455 | et la quantit\'e de donn\'ees sortantes $q_i^+$ pour une t\^ache $i$. | |||
456 | \item Le d\'ebit d'entr\'ee (ou de sortie) : Ce paramètre correspond au d\'ebit de donn\'ees que la t\^ache est capable de traiter ou qu'elle | |||
457 | fournit en sortie. Il s'agit simplement de l'expression des deux pr\'ec\'edents paramètres. Nous d\'efinirons donc la d\'ebit entrant de la | |||
458 | t\^ache $i$ comme $d_i^-\ =\ q_i^-\ *\ f_i^-$ et le d\'ebit sortant comme $d_i^+\ =\ q_i^+\ *\ f_i^+$. | |||
459 | \item La taille de la t\^ache : La taille dans les FPGA \'etant limit\'ee, ce paramètre exprime donc la place qu'occupe la t\^ache au sein du bloc. | |||
460 | Nous nommerons $\mathcal{A}_i$ cette taille. | |||
461 | \item Les pr\'ed\'ecesseurs et successeurs d'une t\^ache : cela nous permet de connaître les t\^aches requises pour pouvoir traiter | |||
462 | la t\^ache $i$ ainsi que les t\^aches qui en d\'ependent. Ces ensemble sont not\'es $\Gamma _i ^-$ et $ \Gamma _i ^+$ \\ | |||
463 | %TODO Est-ce vraiment un paramètre ? | |||
464 | \end{itemize} | |||
465 | ||||
466 | Ces diff\'erents paramètres communs sont fortement li\'es aux \'el\'ements de $\mathcal{P}_i$. Voici quelques exemples de relations | |||
467 | que nous avons identifi\'ees : | |||
468 | \begin{itemize} | |||
469 | \item $ \delta _i ^+ \ = \ \mathcal{F}_{\delta}(\pi_i^-,\ \pi_i^+,\ d_i^-,\ d_i^+,\ \mathcal{P}_i) $ donne le temps d'ex\'ecution | |||
470 | de la t\^ache en fonction de la pr\'ecision voulue, du d\'ebit et des paramètres internes. | |||
471 | \item $ \pi _i ^+ \ = \ \mathcal{F}_{p}(\pi_i^-,\ \mathcal{P}_i) $, la fonction $F_p$ donne la pr\'ecision en sortie selon la pr\'ecision de d\'epart | |||
472 | et les paramètres internes de la t\^ache. | |||
473 | \item $d_i^+\ =\ \mathcal{F}_d(d_i^-, \mathcal{P}_i)$, la fonction $F_d$ donne le d\'ebit sortant de la t\^ache en fonction du d\'ebit | |||
474 | sortant et des variables internes de la t\^ache. | |||
475 | \item $A_i^+\ =\ \mathcal{F}_A(\pi_i^-,\ \pi_i^+,\ d_i^-,\ d_i^+, \mathcal{P}_i)$ | |||
476 | \end{itemize} | |||
477 | Pour le moment, nous ne sommes pas capables de donner une d\'efinition g\'en\'erale de ces fonctions. Mais en revanche, | |||
478 | sur quelques exemples simples (cf. \ref{def-contraintes}), nous parvenons à donner une \'evaluation de ces fonctions. | |||
479 | ||||
480 | Maintenant que nous avons donn\'e toutes les notations utiles, nous allons \'enoncer des contraintes relatives à notre problème. Soit | |||
481 | un DGA $G(V,\ E)$, on a pour toutes arêtes $(i, j)\ \in\ E$ les in\'equations suivantes : | |||
482 | ||||
483 | \paragraph{Contrainte de pr\'ecision :} | |||
484 | Cette in\'equation traduit la contrainte de pr\'ecision d'une t\^ache à l'autre : | |||
485 | \begin{align*} | |||
486 | \pi _i ^+ \geq \pi _j ^- | |||
487 | \end{align*} | |||
488 | ||||
489 | \paragraph{Contrainte de d\'ebit :} | |||
490 | Cette in\'equation traduit la contrainte de d\'ebit d'une t\^ache à l'autre : | |||
491 | \begin{align*} | |||
492 | d _i ^+ = q _j ^- * (f_i + (1 / s_j) ) & \text{ où } s_j \text{ est une valeur positive de temporisation de la t\^ache} | |||
493 | \end{align*} | |||
494 | ||||
495 | \paragraph{Contrainte de synchronisation :} | |||
496 | Il s'agit de la contrainte qui impose que si à un moment du traitement, le DAG se s\'epare en plusieurs branches parallèles | |||
497 | et qu'elles se rejoignent plus tard, la somme des latences sur chacune des branches soit la même. | |||
498 | Plus formellement, s'il existe plusieurs chemins disjoints, partant de la t\^ache $s$ et allant à la t\^ache de $f$ alors : | |||
499 | \begin{align*} | |||
500 | \forall \text{ chemin } \mathcal{C}1(s, .., f), | |||
501 | \forall \text{ chemin } \mathcal{C}2(s, .., f) | |||
502 | \text{ tel que } \mathcal{C}1 \neq \mathcal{C}2 | |||
503 | \Rightarrow | |||
504 | \sum _{i} ^{i \in \mathcal{C}1} \delta_i = \sum _{i} ^{i \in \mathcal{C}2} \delta_i | |||
505 | \end{align*} | |||
506 | ||||
507 | \paragraph{Contrainte de place :} | |||
508 | Cette in\'equation traduit la contrainte de place dans le FPGA. La taille max de la puce FPGA est nomm\'e $\mathcal{A}_{FPGA}$ : | |||
509 | \begin{align*} | |||
510 | \sum ^{\text{t\^ache } i} \mathcal{A}_i \leq \mathcal{A}_{FPGA} | |||
511 | \end{align*} | |||
512 | ||||
513 | \subsection{Exemples de mod\'elisation} | |||
514 | \label{exemples-modeles} | |||
515 | Nous allons maintenant prendre quelques blocs de traitement simples afin d'illustrer au mieux notre modèle. | |||
516 | Pour tous nos exemple, nous prendrons un d\'ebit en entr\'ee de 200 Mo/s avec une pr\'ecision de 16 bit. | |||
517 | ||||
518 | Prenons tout d'abord l'exemple d'un bloc de d\'ecimation. Le but de ce bloc est de ralentir le flux en ne gardant | |||
519 | que certaines donn\'ees à intervalle r\'egulier. Cet intervalle est appel\'e le facteur de d\'ecimation, on le notera $N$. | |||
520 | ||||
521 | Donc d'après notre mod\'elisation : | |||
522 | \begin{itemize} | |||
523 | \item $N \in \mathcal{P}_i$ | |||
524 | %TODO N ou 1 ? | |||
525 | \item $\delta _i = N\ c.h.$ (coup d'horloge) | |||
526 | \item $\pi _i ^+ = \pi _i ^- = 16 bits$ | |||
527 | \item $f _i ^+ = f _i ^-$ | |||
528 | \item $q _i ^+ = q _i ^- / N$ | |||
529 | \item $d _i ^+ = q _i ^- / N / f _i ^-$ | |||
530 | \item $\Gamma _i ^+ = \Gamma _i ^- = 1$\\ | |||
531 | %TODO Je ne sais pas trouver la taille... | |||
532 | \end{itemize} | |||
533 | ||||
534 | Un autre exemple int\'eressant que l'on peut donner, c'est le cas des spliters. Il s'agit la aussi d'un bloc très | |||
535 | simple qui permet de dupliquer un flux. On peut donc donner un nombre de sorties à cr\'eer, on note ce paramètre | |||
536 | %TODO pas très inspir\'e... | |||
537 | $X$. Voici ce que donne notre mod\'elisation : | |||
538 | \begin{itemize} | |||
539 | \item $X \in \mathcal{P}_i$ | |||
540 | \item $\delta _i = 1\ c.h.$ | |||
541 | \item $\pi _i ^+ = \pi _i ^- = 16 bits$ | |||
542 | \item $f _i ^+ = f _i ^-$ | |||
543 | \item $q _i ^+ = q _i ^-$ | |||
544 | \item $d _i ^+ = d _i ^-$ | |||
545 | \item $\Gamma _i ^- = 1$ | |||
546 | \item $\Gamma _i ^+ = X$\\ | |||
547 | \end{itemize} | |||
548 | ||||
549 | L'exemple suivant traite du cas du shifter. Il s'agit d'un bloc qui a pour but de diminuer le nombre de bits des | |||
550 | donn\'ees afin d'acc\'el\'erer les traitement sur les blocs suivants. On peut donc donner le nombre de bits à shifter, | |||
551 | on note ce paramètre $S$. Voici ce que donne notre mod\'elisation : | |||
552 | \begin{itemize} | |||
553 | \item $S \in \mathcal{P}_i$ | |||
554 | \item $\delta _i = 1\ c.h.$ | |||
555 | \item $\pi _i ^+ = \pi _i ^- - S$ | |||
556 | \item $f _i ^+ = f _i ^-$ | |||
557 | \item $q _i ^+ = q _i ^-$ | |||
558 | \item $d _i ^+ = d _i ^-$ | |||
559 | \item $\Gamma _i ^+ = \Gamma _i ^- = 1$\\ | |||
560 | \end{itemize} | |||
561 | ||||
562 | Nous allons traiter un dernier exemple un peu plus complexe, le cas d'un filtre d\'ecimateur (ou FIR). Ce bloc |