From 50d8773b73ae66f7a174f85a5daa20739e088ccc Mon Sep 17 00:00:00 2001
From: Yida Wang <yidawa@gmail.com>
Date: Tue, 9 Jan 2018 15:39:35 -0800
Subject: [PATCH] small fixes on docs (#769)

* small fixs on docs

* add IR output after parallelization
---
 docs/how_to/contribute.md      |  1 -
 tutorials/optimize/opt_gemm.py | 19 +++++++++++++------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/docs/how_to/contribute.md b/docs/how_to/contribute.md
index a0ba99bdf..bd31a62e5 100644
--- a/docs/how_to/contribute.md
+++ b/docs/how_to/contribute.md
@@ -14,7 +14,6 @@ Everyone is more than welcome to contribute. It is a way to make the project bet
   - [What is the consequence of force push](#what-is-the-consequence-of-force-push)
 * [Document](#document)
 * [Testcases](#testcases)
-* [Examples](#examples)
 * [Core Library](#core-library)
 * [Python Package](#python-package)
 
diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py
index 3f24767ab..9a4264c9d 100644
--- a/tutorials/optimize/opt_gemm.py
+++ b/tutorials/optimize/opt_gemm.py
@@ -26,7 +26,7 @@ Actually, all the methodologies used in this tutorial is a subset of tricks ment
 abstraction automatically, but some of them cannot be simply applied due to TVM constraints.
 
 All the experiment results mentioned below, are executed on 2015's 15' MacBook equiped with
-Intel i7-4770QH CPU. The cache line size should be 64 bytes for all the x86 CPU.
+Intel i7-4770QH CPU. The cache line size should be 64 bytes for all the x86 CPUs.
 """
 
 ###############################################################################
@@ -230,7 +230,7 @@ print(tvm.lower(s, [A, B, C], simple_mode=True))
 ###################################################################################################
 # Parallel
 # -------------
-# Futhermore, we can also utilize multi-core processors to parallelize computation.
+# Futhermore, we can also utilize multi-core processors to do the thread-level parallelization.
 
 s = tvm.create_schedule(C.op)
 xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
@@ -251,11 +251,18 @@ evaluator = func.time_evaluator(func.entry_name, tvm.cpu(0), number=50)
 opt5_time = evaluator(a, b, c).mean
 print('Opt5: %f' % opt5_time)
 
+################################################################################################
+# Here is the generated IR after parallelization.
+
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+###################################################################################################
+
 ##################################################################################################
 # Summary
 # -------
 # After applying the above simple optimizations with only 6 lines of code,
-# our generated code can achieve 30% of numpy performance with Apple implemented BLAS.
-#
-# We can see TVM is very powerful tool to optimize low level computation.
-
+# our generated code can achieve 30% of the `numpy` performance with Apple implemented BLAS.
+# Note that the outputs on the webpage reflect the running times on a non-exclusive
+# Docker container, thereby they are *unreliable*. It is highly encouraged to run the
+# tutorial by yourself to observe the performance gain acheived by TVM.
-- 
GitLab