Skip to content

Commit 367946c

Browse files
authored
Don't tell users to use a bad number of threads (#243)
The readme tells people to use the command line option "-t 8", causing 8 threads to be started. On systems with fewer than 8 cores, this causes a significant slowdown. Remove the option from the example command lines and use /proc/cpuinfo on Linux to determine a sensible default.
1 parent 6b0df5c commit 367946c

File tree

5 files changed

+19
-11
lines changed

5 files changed

+19
-11
lines changed

.devops/tools.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ else
3434
echo "Unknown command: $arg1"
3535
echo "Available commands: "
3636
echo " --run (-r): Run a model previously converted into ggml"
37-
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512"
37+
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
3838
echo " --convert (-c): Convert a llama model into ggml"
3939
echo " ex: \"/models/7B/\" 1"
4040
echo " --quantize (-q): Optimize with quantization process ggml"

README.md

+5-5
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ Supported platforms:
3939
Here is a typical run using LLaMA-7B:
4040

4141
```java
42-
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
42+
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
4343
I llama.cpp build info:
4444
I UNAME_S: Darwin
4545
I UNAME_P: arm
@@ -150,7 +150,7 @@ python3 convert-pth-to-ggml.py models/7B/ 1
150150
./quantize.sh 7B
151151

152152
# run the inference
153-
./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
153+
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
154154
```
155155

156156
When running the larger models, make sure you have enough disk space to store all the intermediate files.
@@ -164,7 +164,7 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o
164164

165165
Here is an example few-shot interaction, invoked with the command
166166
```
167-
./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
167+
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
168168
-p \
169169
"Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
170170
@@ -218,13 +218,13 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-on
218218
On complete, you are ready to play!
219219

220220
```bash
221-
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
221+
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
222222
```
223223

224224
or with light image:
225225

226226
```bash
227-
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
227+
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
228228
```
229229

230230
## Limitations

ggml.c

-4
Original file line numberDiff line numberDiff line change
@@ -9318,10 +9318,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
93189318
}
93199319

93209320
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
9321-
if (cgraph->n_threads <= 0) {
9322-
cgraph->n_threads = 8;
9323-
}
9324-
93259321
const int n_threads = cgraph->n_threads;
93269322

93279323
struct ggml_compute_state_shared state_shared = {

utils.cpp

+12
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,18 @@
1616
#endif
1717

1818
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
19+
// determine sensible default number of threads.
20+
// std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
21+
#ifdef __linux__
22+
std::ifstream cpuinfo("/proc/cpuinfo");
23+
params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
24+
std::istream_iterator<std::string>(),
25+
std::string("processor"));
26+
#endif
27+
if (params.n_threads == 0) {
28+
params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
29+
}
30+
1931
for (int i = 1; i < argc; i++) {
2032
std::string arg = argv[i];
2133

utils.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
struct gpt_params {
1616
int32_t seed = -1; // RNG seed
17-
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
17+
int32_t n_threads;
1818
int32_t n_predict = 128; // new tokens to predict
1919
int32_t repeat_last_n = 64; // last n tokens to penalize
2020
int32_t n_ctx = 512; //context size

0 commit comments

Comments
 (0)