@@ -4,6 +4,7 @@ SCRIPT_DIR=$(dirname "$0")
44MEDIA_DIR=$( realpath ${SCRIPT_DIR} /../../third_party)
55
66IMG_PATH=${MEDIA_DIR} /organ.jpg
7+ IMG_PATH2=${MEDIA_DIR} /Cajun_instruments.jpg
78AUDIO_PATH=${MEDIA_DIR} /sample.mp3
89VIDEO_PATH=${MEDIA_DIR} /Big_Buck_Bunny.mp4
910PDF_PATH=${MEDIA_DIR} /test.pdf
@@ -38,43 +39,136 @@ curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:s
3839
3940echo " [START text_gen_multimodal_one_image_prompt]"
4041# [START text_gen_multimodal_one_image_prompt]
42+ # Use a temporary file to hold the base64 encoded image data
43+ TEMP_B64=$( mktemp)
44+ trap ' rm -f "$TEMP_B64"' EXIT
45+ base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 "
46+
47+ # Use a temporary file to hold the JSON payload
48+ TEMP_JSON=$( mktemp)
49+ trap ' rm -f "$TEMP_JSON"' EXIT
50+
51+ cat > " $TEMP_JSON " << EOF
52+ {
53+ "contents": [{
54+ "parts":[
55+ {"text": "Tell me about this instrument"},
56+ {
57+ "inline_data": {
58+ "mime_type":"image/jpeg",
59+ "data": "$( cat " $TEMP_B64 " ) "
60+ }
61+ }
62+ ]
63+ }]
64+ }
65+ EOF
66+
4167curl " https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?key=$GOOGLE_API_KEY " \
4268 -H ' Content-Type: application/json' \
4369 -X POST \
44- -d ' {
45- "contents": [{
46- "parts":[
47- {"text": "Tell me about this instrument"},
48- {
49- "inline_data": {
50- "mime_type":"image/jpeg",
51- "data": "' $( base64 $B64FLAGS $IMG_PATH ) ' "
52- }
53- }
54- ]
55- }]
56- }' 2> /dev/null
70+ -d " @$TEMP_JSON " 2> /dev/null
5771# [END text_gen_multimodal_one_image_prompt]
5872
5973echo " [START text_gen_multimodal_one_image_prompt_streaming]"
6074# [START text_gen_multimodal_one_image_prompt_streaming]
75+ cat > " $TEMP_JSON " << EOF
76+ {
77+ "contents": [{
78+ "parts":[
79+ {"text": "Tell me about this instrument"},
80+ {
81+ "inline_data": {
82+ "mime_type":"image/jpeg",
83+ "data": "$( cat " $TEMP_B64 " ) "
84+ }
85+ }
86+ ]
87+ }]
88+ }
89+ EOF
90+
6191curl " https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:streamGenerateContent?alt=sse&key=$GOOGLE_API_KEY " \
6292 -H ' Content-Type: application/json' \
6393 -X POST \
64- -d ' {
65- "contents": [{
94+ -d " @$TEMP_JSON " 2> /dev/null
95+ # [END text_gen_multimodal_one_image_prompt_streaming]
96+
97+ echo " [START text_gen_multimodal_two_image_prompt]"
98+ # [START text_gen_multimodal_two_image_prompt]
99+ # Base64 encode both images into temporary files
100+ TEMP_B64_1=$( mktemp)
101+ TEMP_B64_2=$( mktemp)
102+ trap ' rm -f "$TEMP_B64_1" "$TEMP_B64_2"' EXIT
103+ base64 $B64FLAGS " $IMG_PATH " > " $TEMP_B64_1 "
104+ base64 $B64FLAGS " $IMG_PATH2 " > " $TEMP_B64_2 "
105+
106+ # Create the JSON payload using the base64 data from both images
107+ cat > " $TEMP_JSON " << EOF
108+ {
109+ "contents": [{
66110 "parts":[
67- {"text": "Tell me about this instrument"},
68111 {
69- "inline_data": {
70- "mime_type":"image/jpeg",
71- "data": "' $( base64 $B64FLAGS $IMG_PATH ) ' "
72- }
112+ "inline_data": {
113+ "mime_type": "image/jpeg",
114+ "data": "$( cat " $TEMP_B64_1 " ) "
115+ }
116+ },
117+ {
118+ "inline_data": {
119+ "mime_type": "image/jpeg",
120+ "data": "$( cat " $TEMP_B64_2 " ) "
121+ }
122+ },
123+ {
124+ "text": "Generate a list of all the objects contained in both images."
73125 }
74126 ]
75- }]
76- }' 2> /dev/null
77- # [END text_gen_multimodal_one_image_prompt_streaming]
127+ }]
128+ }
129+ EOF
130+
131+ # Make the API request using the JSON file
132+ curl " https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?key=$GOOGLE_API_KEY " \
133+ -H ' Content-Type: application/json' \
134+ -X POST \
135+ -d " @$TEMP_JSON " 2> /dev/null > response.json
136+
137+ # Display the response
138+ cat response.json
139+ # [END text_gen_multimodal_two_image_prompt]
140+
141+ echo " [START text_gen_multimodal_one_image_bounding_box_prompt]"
142+ # [START text_gen_multimodal_one_image_bounding_box_prompt]
143+ # Re-use TEMP_B64_2 (from the previous two-image prompt) and TEMP_JSON
144+
145+ # Create the JSON payload for bounding box detection
146+ cat > " $TEMP_JSON " << EOF
147+ {
148+ "contents": [{
149+ "parts":[
150+ {
151+ "inline_data": {
152+ "mime_type": "image/jpeg",
153+ "data": "$( cat " $TEMP_B64_2 " ) "
154+ }
155+ },
156+ {
157+ "text": "Generate bounding boxes for each of the objects in this image in [y_min, x_min, y_max, x_max] format."
158+ }
159+ ]
160+ }]
161+ }
162+ EOF
163+
164+ # Make the API request using the JSON file
165+ curl " https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro:generateContent?key=$GOOGLE_API_KEY " \
166+ -H ' Content-Type: application/json' \
167+ -X POST \
168+ -d " @$TEMP_JSON " 2> /dev/null > response.json
169+
170+ cat response.json
171+ # [END text_gen_multimodal_one_image_bounding_box_prompt]
78172
79173echo " [START text_gen_multimodal_audio]"
80174# [START text_gen_multimodal_audio]
@@ -184,7 +278,7 @@ DISPLAY_NAME=VIDEO
184278# Initial resumable request defining metadata.
185279# The upload url is in the response headers dump them to a file.
186280curl " ${BASE_URL} /upload/v1beta/files?key=${GOOGLE_API_KEY} " \
187- -D upload-header.tmp \
281+ -D " ${tmp_header_file} " \
188282 -H " X-Goog-Upload-Protocol: resumable" \
189283 -H " X-Goog-Upload-Command: start" \
190284 -H " X-Goog-Upload-Header-Content-Length: ${NUM_BYTES} " \
@@ -226,7 +320,7 @@ curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:g
226320 -d ' {
227321 "contents": [{
228322 "parts":[
229- {"text": "Please describe this file ."},
323+ {"text": "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions ."},
230324 {"file_data":{"mime_type": "video/mp4", "file_uri": ' $file_uri ' }}]
231325 }]
232326 }' 2> /dev/null > response.json
0 commit comments